1 
    1 





    1 
    1 
    1 





























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_LOCAL_LOCK_H
# error "Do not include directly, include linux/local_lock.h"
#endif

#include <linux/percpu-defs.h>
#include <linux/lockdep.h>

typedef struct {
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
        struct task_struct        *owner;
#endif
} local_lock_t;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define LOCAL_LOCK_DEBUG_INIT(lockname)                \
        .dep_map = {                                        \
                .name = #lockname,                        \
                .wait_type_inner = LD_WAIT_CONFIG,        \
                .lock_type = LD_LOCK_PERCPU,                \
        },                                                \
        .owner = NULL,

static inline void local_lock_acquire(local_lock_t *l)
{
        lock_map_acquire(&l->dep_map);
        DEBUG_LOCKS_WARN_ON(l->owner);
        l->owner = current;
}

static inline void local_lock_release(local_lock_t *l)
{
        DEBUG_LOCKS_WARN_ON(l->owner != current);
        l->owner = NULL;
        lock_map_release(&l->dep_map);
}

static inline void local_lock_debug_init(local_lock_t *l)
{
        l->owner = NULL;
}
#else /* CONFIG_DEBUG_LOCK_ALLOC */
# define LOCAL_LOCK_DEBUG_INIT(lockname)
static inline void local_lock_acquire(local_lock_t *l) { }
static inline void local_lock_release(local_lock_t *l) { }
static inline void local_lock_debug_init(local_lock_t *l) { }
#endif /* !CONFIG_DEBUG_LOCK_ALLOC */

#define INIT_LOCAL_LOCK(lockname)        { LOCAL_LOCK_DEBUG_INIT(lockname) }

#define __local_lock_init(lock)                                        \
do {                                                                \
        static struct lock_class_key __key;                        \
                                                                \
        debug_check_no_locks_freed((void *)lock, sizeof(*lock));\
        lockdep_init_map_type(&(lock)->dep_map, #lock, &__key,  \
                              0, LD_WAIT_CONFIG, LD_WAIT_INV,        \
                              LD_LOCK_PERCPU);                        \
        local_lock_debug_init(lock);                                \
} while (0)

#define __local_lock(lock)                                        \
        do {                                                        \
                preempt_disable();                                \
                local_lock_acquire(this_cpu_ptr(lock));                \
        } while (0)

#define __local_lock_irq(lock)                                        \
        do {                                                        \
                local_irq_disable();                                \
                local_lock_acquire(this_cpu_ptr(lock));                \
        } while (0)

#define __local_lock_irqsave(lock, flags)                        \
        do {                                                        \
                local_irq_save(flags);                                \
                local_lock_acquire(this_cpu_ptr(lock));                \
        } while (0)

#define __local_unlock(lock)                                        \
        do {                                                        \
                local_lock_release(this_cpu_ptr(lock));                \
                preempt_enable();                                \
        } while (0)

#define __local_unlock_irq(lock)                                \
        do {                                                        \
                local_lock_release(this_cpu_ptr(lock));                \
                local_irq_enable();                                \
        } while (0)

#define __local_unlock_irqrestore(lock, flags)                        \
        do {                                                        \
                local_lock_release(this_cpu_ptr(lock));                \
                local_irq_restore(flags);                        \
        } while (0)
























































































































































































































    1 



    1 





    1 


    1 

    1 












































































































    1 






























































































































































































































































































    1 




    1 










    1 



    1 



    1 





























































































































































































































    1 





    1 
    1 



































































    1 


    1 


































































































    1 










    1 








    1 

















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/mm/swap.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 */

/*
 * This file contains the default values for the operation of the
 * Linux VM subsystem. Fine-tuning documentation can be found in
 * Documentation/admin-guide/sysctl/vm.rst.
 * Started 18.12.91
 * Swap aging added 23.2.95, Stephen Tweedie.
 * Buffermem limits added 12.3.98, Rik van Riel.
 */

#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/pagevec.h>
#include <linux/init.h>
#include <linux/export.h>
#include <linux/mm_inline.h>
#include <linux/percpu_counter.h>
#include <linux/memremap.h>
#include <linux/percpu.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/backing-dev.h>
#include <linux/memcontrol.h>
#include <linux/gfp.h>
#include <linux/uio.h>
#include <linux/hugetlb.h>
#include <linux/page_idle.h>
#include <linux/local_lock.h>

#include "internal.h"

#define CREATE_TRACE_POINTS
#include <trace/events/pagemap.h>

/* How many pages do we try to swap or page in/out together? */
int page_cluster;

/* Protecting only lru_rotate.pvec which requires disabling interrupts */
struct lru_rotate {
        local_lock_t lock;
        struct pagevec pvec;
};
static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = {
        .lock = INIT_LOCAL_LOCK(lock),
};

/*
 * The following struct pagevec are grouped together because they are protected
 * by disabling preemption (and interrupts remain enabled).
 */
struct lru_pvecs {
        local_lock_t lock;
        struct pagevec lru_add;
        struct pagevec lru_deactivate_file;
        struct pagevec lru_deactivate;
        struct pagevec lru_lazyfree;
#ifdef CONFIG_SMP
        struct pagevec activate_page;
#endif
};
static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
        .lock = INIT_LOCAL_LOCK(lock),
};

/*
 * This path almost never happens for VM activity - pages are normally
 * freed via pagevecs.  But it gets used by networking.
 */
static void __page_cache_release(struct page *page)
{
        if (PageLRU(page)) {
                pg_data_t *pgdat = page_pgdat(page);
                struct lruvec *lruvec;
                unsigned long flags;

                spin_lock_irqsave(&pgdat->lru_lock, flags);
                lruvec = mem_cgroup_page_lruvec(page, pgdat);
                VM_BUG_ON_PAGE(!PageLRU(page), page);
                __ClearPageLRU(page);
                del_page_from_lru_list(page, lruvec, page_off_lru(page));
                spin_unlock_irqrestore(&pgdat->lru_lock, flags);
        }
        __ClearPageWaiters(page);
}

static void __put_single_page(struct page *page)
{
        __page_cache_release(page);
        mem_cgroup_uncharge(page);
        free_unref_page(page);
}

static void __put_compound_page(struct page *page)
{
        /*
         * __page_cache_release() is supposed to be called for thp, not for
         * hugetlb. This is because hugetlb page does never have PageLRU set
         * (it's never listed to any LRU lists) and no memcg routines should
         * be called for hugetlb (it has a separate hugetlb_cgroup.)
         */
        if (!PageHuge(page))
                __page_cache_release(page);
        destroy_compound_page(page);
}

void __put_page(struct page *page)
{
        if (is_zone_device_page(page)) {
                put_dev_pagemap(page->pgmap);

                /*
                 * The page belongs to the device that created pgmap. Do
                 * not return it to page allocator.
                 */
                return;
        }

        if (unlikely(PageCompound(page)))
                __put_compound_page(page);
        else
                __put_single_page(page);
}
EXPORT_SYMBOL(__put_page);

/**
 * put_pages_list() - release a list of pages
 * @pages: list of pages threaded on page->lru
 *
 * Release a list of pages which are strung together on page.lru.  Currently
 * used by read_cache_pages() and related error recovery code.
 */
void put_pages_list(struct list_head *pages)
{
        while (!list_empty(pages)) {
                struct page *victim;

                victim = lru_to_page(pages);
                list_del(&victim->lru);
                put_page(victim);
        }
}
EXPORT_SYMBOL(put_pages_list);

/*
 * get_kernel_pages() - pin kernel pages in memory
 * @kiov:        An array of struct kvec structures
 * @nr_segs:        number of segments to pin
 * @write:        pinning for read/write, currently ignored
 * @pages:        array that receives pointers to the pages pinned.
 *                Should be at least nr_segs long.
 *
 * Returns number of pages pinned. This may be fewer than the number
 * requested. If nr_pages is 0 or negative, returns 0. If no pages
 * were pinned, returns -errno. Each page returned must be released
 * with a put_page() call when it is finished with.
 */
int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
                struct page **pages)
{
        int seg;

        for (seg = 0; seg < nr_segs; seg++) {
                if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
                        return seg;

                pages[seg] = kmap_to_page(kiov[seg].iov_base);
                get_page(pages[seg]);
        }

        return seg;
}
EXPORT_SYMBOL_GPL(get_kernel_pages);

/*
 * get_kernel_page() - pin a kernel page in memory
 * @start:        starting kernel address
 * @write:        pinning for read/write, currently ignored
 * @pages:        array that receives pointer to the page pinned.
 *                Must be at least nr_segs long.
 *
 * Returns 1 if page is pinned. If the page was not pinned, returns
 * -errno. The page returned must be released with a put_page() call
 * when it is finished with.
 */
int get_kernel_page(unsigned long start, int write, struct page **pages)
{
        const struct kvec kiov = {
                .iov_base = (void *)start,
                .iov_len = PAGE_SIZE
        };

        return get_kernel_pages(&kiov, 1, write, pages);
}
EXPORT_SYMBOL_GPL(get_kernel_page);

static void pagevec_lru_move_fn(struct pagevec *pvec,
        void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
        void *arg)
{
        int i;
        struct pglist_data *pgdat = NULL;
        struct lruvec *lruvec;
        unsigned long flags = 0;

        for (i = 0; i < pagevec_count(pvec); i++) {
                struct page *page = pvec->pages[i];
                struct pglist_data *pagepgdat = page_pgdat(page);

                if (pagepgdat != pgdat) {
                        if (pgdat)
                                spin_unlock_irqrestore(&pgdat->lru_lock, flags);
                        pgdat = pagepgdat;
                        spin_lock_irqsave(&pgdat->lru_lock, flags);
                }

                lruvec = mem_cgroup_page_lruvec(page, pgdat);
                (*move_fn)(page, lruvec, arg);
        }
        if (pgdat)
                spin_unlock_irqrestore(&pgdat->lru_lock, flags);
        release_pages(pvec->pages, pvec->nr);
        pagevec_reinit(pvec);
}

static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
                                 void *arg)
{
        int *pgmoved = arg;

        if (PageLRU(page) && !PageUnevictable(page)) {
                del_page_from_lru_list(page, lruvec, page_lru(page));
                ClearPageActive(page);
                add_page_to_lru_list_tail(page, lruvec, page_lru(page));
                (*pgmoved) += thp_nr_pages(page);
        }
}

/*
 * pagevec_move_tail() must be called with IRQ disabled.
 * Otherwise this may cause nasty races.
 */
static void pagevec_move_tail(struct pagevec *pvec)
{
        int pgmoved = 0;

        pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
        __count_vm_events(PGROTATED, pgmoved);
}

/*
 * Writeback is about to end against a page which has been marked for immediate
 * reclaim.  If it still appears to be reclaimable, move it to the tail of the
 * inactive list.
 */
void rotate_reclaimable_page(struct page *page)
{
        if (!PageLocked(page) && !PageDirty(page) &&
            !PageUnevictable(page) && PageLRU(page)) {
                struct pagevec *pvec;
                unsigned long flags;

                get_page(page);
                local_lock_irqsave(&lru_rotate.lock, flags);
                pvec = this_cpu_ptr(&lru_rotate.pvec);
                if (!pagevec_add(pvec, page) || PageCompound(page))
                        pagevec_move_tail(pvec);
                local_unlock_irqrestore(&lru_rotate.lock, flags);
        }
}

void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
{
        do {
                unsigned long lrusize;

                /* Record cost event */
                if (file)
                        lruvec->file_cost += nr_pages;
                else
                        lruvec->anon_cost += nr_pages;

                /*
                 * Decay previous events
                 *
                 * Because workloads change over time (and to avoid
                 * overflow) we keep these statistics as a floating
                 * average, which ends up weighing recent refaults
                 * more than old ones.
                 */
                lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) +
                          lruvec_page_state(lruvec, NR_ACTIVE_ANON) +
                          lruvec_page_state(lruvec, NR_INACTIVE_FILE) +
                          lruvec_page_state(lruvec, NR_ACTIVE_FILE);

                if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) {
                        lruvec->file_cost /= 2;
                        lruvec->anon_cost /= 2;
                }
        } while ((lruvec = parent_lruvec(lruvec)));
}

void lru_note_cost_page(struct page *page)
{
        lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)),
                      page_is_file_lru(page), thp_nr_pages(page));
}

static void __activate_page(struct page *page, struct lruvec *lruvec,
                            void *arg)
{
        if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
                int lru = page_lru_base_type(page);
                int nr_pages = thp_nr_pages(page);

                del_page_from_lru_list(page, lruvec, lru);
                SetPageActive(page);
                lru += LRU_ACTIVE;
                add_page_to_lru_list(page, lruvec, lru);
                trace_mm_lru_activate(page);

                __count_vm_events(PGACTIVATE, nr_pages);
                __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE,
                                     nr_pages);
        }
}

#ifdef CONFIG_SMP
static void activate_page_drain(int cpu)
{
        struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu);

        if (pagevec_count(pvec))
                pagevec_lru_move_fn(pvec, __activate_page, NULL);
}

static bool need_activate_page_drain(int cpu)
{
        return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
}

static void activate_page(struct page *page)
{
        page = compound_head(page);
        if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
                struct pagevec *pvec;

                local_lock(&lru_pvecs.lock);
                pvec = this_cpu_ptr(&lru_pvecs.activate_page);
                get_page(page);
                if (!pagevec_add(pvec, page) || PageCompound(page))
                        pagevec_lru_move_fn(pvec, __activate_page, NULL);
                local_unlock(&lru_pvecs.lock);
        }
}

#else
static inline void activate_page_drain(int cpu)
{
}

static void activate_page(struct page *page)
{
        pg_data_t *pgdat = page_pgdat(page);

        page = compound_head(page);
        spin_lock_irq(&pgdat->lru_lock);
        __activate_page(page, mem_cgroup_page_lruvec(page, pgdat), NULL);
        spin_unlock_irq(&pgdat->lru_lock);
}
#endif

static void __lru_cache_activate_page(struct page *page)
{
        struct pagevec *pvec;
        int i;

        local_lock(&lru_pvecs.lock);
        pvec = this_cpu_ptr(&lru_pvecs.lru_add);

        /*
         * Search backwards on the optimistic assumption that the page being
         * activated has just been added to this pagevec. Note that only
         * the local pagevec is examined as a !PageLRU page could be in the
         * process of being released, reclaimed, migrated or on a remote
         * pagevec that is currently being drained. Furthermore, marking
         * a remote pagevec's page PageActive potentially hits a race where
         * a page is marked PageActive just after it is added to the inactive
         * list causing accounting errors and BUG_ON checks to trigger.
         */
        for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
                struct page *pagevec_page = pvec->pages[i];

                if (pagevec_page == page) {
                        SetPageActive(page);
                        break;
                }
        }

        local_unlock(&lru_pvecs.lock);
}

/*
 * Mark a page as having seen activity.
 *
 * inactive,unreferenced        ->        inactive,referenced
 * inactive,referenced                ->        active,unreferenced
 * active,unreferenced                ->        active,referenced
 *
 * When a newly allocated page is not yet visible, so safe for non-atomic ops,
 * __SetPageReferenced(page) may be substituted for mark_page_accessed(page).
 */
void mark_page_accessed(struct page *page)
{
        page = compound_head(page);

        if (!PageReferenced(page)) {
                SetPageReferenced(page);
        } else if (PageUnevictable(page)) {
                /*
                 * Unevictable pages are on the "LRU_UNEVICTABLE" list. But,
                 * this list is never rotated or maintained, so marking an
                 * evictable page accessed has no effect.
                 */
        } else if (!PageActive(page)) {
                /*
                 * If the page is on the LRU, queue it for activation via
                 * lru_pvecs.activate_page. Otherwise, assume the page is on a
                 * pagevec, mark it active and it'll be moved to the active
                 * LRU on the next drain.
                 */
                if (PageLRU(page))
                        activate_page(page);
                else
                        __lru_cache_activate_page(page);
                ClearPageReferenced(page);
                workingset_activation(page);
        }
        if (page_is_idle(page))
                clear_page_idle(page);
}
EXPORT_SYMBOL(mark_page_accessed);

/**
 * lru_cache_add - add a page to a page list
 * @page: the page to be added to the LRU.
 *
 * Queue the page for addition to the LRU via pagevec. The decision on whether
 * to add the page to the [in]active [file|anon] list is deferred until the
 * pagevec is drained. This gives a chance for the caller of lru_cache_add()
 * have the page added to the active list using mark_page_accessed().
 */
void lru_cache_add(struct page *page)
{
        struct pagevec *pvec;

        VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
        VM_BUG_ON_PAGE(PageLRU(page), page);

        get_page(page);
        local_lock(&lru_pvecs.lock);
        pvec = this_cpu_ptr(&lru_pvecs.lru_add);
        if (!pagevec_add(pvec, page) || PageCompound(page))
                __pagevec_lru_add(pvec);
        local_unlock(&lru_pvecs.lock);
}
EXPORT_SYMBOL(lru_cache_add);

/**
 * lru_cache_add_inactive_or_unevictable
 * @page:  the page to be added to LRU
 * @vma:   vma in which page is mapped for determining reclaimability
 *
 * Place @page on the inactive or unevictable LRU list, depending on its
 * evictability.
 */
void lru_cache_add_inactive_or_unevictable(struct page *page,
                                         struct vm_area_struct *vma)
{
        bool unevictable;

        VM_BUG_ON_PAGE(PageLRU(page), page);

        unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED;
        if (unlikely(unevictable) && !TestSetPageMlocked(page)) {
                int nr_pages = thp_nr_pages(page);
                /*
                 * We use the irq-unsafe __mod_zone_page_stat because this
                 * counter is not modified from interrupt context, and the pte
                 * lock is held(spinlock), which implies preemption disabled.
                 */
                __mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
                count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
        }
        lru_cache_add(page);
}

/*
 * If the page can not be invalidated, it is moved to the
 * inactive list to speed up its reclaim.  It is moved to the
 * head of the list, rather than the tail, to give the flusher
 * threads some time to write it out, as this is much more
 * effective than the single-page writeout from reclaim.
 *
 * If the page isn't page_mapped and dirty/writeback, the page
 * could reclaim asap using PG_reclaim.
 *
 * 1. active, mapped page -> none
 * 2. active, dirty/writeback page -> inactive, head, PG_reclaim
 * 3. inactive, mapped page -> none
 * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim
 * 5. inactive, clean -> inactive, tail
 * 6. Others -> none
 *
 * In 4, why it moves inactive's head, the VM expects the page would
 * be write it out by flusher threads as this is much more effective
 * than the single-page writeout from reclaim.
 */
static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
                              void *arg)
{
        int lru;
        bool active;
        int nr_pages = thp_nr_pages(page);

        if (!PageLRU(page))
                return;

        if (PageUnevictable(page))
                return;

        /* Some processes are using the page */
        if (page_mapped(page))
                return;

        active = PageActive(page);
        lru = page_lru_base_type(page);

        del_page_from_lru_list(page, lruvec, lru + active);
        ClearPageActive(page);
        ClearPageReferenced(page);

        if (PageWriteback(page) || PageDirty(page)) {
                /*
                 * PG_reclaim could be raced with end_page_writeback
                 * It can make readahead confusing.  But race window
                 * is _really_ small and  it's non-critical problem.
                 */
                add_page_to_lru_list(page, lruvec, lru);
                SetPageReclaim(page);
        } else {
                /*
                 * The page's writeback ends up during pagevec
                 * We moves tha page into tail of inactive.
                 */
                add_page_to_lru_list_tail(page, lruvec, lru);
                __count_vm_events(PGROTATED, nr_pages);
        }

        if (active) {
                __count_vm_events(PGDEACTIVATE, nr_pages);
                __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
                                     nr_pages);
        }
}

static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
                            void *arg)
{
        if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
                int lru = page_lru_base_type(page);
                int nr_pages = thp_nr_pages(page);

                del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
                ClearPageActive(page);
                ClearPageReferenced(page);
                add_page_to_lru_list(page, lruvec, lru);

                __count_vm_events(PGDEACTIVATE, nr_pages);
                __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
                                     nr_pages);
        }
}

static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
                            void *arg)
{
        if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
            !PageSwapCache(page) && !PageUnevictable(page)) {
                bool active = PageActive(page);
                int nr_pages = thp_nr_pages(page);

                del_page_from_lru_list(page, lruvec,
                                       LRU_INACTIVE_ANON + active);
                ClearPageActive(page);
                ClearPageReferenced(page);
                /*
                 * Lazyfree pages are clean anonymous pages.  They have
                 * PG_swapbacked flag cleared, to distinguish them from normal
                 * anonymous pages
                 */
                ClearPageSwapBacked(page);
                add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);

                __count_vm_events(PGLAZYFREE, nr_pages);
                __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE,
                                     nr_pages);
        }
}

/*
 * Drain pages out of the cpu's pagevecs.
 * Either "cpu" is the current CPU, and preemption has already been
 * disabled; or "cpu" is being hot-unplugged, and is already dead.
 */
void lru_add_drain_cpu(int cpu)
{
        struct pagevec *pvec = &per_cpu(lru_pvecs.lru_add, cpu);

        if (pagevec_count(pvec))
                __pagevec_lru_add(pvec);

        pvec = &per_cpu(lru_rotate.pvec, cpu);
        /* Disabling interrupts below acts as a compiler barrier. */
        if (data_race(pagevec_count(pvec))) {
                unsigned long flags;

                /* No harm done if a racing interrupt already did this */
                local_lock_irqsave(&lru_rotate.lock, flags);
                pagevec_move_tail(pvec);
                local_unlock_irqrestore(&lru_rotate.lock, flags);
        }

        pvec = &per_cpu(lru_pvecs.lru_deactivate_file, cpu);
        if (pagevec_count(pvec))
                pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);

        pvec = &per_cpu(lru_pvecs.lru_deactivate, cpu);
        if (pagevec_count(pvec))
                pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);

        pvec = &per_cpu(lru_pvecs.lru_lazyfree, cpu);
        if (pagevec_count(pvec))
                pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);

        activate_page_drain(cpu);
}

/**
 * deactivate_file_page - forcefully deactivate a file page
 * @page: page to deactivate
 *
 * This function hints the VM that @page is a good reclaim candidate,
 * for example if its invalidation fails due to the page being dirty
 * or under writeback.
 */
void deactivate_file_page(struct page *page)
{
        /*
         * In a workload with many unevictable page such as mprotect,
         * unevictable page deactivation for accelerating reclaim is pointless.
         */
        if (PageUnevictable(page))
                return;

        if (likely(get_page_unless_zero(page))) {
                struct pagevec *pvec;

                local_lock(&lru_pvecs.lock);
                pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);

                if (!pagevec_add(pvec, page) || PageCompound(page))
                        pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
                local_unlock(&lru_pvecs.lock);
        }
}

/*
 * deactivate_page - deactivate a page
 * @page: page to deactivate
 *
 * deactivate_page() moves @page to the inactive list if @page was on the active
 * list and was not an unevictable page.  This is done to accelerate the reclaim
 * of @page.
 */
void deactivate_page(struct page *page)
{
        if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
                struct pagevec *pvec;

                local_lock(&lru_pvecs.lock);
                pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate);
                get_page(page);
                if (!pagevec_add(pvec, page) || PageCompound(page))
                        pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
                local_unlock(&lru_pvecs.lock);
        }
}

/**
 * mark_page_lazyfree - make an anon page lazyfree
 * @page: page to deactivate
 *
 * mark_page_lazyfree() moves @page to the inactive file list.
 * This is done to accelerate the reclaim of @page.
 */
void mark_page_lazyfree(struct page *page)
{
        if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
            !PageSwapCache(page) && !PageUnevictable(page)) {
                struct pagevec *pvec;

                local_lock(&lru_pvecs.lock);
                pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree);
                get_page(page);
                if (!pagevec_add(pvec, page) || PageCompound(page))
                        pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
                local_unlock(&lru_pvecs.lock);
        }
}

void lru_add_drain(void)
{
        local_lock(&lru_pvecs.lock);
        lru_add_drain_cpu(smp_processor_id());
        local_unlock(&lru_pvecs.lock);
}

void lru_add_drain_cpu_zone(struct zone *zone)
{
        local_lock(&lru_pvecs.lock);
        lru_add_drain_cpu(smp_processor_id());
        drain_local_pages(zone);
        local_unlock(&lru_pvecs.lock);
}

#ifdef CONFIG_SMP

static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);

static void lru_add_drain_per_cpu(struct work_struct *dummy)
{
        lru_add_drain();
}

/*
 * Doesn't need any cpu hotplug locking because we do rely on per-cpu
 * kworkers being shut down before our page_alloc_cpu_dead callback is
 * executed on the offlined cpu.
 * Calling this function with cpu hotplug locks held can actually lead
 * to obscure indirect dependencies via WQ context.
 */
void lru_add_drain_all(void)
{
        /*
         * lru_drain_gen - Global pages generation number
         *
         * (A) Definition: global lru_drain_gen = x implies that all generations
         *     0 < n <= x are already *scheduled* for draining.
         *
         * This is an optimization for the highly-contended use case where a
         * user space workload keeps constantly generating a flow of pages for
         * each CPU.
         */
        static unsigned int lru_drain_gen;
        static struct cpumask has_work;
        static DEFINE_MUTEX(lock);
        unsigned cpu, this_gen;

        /*
         * Make sure nobody triggers this path before mm_percpu_wq is fully
         * initialized.
         */
        if (WARN_ON(!mm_percpu_wq))
                return;

        /*
         * Guarantee pagevec counter stores visible by this CPU are visible to
         * other CPUs before loading the current drain generation.
         */
        smp_mb();

        /*
         * (B) Locally cache global LRU draining generation number
         *
         * The read barrier ensures that the counter is loaded before the mutex
         * is taken. It pairs with smp_mb() inside the mutex critical section
         * at (D).
         */
        this_gen = smp_load_acquire(&lru_drain_gen);

        mutex_lock(&lock);

        /*
         * (C) Exit the draining operation if a newer generation, from another
         * lru_add_drain_all(), was already scheduled for draining. Check (A).
         */
        if (unlikely(this_gen != lru_drain_gen))
                goto done;

        /*
         * (D) Increment global generation number
         *
         * Pairs with smp_load_acquire() at (B), outside of the critical
         * section. Use a full memory barrier to guarantee that the new global
         * drain generation number is stored before loading pagevec counters.
         *
         * This pairing must be done here, before the for_each_online_cpu loop
         * below which drains the page vectors.
         *
         * Let x, y, and z represent some system CPU numbers, where x < y < z.
         * Assume CPU #z is is in the middle of the for_each_online_cpu loop
         * below and has already reached CPU #y's per-cpu data. CPU #x comes
         * along, adds some pages to its per-cpu vectors, then calls
         * lru_add_drain_all().
         *
         * If the paired barrier is done at any later step, e.g. after the
         * loop, CPU #x will just exit at (C) and miss flushing out all of its
         * added pages.
         */
        WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1);
        smp_mb();

        cpumask_clear(&has_work);
        for_each_online_cpu(cpu) {
                struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);

                if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
                    data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) ||
                    pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) ||
                    pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) ||
                    pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) ||
                    need_activate_page_drain(cpu)) {
                        INIT_WORK(work, lru_add_drain_per_cpu);
                        queue_work_on(cpu, mm_percpu_wq, work);
                        __cpumask_set_cpu(cpu, &has_work);
                }
        }

        for_each_cpu(cpu, &has_work)
                flush_work(&per_cpu(lru_add_drain_work, cpu));

done:
        mutex_unlock(&lock);
}
#else
void lru_add_drain_all(void)
{
        lru_add_drain();
}
#endif /* CONFIG_SMP */

/**
 * release_pages - batched put_page()
 * @pages: array of pages to release
 * @nr: number of pages
 *
 * Decrement the reference count on all the pages in @pages.  If it
 * fell to zero, remove the page from the LRU and free it.
 */
void release_pages(struct page **pages, int nr)
{
        int i;
        LIST_HEAD(pages_to_free);
        struct pglist_data *locked_pgdat = NULL;
        struct lruvec *lruvec;
        unsigned long flags;
        unsigned int lock_batch;

        for (i = 0; i < nr; i++) {
                struct page *page = pages[i];

                /*
                 * Make sure the IRQ-safe lock-holding time does not get
                 * excessive with a continuous string of pages from the
                 * same pgdat. The lock is held only if pgdat != NULL.
                 */
                if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) {
                        spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
                        locked_pgdat = NULL;
                }

                page = compound_head(page);
                if (is_huge_zero_page(page))
                        continue;

                if (is_zone_device_page(page)) {
                        if (locked_pgdat) {
                                spin_unlock_irqrestore(&locked_pgdat->lru_lock,
                                                       flags);
                                locked_pgdat = NULL;
                        }
                        /*
                         * ZONE_DEVICE pages that return 'false' from
                         * page_is_devmap_managed() do not require special
                         * processing, and instead, expect a call to
                         * put_page_testzero().
                         */
                        if (page_is_devmap_managed(page)) {
                                put_devmap_managed_page(page);
                                continue;
                        }
                }

                if (!put_page_testzero(page))
                        continue;

                if (PageCompound(page)) {
                        if (locked_pgdat) {
                                spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
                                locked_pgdat = NULL;
                        }
                        __put_compound_page(page);
                        continue;
                }

                if (PageLRU(page)) {
                        struct pglist_data *pgdat = page_pgdat(page);

                        if (pgdat != locked_pgdat) {
                                if (locked_pgdat)
                                        spin_unlock_irqrestore(&locked_pgdat->lru_lock,
                                                                        flags);
                                lock_batch = 0;
                                locked_pgdat = pgdat;
                                spin_lock_irqsave(&locked_pgdat->lru_lock, flags);
                        }

                        lruvec = mem_cgroup_page_lruvec(page, locked_pgdat);
                        VM_BUG_ON_PAGE(!PageLRU(page), page);
                        __ClearPageLRU(page);
                        del_page_from_lru_list(page, lruvec, page_off_lru(page));
                }

                __ClearPageWaiters(page);

                list_add(&page->lru, &pages_to_free);
        }
        if (locked_pgdat)
                spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);

        mem_cgroup_uncharge_list(&pages_to_free);
        free_unref_page_list(&pages_to_free);
}
EXPORT_SYMBOL(release_pages);

/*
 * The pages which we're about to release may be in the deferred lru-addition
 * queues.  That would prevent them from really being freed right now.  That's
 * OK from a correctness point of view but is inefficient - those pages may be
 * cache-warm and we want to give them back to the page allocator ASAP.
 *
 * So __pagevec_release() will drain those queues here.  __pagevec_lru_add()
 * and __pagevec_lru_add_active() call release_pages() directly to avoid
 * mutual recursion.
 */
void __pagevec_release(struct pagevec *pvec)
{
        if (!pvec->percpu_pvec_drained) {
                lru_add_drain();
                pvec->percpu_pvec_drained = true;
        }
        release_pages(pvec->pages, pagevec_count(pvec));
        pagevec_reinit(pvec);
}
EXPORT_SYMBOL(__pagevec_release);

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* used by __split_huge_page_refcount() */
void lru_add_page_tail(struct page *page, struct page *page_tail,
                       struct lruvec *lruvec, struct list_head *list)
{
        VM_BUG_ON_PAGE(!PageHead(page), page);
        VM_BUG_ON_PAGE(PageCompound(page_tail), page);
        VM_BUG_ON_PAGE(PageLRU(page_tail), page);
        lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock);

        if (!list)
                SetPageLRU(page_tail);

        if (likely(PageLRU(page)))
                list_add_tail(&page_tail->lru, &page->lru);
        else if (list) {
                /* page reclaim is reclaiming a huge page */
                get_page(page_tail);
                list_add_tail(&page_tail->lru, list);
        } else {
                /*
                 * Head page has not yet been counted, as an hpage,
                 * so we must account for each subpage individually.
                 *
                 * Put page_tail on the list at the correct position
                 * so they all end up in order.
                 */
                add_page_to_lru_list_tail(page_tail, lruvec,
                                          page_lru(page_tail));
        }
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
                                 void *arg)
{
        enum lru_list lru;
        int was_unevictable = TestClearPageUnevictable(page);
        int nr_pages = thp_nr_pages(page);

        VM_BUG_ON_PAGE(PageLRU(page), page);

        /*
         * Page becomes evictable in two ways:
         * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()].
         * 2) Before acquiring LRU lock to put the page to correct LRU and then
         *   a) do PageLRU check with lock [check_move_unevictable_pages]
         *   b) do PageLRU check before lock [clear_page_mlock]
         *
         * (1) & (2a) are ok as LRU lock will serialize them. For (2b), we need
         * following strict ordering:
         *
         * #0: __pagevec_lru_add_fn                #1: clear_page_mlock
         *
         * SetPageLRU()                                TestClearPageMlocked()
         * smp_mb() // explicit ordering        // above provides strict
         *                                        // ordering
         * PageMlocked()                        PageLRU()
         *
         *
         * if '#1' does not observe setting of PG_lru by '#0' and fails
         * isolation, the explicit barrier will make sure that page_evictable
         * check will put the page in correct LRU. Without smp_mb(), SetPageLRU
         * can be reordered after PageMlocked check and can make '#1' to fail
         * the isolation of the page whose Mlocked bit is cleared (#0 is also
         * looking at the same page) and the evictable page will be stranded
         * in an unevictable LRU.
         */
        SetPageLRU(page);
        smp_mb__after_atomic();

        if (page_evictable(page)) {
                lru = page_lru(page);
                if (was_unevictable)
                        __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
        } else {
                lru = LRU_UNEVICTABLE;
                ClearPageActive(page);
                SetPageUnevictable(page);
                if (!was_unevictable)
                        __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
        }

        add_page_to_lru_list(page, lruvec, lru);
        trace_mm_lru_insertion(page, lru);
}

/*
 * Add the passed pages to the LRU, then drop the caller's refcount
 * on them.  Reinitialises the caller's pagevec.
 */
void __pagevec_lru_add(struct pagevec *pvec)
{
        pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
}

/**
 * pagevec_lookup_entries - gang pagecache lookup
 * @pvec:        Where the resulting entries are placed
 * @mapping:        The address_space to search
 * @start:        The starting entry index
 * @nr_entries:        The maximum number of pages
 * @indices:        The cache indices corresponding to the entries in @pvec
 *
 * pagevec_lookup_entries() will search for and return a group of up
 * to @nr_pages pages and shadow entries in the mapping.  All
 * entries are placed in @pvec.  pagevec_lookup_entries() takes a
 * reference against actual pages in @pvec.
 *
 * The search returns a group of mapping-contiguous entries with
 * ascending indexes.  There may be holes in the indices due to
 * not-present entries.
 *
 * Only one subpage of a Transparent Huge Page is returned in one call:
 * allowing truncate_inode_pages_range() to evict the whole THP without
 * cycling through a pagevec of extra references.
 *
 * pagevec_lookup_entries() returns the number of entries which were
 * found.
 */
unsigned pagevec_lookup_entries(struct pagevec *pvec,
                                struct address_space *mapping,
                                pgoff_t start, unsigned nr_entries,
                                pgoff_t *indices)
{
        pvec->nr = find_get_entries(mapping, start, nr_entries,
                                    pvec->pages, indices);
        return pagevec_count(pvec);
}

/**
 * pagevec_remove_exceptionals - pagevec exceptionals pruning
 * @pvec:        The pagevec to prune
 *
 * pagevec_lookup_entries() fills both pages and exceptional radix
 * tree entries into the pagevec.  This function prunes all
 * exceptionals from @pvec without leaving holes, so that it can be
 * passed on to page-only pagevec operations.
 */
void pagevec_remove_exceptionals(struct pagevec *pvec)
{
        int i, j;

        for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
                struct page *page = pvec->pages[i];
                if (!xa_is_value(page))
                        pvec->pages[j++] = page;
        }
        pvec->nr = j;
}

/**
 * pagevec_lookup_range - gang pagecache lookup
 * @pvec:        Where the resulting pages are placed
 * @mapping:        The address_space to search
 * @start:        The starting page index
 * @end:        The final page index
 *
 * pagevec_lookup_range() will search for & return a group of up to PAGEVEC_SIZE
 * pages in the mapping starting from index @start and upto index @end
 * (inclusive).  The pages are placed in @pvec.  pagevec_lookup() takes a
 * reference against the pages in @pvec.
 *
 * The search returns a group of mapping-contiguous pages with ascending
 * indexes.  There may be holes in the indices due to not-present pages. We
 * also update @start to index the next page for the traversal.
 *
 * pagevec_lookup_range() returns the number of pages which were found. If this
 * number is smaller than PAGEVEC_SIZE, the end of specified range has been
 * reached.
 */
unsigned pagevec_lookup_range(struct pagevec *pvec,
                struct address_space *mapping, pgoff_t *start, pgoff_t end)
{
        pvec->nr = find_get_pages_range(mapping, start, end, PAGEVEC_SIZE,
                                        pvec->pages);
        return pagevec_count(pvec);
}
EXPORT_SYMBOL(pagevec_lookup_range);

unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
                struct address_space *mapping, pgoff_t *index, pgoff_t end,
                xa_mark_t tag)
{
        pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
                                        PAGEVEC_SIZE, pvec->pages);
        return pagevec_count(pvec);
}
EXPORT_SYMBOL(pagevec_lookup_range_tag);

unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
                struct address_space *mapping, pgoff_t *index, pgoff_t end,
                xa_mark_t tag, unsigned max_pages)
{
        pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
                min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages);
        return pagevec_count(pvec);
}
EXPORT_SYMBOL(pagevec_lookup_range_nr_tag);
/*
 * Perform any setup for the swap system
 */
void __init swap_setup(void)
{
        unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);

        /* Use a smaller cluster for small-memory machines */
        if (megs < 16)
                page_cluster = 2;
        else
                page_cluster = 3;
        /*
         * Right now other parts of the system means that we
         * _really_ don't want to cluster much more
         */
}

#ifdef CONFIG_DEV_PAGEMAP_OPS
void put_devmap_managed_page(struct page *page)
{
        int count;

        if (WARN_ON_ONCE(!page_is_devmap_managed(page)))
                return;

        count = page_ref_dec_return(page);

        /*
         * devmap page refcounts are 1-based, rather than 0-based: if
         * refcount is 1, then the page is free and the refcount is
         * stable because nobody holds a reference on the page.
         */
        if (count == 1)
                free_devmap_managed_page(page);
        else if (!count)
                __put_page(page);
}
EXPORT_SYMBOL(put_devmap_managed_page);
#endif

























































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SWAPOPS_H
#define _LINUX_SWAPOPS_H

#include <linux/radix-tree.h>
#include <linux/bug.h>
#include <linux/mm_types.h>

#ifdef CONFIG_MMU

/*
 * swapcache pages are stored in the swapper_space radix tree.  We want to
 * get good packing density in that tree, so the index should be dense in
 * the low-order bits.
 *
 * We arrange the `type' and `offset' fields so that `type' is at the seven
 * high-order bits of the swp_entry_t and `offset' is right-aligned in the
 * remaining bits.  Although `type' itself needs only five bits, we allow for
 * shmem/tmpfs to shift it all up a further two bits: see swp_to_radix_entry().
 *
 * swp_entry_t's are *never* stored anywhere in their arch-dependent format.
 */
#define SWP_TYPE_SHIFT        (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT)
#define SWP_OFFSET_MASK        ((1UL << SWP_TYPE_SHIFT) - 1)

/* Clear all flags but only keep swp_entry_t related information */
static inline pte_t pte_swp_clear_flags(pte_t pte)
{
        if (pte_swp_soft_dirty(pte))
                pte = pte_swp_clear_soft_dirty(pte);
        if (pte_swp_uffd_wp(pte))
                pte = pte_swp_clear_uffd_wp(pte);
        return pte;
}

/*
 * Store a type+offset into a swp_entry_t in an arch-independent format
 */
static inline swp_entry_t swp_entry(unsigned long type, pgoff_t offset)
{
        swp_entry_t ret;

        ret.val = (type << SWP_TYPE_SHIFT) | (offset & SWP_OFFSET_MASK);
        return ret;
}

/*
 * Extract the `type' field from a swp_entry_t.  The swp_entry_t is in
 * arch-independent format
 */
static inline unsigned swp_type(swp_entry_t entry)
{
        return (entry.val >> SWP_TYPE_SHIFT);
}

/*
 * Extract the `offset' field from a swp_entry_t.  The swp_entry_t is in
 * arch-independent format
 */
static inline pgoff_t swp_offset(swp_entry_t entry)
{
        return entry.val & SWP_OFFSET_MASK;
}

/* check whether a pte points to a swap entry */
static inline int is_swap_pte(pte_t pte)
{
        return !pte_none(pte) && !pte_present(pte);
}

/*
 * Convert the arch-dependent pte representation of a swp_entry_t into an
 * arch-independent swp_entry_t.
 */
static inline swp_entry_t pte_to_swp_entry(pte_t pte)
{
        swp_entry_t arch_entry;

        pte = pte_swp_clear_flags(pte);
        arch_entry = __pte_to_swp_entry(pte);
        return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
}

/*
 * Convert the arch-independent representation of a swp_entry_t into the
 * arch-dependent pte representation.
 */
static inline pte_t swp_entry_to_pte(swp_entry_t entry)
{
        swp_entry_t arch_entry;

        arch_entry = __swp_entry(swp_type(entry), swp_offset(entry));
        return __swp_entry_to_pte(arch_entry);
}

static inline swp_entry_t radix_to_swp_entry(void *arg)
{
        swp_entry_t entry;

        entry.val = xa_to_value(arg);
        return entry;
}

static inline void *swp_to_radix_entry(swp_entry_t entry)
{
        return xa_mk_value(entry.val);
}

#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
static inline swp_entry_t make_device_private_entry(struct page *page, bool write)
{
        return swp_entry(write ? SWP_DEVICE_WRITE : SWP_DEVICE_READ,
                         page_to_pfn(page));
}

static inline bool is_device_private_entry(swp_entry_t entry)
{
        int type = swp_type(entry);
        return type == SWP_DEVICE_READ || type == SWP_DEVICE_WRITE;
}

static inline void make_device_private_entry_read(swp_entry_t *entry)
{
        *entry = swp_entry(SWP_DEVICE_READ, swp_offset(*entry));
}

static inline bool is_write_device_private_entry(swp_entry_t entry)
{
        return unlikely(swp_type(entry) == SWP_DEVICE_WRITE);
}

static inline unsigned long device_private_entry_to_pfn(swp_entry_t entry)
{
        return swp_offset(entry);
}

static inline struct page *device_private_entry_to_page(swp_entry_t entry)
{
        return pfn_to_page(swp_offset(entry));
}
#else /* CONFIG_DEVICE_PRIVATE */
static inline swp_entry_t make_device_private_entry(struct page *page, bool write)
{
        return swp_entry(0, 0);
}

static inline void make_device_private_entry_read(swp_entry_t *entry)
{
}

static inline bool is_device_private_entry(swp_entry_t entry)
{
        return false;
}

static inline bool is_write_device_private_entry(swp_entry_t entry)
{
        return false;
}

static inline unsigned long device_private_entry_to_pfn(swp_entry_t entry)
{
        return 0;
}

static inline struct page *device_private_entry_to_page(swp_entry_t entry)
{
        return NULL;
}
#endif /* CONFIG_DEVICE_PRIVATE */

#ifdef CONFIG_MIGRATION
static inline swp_entry_t make_migration_entry(struct page *page, int write)
{
        BUG_ON(!PageLocked(compound_head(page)));

        return swp_entry(write ? SWP_MIGRATION_WRITE : SWP_MIGRATION_READ,
                        page_to_pfn(page));
}

static inline int is_migration_entry(swp_entry_t entry)
{
        return unlikely(swp_type(entry) == SWP_MIGRATION_READ ||
                        swp_type(entry) == SWP_MIGRATION_WRITE);
}

static inline int is_write_migration_entry(swp_entry_t entry)
{
        return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE);
}

static inline unsigned long migration_entry_to_pfn(swp_entry_t entry)
{
        return swp_offset(entry);
}

static inline struct page *migration_entry_to_page(swp_entry_t entry)
{
        struct page *p = pfn_to_page(swp_offset(entry));
        /*
         * Any use of migration entries may only occur while the
         * corresponding page is locked
         */
        BUG_ON(!PageLocked(compound_head(p)));
        return p;
}

static inline void make_migration_entry_read(swp_entry_t *entry)
{
        *entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry));
}

extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
                                        spinlock_t *ptl);
extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
                                        unsigned long address);
extern void migration_entry_wait_huge(struct vm_area_struct *vma,
                struct mm_struct *mm, pte_t *pte);
#else

#define make_migration_entry(page, write) swp_entry(0, 0)
static inline int is_migration_entry(swp_entry_t swp)
{
        return 0;
}

static inline unsigned long migration_entry_to_pfn(swp_entry_t entry)
{
        return 0;
}

static inline struct page *migration_entry_to_page(swp_entry_t entry)
{
        return NULL;
}

static inline void make_migration_entry_read(swp_entry_t *entryp) { }
static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
                                        spinlock_t *ptl) { }
static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
                                         unsigned long address) { }
static inline void migration_entry_wait_huge(struct vm_area_struct *vma,
                struct mm_struct *mm, pte_t *pte) { }
static inline int is_write_migration_entry(swp_entry_t entry)
{
        return 0;
}

#endif

struct page_vma_mapped_walk;

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
extern void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
                struct page *page);

extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
                struct page *new);

extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd);

static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
{
        swp_entry_t arch_entry;

        if (pmd_swp_soft_dirty(pmd))
                pmd = pmd_swp_clear_soft_dirty(pmd);
        if (pmd_swp_uffd_wp(pmd))
                pmd = pmd_swp_clear_uffd_wp(pmd);
        arch_entry = __pmd_to_swp_entry(pmd);
        return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
}

static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
{
        swp_entry_t arch_entry;

        arch_entry = __swp_entry(swp_type(entry), swp_offset(entry));
        return __swp_entry_to_pmd(arch_entry);
}

static inline int is_pmd_migration_entry(pmd_t pmd)
{
        return !pmd_present(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
}
#else
static inline void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
                struct page *page)
{
        BUILD_BUG();
}

static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
                struct page *new)
{
        BUILD_BUG();
}

static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { }

static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
{
        return swp_entry(0, 0);
}

static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
{
        return __pmd(0);
}

static inline int is_pmd_migration_entry(pmd_t pmd)
{
        return 0;
}
#endif

#ifdef CONFIG_MEMORY_FAILURE

extern atomic_long_t num_poisoned_pages __read_mostly;

/*
 * Support for hardware poisoned pages
 */
static inline swp_entry_t make_hwpoison_entry(struct page *page)
{
        BUG_ON(!PageLocked(page));
        return swp_entry(SWP_HWPOISON, page_to_pfn(page));
}

static inline int is_hwpoison_entry(swp_entry_t entry)
{
        return swp_type(entry) == SWP_HWPOISON;
}

static inline void num_poisoned_pages_inc(void)
{
        atomic_long_inc(&num_poisoned_pages);
}

static inline void num_poisoned_pages_dec(void)
{
        atomic_long_dec(&num_poisoned_pages);
}

#else

static inline swp_entry_t make_hwpoison_entry(struct page *page)
{
        return swp_entry(0, 0);
}

static inline int is_hwpoison_entry(swp_entry_t swp)
{
        return 0;
}

static inline void num_poisoned_pages_inc(void)
{
}
#endif

#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION) || \
    defined(CONFIG_DEVICE_PRIVATE)
static inline int non_swap_entry(swp_entry_t entry)
{
        return swp_type(entry) >= MAX_SWAPFILES;
}
#else
static inline int non_swap_entry(swp_entry_t entry)
{
        return 0;
}
#endif

#endif /* CONFIG_MMU */
#endif /* _LINUX_SWAPOPS_H */


























































































































































































































































    1 


    1 












































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Credentials management - see Documentation/security/credentials.rst
 *
 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _LINUX_CRED_H
#define _LINUX_CRED_H

#include <linux/capability.h>
#include <linux/init.h>
#include <linux/key.h>
#include <linux/atomic.h>
#include <linux/uidgid.h>
#include <linux/sched.h>
#include <linux/sched/user.h>

struct cred;
struct inode;

/*
 * COW Supplementary groups list
 */
struct group_info {
        atomic_t        usage;
        int                ngroups;
        kgid_t                gid[0];
} __randomize_layout;

/**
 * get_group_info - Get a reference to a group info structure
 * @group_info: The group info to reference
 *
 * This gets a reference to a set of supplementary groups.
 *
 * If the caller is accessing a task's credentials, they must hold the RCU read
 * lock when reading.
 */
static inline struct group_info *get_group_info(struct group_info *gi)
{
        atomic_inc(&gi->usage);
        return gi;
}

/**
 * put_group_info - Release a reference to a group info structure
 * @group_info: The group info to release
 */
#define put_group_info(group_info)                        \
do {                                                        \
        if (atomic_dec_and_test(&(group_info)->usage))        \
                groups_free(group_info);                \
} while (0)

extern struct group_info init_groups;
#ifdef CONFIG_MULTIUSER
extern struct group_info *groups_alloc(int);
extern void groups_free(struct group_info *);

extern int in_group_p(kgid_t);
extern int in_egroup_p(kgid_t);
extern int groups_search(const struct group_info *, kgid_t);

extern int set_current_groups(struct group_info *);
extern void set_groups(struct cred *, struct group_info *);
extern bool may_setgroups(void);
extern void groups_sort(struct group_info *);
#else
static inline void groups_free(struct group_info *group_info)
{
}

static inline int in_group_p(kgid_t grp)
{
        return 1;
}
static inline int in_egroup_p(kgid_t grp)
{
        return 1;
}
static inline int groups_search(const struct group_info *group_info, kgid_t grp)
{
        return 1;
}
#endif

/*
 * The security context of a task
 *
 * The parts of the context break down into two categories:
 *
 *  (1) The objective context of a task.  These parts are used when some other
 *        task is attempting to affect this one.
 *
 *  (2) The subjective context.  These details are used when the task is acting
 *        upon another object, be that a file, a task, a key or whatever.
 *
 * Note that some members of this structure belong to both categories - the
 * LSM security pointer for instance.
 *
 * A task has two security pointers.  task->real_cred points to the objective
 * context that defines that task's actual details.  The objective part of this
 * context is used whenever that task is acted upon.
 *
 * task->cred points to the subjective context that defines the details of how
 * that task is going to act upon another object.  This may be overridden
 * temporarily to point to another security context, but normally points to the
 * same context as task->real_cred.
 */
struct cred {
        atomic_long_t        usage;
#ifdef CONFIG_DEBUG_CREDENTIALS
        atomic_t        subscribers;        /* number of processes subscribed */
        void                *put_addr;
        unsigned        magic;
#define CRED_MAGIC        0x43736564
#define CRED_MAGIC_DEAD        0x44656144
#endif
        kuid_t                uid;                /* real UID of the task */
        kgid_t                gid;                /* real GID of the task */
        kuid_t                suid;                /* saved UID of the task */
        kgid_t                sgid;                /* saved GID of the task */
        kuid_t                euid;                /* effective UID of the task */
        kgid_t                egid;                /* effective GID of the task */
        kuid_t                fsuid;                /* UID for VFS ops */
        kgid_t                fsgid;                /* GID for VFS ops */
        unsigned        securebits;        /* SUID-less security management */
        kernel_cap_t        cap_inheritable; /* caps our children can inherit */
        kernel_cap_t        cap_permitted;        /* caps we're permitted */
        kernel_cap_t        cap_effective;        /* caps we can actually use */
        kernel_cap_t        cap_bset;        /* capability bounding set */
        kernel_cap_t        cap_ambient;        /* Ambient capability set */
#ifdef CONFIG_KEYS
        unsigned char        jit_keyring;        /* default keyring to attach requested
                                         * keys to */
        struct key        *session_keyring; /* keyring inherited over fork */
        struct key        *process_keyring; /* keyring private to this process */
        struct key        *thread_keyring; /* keyring private to this thread */
        struct key        *request_key_auth; /* assumed request_key authority */
#endif
#ifdef CONFIG_SECURITY
        void                *security;        /* subjective LSM security */
#endif
        struct user_struct *user;        /* real user ID subscription */
        struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
        struct group_info *group_info;        /* supplementary groups for euid/fsgid */
        /* RCU deletion */
        union {
                int non_rcu;                        /* Can we skip RCU deletion? */
                struct rcu_head        rcu;                /* RCU deletion hook */
        };
} __randomize_layout;

extern void __put_cred(struct cred *);
extern void exit_creds(struct task_struct *);
extern int copy_creds(struct task_struct *, unsigned long);
extern const struct cred *get_task_cred(struct task_struct *);
extern struct cred *cred_alloc_blank(void);
extern struct cred *prepare_creds(void);
extern struct cred *prepare_exec_creds(void);
extern int commit_creds(struct cred *);
extern void abort_creds(struct cred *);
extern const struct cred *override_creds(const struct cred *);
extern void revert_creds(const struct cred *);
extern struct cred *prepare_kernel_cred(struct task_struct *);
extern int change_create_files_as(struct cred *, struct inode *);
extern int set_security_override(struct cred *, u32);
extern int set_security_override_from_ctx(struct cred *, const char *);
extern int set_create_files_as(struct cred *, struct inode *);
extern int cred_fscmp(const struct cred *, const struct cred *);
extern void __init cred_init(void);

/*
 * check for validity of credentials
 */
#ifdef CONFIG_DEBUG_CREDENTIALS
extern void __invalid_creds(const struct cred *, const char *, unsigned);
extern void __validate_process_creds(struct task_struct *,
                                     const char *, unsigned);

extern bool creds_are_invalid(const struct cred *cred);

static inline void __validate_creds(const struct cred *cred,
                                    const char *file, unsigned line)
{
        if (unlikely(creds_are_invalid(cred)))
                __invalid_creds(cred, file, line);
}

#define validate_creds(cred)                                \
do {                                                        \
        __validate_creds((cred), __FILE__, __LINE__);        \
} while(0)

#define validate_process_creds()                                \
do {                                                                \
        __validate_process_creds(current, __FILE__, __LINE__);        \
} while(0)

extern void validate_creds_for_do_exit(struct task_struct *);
#else
static inline void validate_creds(const struct cred *cred)
{
}
static inline void validate_creds_for_do_exit(struct task_struct *tsk)
{
}
static inline void validate_process_creds(void)
{
}
#endif

static inline bool cap_ambient_invariant_ok(const struct cred *cred)
{
        return cap_issubset(cred->cap_ambient,
                            cap_intersect(cred->cap_permitted,
                                          cred->cap_inheritable));
}

/**
 * get_new_cred - Get a reference on a new set of credentials
 * @cred: The new credentials to reference
 *
 * Get a reference on the specified set of new credentials.  The caller must
 * release the reference.
 */
static inline struct cred *get_new_cred(struct cred *cred)
{
        atomic_long_inc(&cred->usage);
        return cred;
}

/**
 * get_cred - Get a reference on a set of credentials
 * @cred: The credentials to reference
 *
 * Get a reference on the specified set of credentials.  The caller must
 * release the reference.  If %NULL is passed, it is returned with no action.
 *
 * This is used to deal with a committed set of credentials.  Although the
 * pointer is const, this will temporarily discard the const and increment the
 * usage count.  The purpose of this is to attempt to catch at compile time the
 * accidental alteration of a set of credentials that should be considered
 * immutable.
 */
static inline const struct cred *get_cred(const struct cred *cred)
{
        struct cred *nonconst_cred = (struct cred *) cred;
        if (!cred)
                return cred;
        validate_creds(cred);
        nonconst_cred->non_rcu = 0;
        return get_new_cred(nonconst_cred);
}

static inline const struct cred *get_cred_rcu(const struct cred *cred)
{
        struct cred *nonconst_cred = (struct cred *) cred;
        if (!cred)
                return NULL;
        if (!atomic_long_inc_not_zero(&nonconst_cred->usage))
                return NULL;
        validate_creds(cred);
        nonconst_cred->non_rcu = 0;
        return cred;
}

/**
 * put_cred - Release a reference to a set of credentials
 * @cred: The credentials to release
 *
 * Release a reference to a set of credentials, deleting them when the last ref
 * is released.  If %NULL is passed, nothing is done.
 *
 * This takes a const pointer to a set of credentials because the credentials
 * on task_struct are attached by const pointers to prevent accidental
 * alteration of otherwise immutable credential sets.
 */
static inline void put_cred(const struct cred *_cred)
{
        struct cred *cred = (struct cred *) _cred;

        if (cred) {
                validate_creds(cred);
                if (atomic_long_dec_and_test(&(cred)->usage))
                        __put_cred(cred);
        }
}

/**
 * current_cred - Access the current task's subjective credentials
 *
 * Access the subjective credentials of the current task.  RCU-safe,
 * since nobody else can modify it.
 */
#define current_cred() \
        rcu_dereference_protected(current->cred, 1)

/**
 * current_real_cred - Access the current task's objective credentials
 *
 * Access the objective credentials of the current task.  RCU-safe,
 * since nobody else can modify it.
 */
#define current_real_cred() \
        rcu_dereference_protected(current->real_cred, 1)

/**
 * __task_cred - Access a task's objective credentials
 * @task: The task to query
 *
 * Access the objective credentials of a task.  The caller must hold the RCU
 * readlock.
 *
 * The result of this function should not be passed directly to get_cred();
 * rather get_task_cred() should be used instead.
 */
#define __task_cred(task)        \
        rcu_dereference((task)->real_cred)

/**
 * get_current_cred - Get the current task's subjective credentials
 *
 * Get the subjective credentials of the current task, pinning them so that
 * they can't go away.  Accessing the current task's credentials directly is
 * not permitted.
 */
#define get_current_cred()                                \
        (get_cred(current_cred()))

/**
 * get_current_user - Get the current task's user_struct
 *
 * Get the user record of the current task, pinning it so that it can't go
 * away.
 */
#define get_current_user()                                \
({                                                        \
        struct user_struct *__u;                        \
        const struct cred *__cred;                        \
        __cred = current_cred();                        \
        __u = get_uid(__cred->user);                        \
        __u;                                                \
})

/**
 * get_current_groups - Get the current task's supplementary group list
 *
 * Get the supplementary group list of the current task, pinning it so that it
 * can't go away.
 */
#define get_current_groups()                                \
({                                                        \
        struct group_info *__groups;                        \
        const struct cred *__cred;                        \
        __cred = current_cred();                        \
        __groups = get_group_info(__cred->group_info);        \
        __groups;                                        \
})

#define task_cred_xxx(task, xxx)                        \
({                                                        \
        __typeof__(((struct cred *)NULL)->xxx) ___val;        \
        rcu_read_lock();                                \
        ___val = __task_cred((task))->xxx;                \
        rcu_read_unlock();                                \
        ___val;                                                \
})

#define task_uid(task)                (task_cred_xxx((task), uid))
#define task_euid(task)                (task_cred_xxx((task), euid))

#define current_cred_xxx(xxx)                        \
({                                                \
        current_cred()->xxx;                        \
})

#define current_uid()                (current_cred_xxx(uid))
#define current_gid()                (current_cred_xxx(gid))
#define current_euid()                (current_cred_xxx(euid))
#define current_egid()                (current_cred_xxx(egid))
#define current_suid()                (current_cred_xxx(suid))
#define current_sgid()                (current_cred_xxx(sgid))
#define current_fsuid()         (current_cred_xxx(fsuid))
#define current_fsgid()         (current_cred_xxx(fsgid))
#define current_cap()                (current_cred_xxx(cap_effective))
#define current_user()                (current_cred_xxx(user))

extern struct user_namespace init_user_ns;
#ifdef CONFIG_USER_NS
#define current_user_ns()        (current_cred_xxx(user_ns))
#else
static inline struct user_namespace *current_user_ns(void)
{
        return &init_user_ns;
}
#endif


#define current_uid_gid(_uid, _gid)                \
do {                                                \
        const struct cred *__cred;                \
        __cred = current_cred();                \
        *(_uid) = __cred->uid;                        \
        *(_gid) = __cred->gid;                        \
} while(0)

#define current_euid_egid(_euid, _egid)                \
do {                                                \
        const struct cred *__cred;                \
        __cred = current_cred();                \
        *(_euid) = __cred->euid;                \
        *(_egid) = __cred->egid;                \
} while(0)

#define current_fsuid_fsgid(_fsuid, _fsgid)        \
do {                                                \
        const struct cred *__cred;                \
        __cred = current_cred();                \
        *(_fsuid) = __cred->fsuid;                \
        *(_fsgid) = __cred->fsgid;                \
} while(0)

#endif /* _LINUX_CRED_H */




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * generic net pointers
 */

#ifndef __NET_GENERIC_H__
#define __NET_GENERIC_H__

#include <linux/bug.h>
#include <linux/rcupdate.h>

/*
 * Generic net pointers are to be used by modules to put some private
 * stuff on the struct net without explicit struct net modification
 *
 * The rules are simple:
 * 1. set pernet_operations->id.  After register_pernet_device you
 *    will have the id of your private pointer.
 * 2. set pernet_operations->size to have the code allocate and free
 *    a private structure pointed to from struct net.
 * 3. do not change this pointer while the net is alive;
 * 4. do not try to have any private reference on the net_generic object.
 *
 * After accomplishing all of the above, the private pointer can be
 * accessed with the net_generic() call.
 */

struct net_generic {
        union {
                struct {
                        unsigned int len;
                        struct rcu_head rcu;
                } s;

                void *ptr[0];
        };
};

static inline void *net_generic(const struct net *net, unsigned int id)
{
        struct net_generic *ng;
        void *ptr;

        rcu_read_lock();
        ng = rcu_dereference(net->gen);
        ptr = ng->ptr[id];
        rcu_read_unlock();

        return ptr;
}
#endif















































































































































    1 




    1 






























































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * include/linux/backing-dev.h
 *
 * low-level device information and state which is propagated up through
 * to high-level code.
 */

#ifndef _LINUX_BACKING_DEV_H
#define _LINUX_BACKING_DEV_H

#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/blkdev.h>
#include <linux/device.h>
#include <linux/writeback.h>
#include <linux/blk-cgroup.h>
#include <linux/backing-dev-defs.h>
#include <linux/slab.h>

static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi)
{
        kref_get(&bdi->refcnt);
        return bdi;
}

struct backing_dev_info *bdi_get_by_id(u64 id);
void bdi_put(struct backing_dev_info *bdi);

__printf(2, 3)
int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...);
__printf(2, 0)
int bdi_register_va(struct backing_dev_info *bdi, const char *fmt,
                    va_list args);
void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner);
void bdi_unregister(struct backing_dev_info *bdi);

struct backing_dev_info *bdi_alloc(int node_id);

void wb_start_background_writeback(struct bdi_writeback *wb);
void wb_workfn(struct work_struct *work);
void wb_wakeup_delayed(struct bdi_writeback *wb);

void wb_wait_for_completion(struct wb_completion *done);

extern spinlock_t bdi_lock;
extern struct list_head bdi_list;

extern struct workqueue_struct *bdi_wq;
extern struct workqueue_struct *bdi_async_bio_wq;

static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
{
        return test_bit(WB_has_dirty_io, &wb->state);
}

static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi)
{
        /*
         * @bdi->tot_write_bandwidth is guaranteed to be > 0 if there are
         * any dirty wbs.  See wb_update_write_bandwidth().
         */
        return atomic_long_read(&bdi->tot_write_bandwidth);
}

static inline void __add_wb_stat(struct bdi_writeback *wb,
                                 enum wb_stat_item item, s64 amount)
{
        percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH);
}

static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
{
        __add_wb_stat(wb, item, 1);
}

static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
{
        __add_wb_stat(wb, item, -1);
}

static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
{
        return percpu_counter_read_positive(&wb->stat[item]);
}

static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item)
{
        return percpu_counter_sum_positive(&wb->stat[item]);
}

extern void wb_writeout_inc(struct bdi_writeback *wb);

/*
 * maximal error of a stat counter.
 */
static inline unsigned long wb_stat_error(void)
{
#ifdef CONFIG_SMP
        return nr_cpu_ids * WB_STAT_BATCH;
#else
        return 1;
#endif
}

int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio);
int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);

/*
 * Flags in backing_dev_info::capability
 *
 * BDI_CAP_WRITEBACK:                Supports dirty page writeback, and dirty pages
 *                                should contribute to accounting
 * BDI_CAP_WRITEBACK_ACCT:        Automatically account writeback pages
 * BDI_CAP_STRICTLIMIT:                Keep number of dirty pages below bdi threshold
 */
#define BDI_CAP_WRITEBACK                (1 << 0)
#define BDI_CAP_WRITEBACK_ACCT                (1 << 1)
#define BDI_CAP_STRICTLIMIT                (1 << 2)

extern struct backing_dev_info noop_backing_dev_info;

/**
 * writeback_in_progress - determine whether there is writeback in progress
 * @wb: bdi_writeback of interest
 *
 * Determine whether there is writeback waiting to be handled against a
 * bdi_writeback.
 */
static inline bool writeback_in_progress(struct bdi_writeback *wb)
{
        return test_bit(WB_writeback_running, &wb->state);
}

static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
{
        struct super_block *sb;

        if (!inode)
                return &noop_backing_dev_info;

        sb = inode->i_sb;
#ifdef CONFIG_BLOCK
        if (sb_is_blkdev_sb(sb))
                return I_BDEV(inode)->bd_bdi;
#endif
        return sb->s_bdi;
}

static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
{
        return wb->congested & cong_bits;
}

long congestion_wait(int sync, long timeout);
long wait_iff_congested(int sync, long timeout);

static inline bool mapping_can_writeback(struct address_space *mapping)
{
        return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK;
}

static inline int bdi_sched_wait(void *word)
{
        schedule();
        return 0;
}

#ifdef CONFIG_CGROUP_WRITEBACK

struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
                                    struct cgroup_subsys_state *memcg_css);
struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
                                    struct cgroup_subsys_state *memcg_css,
                                    gfp_t gfp);
void wb_memcg_offline(struct mem_cgroup *memcg);
void wb_blkcg_offline(struct blkcg *blkcg);
int inode_congested(struct inode *inode, int cong_bits);

/**
 * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode
 * @inode: inode of interest
 *
 * Cgroup writeback requires support from the filesystem.  Also, both memcg and
 * iocg have to be on the default hierarchy.  Test whether all conditions are
 * met.
 *
 * Note that the test result may change dynamically on the same inode
 * depending on how memcg and iocg are configured.
 */
static inline bool inode_cgwb_enabled(struct inode *inode)
{
        struct backing_dev_info *bdi = inode_to_bdi(inode);

        return cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
                cgroup_subsys_on_dfl(io_cgrp_subsys) &&
                (bdi->capabilities & BDI_CAP_WRITEBACK) &&
                (inode->i_sb->s_iflags & SB_I_CGROUPWB);
}

/**
 * wb_find_current - find wb for %current on a bdi
 * @bdi: bdi of interest
 *
 * Find the wb of @bdi which matches both the memcg and blkcg of %current.
 * Must be called under rcu_read_lock() which protects the returend wb.
 * NULL if not found.
 */
static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
{
        struct cgroup_subsys_state *memcg_css;
        struct bdi_writeback *wb;

        memcg_css = task_css(current, memory_cgrp_id);
        if (!memcg_css->parent)
                return &bdi->wb;

        wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);

        /*
         * %current's blkcg equals the effective blkcg of its memcg.  No
         * need to use the relatively expensive cgroup_get_e_css().
         */
        if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id)))
                return wb;
        return NULL;
}

/**
 * wb_get_create_current - get or create wb for %current on a bdi
 * @bdi: bdi of interest
 * @gfp: allocation mask
 *
 * Equivalent to wb_get_create() on %current's memcg.  This function is
 * called from a relatively hot path and optimizes the common cases using
 * wb_find_current().
 */
static inline struct bdi_writeback *
wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
{
        struct bdi_writeback *wb;

        rcu_read_lock();
        wb = wb_find_current(bdi);
        if (wb && unlikely(!wb_tryget(wb)))
                wb = NULL;
        rcu_read_unlock();

        if (unlikely(!wb)) {
                struct cgroup_subsys_state *memcg_css;

                memcg_css = task_get_css(current, memory_cgrp_id);
                wb = wb_get_create(bdi, memcg_css, gfp);
                css_put(memcg_css);
        }
        return wb;
}

/**
 * inode_to_wb_is_valid - test whether an inode has a wb associated
 * @inode: inode of interest
 *
 * Returns %true if @inode has a wb associated.  May be called without any
 * locking.
 */
static inline bool inode_to_wb_is_valid(struct inode *inode)
{
        return inode->i_wb;
}

/**
 * inode_to_wb - determine the wb of an inode
 * @inode: inode of interest
 *
 * Returns the wb @inode is currently associated with.  The caller must be
 * holding either @inode->i_lock, the i_pages lock, or the
 * associated wb's list_lock.
 */
static inline struct bdi_writeback *inode_to_wb(const struct inode *inode)
{
#ifdef CONFIG_LOCKDEP
        WARN_ON_ONCE(debug_locks &&
                     (!lockdep_is_held(&inode->i_lock) &&
                      !lockdep_is_held(&inode->i_mapping->i_pages.xa_lock) &&
                      !lockdep_is_held(&inode->i_wb->list_lock)));
#endif
        return inode->i_wb;
}

/**
 * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction
 * @inode: target inode
 * @cookie: output param, to be passed to the end function
 *
 * The caller wants to access the wb associated with @inode but isn't
 * holding inode->i_lock, the i_pages lock or wb->list_lock.  This
 * function determines the wb associated with @inode and ensures that the
 * association doesn't change until the transaction is finished with
 * unlocked_inode_to_wb_end().
 *
 * The caller must call unlocked_inode_to_wb_end() with *@cookie afterwards and
 * can't sleep during the transaction.  IRQs may or may not be disabled on
 * return.
 */
static inline struct bdi_writeback *
unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
{
        rcu_read_lock();

        /*
         * Paired with store_release in inode_switch_wbs_work_fn() and
         * ensures that we see the new wb if we see cleared I_WB_SWITCH.
         */
        cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;

        if (unlikely(cookie->locked))
                xa_lock_irqsave(&inode->i_mapping->i_pages, cookie->flags);

        /*
         * Protected by either !I_WB_SWITCH + rcu_read_lock() or the i_pages
         * lock.  inode_to_wb() will bark.  Deref directly.
         */
        return inode->i_wb;
}

/**
 * unlocked_inode_to_wb_end - end inode wb access transaction
 * @inode: target inode
 * @cookie: @cookie from unlocked_inode_to_wb_begin()
 */
static inline void unlocked_inode_to_wb_end(struct inode *inode,
                                            struct wb_lock_cookie *cookie)
{
        if (unlikely(cookie->locked))
                xa_unlock_irqrestore(&inode->i_mapping->i_pages, cookie->flags);

        rcu_read_unlock();
}

#else        /* CONFIG_CGROUP_WRITEBACK */

static inline bool inode_cgwb_enabled(struct inode *inode)
{
        return false;
}

static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
{
        return &bdi->wb;
}

static inline struct bdi_writeback *
wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
{
        return &bdi->wb;
}

static inline bool inode_to_wb_is_valid(struct inode *inode)
{
        return true;
}

static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
{
        return &inode_to_bdi(inode)->wb;
}

static inline struct bdi_writeback *
unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
{
        return inode_to_wb(inode);
}

static inline void unlocked_inode_to_wb_end(struct inode *inode,
                                            struct wb_lock_cookie *cookie)
{
}

static inline void wb_memcg_offline(struct mem_cgroup *memcg)
{
}

static inline void wb_blkcg_offline(struct blkcg *blkcg)
{
}

static inline int inode_congested(struct inode *inode, int cong_bits)
{
        return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

static inline int inode_read_congested(struct inode *inode)
{
        return inode_congested(inode, 1 << WB_sync_congested);
}

static inline int inode_write_congested(struct inode *inode)
{
        return inode_congested(inode, 1 << WB_async_congested);
}

static inline int inode_rw_congested(struct inode *inode)
{
        return inode_congested(inode, (1 << WB_sync_congested) |
                                      (1 << WB_async_congested));
}

static inline int bdi_congested(struct backing_dev_info *bdi, int cong_bits)
{
        return wb_congested(&bdi->wb, cong_bits);
}

static inline int bdi_read_congested(struct backing_dev_info *bdi)
{
        return bdi_congested(bdi, 1 << WB_sync_congested);
}

static inline int bdi_write_congested(struct backing_dev_info *bdi)
{
        return bdi_congested(bdi, 1 << WB_async_congested);
}

static inline int bdi_rw_congested(struct backing_dev_info *bdi)
{
        return bdi_congested(bdi, (1 << WB_sync_congested) |
                                  (1 << WB_async_congested));
}

const char *bdi_dev_name(struct backing_dev_info *bdi);

#endif        /* _LINUX_BACKING_DEV_H */


























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Queued spinlock
 *
 * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
 * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
 *
 * Authors: Waiman Long <waiman.long@hpe.com>
 */
#ifndef __ASM_GENERIC_QSPINLOCK_H
#define __ASM_GENERIC_QSPINLOCK_H

#include <asm-generic/qspinlock_types.h>
#include <linux/atomic.h>

#ifndef queued_spin_is_locked
/**
 * queued_spin_is_locked - is the spinlock locked?
 * @lock: Pointer to queued spinlock structure
 * Return: 1 if it is locked, 0 otherwise
 */
static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
{
        /*
         * Any !0 state indicates it is locked, even if _Q_LOCKED_VAL
         * isn't immediately observable.
         */
        return atomic_read(&lock->val);
}
#endif

/**
 * queued_spin_value_unlocked - is the spinlock structure unlocked?
 * @lock: queued spinlock structure
 * Return: 1 if it is unlocked, 0 otherwise
 *
 * N.B. Whenever there are tasks waiting for the lock, it is considered
 *      locked wrt the lockref code to avoid lock stealing by the lockref
 *      code and change things underneath the lock. This also allows some
 *      optimizations to be applied without conflict with lockref.
 */
static __always_inline int queued_spin_value_unlocked(struct qspinlock lock)
{
        return !lock.val.counter;
}

/**
 * queued_spin_is_contended - check if the lock is contended
 * @lock : Pointer to queued spinlock structure
 * Return: 1 if lock contended, 0 otherwise
 */
static __always_inline int queued_spin_is_contended(struct qspinlock *lock)
{
        return atomic_read(&lock->val) & ~_Q_LOCKED_MASK;
}
/**
 * queued_spin_trylock - try to acquire the queued spinlock
 * @lock : Pointer to queued spinlock structure
 * Return: 1 if lock acquired, 0 if failed
 */
static __always_inline int queued_spin_trylock(struct qspinlock *lock)
{
        u32 val = atomic_read(&lock->val);

        if (unlikely(val))
                return 0;

        return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL));
}

extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);

#ifndef queued_spin_lock
/**
 * queued_spin_lock - acquire a queued spinlock
 * @lock: Pointer to queued spinlock structure
 */
static __always_inline void queued_spin_lock(struct qspinlock *lock)
{
        u32 val = 0;

        if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
                return;

        queued_spin_lock_slowpath(lock, val);
}
#endif

#ifndef queued_spin_unlock
/**
 * queued_spin_unlock - release a queued spinlock
 * @lock : Pointer to queued spinlock structure
 */
static __always_inline void queued_spin_unlock(struct qspinlock *lock)
{
        /*
         * unlock() needs release semantics:
         */
        smp_store_release(&lock->locked, 0);
}
#endif

#ifndef virt_spin_lock
static __always_inline bool virt_spin_lock(struct qspinlock *lock)
{
        return false;
}
#endif

/*
 * Remapping spinlock architecture specific functions to the corresponding
 * queued spinlock functions.
 */
#define arch_spin_is_locked(l)                queued_spin_is_locked(l)
#define arch_spin_is_contended(l)        queued_spin_is_contended(l)
#define arch_spin_value_unlocked(l)        queued_spin_value_unlocked(l)
#define arch_spin_lock(l)                queued_spin_lock(l)
#define arch_spin_trylock(l)                queued_spin_trylock(l)
#define arch_spin_unlock(l)                queued_spin_unlock(l)

#endif /* __ASM_GENERIC_QSPINLOCK_H */



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CGROUP_H
#define _LINUX_CGROUP_H
/*
 *  cgroup interface
 *
 *  Copyright (C) 2003 BULL SA
 *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
 *
 */

#include <linux/sched.h>
#include <linux/cpumask.h>
#include <linux/nodemask.h>
#include <linux/rculist.h>
#include <linux/cgroupstats.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/kernfs.h>
#include <linux/jump_label.h>
#include <linux/types.h>
#include <linux/ns_common.h>
#include <linux/nsproxy.h>
#include <linux/user_namespace.h>
#include <linux/refcount.h>
#include <linux/kernel_stat.h>

#include <linux/cgroup-defs.h>

struct kernel_clone_args;

#ifdef CONFIG_CGROUPS

/*
 * All weight knobs on the default hierarhcy should use the following min,
 * default and max values.  The default value is the logarithmic center of
 * MIN and MAX and allows 100x to be expressed in both directions.
 */
#define CGROUP_WEIGHT_MIN                1
#define CGROUP_WEIGHT_DFL                100
#define CGROUP_WEIGHT_MAX                10000

/* walk only threadgroup leaders */
#define CSS_TASK_ITER_PROCS                (1U << 0)
/* walk all threaded css_sets in the domain */
#define CSS_TASK_ITER_THREADED                (1U << 1)

/* internal flags */
#define CSS_TASK_ITER_SKIPPED                (1U << 16)

/* a css_task_iter should be treated as an opaque object */
struct css_task_iter {
        struct cgroup_subsys                *ss;
        unsigned int                        flags;

        struct list_head                *cset_pos;
        struct list_head                *cset_head;

        struct list_head                *tcset_pos;
        struct list_head                *tcset_head;

        struct list_head                *task_pos;

        struct list_head                *cur_tasks_head;
        struct css_set                        *cur_cset;
        struct css_set                        *cur_dcset;
        struct task_struct                *cur_task;
        struct list_head                iters_node;        /* css_set->task_iters */
};

extern struct file_system_type cgroup_fs_type;
extern struct cgroup_root cgrp_dfl_root;
extern struct css_set init_css_set;

#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
#include <linux/cgroup_subsys.h>
#undef SUBSYS

#define SUBSYS(_x)                                                                \
        extern struct static_key_true _x ## _cgrp_subsys_enabled_key;                \
        extern struct static_key_true _x ## _cgrp_subsys_on_dfl_key;
#include <linux/cgroup_subsys.h>
#undef SUBSYS

/**
 * cgroup_subsys_enabled - fast test on whether a subsys is enabled
 * @ss: subsystem in question
 */
#define cgroup_subsys_enabled(ss)                                                \
        static_branch_likely(&ss ## _enabled_key)

/**
 * cgroup_subsys_on_dfl - fast test on whether a subsys is on default hierarchy
 * @ss: subsystem in question
 */
#define cgroup_subsys_on_dfl(ss)                                                \
        static_branch_likely(&ss ## _on_dfl_key)

bool css_has_online_children(struct cgroup_subsys_state *css);
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup,
                                         struct cgroup_subsys *ss);
struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup,
                                             struct cgroup_subsys *ss);
struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
                                                       struct cgroup_subsys *ss);

struct cgroup *cgroup_get_from_path(const char *path);
struct cgroup *cgroup_get_from_fd(int fd);

int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);

int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_rm_cftypes(struct cftype *cfts);
void cgroup_file_notify(struct cgroup_file *cfile);

int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry);
int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                     struct pid *pid, struct task_struct *tsk);

void cgroup_fork(struct task_struct *p);
extern int cgroup_can_fork(struct task_struct *p,
                           struct kernel_clone_args *kargs);
extern void cgroup_cancel_fork(struct task_struct *p,
                               struct kernel_clone_args *kargs);
extern void cgroup_post_fork(struct task_struct *p,
                             struct kernel_clone_args *kargs);
void cgroup_exit(struct task_struct *p);
void cgroup_release(struct task_struct *p);
void cgroup_free(struct task_struct *p);

int cgroup_init_early(void);
int cgroup_init(void);

int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v);

/*
 * Iteration helpers and macros.
 */

struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
                                           struct cgroup_subsys_state *parent);
struct cgroup_subsys_state *css_next_descendant_pre(struct cgroup_subsys_state *pos,
                                                    struct cgroup_subsys_state *css);
struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state *pos);
struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos,
                                                     struct cgroup_subsys_state *css);

struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
                                         struct cgroup_subsys_state **dst_cssp);
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
                                        struct cgroup_subsys_state **dst_cssp);

void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
                         struct css_task_iter *it);
struct task_struct *css_task_iter_next(struct css_task_iter *it);
void css_task_iter_end(struct css_task_iter *it);

/**
 * css_for_each_child - iterate through children of a css
 * @pos: the css * to use as the loop cursor
 * @parent: css whose children to walk
 *
 * Walk @parent's children.  Must be called under rcu_read_lock().
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 *
 * It is allowed to temporarily drop RCU read lock during iteration.  The
 * caller is responsible for ensuring that @pos remains accessible until
 * the start of the next iteration by, for example, bumping the css refcnt.
 */
#define css_for_each_child(pos, parent)                                        \
        for ((pos) = css_next_child(NULL, (parent)); (pos);                \
             (pos) = css_next_child((pos), (parent)))

/**
 * css_for_each_descendant_pre - pre-order walk of a css's descendants
 * @pos: the css * to use as the loop cursor
 * @root: css whose descendants to walk
 *
 * Walk @root's descendants.  @root is included in the iteration and the
 * first node to be visited.  Must be called under rcu_read_lock().
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 *
 * For example, the following guarantees that a descendant can't escape
 * state updates of its ancestors.
 *
 * my_online(@css)
 * {
 *        Lock @css's parent and @css;
 *        Inherit state from the parent;
 *        Unlock both.
 * }
 *
 * my_update_state(@css)
 * {
 *        css_for_each_descendant_pre(@pos, @css) {
 *                Lock @pos;
 *                if (@pos == @css)
 *                        Update @css's state;
 *                else
 *                        Verify @pos is alive and inherit state from its parent;
 *                Unlock @pos;
 *        }
 * }
 *
 * As long as the inheriting step, including checking the parent state, is
 * enclosed inside @pos locking, double-locking the parent isn't necessary
 * while inheriting.  The state update to the parent is guaranteed to be
 * visible by walking order and, as long as inheriting operations to the
 * same @pos are atomic to each other, multiple updates racing each other
 * still result in the correct state.  It's guaranateed that at least one
 * inheritance happens for any css after the latest update to its parent.
 *
 * If checking parent's state requires locking the parent, each inheriting
 * iteration should lock and unlock both @pos->parent and @pos.
 *
 * Alternatively, a subsystem may choose to use a single global lock to
 * synchronize ->css_online() and ->css_offline() against tree-walking
 * operations.
 *
 * It is allowed to temporarily drop RCU read lock during iteration.  The
 * caller is responsible for ensuring that @pos remains accessible until
 * the start of the next iteration by, for example, bumping the css refcnt.
 */
#define css_for_each_descendant_pre(pos, css)                                \
        for ((pos) = css_next_descendant_pre(NULL, (css)); (pos);        \
             (pos) = css_next_descendant_pre((pos), (css)))

/**
 * css_for_each_descendant_post - post-order walk of a css's descendants
 * @pos: the css * to use as the loop cursor
 * @css: css whose descendants to walk
 *
 * Similar to css_for_each_descendant_pre() but performs post-order
 * traversal instead.  @root is included in the iteration and the last
 * node to be visited.
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 *
 * Note that the walk visibility guarantee example described in pre-order
 * walk doesn't apply the same to post-order walks.
 */
#define css_for_each_descendant_post(pos, css)                                \
        for ((pos) = css_next_descendant_post(NULL, (css)); (pos);        \
             (pos) = css_next_descendant_post((pos), (css)))

/**
 * cgroup_taskset_for_each - iterate cgroup_taskset
 * @task: the loop cursor
 * @dst_css: the destination css
 * @tset: taskset to iterate
 *
 * @tset may contain multiple tasks and they may belong to multiple
 * processes.
 *
 * On the v2 hierarchy, there may be tasks from multiple processes and they
 * may not share the source or destination csses.
 *
 * On traditional hierarchies, when there are multiple tasks in @tset, if a
 * task of a process is in @tset, all tasks of the process are in @tset.
 * Also, all are guaranteed to share the same source and destination csses.
 *
 * Iteration is not in any specific order.
 */
#define cgroup_taskset_for_each(task, dst_css, tset)                        \
        for ((task) = cgroup_taskset_first((tset), &(dst_css));                \
             (task);                                                        \
             (task) = cgroup_taskset_next((tset), &(dst_css)))

/**
 * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset
 * @leader: the loop cursor
 * @dst_css: the destination css
 * @tset: taskset to iterate
 *
 * Iterate threadgroup leaders of @tset.  For single-task migrations, @tset
 * may not contain any.
 */
#define cgroup_taskset_for_each_leader(leader, dst_css, tset)                \
        for ((leader) = cgroup_taskset_first((tset), &(dst_css));        \
             (leader);                                                        \
             (leader) = cgroup_taskset_next((tset), &(dst_css)))        \
                if ((leader) != (leader)->group_leader)                        \
                        ;                                                \
                else

/*
 * Inline functions.
 */

static inline u64 cgroup_id(struct cgroup *cgrp)
{
        return cgrp->kn->id;
}

/**
 * css_get - obtain a reference on the specified css
 * @css: target css
 *
 * The caller must already have a reference.
 */
static inline void css_get(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_get(&css->refcnt);
}

/**
 * css_get_many - obtain references on the specified css
 * @css: target css
 * @n: number of references to get
 *
 * The caller must already have a reference.
 */
static inline void css_get_many(struct cgroup_subsys_state *css, unsigned int n)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_get_many(&css->refcnt, n);
}

/**
 * css_tryget - try to obtain a reference on the specified css
 * @css: target css
 *
 * Obtain a reference on @css unless it already has reached zero and is
 * being released.  This function doesn't care whether @css is on or
 * offline.  The caller naturally needs to ensure that @css is accessible
 * but doesn't have to be holding a reference on it - IOW, RCU protected
 * access is good enough for this function.  Returns %true if a reference
 * count was successfully obtained; %false otherwise.
 */
static inline bool css_tryget(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                return percpu_ref_tryget(&css->refcnt);
        return true;
}

/**
 * css_tryget_online - try to obtain a reference on the specified css if online
 * @css: target css
 *
 * Obtain a reference on @css if it's online.  The caller naturally needs
 * to ensure that @css is accessible but doesn't have to be holding a
 * reference on it - IOW, RCU protected access is good enough for this
 * function.  Returns %true if a reference count was successfully obtained;
 * %false otherwise.
 */
static inline bool css_tryget_online(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                return percpu_ref_tryget_live(&css->refcnt);
        return true;
}

/**
 * css_is_dying - test whether the specified css is dying
 * @css: target css
 *
 * Test whether @css is in the process of offlining or already offline.  In
 * most cases, ->css_online() and ->css_offline() callbacks should be
 * enough; however, the actual offline operations are RCU delayed and this
 * test returns %true also when @css is scheduled to be offlined.
 *
 * This is useful, for example, when the use case requires synchronous
 * behavior with respect to cgroup removal.  cgroup removal schedules css
 * offlining but the css can seem alive while the operation is being
 * delayed.  If the delay affects user visible semantics, this test can be
 * used to resolve the situation.
 */
static inline bool css_is_dying(struct cgroup_subsys_state *css)
{
        return !(css->flags & CSS_NO_REF) && percpu_ref_is_dying(&css->refcnt);
}

/**
 * css_put - put a css reference
 * @css: target css
 *
 * Put a reference obtained via css_get() and css_tryget_online().
 */
static inline void css_put(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_put(&css->refcnt);
}

/**
 * css_put_many - put css references
 * @css: target css
 * @n: number of references to put
 *
 * Put references obtained via css_get() and css_tryget_online().
 */
static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_put_many(&css->refcnt, n);
}

static inline void cgroup_get(struct cgroup *cgrp)
{
        css_get(&cgrp->self);
}

static inline bool cgroup_tryget(struct cgroup *cgrp)
{
        return css_tryget(&cgrp->self);
}

static inline void cgroup_put(struct cgroup *cgrp)
{
        css_put(&cgrp->self);
}

/**
 * task_css_set_check - obtain a task's css_set with extra access conditions
 * @task: the task to obtain css_set for
 * @__c: extra condition expression to be passed to rcu_dereference_check()
 *
 * A task's css_set is RCU protected, initialized and exited while holding
 * task_lock(), and can only be modified while holding both cgroup_mutex
 * and task_lock() while the task is alive.  This macro verifies that the
 * caller is inside proper critical section and returns @task's css_set.
 *
 * The caller can also specify additional allowed conditions via @__c, such
 * as locks used during the cgroup_subsys::attach() methods.
 */
#ifdef CONFIG_PROVE_RCU
extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
#define task_css_set_check(task, __c)                                        \
        rcu_dereference_check((task)->cgroups,                                \
                rcu_read_lock_sched_held() ||                                \
                lockdep_is_held(&cgroup_mutex) ||                        \
                lockdep_is_held(&css_set_lock) ||                        \
                ((task)->flags & PF_EXITING) || (__c))
#else
#define task_css_set_check(task, __c)                                        \
        rcu_dereference((task)->cgroups)
#endif

/**
 * task_css_check - obtain css for (task, subsys) w/ extra access conds
 * @task: the target task
 * @subsys_id: the target subsystem ID
 * @__c: extra condition expression to be passed to rcu_dereference_check()
 *
 * Return the cgroup_subsys_state for the (@task, @subsys_id) pair.  The
 * synchronization rules are the same as task_css_set_check().
 */
#define task_css_check(task, subsys_id, __c)                                \
        task_css_set_check((task), (__c))->subsys[(subsys_id)]

/**
 * task_css_set - obtain a task's css_set
 * @task: the task to obtain css_set for
 *
 * See task_css_set_check().
 */
static inline struct css_set *task_css_set(struct task_struct *task)
{
        return task_css_set_check(task, false);
}

/**
 * task_css - obtain css for (task, subsys)
 * @task: the target task
 * @subsys_id: the target subsystem ID
 *
 * See task_css_check().
 */
static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
                                                   int subsys_id)
{
        return task_css_check(task, subsys_id, false);
}

/**
 * task_get_css - find and get the css for (task, subsys)
 * @task: the target task
 * @subsys_id: the target subsystem ID
 *
 * Find the css for the (@task, @subsys_id) combination, increment a
 * reference on and return it.  This function is guaranteed to return a
 * valid css.  The returned css may already have been offlined.
 */
static inline struct cgroup_subsys_state *
task_get_css(struct task_struct *task, int subsys_id)
{
        struct cgroup_subsys_state *css;

        rcu_read_lock();
        while (true) {
                css = task_css(task, subsys_id);
                /*
                 * Can't use css_tryget_online() here.  A task which has
                 * PF_EXITING set may stay associated with an offline css.
                 * If such task calls this function, css_tryget_online()
                 * will keep failing.
                 */
                if (likely(css_tryget(css)))
                        break;
                cpu_relax();
        }
        rcu_read_unlock();
        return css;
}

/**
 * task_css_is_root - test whether a task belongs to the root css
 * @task: the target task
 * @subsys_id: the target subsystem ID
 *
 * Test whether @task belongs to the root css on the specified subsystem.
 * May be invoked in any context.
 */
static inline bool task_css_is_root(struct task_struct *task, int subsys_id)
{
        return task_css_check(task, subsys_id, true) ==
                init_css_set.subsys[subsys_id];
}

static inline struct cgroup *task_cgroup(struct task_struct *task,
                                         int subsys_id)
{
        return task_css(task, subsys_id)->cgroup;
}

static inline struct cgroup *task_dfl_cgroup(struct task_struct *task)
{
        return task_css_set(task)->dfl_cgrp;
}

static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
{
        struct cgroup_subsys_state *parent_css = cgrp->self.parent;

        if (parent_css)
                return container_of(parent_css, struct cgroup, self);
        return NULL;
}

/**
 * cgroup_is_descendant - test ancestry
 * @cgrp: the cgroup to be tested
 * @ancestor: possible ancestor of @cgrp
 *
 * Test whether @cgrp is a descendant of @ancestor.  It also returns %true
 * if @cgrp == @ancestor.  This function is safe to call as long as @cgrp
 * and @ancestor are accessible.
 */
static inline bool cgroup_is_descendant(struct cgroup *cgrp,
                                        struct cgroup *ancestor)
{
        if (cgrp->root != ancestor->root || cgrp->level < ancestor->level)
                return false;
        return cgrp->ancestor_ids[ancestor->level] == cgroup_id(ancestor);
}

/**
 * cgroup_ancestor - find ancestor of cgroup
 * @cgrp: cgroup to find ancestor of
 * @ancestor_level: level of ancestor to find starting from root
 *
 * Find ancestor of cgroup at specified level starting from root if it exists
 * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at
 * @ancestor_level.
 *
 * This function is safe to call as long as @cgrp is accessible.
 */
static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp,
                                             int ancestor_level)
{
        if (cgrp->level < ancestor_level)
                return NULL;
        while (cgrp && cgrp->level > ancestor_level)
                cgrp = cgroup_parent(cgrp);
        return cgrp;
}

/**
 * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry
 * @task: the task to be tested
 * @ancestor: possible ancestor of @task's cgroup
 *
 * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor.
 * It follows all the same rules as cgroup_is_descendant, and only applies
 * to the default hierarchy.
 */
static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
                                               struct cgroup *ancestor)
{
        struct css_set *cset = task_css_set(task);

        return cgroup_is_descendant(cset->dfl_cgrp, ancestor);
}

/* no synchronization, the result can only be used as a hint */
static inline bool cgroup_is_populated(struct cgroup *cgrp)
{
        return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children +
                cgrp->nr_populated_threaded_children;
}

/* returns ino associated with a cgroup */
static inline ino_t cgroup_ino(struct cgroup *cgrp)
{
        return kernfs_ino(cgrp->kn);
}

/* cft/css accessors for cftype->write() operation */
static inline struct cftype *of_cft(struct kernfs_open_file *of)
{
        return of->kn->priv;
}

struct cgroup_subsys_state *of_css(struct kernfs_open_file *of);

/* cft/css accessors for cftype->seq_*() operations */
static inline struct cftype *seq_cft(struct seq_file *seq)
{
        return of_cft(seq->private);
}

static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq)
{
        return of_css(seq->private);
}

/*
 * Name / path handling functions.  All are thin wrappers around the kernfs
 * counterparts and can be called under any context.
 */

static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
{
        return kernfs_name(cgrp->kn, buf, buflen);
}

static inline int cgroup_path(struct cgroup *cgrp, char *buf, size_t buflen)
{
        return kernfs_path(cgrp->kn, buf, buflen);
}

static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
{
        pr_cont_kernfs_name(cgrp->kn);
}

static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
{
        pr_cont_kernfs_path(cgrp->kn);
}

static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
{
        return &cgrp->psi;
}

static inline void cgroup_init_kthreadd(void)
{
        /*
         * kthreadd is inherited by all kthreads, keep it in the root so
         * that the new kthreads are guaranteed to stay in the root until
         * initialization is finished.
         */
        current->no_cgroup_migration = 1;
}

static inline void cgroup_kthread_ready(void)
{
        /*
         * This kthread finished initialization.  The creator should have
         * set PF_NO_SETAFFINITY if this kthread should stay in the root.
         */
        current->no_cgroup_migration = 0;
}

void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen);
#else /* !CONFIG_CGROUPS */

struct cgroup_subsys_state;
struct cgroup;

static inline u64 cgroup_id(struct cgroup *cgrp) { return 1; }
static inline void css_get(struct cgroup_subsys_state *css) {}
static inline void css_put(struct cgroup_subsys_state *css) {}
static inline int cgroup_attach_task_all(struct task_struct *from,
                                         struct task_struct *t) { return 0; }
static inline int cgroupstats_build(struct cgroupstats *stats,
                                    struct dentry *dentry) { return -EINVAL; }

static inline void cgroup_fork(struct task_struct *p) {}
static inline int cgroup_can_fork(struct task_struct *p,
                                  struct kernel_clone_args *kargs) { return 0; }
static inline void cgroup_cancel_fork(struct task_struct *p,
                                      struct kernel_clone_args *kargs) {}
static inline void cgroup_post_fork(struct task_struct *p,
                                    struct kernel_clone_args *kargs) {}
static inline void cgroup_exit(struct task_struct *p) {}
static inline void cgroup_release(struct task_struct *p) {}
static inline void cgroup_free(struct task_struct *p) {}

static inline int cgroup_init_early(void) { return 0; }
static inline int cgroup_init(void) { return 0; }
static inline void cgroup_init_kthreadd(void) {}
static inline void cgroup_kthread_ready(void) {}

static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
{
        return NULL;
}

static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
{
        return NULL;
}

static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
                                               struct cgroup *ancestor)
{
        return true;
}

static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
{}
#endif /* !CONFIG_CGROUPS */

#ifdef CONFIG_CGROUPS
/*
 * cgroup scalable recursive statistics.
 */
void cgroup_rstat_updated(struct cgroup *cgrp, int cpu);
void cgroup_rstat_flush(struct cgroup *cgrp);
void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp);
void cgroup_rstat_flush_hold(struct cgroup *cgrp);
void cgroup_rstat_flush_release(void);

/*
 * Basic resource stats.
 */
#ifdef CONFIG_CGROUP_CPUACCT
void cpuacct_charge(struct task_struct *tsk, u64 cputime);
void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
#else
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
static inline void cpuacct_account_field(struct task_struct *tsk, int index,
                                         u64 val) {}
#endif

void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec);
void __cgroup_account_cputime_field(struct cgroup *cgrp,
                                    enum cpu_usage_stat index, u64 delta_exec);

static inline void cgroup_account_cputime(struct task_struct *task,
                                          u64 delta_exec)
{
        struct cgroup *cgrp;

        cpuacct_charge(task, delta_exec);

        cgrp = task_dfl_cgroup(task);
        if (cgroup_parent(cgrp))
                __cgroup_account_cputime(cgrp, delta_exec);
}

static inline void cgroup_account_cputime_field(struct task_struct *task,
                                                enum cpu_usage_stat index,
                                                u64 delta_exec)
{
        struct cgroup *cgrp;

        cpuacct_account_field(task, index, delta_exec);

        rcu_read_lock();
        cgrp = task_dfl_cgroup(task);
        if (cgroup_parent(cgrp))
                __cgroup_account_cputime_field(cgrp, index, delta_exec);
        rcu_read_unlock();
}

#else        /* CONFIG_CGROUPS */

static inline void cgroup_account_cputime(struct task_struct *task,
                                          u64 delta_exec) {}
static inline void cgroup_account_cputime_field(struct task_struct *task,
                                                enum cpu_usage_stat index,
                                                u64 delta_exec) {}

#endif        /* CONFIG_CGROUPS */

/*
 * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
 * definition in cgroup-defs.h.
 */
#ifdef CONFIG_SOCK_CGROUP_DATA

#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
extern spinlock_t cgroup_sk_update_lock;
#endif

void cgroup_sk_alloc_disable(void);
void cgroup_sk_alloc(struct sock_cgroup_data *skcd);
void cgroup_sk_clone(struct sock_cgroup_data *skcd);
void cgroup_sk_free(struct sock_cgroup_data *skcd);

static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd)
{
#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
        unsigned long v;

        /*
         * @skcd->val is 64bit but the following is safe on 32bit too as we
         * just need the lower ulong to be written and read atomically.
         */
        v = READ_ONCE(skcd->val);

        if (v & 3)
                return &cgrp_dfl_root.cgrp;

        return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp;
#else
        return (struct cgroup *)(unsigned long)skcd->val;
#endif
}

#else        /* CONFIG_CGROUP_DATA */

static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {}
static inline void cgroup_sk_clone(struct sock_cgroup_data *skcd) {}
static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {}

#endif        /* CONFIG_CGROUP_DATA */

struct cgroup_namespace {
        refcount_t                count;
        struct ns_common        ns;
        struct user_namespace        *user_ns;
        struct ucounts                *ucounts;
        struct css_set          *root_cset;
};

extern struct cgroup_namespace init_cgroup_ns;

#ifdef CONFIG_CGROUPS

void free_cgroup_ns(struct cgroup_namespace *ns);

struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
                                        struct user_namespace *user_ns,
                                        struct cgroup_namespace *old_ns);

int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
                   struct cgroup_namespace *ns);

#else /* !CONFIG_CGROUPS */

static inline void free_cgroup_ns(struct cgroup_namespace *ns) { }
static inline struct cgroup_namespace *
copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns,
               struct cgroup_namespace *old_ns)
{
        return old_ns;
}

#endif /* !CONFIG_CGROUPS */

static inline void get_cgroup_ns(struct cgroup_namespace *ns)
{
        if (ns)
                refcount_inc(&ns->count);
}

static inline void put_cgroup_ns(struct cgroup_namespace *ns)
{
        if (ns && refcount_dec_and_test(&ns->count))
                free_cgroup_ns(ns);
}

#ifdef CONFIG_CGROUPS

void cgroup_enter_frozen(void);
void cgroup_leave_frozen(bool always_leave);
void cgroup_update_frozen(struct cgroup *cgrp);
void cgroup_freeze(struct cgroup *cgrp, bool freeze);
void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src,
                                 struct cgroup *dst);

static inline bool cgroup_task_freeze(struct task_struct *task)
{
        bool ret;

        if (task->flags & PF_KTHREAD)
                return false;

        rcu_read_lock();
        ret = test_bit(CGRP_FREEZE, &task_dfl_cgroup(task)->flags);
        rcu_read_unlock();

        return ret;
}

static inline bool cgroup_task_frozen(struct task_struct *task)
{
        return task->frozen;
}

#else /* !CONFIG_CGROUPS */

static inline void cgroup_enter_frozen(void) { }
static inline void cgroup_leave_frozen(bool always_leave) { }
static inline bool cgroup_task_freeze(struct task_struct *task)
{
        return false;
}
static inline bool cgroup_task_frozen(struct task_struct *task)
{
        return false;
}

#endif /* !CONFIG_CGROUPS */

#ifdef CONFIG_CGROUP_BPF
static inline void cgroup_bpf_get(struct cgroup *cgrp)
{
        percpu_ref_get(&cgrp->bpf.refcnt);
}

static inline void cgroup_bpf_put(struct cgroup *cgrp)
{
        percpu_ref_put(&cgrp->bpf.refcnt);
}

#else /* CONFIG_CGROUP_BPF */

static inline void cgroup_bpf_get(struct cgroup *cgrp) {}
static inline void cgroup_bpf_put(struct cgroup *cgrp) {}

#endif /* CONFIG_CGROUP_BPF */

#endif /* _LINUX_CGROUP_H */










































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PKEYS_H
#define _ASM_X86_PKEYS_H

#define ARCH_DEFAULT_PKEY        0

/*
 * If more than 16 keys are ever supported, a thorough audit
 * will be necessary to ensure that the types that store key
 * numbers and masks have sufficient capacity.
 */
#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1)

extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
                unsigned long init_val);

static inline bool arch_pkeys_enabled(void)
{
        return boot_cpu_has(X86_FEATURE_OSPKE);
}

/*
 * Try to dedicate one of the protection keys to be used as an
 * execute-only protection key.
 */
extern int __execute_only_pkey(struct mm_struct *mm);
static inline int execute_only_pkey(struct mm_struct *mm)
{
        if (!boot_cpu_has(X86_FEATURE_OSPKE))
                return ARCH_DEFAULT_PKEY;

        return __execute_only_pkey(mm);
}

extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma,
                int prot, int pkey);
static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
                int prot, int pkey)
{
        if (!boot_cpu_has(X86_FEATURE_OSPKE))
                return 0;

        return __arch_override_mprotect_pkey(vma, prot, pkey);
}

extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
                unsigned long init_val);

#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3)

#define mm_pkey_allocation_map(mm)        (mm->context.pkey_allocation_map)
#define mm_set_pkey_allocated(mm, pkey) do {                \
        mm_pkey_allocation_map(mm) |= (1U << pkey);        \
} while (0)
#define mm_set_pkey_free(mm, pkey) do {                        \
        mm_pkey_allocation_map(mm) &= ~(1U << pkey);        \
} while (0)

static inline
bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
{
        /*
         * "Allocated" pkeys are those that have been returned
         * from pkey_alloc() or pkey 0 which is allocated
         * implicitly when the mm is created.
         */
        if (pkey < 0)
                return false;
        if (pkey >= arch_max_pkey())
                return false;
        /*
         * The exec-only pkey is set in the allocation map, but
         * is not available to any of the user interfaces like
         * mprotect_pkey().
         */
        if (pkey == mm->context.execute_only_pkey)
                return false;

        return mm_pkey_allocation_map(mm) & (1U << pkey);
}

/*
 * Returns a positive, 4-bit key on success, or -1 on failure.
 */
static inline
int mm_pkey_alloc(struct mm_struct *mm)
{
        /*
         * Note: this is the one and only place we make sure
         * that the pkey is valid as far as the hardware is
         * concerned.  The rest of the kernel trusts that
         * only good, valid pkeys come out of here.
         */
        u16 all_pkeys_mask = ((1U << arch_max_pkey()) - 1);
        int ret;

        /*
         * Are we out of pkeys?  We must handle this specially
         * because ffz() behavior is undefined if there are no
         * zeros.
         */
        if (mm_pkey_allocation_map(mm) == all_pkeys_mask)
                return -1;

        ret = ffz(mm_pkey_allocation_map(mm));

        mm_set_pkey_allocated(mm, ret);

        return ret;
}

static inline
int mm_pkey_free(struct mm_struct *mm, int pkey)
{
        if (!mm_pkey_is_allocated(mm, pkey))
                return -EINVAL;

        mm_set_pkey_free(mm, pkey);

        return 0;
}

extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
                unsigned long init_val);
extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
                unsigned long init_val);
extern void copy_init_pkru_to_fpregs(void);

static inline int vma_pkey(struct vm_area_struct *vma)
{
        unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
                                      VM_PKEY_BIT2 | VM_PKEY_BIT3;

        return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
}

#endif /*_ASM_X86_PKEYS_H */





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 

    1 
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
// SPDX-License-Identifier: GPL-2.0

// Generated by scripts/atomic/gen-atomic-fallback.sh
// DO NOT MODIFY THIS FILE DIRECTLY

#ifndef _LINUX_ATOMIC_FALLBACK_H
#define _LINUX_ATOMIC_FALLBACK_H

#include <linux/compiler.h>

#ifndef arch_xchg_relaxed
#define arch_xchg_relaxed                arch_xchg
#define arch_xchg_acquire                arch_xchg
#define arch_xchg_release                arch_xchg
#else /* arch_xchg_relaxed */

#ifndef arch_xchg_acquire
#define arch_xchg_acquire(...) \
        __atomic_op_acquire(arch_xchg, __VA_ARGS__)
#endif

#ifndef arch_xchg_release
#define arch_xchg_release(...) \
        __atomic_op_release(arch_xchg, __VA_ARGS__)
#endif

#ifndef arch_xchg
#define arch_xchg(...) \
        __atomic_op_fence(arch_xchg, __VA_ARGS__)
#endif

#endif /* arch_xchg_relaxed */

#ifndef arch_cmpxchg_relaxed
#define arch_cmpxchg_relaxed                arch_cmpxchg
#define arch_cmpxchg_acquire                arch_cmpxchg
#define arch_cmpxchg_release                arch_cmpxchg
#else /* arch_cmpxchg_relaxed */

#ifndef arch_cmpxchg_acquire
#define arch_cmpxchg_acquire(...) \
        __atomic_op_acquire(arch_cmpxchg, __VA_ARGS__)
#endif

#ifndef arch_cmpxchg_release
#define arch_cmpxchg_release(...) \
        __atomic_op_release(arch_cmpxchg, __VA_ARGS__)
#endif

#ifndef arch_cmpxchg
#define arch_cmpxchg(...) \
        __atomic_op_fence(arch_cmpxchg, __VA_ARGS__)
#endif

#endif /* arch_cmpxchg_relaxed */

#ifndef arch_cmpxchg64_relaxed
#define arch_cmpxchg64_relaxed                arch_cmpxchg64
#define arch_cmpxchg64_acquire                arch_cmpxchg64
#define arch_cmpxchg64_release                arch_cmpxchg64
#else /* arch_cmpxchg64_relaxed */

#ifndef arch_cmpxchg64_acquire
#define arch_cmpxchg64_acquire(...) \
        __atomic_op_acquire(arch_cmpxchg64, __VA_ARGS__)
#endif

#ifndef arch_cmpxchg64_release
#define arch_cmpxchg64_release(...) \
        __atomic_op_release(arch_cmpxchg64, __VA_ARGS__)
#endif

#ifndef arch_cmpxchg64
#define arch_cmpxchg64(...) \
        __atomic_op_fence(arch_cmpxchg64, __VA_ARGS__)
#endif

#endif /* arch_cmpxchg64_relaxed */

#ifndef arch_atomic_read_acquire
static __always_inline int
arch_atomic_read_acquire(const atomic_t *v)
{
        return smp_load_acquire(&(v)->counter);
}
#define arch_atomic_read_acquire arch_atomic_read_acquire
#endif

#ifndef arch_atomic_set_release
static __always_inline void
arch_atomic_set_release(atomic_t *v, int i)
{
        smp_store_release(&(v)->counter, i);
}
#define arch_atomic_set_release arch_atomic_set_release
#endif

#ifndef arch_atomic_add_return_relaxed
#define arch_atomic_add_return_acquire arch_atomic_add_return
#define arch_atomic_add_return_release arch_atomic_add_return
#define arch_atomic_add_return_relaxed arch_atomic_add_return
#else /* arch_atomic_add_return_relaxed */

#ifndef arch_atomic_add_return_acquire
static __always_inline int
arch_atomic_add_return_acquire(int i, atomic_t *v)
{
        int ret = arch_atomic_add_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_add_return_acquire arch_atomic_add_return_acquire
#endif

#ifndef arch_atomic_add_return_release
static __always_inline int
arch_atomic_add_return_release(int i, atomic_t *v)
{
        __atomic_release_fence();
        return arch_atomic_add_return_relaxed(i, v);
}
#define arch_atomic_add_return_release arch_atomic_add_return_release
#endif

#ifndef arch_atomic_add_return
static __always_inline int
arch_atomic_add_return(int i, atomic_t *v)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_add_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_add_return arch_atomic_add_return
#endif

#endif /* arch_atomic_add_return_relaxed */

#ifndef arch_atomic_fetch_add_relaxed
#define arch_atomic_fetch_add_acquire arch_atomic_fetch_add
#define arch_atomic_fetch_add_release arch_atomic_fetch_add
#define arch_atomic_fetch_add_relaxed arch_atomic_fetch_add
#else /* arch_atomic_fetch_add_relaxed */

#ifndef arch_atomic_fetch_add_acquire
static __always_inline int
arch_atomic_fetch_add_acquire(int i, atomic_t *v)
{
        int ret = arch_atomic_fetch_add_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_fetch_add_acquire arch_atomic_fetch_add_acquire
#endif

#ifndef arch_atomic_fetch_add_release
static __always_inline int
arch_atomic_fetch_add_release(int i, atomic_t *v)
{
        __atomic_release_fence();
        return arch_atomic_fetch_add_relaxed(i, v);
}
#define arch_atomic_fetch_add_release arch_atomic_fetch_add_release
#endif

#ifndef arch_atomic_fetch_add
static __always_inline int
arch_atomic_fetch_add(int i, atomic_t *v)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_add_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_fetch_add arch_atomic_fetch_add
#endif

#endif /* arch_atomic_fetch_add_relaxed */

#ifndef arch_atomic_sub_return_relaxed
#define arch_atomic_sub_return_acquire arch_atomic_sub_return
#define arch_atomic_sub_return_release arch_atomic_sub_return
#define arch_atomic_sub_return_relaxed arch_atomic_sub_return
#else /* arch_atomic_sub_return_relaxed */

#ifndef arch_atomic_sub_return_acquire
static __always_inline int
arch_atomic_sub_return_acquire(int i, atomic_t *v)
{
        int ret = arch_atomic_sub_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_sub_return_acquire arch_atomic_sub_return_acquire
#endif

#ifndef arch_atomic_sub_return_release
static __always_inline int
arch_atomic_sub_return_release(int i, atomic_t *v)
{
        __atomic_release_fence();
        return arch_atomic_sub_return_relaxed(i, v);
}
#define arch_atomic_sub_return_release arch_atomic_sub_return_release
#endif

#ifndef arch_atomic_sub_return
static __always_inline int
arch_atomic_sub_return(int i, atomic_t *v)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_sub_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_sub_return arch_atomic_sub_return
#endif

#endif /* arch_atomic_sub_return_relaxed */

#ifndef arch_atomic_fetch_sub_relaxed
#define arch_atomic_fetch_sub_acquire arch_atomic_fetch_sub
#define arch_atomic_fetch_sub_release arch_atomic_fetch_sub
#define arch_atomic_fetch_sub_relaxed arch_atomic_fetch_sub
#else /* arch_atomic_fetch_sub_relaxed */

#ifndef arch_atomic_fetch_sub_acquire
static __always_inline int
arch_atomic_fetch_sub_acquire(int i, atomic_t *v)
{
        int ret = arch_atomic_fetch_sub_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_fetch_sub_acquire arch_atomic_fetch_sub_acquire
#endif

#ifndef arch_atomic_fetch_sub_release
static __always_inline int
arch_atomic_fetch_sub_release(int i, atomic_t *v)
{
        __atomic_release_fence();
        return arch_atomic_fetch_sub_relaxed(i, v);
}
#define arch_atomic_fetch_sub_release arch_atomic_fetch_sub_release
#endif

#ifndef arch_atomic_fetch_sub
static __always_inline int
arch_atomic_fetch_sub(int i, atomic_t *v)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_sub_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_fetch_sub arch_atomic_fetch_sub
#endif

#endif /* arch_atomic_fetch_sub_relaxed */

#ifndef arch_atomic_inc
static __always_inline void
arch_atomic_inc(atomic_t *v)
{
        arch_atomic_add(1, v);
}
#define arch_atomic_inc arch_atomic_inc
#endif

#ifndef arch_atomic_inc_return_relaxed
#ifdef arch_atomic_inc_return
#define arch_atomic_inc_return_acquire arch_atomic_inc_return
#define arch_atomic_inc_return_release arch_atomic_inc_return
#define arch_atomic_inc_return_relaxed arch_atomic_inc_return
#endif /* arch_atomic_inc_return */

#ifndef arch_atomic_inc_return
static __always_inline int
arch_atomic_inc_return(atomic_t *v)
{
        return arch_atomic_add_return(1, v);
}
#define arch_atomic_inc_return arch_atomic_inc_return
#endif

#ifndef arch_atomic_inc_return_acquire
static __always_inline int
arch_atomic_inc_return_acquire(atomic_t *v)
{
        return arch_atomic_add_return_acquire(1, v);
}
#define arch_atomic_inc_return_acquire arch_atomic_inc_return_acquire
#endif

#ifndef arch_atomic_inc_return_release
static __always_inline int
arch_atomic_inc_return_release(atomic_t *v)
{
        return arch_atomic_add_return_release(1, v);
}
#define arch_atomic_inc_return_release arch_atomic_inc_return_release
#endif

#ifndef arch_atomic_inc_return_relaxed
static __always_inline int
arch_atomic_inc_return_relaxed(atomic_t *v)
{
        return arch_atomic_add_return_relaxed(1, v);
}
#define arch_atomic_inc_return_relaxed arch_atomic_inc_return_relaxed
#endif

#else /* arch_atomic_inc_return_relaxed */

#ifndef arch_atomic_inc_return_acquire
static __always_inline int
arch_atomic_inc_return_acquire(atomic_t *v)
{
        int ret = arch_atomic_inc_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_inc_return_acquire arch_atomic_inc_return_acquire
#endif

#ifndef arch_atomic_inc_return_release
static __always_inline int
arch_atomic_inc_return_release(atomic_t *v)
{
        __atomic_release_fence();
        return arch_atomic_inc_return_relaxed(v);
}
#define arch_atomic_inc_return_release arch_atomic_inc_return_release
#endif

#ifndef arch_atomic_inc_return
static __always_inline int
arch_atomic_inc_return(atomic_t *v)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_inc_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_inc_return arch_atomic_inc_return
#endif

#endif /* arch_atomic_inc_return_relaxed */

#ifndef arch_atomic_fetch_inc_relaxed
#ifdef arch_atomic_fetch_inc
#define arch_atomic_fetch_inc_acquire arch_atomic_fetch_inc
#define arch_atomic_fetch_inc_release arch_atomic_fetch_inc
#define arch_atomic_fetch_inc_relaxed arch_atomic_fetch_inc
#endif /* arch_atomic_fetch_inc */

#ifndef arch_atomic_fetch_inc
static __always_inline int
arch_atomic_fetch_inc(atomic_t *v)
{
        return arch_atomic_fetch_add(1, v);
}
#define arch_atomic_fetch_inc arch_atomic_fetch_inc
#endif

#ifndef arch_atomic_fetch_inc_acquire
static __always_inline int
arch_atomic_fetch_inc_acquire(atomic_t *v)
{
        return arch_atomic_fetch_add_acquire(1, v);
}
#define arch_atomic_fetch_inc_acquire arch_atomic_fetch_inc_acquire
#endif

#ifndef arch_atomic_fetch_inc_release
static __always_inline int
arch_atomic_fetch_inc_release(atomic_t *v)
{
        return arch_atomic_fetch_add_release(1, v);
}
#define arch_atomic_fetch_inc_release arch_atomic_fetch_inc_release
#endif

#ifndef arch_atomic_fetch_inc_relaxed
static __always_inline int
arch_atomic_fetch_inc_relaxed(atomic_t *v)
{
        return arch_atomic_fetch_add_relaxed(1, v);
}
#define arch_atomic_fetch_inc_relaxed arch_atomic_fetch_inc_relaxed
#endif

#else /* arch_atomic_fetch_inc_relaxed */

#ifndef arch_atomic_fetch_inc_acquire
static __always_inline int
arch_atomic_fetch_inc_acquire(atomic_t *v)
{
        int ret = arch_atomic_fetch_inc_relaxed(v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_fetch_inc_acquire arch_atomic_fetch_inc_acquire
#endif

#ifndef arch_atomic_fetch_inc_release
static __always_inline int
arch_atomic_fetch_inc_release(atomic_t *v)
{
        __atomic_release_fence();
        return arch_atomic_fetch_inc_relaxed(v);
}
#define arch_atomic_fetch_inc_release arch_atomic_fetch_inc_release
#endif

#ifndef arch_atomic_fetch_inc
static __always_inline int
arch_atomic_fetch_inc(atomic_t *v)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_inc_relaxed(v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_fetch_inc arch_atomic_fetch_inc
#endif

#endif /* arch_atomic_fetch_inc_relaxed */

#ifndef arch_atomic_dec
static __always_inline void
arch_atomic_dec(atomic_t *v)
{
        arch_atomic_sub(1, v);
}
#define arch_atomic_dec arch_atomic_dec
#endif

#ifndef arch_atomic_dec_return_relaxed
#ifdef arch_atomic_dec_return
#define arch_atomic_dec_return_acquire arch_atomic_dec_return
#define arch_atomic_dec_return_release arch_atomic_dec_return
#define arch_atomic_dec_return_relaxed arch_atomic_dec_return
#endif /* arch_atomic_dec_return */

#ifndef arch_atomic_dec_return
static __always_inline int
arch_atomic_dec_return(atomic_t *v)
{
        return arch_atomic_sub_return(1, v);
}
#define arch_atomic_dec_return arch_atomic_dec_return
#endif

#ifndef arch_atomic_dec_return_acquire
static __always_inline int
arch_atomic_dec_return_acquire(atomic_t *v)
{
        return arch_atomic_sub_return_acquire(1, v);
}
#define arch_atomic_dec_return_acquire arch_atomic_dec_return_acquire
#endif

#ifndef arch_atomic_dec_return_release
static __always_inline int
arch_atomic_dec_return_release(atomic_t *v)
{
        return arch_atomic_sub_return_release(1, v);
}
#define arch_atomic_dec_return_release arch_atomic_dec_return_release
#endif

#ifndef arch_atomic_dec_return_relaxed
static __always_inline int
arch_atomic_dec_return_relaxed(atomic_t *v)
{
        return arch_atomic_sub_return_relaxed(1, v);
}
#define arch_atomic_dec_return_relaxed arch_atomic_dec_return_relaxed
#endif

#else /* arch_atomic_dec_return_relaxed */

#ifndef arch_atomic_dec_return_acquire
static __always_inline int
arch_atomic_dec_return_acquire(atomic_t *v)
{
        int ret = arch_atomic_dec_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_dec_return_acquire arch_atomic_dec_return_acquire
#endif

#ifndef arch_atomic_dec_return_release
static __always_inline int
arch_atomic_dec_return_release(atomic_t *v)
{
        __atomic_release_fence();
        return arch_atomic_dec_return_relaxed(v);
}
#define arch_atomic_dec_return_release arch_atomic_dec_return_release
#endif

#ifndef arch_atomic_dec_return
static __always_inline int
arch_atomic_dec_return(atomic_t *v)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_dec_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_dec_return arch_atomic_dec_return
#endif

#endif /* arch_atomic_dec_return_relaxed */

#ifndef arch_atomic_fetch_dec_relaxed
#ifdef arch_atomic_fetch_dec
#define arch_atomic_fetch_dec_acquire arch_atomic_fetch_dec
#define arch_atomic_fetch_dec_release arch_atomic_fetch_dec
#define arch_atomic_fetch_dec_relaxed arch_atomic_fetch_dec
#endif /* arch_atomic_fetch_dec */

#ifndef arch_atomic_fetch_dec
static __always_inline int
arch_atomic_fetch_dec(atomic_t *v)
{
        return arch_atomic_fetch_sub(1, v);
}
#define arch_atomic_fetch_dec arch_atomic_fetch_dec
#endif

#ifndef arch_atomic_fetch_dec_acquire
static __always_inline int
arch_atomic_fetch_dec_acquire(atomic_t *v)
{
        return arch_atomic_fetch_sub_acquire(1, v);
}
#define arch_atomic_fetch_dec_acquire arch_atomic_fetch_dec_acquire
#endif

#ifndef arch_atomic_fetch_dec_release
static __always_inline int
arch_atomic_fetch_dec_release(atomic_t *v)
{
        return arch_atomic_fetch_sub_release(1, v);
}
#define arch_atomic_fetch_dec_release arch_atomic_fetch_dec_release
#endif

#ifndef arch_atomic_fetch_dec_relaxed
static __always_inline int
arch_atomic_fetch_dec_relaxed(atomic_t *v)
{
        return arch_atomic_fetch_sub_relaxed(1, v);
}
#define arch_atomic_fetch_dec_relaxed arch_atomic_fetch_dec_relaxed
#endif

#else /* arch_atomic_fetch_dec_relaxed */

#ifndef arch_atomic_fetch_dec_acquire
static __always_inline int
arch_atomic_fetch_dec_acquire(atomic_t *v)
{
        int ret = arch_atomic_fetch_dec_relaxed(v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_fetch_dec_acquire arch_atomic_fetch_dec_acquire
#endif

#ifndef arch_atomic_fetch_dec_release
static __always_inline int
arch_atomic_fetch_dec_release(atomic_t *v)
{
        __atomic_release_fence();
        return arch_atomic_fetch_dec_relaxed(v);
}
#define arch_atomic_fetch_dec_release arch_atomic_fetch_dec_release
#endif

#ifndef arch_atomic_fetch_dec
static __always_inline int
arch_atomic_fetch_dec(atomic_t *v)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_dec_relaxed(v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_fetch_dec arch_atomic_fetch_dec
#endif

#endif /* arch_atomic_fetch_dec_relaxed */

#ifndef arch_atomic_fetch_and_relaxed
#define arch_atomic_fetch_and_acquire arch_atomic_fetch_and
#define arch_atomic_fetch_and_release arch_atomic_fetch_and
#define arch_atomic_fetch_and_relaxed arch_atomic_fetch_and
#else /* arch_atomic_fetch_and_relaxed */

#ifndef arch_atomic_fetch_and_acquire
static __always_inline int
arch_atomic_fetch_and_acquire(int i, atomic_t *v)
{
        int ret = arch_atomic_fetch_and_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_fetch_and_acquire arch_atomic_fetch_and_acquire
#endif

#ifndef arch_atomic_fetch_and_release
static __always_inline int
arch_atomic_fetch_and_release(int i, atomic_t *v)
{
        __atomic_release_fence();
        return arch_atomic_fetch_and_relaxed(i, v);
}
#define arch_atomic_fetch_and_release arch_atomic_fetch_and_release
#endif

#ifndef arch_atomic_fetch_and
static __always_inline int
arch_atomic_fetch_and(int i, atomic_t *v)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_and_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_fetch_and arch_atomic_fetch_and
#endif

#endif /* arch_atomic_fetch_and_relaxed */

#ifndef arch_atomic_andnot
static __always_inline void
arch_atomic_andnot(int i, atomic_t *v)
{
        arch_atomic_and(~i, v);
}
#define arch_atomic_andnot arch_atomic_andnot
#endif

#ifndef arch_atomic_fetch_andnot_relaxed
#ifdef arch_atomic_fetch_andnot
#define arch_atomic_fetch_andnot_acquire arch_atomic_fetch_andnot
#define arch_atomic_fetch_andnot_release arch_atomic_fetch_andnot
#define arch_atomic_fetch_andnot_relaxed arch_atomic_fetch_andnot
#endif /* arch_atomic_fetch_andnot */

#ifndef arch_atomic_fetch_andnot
static __always_inline int
arch_atomic_fetch_andnot(int i, atomic_t *v)
{
        return arch_atomic_fetch_and(~i, v);
}
#define arch_atomic_fetch_andnot arch_atomic_fetch_andnot
#endif

#ifndef arch_atomic_fetch_andnot_acquire
static __always_inline int
arch_atomic_fetch_andnot_acquire(int i, atomic_t *v)
{
        return arch_atomic_fetch_and_acquire(~i, v);
}
#define arch_atomic_fetch_andnot_acquire arch_atomic_fetch_andnot_acquire
#endif

#ifndef arch_atomic_fetch_andnot_release
static __always_inline int
arch_atomic_fetch_andnot_release(int i, atomic_t *v)
{
        return arch_atomic_fetch_and_release(~i, v);
}
#define arch_atomic_fetch_andnot_release arch_atomic_fetch_andnot_release
#endif

#ifndef arch_atomic_fetch_andnot_relaxed
static __always_inline int
arch_atomic_fetch_andnot_relaxed(int i, atomic_t *v)
{
        return arch_atomic_fetch_and_relaxed(~i, v);
}
#define arch_atomic_fetch_andnot_relaxed arch_atomic_fetch_andnot_relaxed
#endif

#else /* arch_atomic_fetch_andnot_relaxed */

#ifndef arch_atomic_fetch_andnot_acquire
static __always_inline int
arch_atomic_fetch_andnot_acquire(int i, atomic_t *v)
{
        int ret = arch_atomic_fetch_andnot_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_fetch_andnot_acquire arch_atomic_fetch_andnot_acquire
#endif

#ifndef arch_atomic_fetch_andnot_release
static __always_inline int
arch_atomic_fetch_andnot_release(int i, atomic_t *v)
{
        __atomic_release_fence();
        return arch_atomic_fetch_andnot_relaxed(i, v);
}
#define arch_atomic_fetch_andnot_release arch_atomic_fetch_andnot_release
#endif

#ifndef arch_atomic_fetch_andnot
static __always_inline int
arch_atomic_fetch_andnot(int i, atomic_t *v)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_andnot_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_fetch_andnot arch_atomic_fetch_andnot
#endif

#endif /* arch_atomic_fetch_andnot_relaxed */

#ifndef arch_atomic_fetch_or_relaxed
#define arch_atomic_fetch_or_acquire arch_atomic_fetch_or
#define arch_atomic_fetch_or_release arch_atomic_fetch_or
#define arch_atomic_fetch_or_relaxed arch_atomic_fetch_or
#else /* arch_atomic_fetch_or_relaxed */

#ifndef arch_atomic_fetch_or_acquire
static __always_inline int
arch_atomic_fetch_or_acquire(int i, atomic_t *v)
{
        int ret = arch_atomic_fetch_or_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_fetch_or_acquire arch_atomic_fetch_or_acquire
#endif

#ifndef arch_atomic_fetch_or_release
static __always_inline int
arch_atomic_fetch_or_release(int i, atomic_t *v)
{
        __atomic_release_fence();
        return arch_atomic_fetch_or_relaxed(i, v);
}
#define arch_atomic_fetch_or_release arch_atomic_fetch_or_release
#endif

#ifndef arch_atomic_fetch_or
static __always_inline int
arch_atomic_fetch_or(int i, atomic_t *v)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_or_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_fetch_or arch_atomic_fetch_or
#endif

#endif /* arch_atomic_fetch_or_relaxed */

#ifndef arch_atomic_fetch_xor_relaxed
#define arch_atomic_fetch_xor_acquire arch_atomic_fetch_xor
#define arch_atomic_fetch_xor_release arch_atomic_fetch_xor
#define arch_atomic_fetch_xor_relaxed arch_atomic_fetch_xor
#else /* arch_atomic_fetch_xor_relaxed */

#ifndef arch_atomic_fetch_xor_acquire
static __always_inline int
arch_atomic_fetch_xor_acquire(int i, atomic_t *v)
{
        int ret = arch_atomic_fetch_xor_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_fetch_xor_acquire arch_atomic_fetch_xor_acquire
#endif

#ifndef arch_atomic_fetch_xor_release
static __always_inline int
arch_atomic_fetch_xor_release(int i, atomic_t *v)
{
        __atomic_release_fence();
        return arch_atomic_fetch_xor_relaxed(i, v);
}
#define arch_atomic_fetch_xor_release arch_atomic_fetch_xor_release
#endif

#ifndef arch_atomic_fetch_xor
static __always_inline int
arch_atomic_fetch_xor(int i, atomic_t *v)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_xor_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_fetch_xor arch_atomic_fetch_xor
#endif

#endif /* arch_atomic_fetch_xor_relaxed */

#ifndef arch_atomic_xchg_relaxed
#define arch_atomic_xchg_acquire arch_atomic_xchg
#define arch_atomic_xchg_release arch_atomic_xchg
#define arch_atomic_xchg_relaxed arch_atomic_xchg
#else /* arch_atomic_xchg_relaxed */

#ifndef arch_atomic_xchg_acquire
static __always_inline int
arch_atomic_xchg_acquire(atomic_t *v, int i)
{
        int ret = arch_atomic_xchg_relaxed(v, i);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_xchg_acquire arch_atomic_xchg_acquire
#endif

#ifndef arch_atomic_xchg_release
static __always_inline int
arch_atomic_xchg_release(atomic_t *v, int i)
{
        __atomic_release_fence();
        return arch_atomic_xchg_relaxed(v, i);
}
#define arch_atomic_xchg_release arch_atomic_xchg_release
#endif

#ifndef arch_atomic_xchg
static __always_inline int
arch_atomic_xchg(atomic_t *v, int i)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_xchg_relaxed(v, i);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_xchg arch_atomic_xchg
#endif

#endif /* arch_atomic_xchg_relaxed */

#ifndef arch_atomic_cmpxchg_relaxed
#define arch_atomic_cmpxchg_acquire arch_atomic_cmpxchg
#define arch_atomic_cmpxchg_release arch_atomic_cmpxchg
#define arch_atomic_cmpxchg_relaxed arch_atomic_cmpxchg
#else /* arch_atomic_cmpxchg_relaxed */

#ifndef arch_atomic_cmpxchg_acquire
static __always_inline int
arch_atomic_cmpxchg_acquire(atomic_t *v, int old, int new)
{
        int ret = arch_atomic_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_cmpxchg_acquire arch_atomic_cmpxchg_acquire
#endif

#ifndef arch_atomic_cmpxchg_release
static __always_inline int
arch_atomic_cmpxchg_release(atomic_t *v, int old, int new)
{
        __atomic_release_fence();
        return arch_atomic_cmpxchg_relaxed(v, old, new);
}
#define arch_atomic_cmpxchg_release arch_atomic_cmpxchg_release
#endif

#ifndef arch_atomic_cmpxchg
static __always_inline int
arch_atomic_cmpxchg(atomic_t *v, int old, int new)
{
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_cmpxchg arch_atomic_cmpxchg
#endif

#endif /* arch_atomic_cmpxchg_relaxed */

#ifndef arch_atomic_try_cmpxchg_relaxed
#ifdef arch_atomic_try_cmpxchg
#define arch_atomic_try_cmpxchg_acquire arch_atomic_try_cmpxchg
#define arch_atomic_try_cmpxchg_release arch_atomic_try_cmpxchg
#define arch_atomic_try_cmpxchg_relaxed arch_atomic_try_cmpxchg
#endif /* arch_atomic_try_cmpxchg */

#ifndef arch_atomic_try_cmpxchg
static __always_inline bool
arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new)
{
        int r, o = *old;
        r = arch_atomic_cmpxchg(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
}
#define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg
#endif

#ifndef arch_atomic_try_cmpxchg_acquire
static __always_inline bool
arch_atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new)
{
        int r, o = *old;
        r = arch_atomic_cmpxchg_acquire(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
}
#define arch_atomic_try_cmpxchg_acquire arch_atomic_try_cmpxchg_acquire
#endif

#ifndef arch_atomic_try_cmpxchg_release
static __always_inline bool
arch_atomic_try_cmpxchg_release(atomic_t *v, int *old, int new)
{
        int r, o = *old;
        r = arch_atomic_cmpxchg_release(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
}
#define arch_atomic_try_cmpxchg_release arch_atomic_try_cmpxchg_release
#endif

#ifndef arch_atomic_try_cmpxchg_relaxed
static __always_inline bool
arch_atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new)
{
        int r, o = *old;
        r = arch_atomic_cmpxchg_relaxed(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
}
#define arch_atomic_try_cmpxchg_relaxed arch_atomic_try_cmpxchg_relaxed
#endif

#else /* arch_atomic_try_cmpxchg_relaxed */

#ifndef arch_atomic_try_cmpxchg_acquire
static __always_inline bool
arch_atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new)
{
        bool ret = arch_atomic_try_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic_try_cmpxchg_acquire arch_atomic_try_cmpxchg_acquire
#endif

#ifndef arch_atomic_try_cmpxchg_release
static __always_inline bool
arch_atomic_try_cmpxchg_release(atomic_t *v, int *old, int new)
{
        __atomic_release_fence();
        return arch_atomic_try_cmpxchg_relaxed(v, old, new);
}
#define arch_atomic_try_cmpxchg_release arch_atomic_try_cmpxchg_release
#endif

#ifndef arch_atomic_try_cmpxchg
static __always_inline bool
arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new)
{
        bool ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_try_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg
#endif

#endif /* arch_atomic_try_cmpxchg_relaxed */

#ifndef arch_atomic_sub_and_test
/**
 * arch_atomic_sub_and_test - subtract value from variable and test result
 * @i: integer value to subtract
 * @v: pointer of type atomic_t
 *
 * Atomically subtracts @i from @v and returns
 * true if the result is zero, or false for all
 * other cases.
 */
static __always_inline bool
arch_atomic_sub_and_test(int i, atomic_t *v)
{
        return arch_atomic_sub_return(i, v) == 0;
}
#define arch_atomic_sub_and_test arch_atomic_sub_and_test
#endif

#ifndef arch_atomic_dec_and_test
/**
 * arch_atomic_dec_and_test - decrement and test
 * @v: pointer of type atomic_t
 *
 * Atomically decrements @v by 1 and
 * returns true if the result is 0, or false for all other
 * cases.
 */
static __always_inline bool
arch_atomic_dec_and_test(atomic_t *v)
{
        return arch_atomic_dec_return(v) == 0;
}
#define arch_atomic_dec_and_test arch_atomic_dec_and_test
#endif

#ifndef arch_atomic_inc_and_test
/**
 * arch_atomic_inc_and_test - increment and test
 * @v: pointer of type atomic_t
 *
 * Atomically increments @v by 1
 * and returns true if the result is zero, or false for all
 * other cases.
 */
static __always_inline bool
arch_atomic_inc_and_test(atomic_t *v)
{
        return arch_atomic_inc_return(v) == 0;
}
#define arch_atomic_inc_and_test arch_atomic_inc_and_test
#endif

#ifndef arch_atomic_add_negative
/**
 * arch_atomic_add_negative - add and test if negative
 * @i: integer value to add
 * @v: pointer of type atomic_t
 *
 * Atomically adds @i to @v and returns true
 * if the result is negative, or false when
 * result is greater than or equal to zero.
 */
static __always_inline bool
arch_atomic_add_negative(int i, atomic_t *v)
{
        return arch_atomic_add_return(i, v) < 0;
}
#define arch_atomic_add_negative arch_atomic_add_negative
#endif

#ifndef arch_atomic_fetch_add_unless
/**
 * arch_atomic_fetch_add_unless - add unless the number is already a given value
 * @v: pointer of type atomic_t
 * @a: the amount to add to v...
 * @u: ...unless v is equal to u.
 *
 * Atomically adds @a to @v, so long as @v was not already @u.
 * Returns original value of @v
 */
static __always_inline int
arch_atomic_fetch_add_unless(atomic_t *v, int a, int u)
{
        int c = arch_atomic_read(v);

        do {
                if (unlikely(c == u))
                        break;
        } while (!arch_atomic_try_cmpxchg(v, &c, c + a));

        return c;
}
#define arch_atomic_fetch_add_unless arch_atomic_fetch_add_unless
#endif

#ifndef arch_atomic_add_unless
/**
 * arch_atomic_add_unless - add unless the number is already a given value
 * @v: pointer of type atomic_t
 * @a: the amount to add to v...
 * @u: ...unless v is equal to u.
 *
 * Atomically adds @a to @v, if @v was not already @u.
 * Returns true if the addition was done.
 */
static __always_inline bool
arch_atomic_add_unless(atomic_t *v, int a, int u)
{
        return arch_atomic_fetch_add_unless(v, a, u) != u;
}
#define arch_atomic_add_unless arch_atomic_add_unless
#endif

#ifndef arch_atomic_inc_not_zero
/**
 * arch_atomic_inc_not_zero - increment unless the number is zero
 * @v: pointer of type atomic_t
 *
 * Atomically increments @v by 1, if @v is non-zero.
 * Returns true if the increment was done.
 */
static __always_inline bool
arch_atomic_inc_not_zero(atomic_t *v)
{
        return arch_atomic_add_unless(v, 1, 0);
}
#define arch_atomic_inc_not_zero arch_atomic_inc_not_zero
#endif

#ifndef arch_atomic_inc_unless_negative
static __always_inline bool
arch_atomic_inc_unless_negative(atomic_t *v)
{
        int c = arch_atomic_read(v);

        do {
                if (unlikely(c < 0))
                        return false;
        } while (!arch_atomic_try_cmpxchg(v, &c, c + 1));

        return true;
}
#define arch_atomic_inc_unless_negative arch_atomic_inc_unless_negative
#endif

#ifndef arch_atomic_dec_unless_positive
static __always_inline bool
arch_atomic_dec_unless_positive(atomic_t *v)
{
        int c = arch_atomic_read(v);

        do {
                if (unlikely(c > 0))
                        return false;
        } while (!arch_atomic_try_cmpxchg(v, &c, c - 1));

        return true;
}
#define arch_atomic_dec_unless_positive arch_atomic_dec_unless_positive
#endif

#ifndef arch_atomic_dec_if_positive
static __always_inline int
arch_atomic_dec_if_positive(atomic_t *v)
{
        int dec, c = arch_atomic_read(v);

        do {
                dec = c - 1;
                if (unlikely(dec < 0))
                        break;
        } while (!arch_atomic_try_cmpxchg(v, &c, dec));

        return dec;
}
#define arch_atomic_dec_if_positive arch_atomic_dec_if_positive
#endif

#ifdef CONFIG_GENERIC_ATOMIC64
#include <asm-generic/atomic64.h>
#endif

#ifndef arch_atomic64_read_acquire
static __always_inline s64
arch_atomic64_read_acquire(const atomic64_t *v)
{
        return smp_load_acquire(&(v)->counter);
}
#define arch_atomic64_read_acquire arch_atomic64_read_acquire
#endif

#ifndef arch_atomic64_set_release
static __always_inline void
arch_atomic64_set_release(atomic64_t *v, s64 i)
{
        smp_store_release(&(v)->counter, i);
}
#define arch_atomic64_set_release arch_atomic64_set_release
#endif

#ifndef arch_atomic64_add_return_relaxed
#define arch_atomic64_add_return_acquire arch_atomic64_add_return
#define arch_atomic64_add_return_release arch_atomic64_add_return
#define arch_atomic64_add_return_relaxed arch_atomic64_add_return
#else /* arch_atomic64_add_return_relaxed */

#ifndef arch_atomic64_add_return_acquire
static __always_inline s64
arch_atomic64_add_return_acquire(s64 i, atomic64_t *v)
{
        s64 ret = arch_atomic64_add_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_add_return_acquire arch_atomic64_add_return_acquire
#endif

#ifndef arch_atomic64_add_return_release
static __always_inline s64
arch_atomic64_add_return_release(s64 i, atomic64_t *v)
{
        __atomic_release_fence();
        return arch_atomic64_add_return_relaxed(i, v);
}
#define arch_atomic64_add_return_release arch_atomic64_add_return_release
#endif

#ifndef arch_atomic64_add_return
static __always_inline s64
arch_atomic64_add_return(s64 i, atomic64_t *v)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_add_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_add_return arch_atomic64_add_return
#endif

#endif /* arch_atomic64_add_return_relaxed */

#ifndef arch_atomic64_fetch_add_relaxed
#define arch_atomic64_fetch_add_acquire arch_atomic64_fetch_add
#define arch_atomic64_fetch_add_release arch_atomic64_fetch_add
#define arch_atomic64_fetch_add_relaxed arch_atomic64_fetch_add
#else /* arch_atomic64_fetch_add_relaxed */

#ifndef arch_atomic64_fetch_add_acquire
static __always_inline s64
arch_atomic64_fetch_add_acquire(s64 i, atomic64_t *v)
{
        s64 ret = arch_atomic64_fetch_add_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_fetch_add_acquire arch_atomic64_fetch_add_acquire
#endif

#ifndef arch_atomic64_fetch_add_release
static __always_inline s64
arch_atomic64_fetch_add_release(s64 i, atomic64_t *v)
{
        __atomic_release_fence();
        return arch_atomic64_fetch_add_relaxed(i, v);
}
#define arch_atomic64_fetch_add_release arch_atomic64_fetch_add_release
#endif

#ifndef arch_atomic64_fetch_add
static __always_inline s64
arch_atomic64_fetch_add(s64 i, atomic64_t *v)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_add_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_fetch_add arch_atomic64_fetch_add
#endif

#endif /* arch_atomic64_fetch_add_relaxed */

#ifndef arch_atomic64_sub_return_relaxed
#define arch_atomic64_sub_return_acquire arch_atomic64_sub_return
#define arch_atomic64_sub_return_release arch_atomic64_sub_return
#define arch_atomic64_sub_return_relaxed arch_atomic64_sub_return
#else /* arch_atomic64_sub_return_relaxed */

#ifndef arch_atomic64_sub_return_acquire
static __always_inline s64
arch_atomic64_sub_return_acquire(s64 i, atomic64_t *v)
{
        s64 ret = arch_atomic64_sub_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_sub_return_acquire arch_atomic64_sub_return_acquire
#endif

#ifndef arch_atomic64_sub_return_release
static __always_inline s64
arch_atomic64_sub_return_release(s64 i, atomic64_t *v)
{
        __atomic_release_fence();
        return arch_atomic64_sub_return_relaxed(i, v);
}
#define arch_atomic64_sub_return_release arch_atomic64_sub_return_release
#endif

#ifndef arch_atomic64_sub_return
static __always_inline s64
arch_atomic64_sub_return(s64 i, atomic64_t *v)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_sub_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_sub_return arch_atomic64_sub_return
#endif

#endif /* arch_atomic64_sub_return_relaxed */

#ifndef arch_atomic64_fetch_sub_relaxed
#define arch_atomic64_fetch_sub_acquire arch_atomic64_fetch_sub
#define arch_atomic64_fetch_sub_release arch_atomic64_fetch_sub
#define arch_atomic64_fetch_sub_relaxed arch_atomic64_fetch_sub
#else /* arch_atomic64_fetch_sub_relaxed */

#ifndef arch_atomic64_fetch_sub_acquire
static __always_inline s64
arch_atomic64_fetch_sub_acquire(s64 i, atomic64_t *v)
{
        s64 ret = arch_atomic64_fetch_sub_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_fetch_sub_acquire arch_atomic64_fetch_sub_acquire
#endif

#ifndef arch_atomic64_fetch_sub_release
static __always_inline s64
arch_atomic64_fetch_sub_release(s64 i, atomic64_t *v)
{
        __atomic_release_fence();
        return arch_atomic64_fetch_sub_relaxed(i, v);
}
#define arch_atomic64_fetch_sub_release arch_atomic64_fetch_sub_release
#endif

#ifndef arch_atomic64_fetch_sub
static __always_inline s64
arch_atomic64_fetch_sub(s64 i, atomic64_t *v)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_sub_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_fetch_sub arch_atomic64_fetch_sub
#endif

#endif /* arch_atomic64_fetch_sub_relaxed */

#ifndef arch_atomic64_inc
static __always_inline void
arch_atomic64_inc(atomic64_t *v)
{
        arch_atomic64_add(1, v);
}
#define arch_atomic64_inc arch_atomic64_inc
#endif

#ifndef arch_atomic64_inc_return_relaxed
#ifdef arch_atomic64_inc_return
#define arch_atomic64_inc_return_acquire arch_atomic64_inc_return
#define arch_atomic64_inc_return_release arch_atomic64_inc_return
#define arch_atomic64_inc_return_relaxed arch_atomic64_inc_return
#endif /* arch_atomic64_inc_return */

#ifndef arch_atomic64_inc_return
static __always_inline s64
arch_atomic64_inc_return(atomic64_t *v)
{
        return arch_atomic64_add_return(1, v);
}
#define arch_atomic64_inc_return arch_atomic64_inc_return
#endif

#ifndef arch_atomic64_inc_return_acquire
static __always_inline s64
arch_atomic64_inc_return_acquire(atomic64_t *v)
{
        return arch_atomic64_add_return_acquire(1, v);
}
#define arch_atomic64_inc_return_acquire arch_atomic64_inc_return_acquire
#endif

#ifndef arch_atomic64_inc_return_release
static __always_inline s64
arch_atomic64_inc_return_release(atomic64_t *v)
{
        return arch_atomic64_add_return_release(1, v);
}
#define arch_atomic64_inc_return_release arch_atomic64_inc_return_release
#endif

#ifndef arch_atomic64_inc_return_relaxed
static __always_inline s64
arch_atomic64_inc_return_relaxed(atomic64_t *v)
{
        return arch_atomic64_add_return_relaxed(1, v);
}
#define arch_atomic64_inc_return_relaxed arch_atomic64_inc_return_relaxed
#endif

#else /* arch_atomic64_inc_return_relaxed */

#ifndef arch_atomic64_inc_return_acquire
static __always_inline s64
arch_atomic64_inc_return_acquire(atomic64_t *v)
{
        s64 ret = arch_atomic64_inc_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_inc_return_acquire arch_atomic64_inc_return_acquire
#endif

#ifndef arch_atomic64_inc_return_release
static __always_inline s64
arch_atomic64_inc_return_release(atomic64_t *v)
{
        __atomic_release_fence();
        return arch_atomic64_inc_return_relaxed(v);
}
#define arch_atomic64_inc_return_release arch_atomic64_inc_return_release
#endif

#ifndef arch_atomic64_inc_return
static __always_inline s64
arch_atomic64_inc_return(atomic64_t *v)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_inc_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_inc_return arch_atomic64_inc_return
#endif

#endif /* arch_atomic64_inc_return_relaxed */

#ifndef arch_atomic64_fetch_inc_relaxed
#ifdef arch_atomic64_fetch_inc
#define arch_atomic64_fetch_inc_acquire arch_atomic64_fetch_inc
#define arch_atomic64_fetch_inc_release arch_atomic64_fetch_inc
#define arch_atomic64_fetch_inc_relaxed arch_atomic64_fetch_inc
#endif /* arch_atomic64_fetch_inc */

#ifndef arch_atomic64_fetch_inc
static __always_inline s64
arch_atomic64_fetch_inc(atomic64_t *v)
{
        return arch_atomic64_fetch_add(1, v);
}
#define arch_atomic64_fetch_inc arch_atomic64_fetch_inc
#endif

#ifndef arch_atomic64_fetch_inc_acquire
static __always_inline s64
arch_atomic64_fetch_inc_acquire(atomic64_t *v)
{
        return arch_atomic64_fetch_add_acquire(1, v);
}
#define arch_atomic64_fetch_inc_acquire arch_atomic64_fetch_inc_acquire
#endif

#ifndef arch_atomic64_fetch_inc_release
static __always_inline s64
arch_atomic64_fetch_inc_release(atomic64_t *v)
{
        return arch_atomic64_fetch_add_release(1, v);
}
#define arch_atomic64_fetch_inc_release arch_atomic64_fetch_inc_release
#endif

#ifndef arch_atomic64_fetch_inc_relaxed
static __always_inline s64
arch_atomic64_fetch_inc_relaxed(atomic64_t *v)
{
        return arch_atomic64_fetch_add_relaxed(1, v);
}
#define arch_atomic64_fetch_inc_relaxed arch_atomic64_fetch_inc_relaxed
#endif

#else /* arch_atomic64_fetch_inc_relaxed */

#ifndef arch_atomic64_fetch_inc_acquire
static __always_inline s64
arch_atomic64_fetch_inc_acquire(atomic64_t *v)
{
        s64 ret = arch_atomic64_fetch_inc_relaxed(v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_fetch_inc_acquire arch_atomic64_fetch_inc_acquire
#endif

#ifndef arch_atomic64_fetch_inc_release
static __always_inline s64
arch_atomic64_fetch_inc_release(atomic64_t *v)
{
        __atomic_release_fence();
        return arch_atomic64_fetch_inc_relaxed(v);
}
#define arch_atomic64_fetch_inc_release arch_atomic64_fetch_inc_release
#endif

#ifndef arch_atomic64_fetch_inc
static __always_inline s64
arch_atomic64_fetch_inc(atomic64_t *v)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_inc_relaxed(v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_fetch_inc arch_atomic64_fetch_inc
#endif

#endif /* arch_atomic64_fetch_inc_relaxed */

#ifndef arch_atomic64_dec
static __always_inline void
arch_atomic64_dec(atomic64_t *v)
{
        arch_atomic64_sub(1, v);
}
#define arch_atomic64_dec arch_atomic64_dec
#endif

#ifndef arch_atomic64_dec_return_relaxed
#ifdef arch_atomic64_dec_return
#define arch_atomic64_dec_return_acquire arch_atomic64_dec_return
#define arch_atomic64_dec_return_release arch_atomic64_dec_return
#define arch_atomic64_dec_return_relaxed arch_atomic64_dec_return
#endif /* arch_atomic64_dec_return */

#ifndef arch_atomic64_dec_return
static __always_inline s64
arch_atomic64_dec_return(atomic64_t *v)
{
        return arch_atomic64_sub_return(1, v);
}
#define arch_atomic64_dec_return arch_atomic64_dec_return
#endif

#ifndef arch_atomic64_dec_return_acquire
static __always_inline s64
arch_atomic64_dec_return_acquire(atomic64_t *v)
{
        return arch_atomic64_sub_return_acquire(1, v);
}
#define arch_atomic64_dec_return_acquire arch_atomic64_dec_return_acquire
#endif

#ifndef arch_atomic64_dec_return_release
static __always_inline s64
arch_atomic64_dec_return_release(atomic64_t *v)
{
        return arch_atomic64_sub_return_release(1, v);
}
#define arch_atomic64_dec_return_release arch_atomic64_dec_return_release
#endif

#ifndef arch_atomic64_dec_return_relaxed
static __always_inline s64
arch_atomic64_dec_return_relaxed(atomic64_t *v)
{
        return arch_atomic64_sub_return_relaxed(1, v);
}
#define arch_atomic64_dec_return_relaxed arch_atomic64_dec_return_relaxed
#endif

#else /* arch_atomic64_dec_return_relaxed */

#ifndef arch_atomic64_dec_return_acquire
static __always_inline s64
arch_atomic64_dec_return_acquire(atomic64_t *v)
{
        s64 ret = arch_atomic64_dec_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_dec_return_acquire arch_atomic64_dec_return_acquire
#endif

#ifndef arch_atomic64_dec_return_release
static __always_inline s64
arch_atomic64_dec_return_release(atomic64_t *v)
{
        __atomic_release_fence();
        return arch_atomic64_dec_return_relaxed(v);
}
#define arch_atomic64_dec_return_release arch_atomic64_dec_return_release
#endif

#ifndef arch_atomic64_dec_return
static __always_inline s64
arch_atomic64_dec_return(atomic64_t *v)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_dec_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_dec_return arch_atomic64_dec_return
#endif

#endif /* arch_atomic64_dec_return_relaxed */

#ifndef arch_atomic64_fetch_dec_relaxed
#ifdef arch_atomic64_fetch_dec
#define arch_atomic64_fetch_dec_acquire arch_atomic64_fetch_dec
#define arch_atomic64_fetch_dec_release arch_atomic64_fetch_dec
#define arch_atomic64_fetch_dec_relaxed arch_atomic64_fetch_dec
#endif /* arch_atomic64_fetch_dec */

#ifndef arch_atomic64_fetch_dec
static __always_inline s64
arch_atomic64_fetch_dec(atomic64_t *v)
{
        return arch_atomic64_fetch_sub(1, v);
}
#define arch_atomic64_fetch_dec arch_atomic64_fetch_dec
#endif

#ifndef arch_atomic64_fetch_dec_acquire
static __always_inline s64
arch_atomic64_fetch_dec_acquire(atomic64_t *v)
{
        return arch_atomic64_fetch_sub_acquire(1, v);
}
#define arch_atomic64_fetch_dec_acquire arch_atomic64_fetch_dec_acquire
#endif

#ifndef arch_atomic64_fetch_dec_release
static __always_inline s64
arch_atomic64_fetch_dec_release(atomic64_t *v)
{
        return arch_atomic64_fetch_sub_release(1, v);
}
#define arch_atomic64_fetch_dec_release arch_atomic64_fetch_dec_release
#endif

#ifndef arch_atomic64_fetch_dec_relaxed
static __always_inline s64
arch_atomic64_fetch_dec_relaxed(atomic64_t *v)
{
        return arch_atomic64_fetch_sub_relaxed(1, v);
}
#define arch_atomic64_fetch_dec_relaxed arch_atomic64_fetch_dec_relaxed
#endif

#else /* arch_atomic64_fetch_dec_relaxed */

#ifndef arch_atomic64_fetch_dec_acquire
static __always_inline s64
arch_atomic64_fetch_dec_acquire(atomic64_t *v)
{
        s64 ret = arch_atomic64_fetch_dec_relaxed(v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_fetch_dec_acquire arch_atomic64_fetch_dec_acquire
#endif

#ifndef arch_atomic64_fetch_dec_release
static __always_inline s64
arch_atomic64_fetch_dec_release(atomic64_t *v)
{
        __atomic_release_fence();
        return arch_atomic64_fetch_dec_relaxed(v);
}
#define arch_atomic64_fetch_dec_release arch_atomic64_fetch_dec_release
#endif

#ifndef arch_atomic64_fetch_dec
static __always_inline s64
arch_atomic64_fetch_dec(atomic64_t *v)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_dec_relaxed(v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_fetch_dec arch_atomic64_fetch_dec
#endif

#endif /* arch_atomic64_fetch_dec_relaxed */

#ifndef arch_atomic64_fetch_and_relaxed
#define arch_atomic64_fetch_and_acquire arch_atomic64_fetch_and
#define arch_atomic64_fetch_and_release arch_atomic64_fetch_and
#define arch_atomic64_fetch_and_relaxed arch_atomic64_fetch_and
#else /* arch_atomic64_fetch_and_relaxed */

#ifndef arch_atomic64_fetch_and_acquire
static __always_inline s64
arch_atomic64_fetch_and_acquire(s64 i, atomic64_t *v)
{
        s64 ret = arch_atomic64_fetch_and_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_fetch_and_acquire arch_atomic64_fetch_and_acquire
#endif

#ifndef arch_atomic64_fetch_and_release
static __always_inline s64
arch_atomic64_fetch_and_release(s64 i, atomic64_t *v)
{
        __atomic_release_fence();
        return arch_atomic64_fetch_and_relaxed(i, v);
}
#define arch_atomic64_fetch_and_release arch_atomic64_fetch_and_release
#endif

#ifndef arch_atomic64_fetch_and
static __always_inline s64
arch_atomic64_fetch_and(s64 i, atomic64_t *v)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_and_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_fetch_and arch_atomic64_fetch_and
#endif

#endif /* arch_atomic64_fetch_and_relaxed */

#ifndef arch_atomic64_andnot
static __always_inline void
arch_atomic64_andnot(s64 i, atomic64_t *v)
{
        arch_atomic64_and(~i, v);
}
#define arch_atomic64_andnot arch_atomic64_andnot
#endif

#ifndef arch_atomic64_fetch_andnot_relaxed
#ifdef arch_atomic64_fetch_andnot
#define arch_atomic64_fetch_andnot_acquire arch_atomic64_fetch_andnot
#define arch_atomic64_fetch_andnot_release arch_atomic64_fetch_andnot
#define arch_atomic64_fetch_andnot_relaxed arch_atomic64_fetch_andnot
#endif /* arch_atomic64_fetch_andnot */

#ifndef arch_atomic64_fetch_andnot
static __always_inline s64
arch_atomic64_fetch_andnot(s64 i, atomic64_t *v)
{
        return arch_atomic64_fetch_and(~i, v);
}
#define arch_atomic64_fetch_andnot arch_atomic64_fetch_andnot
#endif

#ifndef arch_atomic64_fetch_andnot_acquire
static __always_inline s64
arch_atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v)
{
        return arch_atomic64_fetch_and_acquire(~i, v);
}
#define arch_atomic64_fetch_andnot_acquire arch_atomic64_fetch_andnot_acquire
#endif

#ifndef arch_atomic64_fetch_andnot_release
static __always_inline s64
arch_atomic64_fetch_andnot_release(s64 i, atomic64_t *v)
{
        return arch_atomic64_fetch_and_release(~i, v);
}
#define arch_atomic64_fetch_andnot_release arch_atomic64_fetch_andnot_release
#endif

#ifndef arch_atomic64_fetch_andnot_relaxed
static __always_inline s64
arch_atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v)
{
        return arch_atomic64_fetch_and_relaxed(~i, v);
}
#define arch_atomic64_fetch_andnot_relaxed arch_atomic64_fetch_andnot_relaxed
#endif

#else /* arch_atomic64_fetch_andnot_relaxed */

#ifndef arch_atomic64_fetch_andnot_acquire
static __always_inline s64
arch_atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v)
{
        s64 ret = arch_atomic64_fetch_andnot_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_fetch_andnot_acquire arch_atomic64_fetch_andnot_acquire
#endif

#ifndef arch_atomic64_fetch_andnot_release
static __always_inline s64
arch_atomic64_fetch_andnot_release(s64 i, atomic64_t *v)
{
        __atomic_release_fence();
        return arch_atomic64_fetch_andnot_relaxed(i, v);
}
#define arch_atomic64_fetch_andnot_release arch_atomic64_fetch_andnot_release
#endif

#ifndef arch_atomic64_fetch_andnot
static __always_inline s64
arch_atomic64_fetch_andnot(s64 i, atomic64_t *v)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_andnot_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_fetch_andnot arch_atomic64_fetch_andnot
#endif

#endif /* arch_atomic64_fetch_andnot_relaxed */

#ifndef arch_atomic64_fetch_or_relaxed
#define arch_atomic64_fetch_or_acquire arch_atomic64_fetch_or
#define arch_atomic64_fetch_or_release arch_atomic64_fetch_or
#define arch_atomic64_fetch_or_relaxed arch_atomic64_fetch_or
#else /* arch_atomic64_fetch_or_relaxed */

#ifndef arch_atomic64_fetch_or_acquire
static __always_inline s64
arch_atomic64_fetch_or_acquire(s64 i, atomic64_t *v)
{
        s64 ret = arch_atomic64_fetch_or_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_fetch_or_acquire arch_atomic64_fetch_or_acquire
#endif

#ifndef arch_atomic64_fetch_or_release
static __always_inline s64
arch_atomic64_fetch_or_release(s64 i, atomic64_t *v)
{
        __atomic_release_fence();
        return arch_atomic64_fetch_or_relaxed(i, v);
}
#define arch_atomic64_fetch_or_release arch_atomic64_fetch_or_release
#endif

#ifndef arch_atomic64_fetch_or
static __always_inline s64
arch_atomic64_fetch_or(s64 i, atomic64_t *v)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_or_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_fetch_or arch_atomic64_fetch_or
#endif

#endif /* arch_atomic64_fetch_or_relaxed */

#ifndef arch_atomic64_fetch_xor_relaxed
#define arch_atomic64_fetch_xor_acquire arch_atomic64_fetch_xor
#define arch_atomic64_fetch_xor_release arch_atomic64_fetch_xor
#define arch_atomic64_fetch_xor_relaxed arch_atomic64_fetch_xor
#else /* arch_atomic64_fetch_xor_relaxed */

#ifndef arch_atomic64_fetch_xor_acquire
static __always_inline s64
arch_atomic64_fetch_xor_acquire(s64 i, atomic64_t *v)
{
        s64 ret = arch_atomic64_fetch_xor_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_fetch_xor_acquire arch_atomic64_fetch_xor_acquire
#endif

#ifndef arch_atomic64_fetch_xor_release
static __always_inline s64
arch_atomic64_fetch_xor_release(s64 i, atomic64_t *v)
{
        __atomic_release_fence();
        return arch_atomic64_fetch_xor_relaxed(i, v);
}
#define arch_atomic64_fetch_xor_release arch_atomic64_fetch_xor_release
#endif

#ifndef arch_atomic64_fetch_xor
static __always_inline s64
arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_xor_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_fetch_xor arch_atomic64_fetch_xor
#endif

#endif /* arch_atomic64_fetch_xor_relaxed */

#ifndef arch_atomic64_xchg_relaxed
#define arch_atomic64_xchg_acquire arch_atomic64_xchg
#define arch_atomic64_xchg_release arch_atomic64_xchg
#define arch_atomic64_xchg_relaxed arch_atomic64_xchg
#else /* arch_atomic64_xchg_relaxed */

#ifndef arch_atomic64_xchg_acquire
static __always_inline s64
arch_atomic64_xchg_acquire(atomic64_t *v, s64 i)
{
        s64 ret = arch_atomic64_xchg_relaxed(v, i);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_xchg_acquire arch_atomic64_xchg_acquire
#endif

#ifndef arch_atomic64_xchg_release
static __always_inline s64
arch_atomic64_xchg_release(atomic64_t *v, s64 i)
{
        __atomic_release_fence();
        return arch_atomic64_xchg_relaxed(v, i);
}
#define arch_atomic64_xchg_release arch_atomic64_xchg_release
#endif

#ifndef arch_atomic64_xchg
static __always_inline s64
arch_atomic64_xchg(atomic64_t *v, s64 i)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_xchg_relaxed(v, i);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_xchg arch_atomic64_xchg
#endif

#endif /* arch_atomic64_xchg_relaxed */

#ifndef arch_atomic64_cmpxchg_relaxed
#define arch_atomic64_cmpxchg_acquire arch_atomic64_cmpxchg
#define arch_atomic64_cmpxchg_release arch_atomic64_cmpxchg
#define arch_atomic64_cmpxchg_relaxed arch_atomic64_cmpxchg
#else /* arch_atomic64_cmpxchg_relaxed */

#ifndef arch_atomic64_cmpxchg_acquire
static __always_inline s64
arch_atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new)
{
        s64 ret = arch_atomic64_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_cmpxchg_acquire arch_atomic64_cmpxchg_acquire
#endif

#ifndef arch_atomic64_cmpxchg_release
static __always_inline s64
arch_atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new)
{
        __atomic_release_fence();
        return arch_atomic64_cmpxchg_relaxed(v, old, new);
}
#define arch_atomic64_cmpxchg_release arch_atomic64_cmpxchg_release
#endif

#ifndef arch_atomic64_cmpxchg
static __always_inline s64
arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_cmpxchg arch_atomic64_cmpxchg
#endif

#endif /* arch_atomic64_cmpxchg_relaxed */

#ifndef arch_atomic64_try_cmpxchg_relaxed
#ifdef arch_atomic64_try_cmpxchg
#define arch_atomic64_try_cmpxchg_acquire arch_atomic64_try_cmpxchg
#define arch_atomic64_try_cmpxchg_release arch_atomic64_try_cmpxchg
#define arch_atomic64_try_cmpxchg_relaxed arch_atomic64_try_cmpxchg
#endif /* arch_atomic64_try_cmpxchg */

#ifndef arch_atomic64_try_cmpxchg
static __always_inline bool
arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
{
        s64 r, o = *old;
        r = arch_atomic64_cmpxchg(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
}
#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg
#endif

#ifndef arch_atomic64_try_cmpxchg_acquire
static __always_inline bool
arch_atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new)
{
        s64 r, o = *old;
        r = arch_atomic64_cmpxchg_acquire(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
}
#define arch_atomic64_try_cmpxchg_acquire arch_atomic64_try_cmpxchg_acquire
#endif

#ifndef arch_atomic64_try_cmpxchg_release
static __always_inline bool
arch_atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new)
{
        s64 r, o = *old;
        r = arch_atomic64_cmpxchg_release(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
}
#define arch_atomic64_try_cmpxchg_release arch_atomic64_try_cmpxchg_release
#endif

#ifndef arch_atomic64_try_cmpxchg_relaxed
static __always_inline bool
arch_atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new)
{
        s64 r, o = *old;
        r = arch_atomic64_cmpxchg_relaxed(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
}
#define arch_atomic64_try_cmpxchg_relaxed arch_atomic64_try_cmpxchg_relaxed
#endif

#else /* arch_atomic64_try_cmpxchg_relaxed */

#ifndef arch_atomic64_try_cmpxchg_acquire
static __always_inline bool
arch_atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new)
{
        bool ret = arch_atomic64_try_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
}
#define arch_atomic64_try_cmpxchg_acquire arch_atomic64_try_cmpxchg_acquire
#endif

#ifndef arch_atomic64_try_cmpxchg_release
static __always_inline bool
arch_atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new)
{
        __atomic_release_fence();
        return arch_atomic64_try_cmpxchg_relaxed(v, old, new);
}
#define arch_atomic64_try_cmpxchg_release arch_atomic64_try_cmpxchg_release
#endif

#ifndef arch_atomic64_try_cmpxchg
static __always_inline bool
arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
{
        bool ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_try_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
}
#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg
#endif

#endif /* arch_atomic64_try_cmpxchg_relaxed */

#ifndef arch_atomic64_sub_and_test
/**
 * arch_atomic64_sub_and_test - subtract value from variable and test result
 * @i: integer value to subtract
 * @v: pointer of type atomic64_t
 *
 * Atomically subtracts @i from @v and returns
 * true if the result is zero, or false for all
 * other cases.
 */
static __always_inline bool
arch_atomic64_sub_and_test(s64 i, atomic64_t *v)
{
        return arch_atomic64_sub_return(i, v) == 0;
}
#define arch_atomic64_sub_and_test arch_atomic64_sub_and_test
#endif

#ifndef arch_atomic64_dec_and_test
/**
 * arch_atomic64_dec_and_test - decrement and test
 * @v: pointer of type atomic64_t
 *
 * Atomically decrements @v by 1 and
 * returns true if the result is 0, or false for all other
 * cases.
 */
static __always_inline bool
arch_atomic64_dec_and_test(atomic64_t *v)
{
        return arch_atomic64_dec_return(v) == 0;
}
#define arch_atomic64_dec_and_test arch_atomic64_dec_and_test
#endif

#ifndef arch_atomic64_inc_and_test
/**
 * arch_atomic64_inc_and_test - increment and test
 * @v: pointer of type atomic64_t
 *
 * Atomically increments @v by 1
 * and returns true if the result is zero, or false for all
 * other cases.
 */
static __always_inline bool
arch_atomic64_inc_and_test(atomic64_t *v)
{
        return arch_atomic64_inc_return(v) == 0;
}
#define arch_atomic64_inc_and_test arch_atomic64_inc_and_test
#endif

#ifndef arch_atomic64_add_negative
/**
 * arch_atomic64_add_negative - add and test if negative
 * @i: integer value to add
 * @v: pointer of type atomic64_t
 *
 * Atomically adds @i to @v and returns true
 * if the result is negative, or false when
 * result is greater than or equal to zero.
 */
static __always_inline bool
arch_atomic64_add_negative(s64 i, atomic64_t *v)
{
        return arch_atomic64_add_return(i, v) < 0;
}
#define arch_atomic64_add_negative arch_atomic64_add_negative
#endif

#ifndef arch_atomic64_fetch_add_unless
/**
 * arch_atomic64_fetch_add_unless - add unless the number is already a given value
 * @v: pointer of type atomic64_t
 * @a: the amount to add to v...
 * @u: ...unless v is equal to u.
 *
 * Atomically adds @a to @v, so long as @v was not already @u.
 * Returns original value of @v
 */
static __always_inline s64
arch_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
{
        s64 c = arch_atomic64_read(v);

        do {
                if (unlikely(c == u))
                        break;
        } while (!arch_atomic64_try_cmpxchg(v, &c, c + a));

        return c;
}
#define arch_atomic64_fetch_add_unless arch_atomic64_fetch_add_unless
#endif

#ifndef arch_atomic64_add_unless
/**
 * arch_atomic64_add_unless - add unless the number is already a given value
 * @v: pointer of type atomic64_t
 * @a: the amount to add to v...
 * @u: ...unless v is equal to u.
 *
 * Atomically adds @a to @v, if @v was not already @u.
 * Returns true if the addition was done.
 */
static __always_inline bool
arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
{
        return arch_atomic64_fetch_add_unless(v, a, u) != u;
}
#define arch_atomic64_add_unless arch_atomic64_add_unless
#endif

#ifndef arch_atomic64_inc_not_zero
/**
 * arch_atomic64_inc_not_zero - increment unless the number is zero
 * @v: pointer of type atomic64_t
 *
 * Atomically increments @v by 1, if @v is non-zero.
 * Returns true if the increment was done.
 */
static __always_inline bool
arch_atomic64_inc_not_zero(atomic64_t *v)
{
        return arch_atomic64_add_unless(v, 1, 0);
}
#define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero
#endif

#ifndef arch_atomic64_inc_unless_negative
static __always_inline bool
arch_atomic64_inc_unless_negative(atomic64_t *v)
{
        s64 c = arch_atomic64_read(v);

        do {
                if (unlikely(c < 0))
                        return false;
        } while (!arch_atomic64_try_cmpxchg(v, &c, c + 1));

        return true;
}
#define arch_atomic64_inc_unless_negative arch_atomic64_inc_unless_negative
#endif

#ifndef arch_atomic64_dec_unless_positive
static __always_inline bool
arch_atomic64_dec_unless_positive(atomic64_t *v)
{
        s64 c = arch_atomic64_read(v);

        do {
                if (unlikely(c > 0))
                        return false;
        } while (!arch_atomic64_try_cmpxchg(v, &c, c - 1));

        return true;
}
#define arch_atomic64_dec_unless_positive arch_atomic64_dec_unless_positive
#endif

#ifndef arch_atomic64_dec_if_positive
static __always_inline s64
arch_atomic64_dec_if_positive(atomic64_t *v)
{
        s64 dec, c = arch_atomic64_read(v);

        do {
                dec = c - 1;
                if (unlikely(dec < 0))
                        break;
        } while (!arch_atomic64_try_cmpxchg(v, &c, dec));

        return dec;
}
#define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive
#endif

#endif /* _LINUX_ATOMIC_FALLBACK_H */
// 90cd26cfd69d2250303d654955a0cc12620fb91b



































































    1 





























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PAGE_REF_H
#define _LINUX_PAGE_REF_H

#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/page-flags.h>
#include <linux/tracepoint-defs.h>

DECLARE_TRACEPOINT(page_ref_set);
DECLARE_TRACEPOINT(page_ref_mod);
DECLARE_TRACEPOINT(page_ref_mod_and_test);
DECLARE_TRACEPOINT(page_ref_mod_and_return);
DECLARE_TRACEPOINT(page_ref_mod_unless);
DECLARE_TRACEPOINT(page_ref_freeze);
DECLARE_TRACEPOINT(page_ref_unfreeze);

#ifdef CONFIG_DEBUG_PAGE_REF

/*
 * Ideally we would want to use the trace_<tracepoint>_enabled() helper
 * functions. But due to include header file issues, that is not
 * feasible. Instead we have to open code the static key functions.
 *
 * See trace_##name##_enabled(void) in include/linux/tracepoint.h
 */
#define page_ref_tracepoint_active(t) tracepoint_enabled(t)

extern void __page_ref_set(struct page *page, int v);
extern void __page_ref_mod(struct page *page, int v);
extern void __page_ref_mod_and_test(struct page *page, int v, int ret);
extern void __page_ref_mod_and_return(struct page *page, int v, int ret);
extern void __page_ref_mod_unless(struct page *page, int v, int u);
extern void __page_ref_freeze(struct page *page, int v, int ret);
extern void __page_ref_unfreeze(struct page *page, int v);

#else

#define page_ref_tracepoint_active(t) false

static inline void __page_ref_set(struct page *page, int v)
{
}
static inline void __page_ref_mod(struct page *page, int v)
{
}
static inline void __page_ref_mod_and_test(struct page *page, int v, int ret)
{
}
static inline void __page_ref_mod_and_return(struct page *page, int v, int ret)
{
}
static inline void __page_ref_mod_unless(struct page *page, int v, int u)
{
}
static inline void __page_ref_freeze(struct page *page, int v, int ret)
{
}
static inline void __page_ref_unfreeze(struct page *page, int v)
{
}

#endif

static inline int page_ref_count(struct page *page)
{
        return atomic_read(&page->_refcount);
}

static inline int page_count(struct page *page)
{
        return atomic_read(&compound_head(page)->_refcount);
}

static inline void set_page_count(struct page *page, int v)
{
        atomic_set(&page->_refcount, v);
        if (page_ref_tracepoint_active(page_ref_set))
                __page_ref_set(page, v);
}

/*
 * Setup the page count before being freed into the page allocator for
 * the first time (boot or memory hotplug)
 */
static inline void init_page_count(struct page *page)
{
        set_page_count(page, 1);
}

static inline void page_ref_add(struct page *page, int nr)
{
        atomic_add(nr, &page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, nr);
}

static inline void page_ref_sub(struct page *page, int nr)
{
        atomic_sub(nr, &page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, -nr);
}

static inline int page_ref_sub_return(struct page *page, int nr)
{
        int ret = atomic_sub_return(nr, &page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_return))
                __page_ref_mod_and_return(page, -nr, ret);
        return ret;
}

static inline void page_ref_inc(struct page *page)
{
        atomic_inc(&page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, 1);
}

static inline void page_ref_dec(struct page *page)
{
        atomic_dec(&page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, -1);
}

static inline int page_ref_sub_and_test(struct page *page, int nr)
{
        int ret = atomic_sub_and_test(nr, &page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_test))
                __page_ref_mod_and_test(page, -nr, ret);
        return ret;
}

static inline int page_ref_inc_return(struct page *page)
{
        int ret = atomic_inc_return(&page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_return))
                __page_ref_mod_and_return(page, 1, ret);
        return ret;
}

static inline int page_ref_dec_and_test(struct page *page)
{
        int ret = atomic_dec_and_test(&page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_test))
                __page_ref_mod_and_test(page, -1, ret);
        return ret;
}

static inline int page_ref_dec_return(struct page *page)
{
        int ret = atomic_dec_return(&page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_return))
                __page_ref_mod_and_return(page, -1, ret);
        return ret;
}

static inline int page_ref_add_unless(struct page *page, int nr, int u)
{
        int ret = atomic_add_unless(&page->_refcount, nr, u);

        if (page_ref_tracepoint_active(page_ref_mod_unless))
                __page_ref_mod_unless(page, nr, ret);
        return ret;
}

static inline int page_ref_freeze(struct page *page, int count)
{
        int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count);

        if (page_ref_tracepoint_active(page_ref_freeze))
                __page_ref_freeze(page, count, ret);
        return ret;
}

static inline void page_ref_unfreeze(struct page *page, int count)
{
        VM_BUG_ON_PAGE(page_count(page) != 0, page);
        VM_BUG_ON(count == 0);

        atomic_set_release(&page->_refcount, count);
        if (page_ref_tracepoint_active(page_ref_unfreeze))
                __page_ref_unfreeze(page, count);
}

#endif







































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __FS_NOTIFY_FSNOTIFY_H_
#define __FS_NOTIFY_FSNOTIFY_H_

#include <linux/list.h>
#include <linux/fsnotify.h>
#include <linux/srcu.h>
#include <linux/types.h>

#include "../mount.h"

static inline struct inode *fsnotify_conn_inode(
                                struct fsnotify_mark_connector *conn)
{
        return container_of(conn->obj, struct inode, i_fsnotify_marks);
}

static inline struct mount *fsnotify_conn_mount(
                                struct fsnotify_mark_connector *conn)
{
        return container_of(conn->obj, struct mount, mnt_fsnotify_marks);
}

static inline struct super_block *fsnotify_conn_sb(
                                struct fsnotify_mark_connector *conn)
{
        return container_of(conn->obj, struct super_block, s_fsnotify_marks);
}

/* destroy all events sitting in this groups notification queue */
extern void fsnotify_flush_notify(struct fsnotify_group *group);

/* protects reads of inode and vfsmount marks list */
extern struct srcu_struct fsnotify_mark_srcu;

/* compare two groups for sorting of marks lists */
extern int fsnotify_compare_groups(struct fsnotify_group *a,
                                   struct fsnotify_group *b);

/* Destroy all marks attached to an object via connector */
extern void fsnotify_destroy_marks(fsnotify_connp_t *connp);
/* run the list of all marks associated with inode and destroy them */
static inline void fsnotify_clear_marks_by_inode(struct inode *inode)
{
        fsnotify_destroy_marks(&inode->i_fsnotify_marks);
}
/* run the list of all marks associated with vfsmount and destroy them */
static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
{
        fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks);
}
/* run the list of all marks associated with sb and destroy them */
static inline void fsnotify_clear_marks_by_sb(struct super_block *sb)
{
        fsnotify_destroy_marks(&sb->s_fsnotify_marks);
}

/*
 * update the dentry->d_flags of all of inode's children to indicate if inode cares
 * about events that happen to its children.
 */
extern void __fsnotify_update_child_dentry_flags(struct inode *inode);

/* allocate and destroy and event holder to attach events to notification/access queues */
extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void);
extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder);

extern struct kmem_cache *fsnotify_mark_connector_cachep;

#endif        /* __FS_NOTIFY_FSNOTIFY_H_ */


















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_POLL_H
#define _LINUX_POLL_H


#include <linux/compiler.h>
#include <linux/ktime.h>
#include <linux/wait.h>
#include <linux/string.h>
#include <linux/fs.h>
#include <linux/sysctl.h>
#include <linux/uaccess.h>
#include <uapi/linux/poll.h>
#include <uapi/linux/eventpoll.h>

extern struct ctl_table epoll_table[]; /* for sysctl */
/* ~832 bytes of stack space used max in sys_select/sys_poll before allocating
   additional memory. */
#define MAX_STACK_ALLOC 832
#define FRONTEND_STACK_ALLOC        256
#define SELECT_STACK_ALLOC        FRONTEND_STACK_ALLOC
#define POLL_STACK_ALLOC        FRONTEND_STACK_ALLOC
#define WQUEUES_STACK_ALLOC        (MAX_STACK_ALLOC - FRONTEND_STACK_ALLOC)
#define N_INLINE_POLL_ENTRIES        (WQUEUES_STACK_ALLOC / sizeof(struct poll_table_entry))

#define DEFAULT_POLLMASK (EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM)

struct poll_table_struct;

/* 
 * structures and helpers for f_op->poll implementations
 */
typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);

/*
 * Do not touch the structure directly, use the access functions
 * poll_does_not_wait() and poll_requested_events() instead.
 */
typedef struct poll_table_struct {
        poll_queue_proc _qproc;
        __poll_t _key;
} poll_table;

static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
        if (p && p->_qproc && wait_address)
                p->_qproc(filp, wait_address, p);
}

/*
 * Return true if it is guaranteed that poll will not wait. This is the case
 * if the poll() of another file descriptor in the set got an event, so there
 * is no need for waiting.
 */
static inline bool poll_does_not_wait(const poll_table *p)
{
        return p == NULL || p->_qproc == NULL;
}

/*
 * Return the set of events that the application wants to poll for.
 * This is useful for drivers that need to know whether a DMA transfer has
 * to be started implicitly on poll(). You typically only want to do that
 * if the application is actually polling for POLLIN and/or POLLOUT.
 */
static inline __poll_t poll_requested_events(const poll_table *p)
{
        return p ? p->_key : ~(__poll_t)0;
}

static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
        pt->_qproc = qproc;
        pt->_key   = ~(__poll_t)0; /* all events enabled */
}

static inline bool file_can_poll(struct file *file)
{
        return file->f_op->poll;
}

static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
{
        if (unlikely(!file->f_op->poll))
                return DEFAULT_POLLMASK;
        return file->f_op->poll(file, pt);
}

struct poll_table_entry {
        struct file *filp;
        __poll_t key;
        wait_queue_entry_t wait;
        wait_queue_head_t *wait_address;
};

/*
 * Structures and helpers for select/poll syscall
 */
struct poll_wqueues {
        poll_table pt;
        struct poll_table_page *table;
        struct task_struct *polling_task;
        int triggered;
        int error;
        int inline_index;
        struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
};

extern void poll_initwait(struct poll_wqueues *pwq);
extern void poll_freewait(struct poll_wqueues *pwq);
extern u64 select_estimate_accuracy(struct timespec64 *tv);

#define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1)

extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
                           fd_set __user *exp, struct timespec64 *end_time);

extern int poll_select_set_timeout(struct timespec64 *to, time64_t sec,
                                   long nsec);

#define __MAP(v, from, to) \
        (from < to ? (v & from) * (to/from) : (v & from) / (from/to))

static inline __u16 mangle_poll(__poll_t val)
{
        __u16 v = (__force __u16)val;
#define M(X) __MAP(v, (__force __u16)EPOLL##X, POLL##X)
        return M(IN) | M(OUT) | M(PRI) | M(ERR) | M(NVAL) |
                M(RDNORM) | M(RDBAND) | M(WRNORM) | M(WRBAND) |
                M(HUP) | M(RDHUP) | M(MSG);
#undef M
}

static inline __poll_t demangle_poll(u16 val)
{
#define M(X) (__force __poll_t)__MAP(val, POLL##X, (__force __u16)EPOLL##X)
        return M(IN) | M(OUT) | M(PRI) | M(ERR) | M(NVAL) |
                M(RDNORM) | M(RDBAND) | M(WRNORM) | M(WRBAND) |
                M(HUP) | M(RDHUP) | M(MSG);
#undef M
}
#undef __MAP


#endif /* _LINUX_POLL_H */










    1 




















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM percpu

#if !defined(_TRACE_PERCPU_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PERCPU_H

#include <linux/tracepoint.h>

TRACE_EVENT(percpu_alloc_percpu,

        TP_PROTO(bool reserved, bool is_atomic, size_t size,
                 size_t align, void *base_addr, int off, void __percpu *ptr),

        TP_ARGS(reserved, is_atomic, size, align, base_addr, off, ptr),

        TP_STRUCT__entry(
                __field(        bool,                        reserved        )
                __field(        bool,                        is_atomic        )
                __field(        size_t,                        size                )
                __field(        size_t,                        align                )
                __field(        void *,                        base_addr        )
                __field(        int,                        off                )
                __field(        void __percpu *,        ptr                )
        ),

        TP_fast_assign(
                __entry->reserved        = reserved;
                __entry->is_atomic        = is_atomic;
                __entry->size                = size;
                __entry->align                = align;
                __entry->base_addr        = base_addr;
                __entry->off                = off;
                __entry->ptr                = ptr;
        ),

        TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu base_addr=%p off=%d ptr=%p",
                  __entry->reserved, __entry->is_atomic,
                  __entry->size, __entry->align,
                  __entry->base_addr, __entry->off, __entry->ptr)
);

TRACE_EVENT(percpu_free_percpu,

        TP_PROTO(void *base_addr, int off, void __percpu *ptr),

        TP_ARGS(base_addr, off, ptr),

        TP_STRUCT__entry(
                __field(        void *,                        base_addr        )
                __field(        int,                        off                )
                __field(        void __percpu *,        ptr                )
        ),

        TP_fast_assign(
                __entry->base_addr        = base_addr;
                __entry->off                = off;
                __entry->ptr                = ptr;
        ),

        TP_printk("base_addr=%p off=%d ptr=%p",
                __entry->base_addr, __entry->off, __entry->ptr)
);

TRACE_EVENT(percpu_alloc_percpu_fail,

        TP_PROTO(bool reserved, bool is_atomic, size_t size, size_t align),

        TP_ARGS(reserved, is_atomic, size, align),

        TP_STRUCT__entry(
                __field(        bool,        reserved        )
                __field(        bool,        is_atomic        )
                __field(        size_t,        size                )
                __field(        size_t, align                )
        ),

        TP_fast_assign(
                __entry->reserved        = reserved;
                __entry->is_atomic        = is_atomic;
                __entry->size                = size;
                __entry->align                = align;
        ),

        TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu",
                  __entry->reserved, __entry->is_atomic,
                  __entry->size, __entry->align)
);

TRACE_EVENT(percpu_create_chunk,

        TP_PROTO(void *base_addr),

        TP_ARGS(base_addr),

        TP_STRUCT__entry(
                __field(        void *, base_addr        )
        ),

        TP_fast_assign(
                __entry->base_addr        = base_addr;
        ),

        TP_printk("base_addr=%p", __entry->base_addr)
);

TRACE_EVENT(percpu_destroy_chunk,

        TP_PROTO(void *base_addr),

        TP_ARGS(base_addr),

        TP_STRUCT__entry(
                __field(        void *,        base_addr        )
        ),

        TP_fast_assign(
                __entry->base_addr        = base_addr;
        ),

        TP_printk("base_addr=%p", __entry->base_addr)
);

#endif /* _TRACE_PERCPU_H */

#include <trace/define_trace.h>


































































































































































































































































































































































































































































































































    1 


































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * fscrypt.h: declarations for per-file encryption
 *
 * Filesystems that implement per-file encryption must include this header
 * file.
 *
 * Copyright (C) 2015, Google, Inc.
 *
 * Written by Michael Halcrow, 2015.
 * Modified by Jaegeuk Kim, 2015.
 */
#ifndef _LINUX_FSCRYPT_H
#define _LINUX_FSCRYPT_H

#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <uapi/linux/fscrypt.h>

#define FS_CRYPTO_BLOCK_SIZE                16

union fscrypt_policy;
struct fscrypt_info;
struct seq_file;

struct fscrypt_str {
        unsigned char *name;
        u32 len;
};

struct fscrypt_name {
        const struct qstr *usr_fname;
        struct fscrypt_str disk_name;
        u32 hash;
        u32 minor_hash;
        struct fscrypt_str crypto_buf;
        bool is_nokey_name;
};

#define FSTR_INIT(n, l)                { .name = n, .len = l }
#define FSTR_TO_QSTR(f)                QSTR_INIT((f)->name, (f)->len)
#define fname_name(p)                ((p)->disk_name.name)
#define fname_len(p)                ((p)->disk_name.len)

/* Maximum value for the third parameter of fscrypt_operations.set_context(). */
#define FSCRYPT_SET_CONTEXT_MAX_SIZE        40

#ifdef CONFIG_FS_ENCRYPTION
/*
 * fscrypt superblock flags
 */
#define FS_CFLG_OWN_PAGES (1U << 1)

/*
 * crypto operations for filesystems
 */
struct fscrypt_operations {
        unsigned int flags;
        const char *key_prefix;
        int (*get_context)(struct inode *inode, void *ctx, size_t len);
        int (*set_context)(struct inode *inode, const void *ctx, size_t len,
                           void *fs_data);
        const union fscrypt_policy *(*get_dummy_policy)(struct super_block *sb);
        bool (*empty_dir)(struct inode *inode);
        unsigned int max_namelen;
        bool (*has_stable_inodes)(struct super_block *sb);
        void (*get_ino_and_lblk_bits)(struct super_block *sb,
                                      int *ino_bits_ret, int *lblk_bits_ret);
        int (*get_num_devices)(struct super_block *sb);
        void (*get_devices)(struct super_block *sb,
                            struct request_queue **devs);
};

static inline struct fscrypt_info *fscrypt_get_info(const struct inode *inode)
{
        /*
         * Pairs with the cmpxchg_release() in fscrypt_get_encryption_info().
         * I.e., another task may publish ->i_crypt_info concurrently, executing
         * a RELEASE barrier.  We need to use smp_load_acquire() here to safely
         * ACQUIRE the memory the other task published.
         */
        return smp_load_acquire(&inode->i_crypt_info);
}

/**
 * fscrypt_needs_contents_encryption() - check whether an inode needs
 *                                         contents encryption
 * @inode: the inode to check
 *
 * Return: %true iff the inode is an encrypted regular file and the kernel was
 * built with fscrypt support.
 *
 * If you need to know whether the encrypt bit is set even when the kernel was
 * built without fscrypt support, you must use IS_ENCRYPTED() directly instead.
 */
static inline bool fscrypt_needs_contents_encryption(const struct inode *inode)
{
        return IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode);
}

/*
 * When d_splice_alias() moves a directory's no-key alias to its plaintext alias
 * as a result of the encryption key being added, DCACHE_NOKEY_NAME must be
 * cleared.  Note that we don't have to support arbitrary moves of this flag
 * because fscrypt doesn't allow no-key names to be the source or target of a
 * rename().
 */
static inline void fscrypt_handle_d_move(struct dentry *dentry)
{
        dentry->d_flags &= ~DCACHE_NOKEY_NAME;
}

/**
 * fscrypt_is_nokey_name() - test whether a dentry is a no-key name
 * @dentry: the dentry to check
 *
 * This returns true if the dentry is a no-key dentry.  A no-key dentry is a
 * dentry that was created in an encrypted directory that hasn't had its
 * encryption key added yet.  Such dentries may be either positive or negative.
 *
 * When a filesystem is asked to create a new filename in an encrypted directory
 * and the new filename's dentry is a no-key dentry, it must fail the operation
 * with ENOKEY.  This includes ->create(), ->mkdir(), ->mknod(), ->symlink(),
 * ->rename(), and ->link().  (However, ->rename() and ->link() are already
 * handled by fscrypt_prepare_rename() and fscrypt_prepare_link().)
 *
 * This is necessary because creating a filename requires the directory's
 * encryption key, but just checking for the key on the directory inode during
 * the final filesystem operation doesn't guarantee that the key was available
 * during the preceding dentry lookup.  And the key must have already been
 * available during the dentry lookup in order for it to have been checked
 * whether the filename already exists in the directory and for the new file's
 * dentry not to be invalidated due to it incorrectly having the no-key flag.
 *
 * Return: %true if the dentry is a no-key name
 */
static inline bool fscrypt_is_nokey_name(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_NOKEY_NAME;
}

/* crypto.c */
void fscrypt_enqueue_decrypt_work(struct work_struct *);

struct page *fscrypt_encrypt_pagecache_blocks(struct page *page,
                                              unsigned int len,
                                              unsigned int offs,
                                              gfp_t gfp_flags);
int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page,
                                  unsigned int len, unsigned int offs,
                                  u64 lblk_num, gfp_t gfp_flags);

int fscrypt_decrypt_pagecache_blocks(struct page *page, unsigned int len,
                                     unsigned int offs);
int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page,
                                  unsigned int len, unsigned int offs,
                                  u64 lblk_num);

static inline bool fscrypt_is_bounce_page(struct page *page)
{
        return page->mapping == NULL;
}

static inline struct page *fscrypt_pagecache_page(struct page *bounce_page)
{
        return (struct page *)page_private(bounce_page);
}

void fscrypt_free_bounce_page(struct page *bounce_page);

/* policy.c */
int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg);
int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg);
int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg);
int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg);
int fscrypt_has_permitted_context(struct inode *parent, struct inode *child);
int fscrypt_set_context(struct inode *inode, void *fs_data);

struct fscrypt_dummy_policy {
        const union fscrypt_policy *policy;
};

int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg,
                                struct fscrypt_dummy_policy *dummy_policy);
void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep,
                                        struct super_block *sb);
static inline void
fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy)
{
        kfree(dummy_policy->policy);
        dummy_policy->policy = NULL;
}

/* keyring.c */
void fscrypt_destroy_keyring(struct super_block *sb);
int fscrypt_ioctl_add_key(struct file *filp, void __user *arg);
int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg);
int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *arg);
int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg);

/* keysetup.c */
int fscrypt_get_encryption_info(struct inode *inode);
int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode,
                              bool *encrypt_ret);
void fscrypt_put_encryption_info(struct inode *inode);
void fscrypt_free_inode(struct inode *inode);
int fscrypt_drop_inode(struct inode *inode);

/* fname.c */
int fscrypt_setup_filename(struct inode *inode, const struct qstr *iname,
                           int lookup, struct fscrypt_name *fname);

static inline void fscrypt_free_filename(struct fscrypt_name *fname)
{
        kfree(fname->crypto_buf.name);
}

int fscrypt_fname_alloc_buffer(u32 max_encrypted_len,
                               struct fscrypt_str *crypto_str);
void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str);
int fscrypt_fname_disk_to_usr(const struct inode *inode,
                              u32 hash, u32 minor_hash,
                              const struct fscrypt_str *iname,
                              struct fscrypt_str *oname);
bool fscrypt_match_name(const struct fscrypt_name *fname,
                        const u8 *de_name, u32 de_name_len);
u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name);
int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags);

/* bio.c */
void fscrypt_decrypt_bio(struct bio *bio);
int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
                          sector_t pblk, unsigned int len);

/* hooks.c */
int fscrypt_file_open(struct inode *inode, struct file *filp);
int __fscrypt_prepare_link(struct inode *inode, struct inode *dir,
                           struct dentry *dentry);
int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
                             struct inode *new_dir, struct dentry *new_dentry,
                             unsigned int flags);
int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
                             struct fscrypt_name *fname);
int fscrypt_prepare_setflags(struct inode *inode,
                             unsigned int oldflags, unsigned int flags);
int fscrypt_prepare_symlink(struct inode *dir, const char *target,
                            unsigned int len, unsigned int max_len,
                            struct fscrypt_str *disk_link);
int __fscrypt_encrypt_symlink(struct inode *inode, const char *target,
                              unsigned int len, struct fscrypt_str *disk_link);
const char *fscrypt_get_symlink(struct inode *inode, const void *caddr,
                                unsigned int max_size,
                                struct delayed_call *done);
int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat);
static inline void fscrypt_set_ops(struct super_block *sb,
                                   const struct fscrypt_operations *s_cop)
{
        sb->s_cop = s_cop;
}
#else  /* !CONFIG_FS_ENCRYPTION */

static inline struct fscrypt_info *fscrypt_get_info(const struct inode *inode)
{
        return NULL;
}

static inline bool fscrypt_needs_contents_encryption(const struct inode *inode)
{
        return false;
}

static inline void fscrypt_handle_d_move(struct dentry *dentry)
{
}

static inline bool fscrypt_is_nokey_name(const struct dentry *dentry)
{
        return false;
}

/* crypto.c */
static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work)
{
}

static inline struct page *fscrypt_encrypt_pagecache_blocks(struct page *page,
                                                            unsigned int len,
                                                            unsigned int offs,
                                                            gfp_t gfp_flags)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline int fscrypt_encrypt_block_inplace(const struct inode *inode,
                                                struct page *page,
                                                unsigned int len,
                                                unsigned int offs, u64 lblk_num,
                                                gfp_t gfp_flags)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_decrypt_pagecache_blocks(struct page *page,
                                                   unsigned int len,
                                                   unsigned int offs)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_decrypt_block_inplace(const struct inode *inode,
                                                struct page *page,
                                                unsigned int len,
                                                unsigned int offs, u64 lblk_num)
{
        return -EOPNOTSUPP;
}

static inline bool fscrypt_is_bounce_page(struct page *page)
{
        return false;
}

static inline struct page *fscrypt_pagecache_page(struct page *bounce_page)
{
        WARN_ON_ONCE(1);
        return ERR_PTR(-EINVAL);
}

static inline void fscrypt_free_bounce_page(struct page *bounce_page)
{
}

/* policy.c */
static inline int fscrypt_ioctl_set_policy(struct file *filp,
                                           const void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_get_policy_ex(struct file *filp,
                                              void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_has_permitted_context(struct inode *parent,
                                                struct inode *child)
{
        return 0;
}

static inline int fscrypt_set_context(struct inode *inode, void *fs_data)
{
        return -EOPNOTSUPP;
}

struct fscrypt_dummy_policy {
};

static inline void fscrypt_show_test_dummy_encryption(struct seq_file *seq,
                                                      char sep,
                                                      struct super_block *sb)
{
}

static inline void
fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy)
{
}

/* keyring.c */
static inline void fscrypt_destroy_keyring(struct super_block *sb)
{
}

static inline int fscrypt_ioctl_add_key(struct file *filp, void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_remove_key_all_users(struct file *filp,
                                                     void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_get_key_status(struct file *filp,
                                               void __user *arg)
{
        return -EOPNOTSUPP;
}

/* keysetup.c */
static inline int fscrypt_get_encryption_info(struct inode *inode)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_prepare_new_inode(struct inode *dir,
                                            struct inode *inode,
                                            bool *encrypt_ret)
{
        if (IS_ENCRYPTED(dir))
                return -EOPNOTSUPP;
        return 0;
}

static inline void fscrypt_put_encryption_info(struct inode *inode)
{
        return;
}

static inline void fscrypt_free_inode(struct inode *inode)
{
}

static inline int fscrypt_drop_inode(struct inode *inode)
{
        return 0;
}

 /* fname.c */
static inline int fscrypt_setup_filename(struct inode *dir,
                                         const struct qstr *iname,
                                         int lookup, struct fscrypt_name *fname)
{
        if (IS_ENCRYPTED(dir))
                return -EOPNOTSUPP;

        memset(fname, 0, sizeof(*fname));
        fname->usr_fname = iname;
        fname->disk_name.name = (unsigned char *)iname->name;
        fname->disk_name.len = iname->len;
        return 0;
}

static inline void fscrypt_free_filename(struct fscrypt_name *fname)
{
        return;
}

static inline int fscrypt_fname_alloc_buffer(u32 max_encrypted_len,
                                             struct fscrypt_str *crypto_str)
{
        return -EOPNOTSUPP;
}

static inline void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str)
{
        return;
}

static inline int fscrypt_fname_disk_to_usr(const struct inode *inode,
                                            u32 hash, u32 minor_hash,
                                            const struct fscrypt_str *iname,
                                            struct fscrypt_str *oname)
{
        return -EOPNOTSUPP;
}

static inline bool fscrypt_match_name(const struct fscrypt_name *fname,
                                      const u8 *de_name, u32 de_name_len)
{
        /* Encryption support disabled; use standard comparison */
        if (de_name_len != fname->disk_name.len)
                return false;
        return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len);
}

static inline u64 fscrypt_fname_siphash(const struct inode *dir,
                                        const struct qstr *name)
{
        WARN_ON_ONCE(1);
        return 0;
}

static inline int fscrypt_d_revalidate(struct dentry *dentry,
                                       unsigned int flags)
{
        return 1;
}

/* bio.c */
static inline void fscrypt_decrypt_bio(struct bio *bio)
{
}

static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
                                        sector_t pblk, unsigned int len)
{
        return -EOPNOTSUPP;
}

/* hooks.c */

static inline int fscrypt_file_open(struct inode *inode, struct file *filp)
{
        if (IS_ENCRYPTED(inode))
                return -EOPNOTSUPP;
        return 0;
}

static inline int __fscrypt_prepare_link(struct inode *inode, struct inode *dir,
                                         struct dentry *dentry)
{
        return -EOPNOTSUPP;
}

static inline int __fscrypt_prepare_rename(struct inode *old_dir,
                                           struct dentry *old_dentry,
                                           struct inode *new_dir,
                                           struct dentry *new_dentry,
                                           unsigned int flags)
{
        return -EOPNOTSUPP;
}

static inline int __fscrypt_prepare_lookup(struct inode *dir,
                                           struct dentry *dentry,
                                           struct fscrypt_name *fname)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_prepare_setflags(struct inode *inode,
                                           unsigned int oldflags,
                                           unsigned int flags)
{
        return 0;
}

static inline int fscrypt_prepare_symlink(struct inode *dir,
                                          const char *target,
                                          unsigned int len,
                                          unsigned int max_len,
                                          struct fscrypt_str *disk_link)
{
        if (IS_ENCRYPTED(dir))
                return -EOPNOTSUPP;
        disk_link->name = (unsigned char *)target;
        disk_link->len = len + 1;
        if (disk_link->len > max_len)
                return -ENAMETOOLONG;
        return 0;
}

static inline int __fscrypt_encrypt_symlink(struct inode *inode,
                                            const char *target,
                                            unsigned int len,
                                            struct fscrypt_str *disk_link)
{
        return -EOPNOTSUPP;
}

static inline const char *fscrypt_get_symlink(struct inode *inode,
                                              const void *caddr,
                                              unsigned int max_size,
                                              struct delayed_call *done)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline int fscrypt_symlink_getattr(const struct path *path,
                                          struct kstat *stat)
{
        return -EOPNOTSUPP;
}

static inline void fscrypt_set_ops(struct super_block *sb,
                                   const struct fscrypt_operations *s_cop)
{
}

#endif        /* !CONFIG_FS_ENCRYPTION */

/* inline_crypt.c */
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT

bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode);

void fscrypt_set_bio_crypt_ctx(struct bio *bio,
                               const struct inode *inode, u64 first_lblk,
                               gfp_t gfp_mask);

void fscrypt_set_bio_crypt_ctx_bh(struct bio *bio,
                                  const struct buffer_head *first_bh,
                                  gfp_t gfp_mask);

bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode,
                           u64 next_lblk);

bool fscrypt_mergeable_bio_bh(struct bio *bio,
                              const struct buffer_head *next_bh);

#else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */

static inline bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode)
{
        return false;
}

static inline void fscrypt_set_bio_crypt_ctx(struct bio *bio,
                                             const struct inode *inode,
                                             u64 first_lblk, gfp_t gfp_mask) { }

static inline void fscrypt_set_bio_crypt_ctx_bh(
                                         struct bio *bio,
                                         const struct buffer_head *first_bh,
                                         gfp_t gfp_mask) { }

static inline bool fscrypt_mergeable_bio(struct bio *bio,
                                         const struct inode *inode,
                                         u64 next_lblk)
{
        return true;
}

static inline bool fscrypt_mergeable_bio_bh(struct bio *bio,
                                            const struct buffer_head *next_bh)
{
        return true;
}
#endif /* !CONFIG_FS_ENCRYPTION_INLINE_CRYPT */

/**
 * fscrypt_inode_uses_inline_crypto() - test whether an inode uses inline
 *                                        encryption
 * @inode: an inode. If encrypted, its key must be set up.
 *
 * Return: true if the inode requires file contents encryption and if the
 *           encryption should be done in the block layer via blk-crypto rather
 *           than in the filesystem layer.
 */
static inline bool fscrypt_inode_uses_inline_crypto(const struct inode *inode)
{
        return fscrypt_needs_contents_encryption(inode) &&
               __fscrypt_inode_uses_inline_crypto(inode);
}

/**
 * fscrypt_inode_uses_fs_layer_crypto() - test whether an inode uses fs-layer
 *                                          encryption
 * @inode: an inode. If encrypted, its key must be set up.
 *
 * Return: true if the inode requires file contents encryption and if the
 *           encryption should be done in the filesystem layer rather than in the
 *           block layer via blk-crypto.
 */
static inline bool fscrypt_inode_uses_fs_layer_crypto(const struct inode *inode)
{
        return fscrypt_needs_contents_encryption(inode) &&
               !__fscrypt_inode_uses_inline_crypto(inode);
}

/**
 * fscrypt_has_encryption_key() - check whether an inode has had its key set up
 * @inode: the inode to check
 *
 * Return: %true if the inode has had its encryption key set up, else %false.
 *
 * Usually this should be preceded by fscrypt_get_encryption_info() to try to
 * set up the key first.
 */
static inline bool fscrypt_has_encryption_key(const struct inode *inode)
{
        return fscrypt_get_info(inode) != NULL;
}

/**
 * fscrypt_require_key() - require an inode's encryption key
 * @inode: the inode we need the key for
 *
 * If the inode is encrypted, set up its encryption key if not already done.
 * Then require that the key be present and return -ENOKEY otherwise.
 *
 * No locks are needed, and the key will live as long as the struct inode --- so
 * it won't go away from under you.
 *
 * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
 * if a problem occurred while setting up the encryption key.
 */
static inline int fscrypt_require_key(struct inode *inode)
{
        if (IS_ENCRYPTED(inode)) {
                int err = fscrypt_get_encryption_info(inode);

                if (err)
                        return err;
                if (!fscrypt_has_encryption_key(inode))
                        return -ENOKEY;
        }
        return 0;
}

/**
 * fscrypt_prepare_link() - prepare to link an inode into a possibly-encrypted
 *                            directory
 * @old_dentry: an existing dentry for the inode being linked
 * @dir: the target directory
 * @dentry: negative dentry for the target filename
 *
 * A new link can only be added to an encrypted directory if the directory's
 * encryption key is available --- since otherwise we'd have no way to encrypt
 * the filename.  Therefore, we first set up the directory's encryption key (if
 * not already done) and return an error if it's unavailable.
 *
 * We also verify that the link will not violate the constraint that all files
 * in an encrypted directory tree use the same encryption policy.
 *
 * Return: 0 on success, -ENOKEY if the directory's encryption key is missing,
 * -EXDEV if the link would result in an inconsistent encryption policy, or
 * another -errno code.
 */
static inline int fscrypt_prepare_link(struct dentry *old_dentry,
                                       struct inode *dir,
                                       struct dentry *dentry)
{
        if (IS_ENCRYPTED(dir))
                return __fscrypt_prepare_link(d_inode(old_dentry), dir, dentry);
        return 0;
}

/**
 * fscrypt_prepare_rename() - prepare for a rename between possibly-encrypted
 *                              directories
 * @old_dir: source directory
 * @old_dentry: dentry for source file
 * @new_dir: target directory
 * @new_dentry: dentry for target location (may be negative unless exchanging)
 * @flags: rename flags (we care at least about %RENAME_EXCHANGE)
 *
 * Prepare for ->rename() where the source and/or target directories may be
 * encrypted.  A new link can only be added to an encrypted directory if the
 * directory's encryption key is available --- since otherwise we'd have no way
 * to encrypt the filename.  A rename to an existing name, on the other hand,
 * *is* cryptographically possible without the key.  However, we take the more
 * conservative approach and just forbid all no-key renames.
 *
 * We also verify that the rename will not violate the constraint that all files
 * in an encrypted directory tree use the same encryption policy.
 *
 * Return: 0 on success, -ENOKEY if an encryption key is missing, -EXDEV if the
 * rename would cause inconsistent encryption policies, or another -errno code.
 */
static inline int fscrypt_prepare_rename(struct inode *old_dir,
                                         struct dentry *old_dentry,
                                         struct inode *new_dir,
                                         struct dentry *new_dentry,
                                         unsigned int flags)
{
        if (IS_ENCRYPTED(old_dir) || IS_ENCRYPTED(new_dir))
                return __fscrypt_prepare_rename(old_dir, old_dentry,
                                                new_dir, new_dentry, flags);
        return 0;
}

/**
 * fscrypt_prepare_lookup() - prepare to lookup a name in a possibly-encrypted
 *                              directory
 * @dir: directory being searched
 * @dentry: filename being looked up
 * @fname: (output) the name to use to search the on-disk directory
 *
 * Prepare for ->lookup() in a directory which may be encrypted by determining
 * the name that will actually be used to search the directory on-disk.  If the
 * directory's encryption key is available, then the lookup is assumed to be by
 * plaintext name; otherwise, it is assumed to be by no-key name.
 *
 * This also installs a custom ->d_revalidate() method which will invalidate the
 * dentry if it was created without the key and the key is later added.
 *
 * Return: 0 on success; -ENOENT if the directory's key is unavailable but the
 * filename isn't a valid no-key name, so a negative dentry should be created;
 * or another -errno code.
 */
static inline int fscrypt_prepare_lookup(struct inode *dir,
                                         struct dentry *dentry,
                                         struct fscrypt_name *fname)
{
        if (IS_ENCRYPTED(dir))
                return __fscrypt_prepare_lookup(dir, dentry, fname);

        memset(fname, 0, sizeof(*fname));
        fname->usr_fname = &dentry->d_name;
        fname->disk_name.name = (unsigned char *)dentry->d_name.name;
        fname->disk_name.len = dentry->d_name.len;
        return 0;
}

/**
 * fscrypt_prepare_setattr() - prepare to change a possibly-encrypted inode's
 *                               attributes
 * @dentry: dentry through which the inode is being changed
 * @attr: attributes to change
 *
 * Prepare for ->setattr() on a possibly-encrypted inode.  On an encrypted file,
 * most attribute changes are allowed even without the encryption key.  However,
 * without the encryption key we do have to forbid truncates.  This is needed
 * because the size being truncated to may not be a multiple of the filesystem
 * block size, and in that case we'd have to decrypt the final block, zero the
 * portion past i_size, and re-encrypt it.  (We *could* allow truncating to a
 * filesystem block boundary, but it's simpler to just forbid all truncates ---
 * and we already forbid all other contents modifications without the key.)
 *
 * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
 * if a problem occurred while setting up the encryption key.
 */
static inline int fscrypt_prepare_setattr(struct dentry *dentry,
                                          struct iattr *attr)
{
        if (attr->ia_valid & ATTR_SIZE)
                return fscrypt_require_key(d_inode(dentry));
        return 0;
}

/**
 * fscrypt_encrypt_symlink() - encrypt the symlink target if needed
 * @inode: symlink inode
 * @target: plaintext symlink target
 * @len: length of @target excluding null terminator
 * @disk_link: (in/out) the on-disk symlink target being prepared
 *
 * If the symlink target needs to be encrypted, then this function encrypts it
 * into @disk_link->name.  fscrypt_prepare_symlink() must have been called
 * previously to compute @disk_link->len.  If the filesystem did not allocate a
 * buffer for @disk_link->name after calling fscrypt_prepare_link(), then one
 * will be kmalloc()'ed and the filesystem will be responsible for freeing it.
 *
 * Return: 0 on success, -errno on failure
 */
static inline int fscrypt_encrypt_symlink(struct inode *inode,
                                          const char *target,
                                          unsigned int len,
                                          struct fscrypt_str *disk_link)
{
        if (IS_ENCRYPTED(inode))
                return __fscrypt_encrypt_symlink(inode, target, len, disk_link);
        return 0;
}

/* If *pagep is a bounce page, free it and set *pagep to the pagecache page */
static inline void fscrypt_finalize_bounce_page(struct page **pagep)
{
        struct page *page = *pagep;

        if (fscrypt_is_bounce_page(page)) {
                *pagep = fscrypt_pagecache_page(page);
                fscrypt_free_bounce_page(page);
        }
}

#endif        /* _LINUX_FSCRYPT_H */










































































































































    1 







    1 














    1 



























    1 










    1 








    1 








































    1 

    1 


    1 



    1 






































    1 



    1 






















































    1 













    1 

    1 








    1 













    1 








    1 












    1 













































    1 






    1 











    1 












    1 





































    1 











    1 



































    1 










    1 




    1 







    1 







    1 






    1 



    1 











    1 




    1 




    1 












    1 


    1 





























    1 



    1 


    1 

    1 


    1 
























































































    1 







    1 
    1 
    1 




    1 




















    1 
    1 














































































































































































































































































































































































































































































































    1 

    1 


















































    1 



    1 





















































































    1 









    1 


    1 


















    1 


    1 











































































    1 
    1 




























































































































    1 




    1 


    1 



























































































    1 




















































    1 
    1 













    1 










































































































































































    1 








































































    1 

    1 

    1 

    1 









    1 


    1 


    1 



    1 







    1 










    1 



    1 







    1 

    1 


    1 
    1 









    1 

    1 







    1 













    1 


    1 




    1 


    1 


    1 
    1 


















    1 










    1 
    1 




    1 


    1 
































    1 


    1 




















































































































































































































































































































































































































































































































































































































    1 






    1 




















    1 




    1 






    1 







    1 







































































































































































































































    1 















    1 
    1 


    1 


    1 
















































    1 

    1 
    1 










    1 





    1 



    1 

    1 









    1 



    1 
    1 



    1 





    1 
    1 
    1 


    1 




































































































    1 



    1 

    1 


    1 
    1 




    1 


    1 


















    1 






    1 


    1 

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/namei.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * Some corrections by tytso.
 */

/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
 * lookup logic.
 */
/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
 */

#include <linux/init.h>
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/fsnotify.h>
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/ima.h>
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fcntl.h>
#include <linux/device_cgroup.h>
#include <linux/fs_struct.h>
#include <linux/posix_acl.h>
#include <linux/hash.h>
#include <linux/bitops.h>
#include <linux/init_task.h>
#include <linux/uaccess.h>

#include "internal.h"
#include "mount.h"

/* [Feb-1997 T. Schoebel-Theuer]
 * Fundamental changes in the pathname lookup mechanisms (namei)
 * were necessary because of omirr.  The reason is that omirr needs
 * to know the _real_ pathname, not the user-supplied one, in case
 * of symlinks (and also when transname replacements occur).
 *
 * The new code replaces the old recursive symlink resolution with
 * an iterative one (in case of non-nested symlink chains).  It does
 * this with calls to <fs>_follow_link().
 * As a side effect, dir_namei(), _namei() and follow_link() are now 
 * replaced with a single function lookup_dentry() that can handle all 
 * the special cases of the former code.
 *
 * With the new dcache, the pathname is stored at each inode, at least as
 * long as the refcount of the inode is positive.  As a side effect, the
 * size of the dcache depends on the inode cache and thus is dynamic.
 *
 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
 * resolution to correspond with current state of the code.
 *
 * Note that the symlink resolution is not *completely* iterative.
 * There is still a significant amount of tail- and mid- recursion in
 * the algorithm.  Also, note that <fs>_readlink() is not used in
 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
 * may return different results than <fs>_follow_link().  Many virtual
 * filesystems (including /proc) exhibit this behavior.
 */

/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
 * and the name already exists in form of a symlink, try to create the new
 * name indicated by the symlink. The old code always complained that the
 * name already exists, due to not following the symlink even if its target
 * is nonexistent.  The new semantics affects also mknod() and link() when
 * the name is a symlink pointing to a non-existent name.
 *
 * I don't know which semantics is the right one, since I have no access
 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
 * "old" one. Personally, I think the new semantics is much more logical.
 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
 * file does succeed in both HP-UX and SunOs, but not in Solaris
 * and in the old Linux semantics.
 */

/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
 * semantics.  See the comments in "open_namei" and "do_link" below.
 *
 * [10-Sep-98 Alan Modra] Another symlink change.
 */

/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
 *        inside the path - always follow.
 *        in the last component in creation/removal/renaming - never follow.
 *        if LOOKUP_FOLLOW passed - follow.
 *        if the pathname has trailing slashes - follow.
 *        otherwise - don't follow.
 * (applied in that order).
 *
 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 * During the 2.4 we need to fix the userland stuff depending on it -
 * hopefully we will be able to get rid of that wart in 2.5. So far only
 * XEmacs seems to be relying on it...
 */
/*
 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
 * any extra contention...
 */

/* In order to reduce some races, while at the same time doing additional
 * checking and hopefully speeding things up, we copy filenames to the
 * kernel data space before using them..
 *
 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 * PATH_MAX includes the nul terminator --RR.
 */

#define EMBEDDED_NAME_MAX        (PATH_MAX - offsetof(struct filename, iname))

struct filename *
getname_flags(const char __user *filename, int flags, int *empty)
{
        struct filename *result;
        char *kname;
        int len;

        result = audit_reusename(filename);
        if (result)
                return result;

        result = __getname();
        if (unlikely(!result))
                return ERR_PTR(-ENOMEM);

        /*
         * First, try to embed the struct filename inside the names_cache
         * allocation
         */
        kname = (char *)result->iname;
        result->name = kname;

        len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
        if (unlikely(len < 0)) {
                __putname(result);
                return ERR_PTR(len);
        }

        /*
         * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
         * separate struct filename so we can dedicate the entire
         * names_cache allocation for the pathname, and re-do the copy from
         * userland.
         */
        if (unlikely(len == EMBEDDED_NAME_MAX)) {
                const size_t size = offsetof(struct filename, iname[1]);
                kname = (char *)result;

                /*
                 * size is chosen that way we to guarantee that
                 * result->iname[0] is within the same object and that
                 * kname can't be equal to result->iname, no matter what.
                 */
                result = kzalloc(size, GFP_KERNEL);
                if (unlikely(!result)) {
                        __putname(kname);
                        return ERR_PTR(-ENOMEM);
                }
                result->name = kname;
                len = strncpy_from_user(kname, filename, PATH_MAX);
                if (unlikely(len < 0)) {
                        __putname(kname);
                        kfree(result);
                        return ERR_PTR(len);
                }
                if (unlikely(len == PATH_MAX)) {
                        __putname(kname);
                        kfree(result);
                        return ERR_PTR(-ENAMETOOLONG);
                }
        }

        result->refcnt = 1;
        /* The empty path is special. */
        if (unlikely(!len)) {
                if (empty)
                        *empty = 1;
                if (!(flags & LOOKUP_EMPTY)) {
                        putname(result);
                        return ERR_PTR(-ENOENT);
                }
        }

        result->uptr = filename;
        result->aname = NULL;
        audit_getname(result);
        return result;
}

struct filename *
getname(const char __user * filename)
{
        return getname_flags(filename, 0, NULL);
}

struct filename *
getname_kernel(const char * filename)
{
        struct filename *result;
        int len = strlen(filename) + 1;

        result = __getname();
        if (unlikely(!result))
                return ERR_PTR(-ENOMEM);

        if (len <= EMBEDDED_NAME_MAX) {
                result->name = (char *)result->iname;
        } else if (len <= PATH_MAX) {
                const size_t size = offsetof(struct filename, iname[1]);
                struct filename *tmp;

                tmp = kmalloc(size, GFP_KERNEL);
                if (unlikely(!tmp)) {
                        __putname(result);
                        return ERR_PTR(-ENOMEM);
                }
                tmp->name = (char *)result;
                result = tmp;
        } else {
                __putname(result);
                return ERR_PTR(-ENAMETOOLONG);
        }
        memcpy((char *)result->name, filename, len);
        result->uptr = NULL;
        result->aname = NULL;
        result->refcnt = 1;
        audit_getname(result);

        return result;
}

void putname(struct filename *name)
{
        BUG_ON(name->refcnt <= 0);

        if (--name->refcnt > 0)
                return;

        if (name->name != name->iname) {
                __putname(name->name);
                kfree(name);
        } else
                __putname(name);
}

static int check_acl(struct inode *inode, int mask)
{
#ifdef CONFIG_FS_POSIX_ACL
        struct posix_acl *acl;

        if (mask & MAY_NOT_BLOCK) {
                acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
                if (!acl)
                        return -EAGAIN;
                /* no ->get_acl() calls in RCU mode... */
                if (is_uncached_acl(acl))
                        return -ECHILD;
                return posix_acl_permission(inode, acl, mask);
        }

        acl = get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl) {
                int error = posix_acl_permission(inode, acl, mask);
                posix_acl_release(acl);
                return error;
        }
#endif

        return -EAGAIN;
}

/*
 * This does the basic UNIX permission checking.
 *
 * Note that the POSIX ACL check cares about the MAY_NOT_BLOCK bit,
 * for RCU walking.
 */
static int acl_permission_check(struct inode *inode, int mask)
{
        unsigned int mode = inode->i_mode;

        /* Are we the owner? If so, ACL's don't matter */
        if (likely(uid_eq(current_fsuid(), inode->i_uid))) {
                mask &= 7;
                mode >>= 6;
                return (mask & ~mode) ? -EACCES : 0;
        }

        /* Do we have ACL's? */
        if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
                int error = check_acl(inode, mask);
                if (error != -EAGAIN)
                        return error;
        }

        /* Only RWX matters for group/other mode bits */
        mask &= 7;

        /*
         * Are the group permissions different from
         * the other permissions in the bits we care
         * about? Need to check group ownership if so.
         */
        if (mask & (mode ^ (mode >> 3))) {
                if (in_group_p(inode->i_gid))
                        mode >>= 3;
        }

        /* Bits in 'mode' clear that we require? */
        return (mask & ~mode) ? -EACCES : 0;
}

/**
 * generic_permission -  check for access rights on a Posix-like filesystem
 * @inode:        inode to check access rights for
 * @mask:        right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
 *                %MAY_NOT_BLOCK ...)
 *
 * Used to check for read/write/execute permissions on a file.
 * We use "fsuid" for this, letting us set arbitrary permissions
 * for filesystem access without changing the "normal" uids which
 * are used for other things.
 *
 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 * request cannot be satisfied (eg. requires blocking or too much complexity).
 * It would then be called again in ref-walk mode.
 */
int generic_permission(struct inode *inode, int mask)
{
        int ret;

        /*
         * Do the basic permission checks.
         */
        ret = acl_permission_check(inode, mask);
        if (ret != -EACCES)
                return ret;

        if (S_ISDIR(inode->i_mode)) {
                /* DACs are overridable for directories */
                if (!(mask & MAY_WRITE))
                        if (capable_wrt_inode_uidgid(inode,
                                                     CAP_DAC_READ_SEARCH))
                                return 0;
                if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
                        return 0;
                return -EACCES;
        }

        /*
         * Searching includes executable on directories, else just read.
         */
        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
        if (mask == MAY_READ)
                if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
                        return 0;
        /*
         * Read/write DACs are always overridable.
         * Executable DACs are overridable when there is
         * at least one exec bit set.
         */
        if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
                if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
                        return 0;

        return -EACCES;
}
EXPORT_SYMBOL(generic_permission);

/*
 * We _really_ want to just do "generic_permission()" without
 * even looking at the inode->i_op values. So we keep a cache
 * flag in inode->i_opflags, that says "this has not special
 * permission function, use the fast case".
 */
static inline int do_inode_permission(struct inode *inode, int mask)
{
        if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
                if (likely(inode->i_op->permission))
                        return inode->i_op->permission(inode, mask);

                /* This gets set once for the inode lifetime */
                spin_lock(&inode->i_lock);
                inode->i_opflags |= IOP_FASTPERM;
                spin_unlock(&inode->i_lock);
        }
        return generic_permission(inode, mask);
}

/**
 * sb_permission - Check superblock-level permissions
 * @sb: Superblock of inode to check permission on
 * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Separate out file-system wide checks from inode-specific permission checks.
 */
static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
{
        if (unlikely(mask & MAY_WRITE)) {
                umode_t mode = inode->i_mode;

                /* Nobody gets write access to a read-only fs. */
                if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
                        return -EROFS;
        }
        return 0;
}

/**
 * inode_permission - Check for access rights to a given inode
 * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
 * this, letting us set arbitrary permissions for filesystem access without
 * changing the "normal" UIDs which are used for other things.
 *
 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 */
int inode_permission(struct inode *inode, int mask)
{
        int retval;

        retval = sb_permission(inode->i_sb, inode, mask);
        if (retval)
                return retval;

        if (unlikely(mask & MAY_WRITE)) {
                /*
                 * Nobody gets write access to an immutable file.
                 */
                if (IS_IMMUTABLE(inode))
                        return -EPERM;

                /*
                 * Updating mtime will likely cause i_uid and i_gid to be
                 * written back improperly if their true value is unknown
                 * to the vfs.
                 */
                if (HAS_UNMAPPED_ID(inode))
                        return -EACCES;
        }

        retval = do_inode_permission(inode, mask);
        if (retval)
                return retval;

        retval = devcgroup_inode_permission(inode, mask);
        if (retval)
                return retval;

        return security_inode_permission(inode, mask);
}
EXPORT_SYMBOL(inode_permission);

/**
 * path_get - get a reference to a path
 * @path: path to get the reference to
 *
 * Given a path increment the reference count to the dentry and the vfsmount.
 */
void path_get(const struct path *path)
{
        mntget(path->mnt);
        dget(path->dentry);
}
EXPORT_SYMBOL(path_get);

/**
 * path_put - put a reference to a path
 * @path: path to put the reference to
 *
 * Given a path decrement the reference count to the dentry and the vfsmount.
 */
void path_put(const struct path *path)
{
        dput(path->dentry);
        mntput(path->mnt);
}
EXPORT_SYMBOL(path_put);

#define EMBEDDED_LEVELS 2
struct nameidata {
        struct path        path;
        struct qstr        last;
        struct path        root;
        struct inode        *inode; /* path.dentry.d_inode */
        unsigned int        flags;
        unsigned        seq, m_seq, r_seq;
        int                last_type;
        unsigned        depth;
        int                total_link_count;
        struct saved {
                struct path link;
                struct delayed_call done;
                const char *name;
                unsigned seq;
        } *stack, internal[EMBEDDED_LEVELS];
        struct filename        *name;
        struct nameidata *saved;
        unsigned        root_seq;
        int                dfd;
        kuid_t                dir_uid;
        umode_t                dir_mode;
} __randomize_layout;

static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
{
        struct nameidata *old = current->nameidata;
        p->stack = p->internal;
        p->dfd = dfd;
        p->name = name;
        p->path.mnt = NULL;
        p->path.dentry = NULL;
        p->total_link_count = old ? old->total_link_count : 0;
        p->saved = old;
        current->nameidata = p;
}

static void restore_nameidata(void)
{
        struct nameidata *now = current->nameidata, *old = now->saved;

        current->nameidata = old;
        if (old)
                old->total_link_count = now->total_link_count;
        if (now->stack != now->internal)
                kfree(now->stack);
}

static bool nd_alloc_stack(struct nameidata *nd)
{
        struct saved *p;

        p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
                         nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL);
        if (unlikely(!p))
                return false;
        memcpy(p, nd->internal, sizeof(nd->internal));
        nd->stack = p;
        return true;
}

/**
 * path_connected - Verify that a dentry is below mnt.mnt_root
 *
 * Rename can sometimes move a file or directory outside of a bind
 * mount, path_connected allows those cases to be detected.
 */
static bool path_connected(struct vfsmount *mnt, struct dentry *dentry)
{
        struct super_block *sb = mnt->mnt_sb;

        /* Bind mounts can have disconnected paths */
        if (mnt->mnt_root == sb->s_root)
                return true;

        return is_subdir(dentry, mnt->mnt_root);
}

static void drop_links(struct nameidata *nd)
{
        int i = nd->depth;
        while (i--) {
                struct saved *last = nd->stack + i;
                do_delayed_call(&last->done);
                clear_delayed_call(&last->done);
        }
}

static void terminate_walk(struct nameidata *nd)
{
        drop_links(nd);
        if (!(nd->flags & LOOKUP_RCU)) {
                int i;
                path_put(&nd->path);
                for (i = 0; i < nd->depth; i++)
                        path_put(&nd->stack[i].link);
                if (nd->flags & LOOKUP_ROOT_GRABBED) {
                        path_put(&nd->root);
                        nd->flags &= ~LOOKUP_ROOT_GRABBED;
                }
        } else {
                nd->flags &= ~LOOKUP_RCU;
                rcu_read_unlock();
        }
        nd->depth = 0;
        nd->path.mnt = NULL;
        nd->path.dentry = NULL;
}

/* path_put is needed afterwards regardless of success or failure */
static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq)
{
        int res = __legitimize_mnt(path->mnt, mseq);
        if (unlikely(res)) {
                if (res > 0)
                        path->mnt = NULL;
                path->dentry = NULL;
                return false;
        }
        if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
                path->dentry = NULL;
                return false;
        }
        return !read_seqcount_retry(&path->dentry->d_seq, seq);
}

static inline bool legitimize_path(struct nameidata *nd,
                            struct path *path, unsigned seq)
{
        return __legitimize_path(path, seq, nd->m_seq);
}

static bool legitimize_links(struct nameidata *nd)
{
        int i;
        if (unlikely(nd->flags & LOOKUP_CACHED)) {
                drop_links(nd);
                nd->depth = 0;
                return false;
        }
        for (i = 0; i < nd->depth; i++) {
                struct saved *last = nd->stack + i;
                if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
                        drop_links(nd);
                        nd->depth = i + 1;
                        return false;
                }
        }
        return true;
}

static bool legitimize_root(struct nameidata *nd)
{
        /*
         * For scoped-lookups (where nd->root has been zeroed), we need to
         * restart the whole lookup from scratch -- because set_root() is wrong
         * for these lookups (nd->dfd is the root, not the filesystem root).
         */
        if (!nd->root.mnt && (nd->flags & LOOKUP_IS_SCOPED))
                return false;
        /* Nothing to do if nd->root is zero or is managed by the VFS user. */
        if (!nd->root.mnt || (nd->flags & LOOKUP_ROOT))
                return true;
        nd->flags |= LOOKUP_ROOT_GRABBED;
        return legitimize_path(nd, &nd->root, nd->root_seq);
}

/*
 * Path walking has 2 modes, rcu-walk and ref-walk (see
 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
 * normal reference counts on dentries and vfsmounts to transition to ref-walk
 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 * got stuck, so ref-walk may continue from there. If this is not successful
 * (eg. a seqcount has changed), then failure is returned and it's up to caller
 * to restart the path walk from the beginning in ref-walk mode.
 */

/**
 * try_to_unlazy - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
 * Returns: true on success, false on failure
 *
 * try_to_unlazy attempts to legitimize the current nd->path and nd->root
 * for ref-walk mode.
 * Must be called from rcu-walk context.
 * Nothing should touch nameidata between try_to_unlazy() failure and
 * terminate_walk().
 */
static bool try_to_unlazy(struct nameidata *nd)
{
        struct dentry *parent = nd->path.dentry;

        BUG_ON(!(nd->flags & LOOKUP_RCU));

        nd->flags &= ~LOOKUP_RCU;
        if (unlikely(!legitimize_links(nd)))
                goto out1;
        if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
                goto out;
        if (unlikely(!legitimize_root(nd)))
                goto out;
        rcu_read_unlock();
        BUG_ON(nd->inode != parent->d_inode);
        return true;

out1:
        nd->path.mnt = NULL;
        nd->path.dentry = NULL;
out:
        rcu_read_unlock();
        return false;
}

/**
 * try_to_unlazy_next - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
 * @dentry: next dentry to step into
 * @seq: seq number to check @dentry against
 * Returns: true on success, false on failure
 *
 * Similar to to try_to_unlazy(), but here we have the next dentry already
 * picked by rcu-walk and want to legitimize that in addition to the current
 * nd->path and nd->root for ref-walk mode.  Must be called from rcu-walk context.
 * Nothing should touch nameidata between try_to_unlazy_next() failure and
 * terminate_walk().
 */
static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsigned seq)
{
        BUG_ON(!(nd->flags & LOOKUP_RCU));

        nd->flags &= ~LOOKUP_RCU;
        if (unlikely(!legitimize_links(nd)))
                goto out2;
        if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
                goto out2;
        if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
                goto out1;

        /*
         * We need to move both the parent and the dentry from the RCU domain
         * to be properly refcounted. And the sequence number in the dentry
         * validates *both* dentry counters, since we checked the sequence
         * number of the parent after we got the child sequence number. So we
         * know the parent must still be valid if the child sequence number is
         */
        if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
                goto out;
        if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
                goto out_dput;
        /*
         * Sequence counts matched. Now make sure that the root is
         * still valid and get it if required.
         */
        if (unlikely(!legitimize_root(nd)))
                goto out_dput;
        rcu_read_unlock();
        return true;

out2:
        nd->path.mnt = NULL;
out1:
        nd->path.dentry = NULL;
out:
        rcu_read_unlock();
        return false;
out_dput:
        rcu_read_unlock();
        dput(dentry);
        return false;
}

static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
{
        if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
                return dentry->d_op->d_revalidate(dentry, flags);
        else
                return 1;
}

/**
 * complete_walk - successful completion of path walk
 * @nd:  pointer nameidata
 *
 * If we had been in RCU mode, drop out of it and legitimize nd->path.
 * Revalidate the final result, unless we'd already done that during
 * the path walk or the filesystem doesn't ask for it.  Return 0 on
 * success, -error on failure.  In case of failure caller does not
 * need to drop nd->path.
 */
static int complete_walk(struct nameidata *nd)
{
        struct dentry *dentry = nd->path.dentry;
        int status;

        if (nd->flags & LOOKUP_RCU) {
                /*
                 * We don't want to zero nd->root for scoped-lookups or
                 * externally-managed nd->root.
                 */
                if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED)))
                        nd->root.mnt = NULL;
                nd->flags &= ~LOOKUP_CACHED;
                if (!try_to_unlazy(nd))
                        return -ECHILD;
        }

        if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
                /*
                 * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't
                 * ever step outside the root during lookup" and should already
                 * be guaranteed by the rest of namei, we want to avoid a namei
                 * BUG resulting in userspace being given a path that was not
                 * scoped within the root at some point during the lookup.
                 *
                 * So, do a final sanity-check to make sure that in the
                 * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED)
                 * we won't silently return an fd completely outside of the
                 * requested root to userspace.
                 *
                 * Userspace could move the path outside the root after this
                 * check, but as discussed elsewhere this is not a concern (the
                 * resolved file was inside the root at some point).
                 */
                if (!path_is_under(&nd->path, &nd->root))
                        return -EXDEV;
        }

        if (likely(!(nd->flags & LOOKUP_JUMPED)))
                return 0;

        if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
                return 0;

        status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
        if (status > 0)
                return 0;

        if (!status)
                status = -ESTALE;

        return status;
}

static int set_root(struct nameidata *nd)
{
        struct fs_struct *fs = current->fs;

        /*
         * Jumping to the real root in a scoped-lookup is a BUG in namei, but we
         * still have to ensure it doesn't happen because it will cause a breakout
         * from the dirfd.
         */
        if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED))
                return -ENOTRECOVERABLE;

        if (nd->flags & LOOKUP_RCU) {
                unsigned seq;

                do {
                        seq = read_seqcount_begin(&fs->seq);
                        nd->root = fs->root;
                        nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
                } while (read_seqcount_retry(&fs->seq, seq));
        } else {
                get_fs_root(fs, &nd->root);
                nd->flags |= LOOKUP_ROOT_GRABBED;
        }
        return 0;
}

static int nd_jump_root(struct nameidata *nd)
{
        if (unlikely(nd->flags & LOOKUP_BENEATH))
                return -EXDEV;
        if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
                /* Absolute path arguments to path_init() are allowed. */
                if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt)
                        return -EXDEV;
        }
        if (!nd->root.mnt) {
                int error = set_root(nd);
                if (error)
                        return error;
        }
        if (nd->flags & LOOKUP_RCU) {
                struct dentry *d;
                nd->path = nd->root;
                d = nd->path.dentry;
                nd->inode = d->d_inode;
                nd->seq = nd->root_seq;
                if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
                        return -ECHILD;
        } else {
                path_put(&nd->path);
                nd->path = nd->root;
                path_get(&nd->path);
                nd->inode = nd->path.dentry->d_inode;
        }
        nd->flags |= LOOKUP_JUMPED;
        return 0;
}

/*
 * Helper to directly jump to a known parsed path from ->get_link,
 * caller must have taken a reference to path beforehand.
 */
int nd_jump_link(struct path *path)
{
        int error = -ELOOP;
        struct nameidata *nd = current->nameidata;

        if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
                goto err;

        error = -EXDEV;
        if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
                if (nd->path.mnt != path->mnt)
                        goto err;
        }
        /* Not currently safe for scoped-lookups. */
        if (unlikely(nd->flags & LOOKUP_IS_SCOPED))
                goto err;

        path_put(&nd->path);
        nd->path = *path;
        nd->inode = nd->path.dentry->d_inode;
        nd->flags |= LOOKUP_JUMPED;
        return 0;

err:
        path_put(path);
        return error;
}

static inline void put_link(struct nameidata *nd)
{
        struct saved *last = nd->stack + --nd->depth;
        do_delayed_call(&last->done);
        if (!(nd->flags & LOOKUP_RCU))
                path_put(&last->link);
}

int sysctl_protected_symlinks __read_mostly = 0;
int sysctl_protected_hardlinks __read_mostly = 0;
int sysctl_protected_fifos __read_mostly;
int sysctl_protected_regular __read_mostly;

/**
 * may_follow_link - Check symlink following for unsafe situations
 * @nd: nameidata pathwalk data
 *
 * In the case of the sysctl_protected_symlinks sysctl being enabled,
 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
 * in a sticky world-writable directory. This is to protect privileged
 * processes from failing races against path names that may change out
 * from under them by way of other users creating malicious symlinks.
 * It will permit symlinks to be followed only when outside a sticky
 * world-writable directory, or when the uid of the symlink and follower
 * match, or when the directory owner matches the symlink's owner.
 *
 * Returns 0 if following the symlink is allowed, -ve on error.
 */
static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
{
        if (!sysctl_protected_symlinks)
                return 0;

        /* Allowed if owner and follower match. */
        if (uid_eq(current_cred()->fsuid, inode->i_uid))
                return 0;

        /* Allowed if parent directory not sticky and world-writable. */
        if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
                return 0;

        /* Allowed if parent directory and link owner match. */
        if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, inode->i_uid))
                return 0;

        if (nd->flags & LOOKUP_RCU)
                return -ECHILD;

        audit_inode(nd->name, nd->stack[0].link.dentry, 0);
        audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link");
        return -EACCES;
}

/**
 * safe_hardlink_source - Check for safe hardlink conditions
 * @inode: the source inode to hardlink from
 *
 * Return false if at least one of the following conditions:
 *    - inode is not a regular file
 *    - inode is setuid
 *    - inode is setgid and group-exec
 *    - access failure for read and write
 *
 * Otherwise returns true.
 */
static bool safe_hardlink_source(struct inode *inode)
{
        umode_t mode = inode->i_mode;

        /* Special files should not get pinned to the filesystem. */
        if (!S_ISREG(mode))
                return false;

        /* Setuid files should not get pinned to the filesystem. */
        if (mode & S_ISUID)
                return false;

        /* Executable setgid files should not get pinned to the filesystem. */
        if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
                return false;

        /* Hardlinking to unreadable or unwritable sources is dangerous. */
        if (inode_permission(inode, MAY_READ | MAY_WRITE))
                return false;

        return true;
}

/**
 * may_linkat - Check permissions for creating a hardlink
 * @link: the source to hardlink from
 *
 * Block hardlink when all of:
 *  - sysctl_protected_hardlinks enabled
 *  - fsuid does not match inode
 *  - hardlink source is unsafe (see safe_hardlink_source() above)
 *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
 *
 * Returns 0 if successful, -ve on error.
 */
int may_linkat(struct path *link)
{
        struct inode *inode = link->dentry->d_inode;

        /* Inode writeback is not safe when the uid or gid are invalid. */
        if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
                return -EOVERFLOW;

        if (!sysctl_protected_hardlinks)
                return 0;

        /* Source inode owner (or CAP_FOWNER) can hardlink all they like,
         * otherwise, it must be a safe source.
         */
        if (safe_hardlink_source(inode) || inode_owner_or_capable(inode))
                return 0;

        audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
        return -EPERM;
}

/**
 * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
 *                          should be allowed, or not, on files that already
 *                          exist.
 * @dir_mode: mode bits of directory
 * @dir_uid: owner of directory
 * @inode: the inode of the file to open
 *
 * Block an O_CREAT open of a FIFO (or a regular file) when:
 *   - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
 *   - the file already exists
 *   - we are in a sticky directory
 *   - we don't own the file
 *   - the owner of the directory doesn't own the file
 *   - the directory is world writable
 * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
 * the directory doesn't have to be world writable: being group writable will
 * be enough.
 *
 * Returns 0 if the open is allowed, -ve on error.
 */
static int may_create_in_sticky(umode_t dir_mode, kuid_t dir_uid,
                                struct inode * const inode)
{
        if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
            (!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
            likely(!(dir_mode & S_ISVTX)) ||
            uid_eq(inode->i_uid, dir_uid) ||
            uid_eq(current_fsuid(), inode->i_uid))
                return 0;

        if (likely(dir_mode & 0002) ||
            (dir_mode & 0020 &&
             ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) ||
              (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) {
                const char *operation = S_ISFIFO(inode->i_mode) ?
                                        "sticky_create_fifo" :
                                        "sticky_create_regular";
                audit_log_path_denied(AUDIT_ANOM_CREAT, operation);
                return -EACCES;
        }
        return 0;
}

/*
 * follow_up - Find the mountpoint of path's vfsmount
 *
 * Given a path, find the mountpoint of its source file system.
 * Replace @path with the path of the mountpoint in the parent mount.
 * Up is towards /.
 *
 * Return 1 if we went up a level and 0 if we were already at the
 * root.
 */
int follow_up(struct path *path)
{
        struct mount *mnt = real_mount(path->mnt);
        struct mount *parent;
        struct dentry *mountpoint;

        read_seqlock_excl(&mount_lock);
        parent = mnt->mnt_parent;
        if (parent == mnt) {
                read_sequnlock_excl(&mount_lock);
                return 0;
        }
        mntget(&parent->mnt);
        mountpoint = dget(mnt->mnt_mountpoint);
        read_sequnlock_excl(&mount_lock);
        dput(path->dentry);
        path->dentry = mountpoint;
        mntput(path->mnt);
        path->mnt = &parent->mnt;
        return 1;
}
EXPORT_SYMBOL(follow_up);

static bool choose_mountpoint_rcu(struct mount *m, const struct path *root,
                                  struct path *path, unsigned *seqp)
{
        while (mnt_has_parent(m)) {
                struct dentry *mountpoint = m->mnt_mountpoint;

                m = m->mnt_parent;
                if (unlikely(root->dentry == mountpoint &&
                             root->mnt == &m->mnt))
                        break;
                if (mountpoint != m->mnt.mnt_root) {
                        path->mnt = &m->mnt;
                        path->dentry = mountpoint;
                        *seqp = read_seqcount_begin(&mountpoint->d_seq);
                        return true;
                }
        }
        return false;
}

static bool choose_mountpoint(struct mount *m, const struct path *root,
                              struct path *path)
{
        bool found;

        rcu_read_lock();
        while (1) {
                unsigned seq, mseq = read_seqbegin(&mount_lock);

                found = choose_mountpoint_rcu(m, root, path, &seq);
                if (unlikely(!found)) {
                        if (!read_seqretry(&mount_lock, mseq))
                                break;
                } else {
                        if (likely(__legitimize_path(path, seq, mseq)))
                                break;
                        rcu_read_unlock();
                        path_put(path);
                        rcu_read_lock();
                }
        }
        rcu_read_unlock();
        return found;
}

/*
 * Perform an automount
 * - return -EISDIR to tell follow_managed() to stop and return the path we
 *   were called with.
 */
static int follow_automount(struct path *path, int *count, unsigned lookup_flags)
{
        struct dentry *dentry = path->dentry;

        /* We don't want to mount if someone's just doing a stat -
         * unless they're stat'ing a directory and appended a '/' to
         * the name.
         *
         * We do, however, want to mount if someone wants to open or
         * create a file of any type under the mountpoint, wants to
         * traverse through the mountpoint or wants to open the
         * mounted directory.  Also, autofs may mark negative dentries
         * as being automount points.  These will need the attentions
         * of the daemon to instantiate them before they can be used.
         */
        if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
                           LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
            dentry->d_inode)
                return -EISDIR;

        if (count && (*count)++ >= MAXSYMLINKS)
                return -ELOOP;

        return finish_automount(dentry->d_op->d_automount(path), path);
}

/*
 * mount traversal - out-of-line part.  One note on ->d_flags accesses -
 * dentries are pinned but not locked here, so negative dentry can go
 * positive right under us.  Use of smp_load_acquire() provides a barrier
 * sufficient for ->d_inode and ->d_flags consistency.
 */
static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
                             int *count, unsigned lookup_flags)
{
        struct vfsmount *mnt = path->mnt;
        bool need_mntput = false;
        int ret = 0;

        while (flags & DCACHE_MANAGED_DENTRY) {
                /* Allow the filesystem to manage the transit without i_mutex
                 * being held. */
                if (flags & DCACHE_MANAGE_TRANSIT) {
                        ret = path->dentry->d_op->d_manage(path, false);
                        flags = smp_load_acquire(&path->dentry->d_flags);
                        if (ret < 0)
                                break;
                }

                if (flags & DCACHE_MOUNTED) {        // something's mounted on it..
                        struct vfsmount *mounted = lookup_mnt(path);
                        if (mounted) {                // ... in our namespace
                                dput(path->dentry);
                                if (need_mntput)
                                        mntput(path->mnt);
                                path->mnt = mounted;
                                path->dentry = dget(mounted->mnt_root);
                                // here we know it's positive
                                flags = path->dentry->d_flags;
                                need_mntput = true;
                                continue;
                        }
                }

                if (!(flags & DCACHE_NEED_AUTOMOUNT))
                        break;

                // uncovered automount point
                ret = follow_automount(path, count, lookup_flags);
                flags = smp_load_acquire(&path->dentry->d_flags);
                if (ret < 0)
                        break;
        }

        if (ret == -EISDIR)
                ret = 0;
        // possible if you race with several mount --move
        if (need_mntput && path->mnt == mnt)
                mntput(path->mnt);
        if (!ret && unlikely(d_flags_negative(flags)))
                ret = -ENOENT;
        *jumped = need_mntput;
        return ret;
}

static inline int traverse_mounts(struct path *path, bool *jumped,
                                  int *count, unsigned lookup_flags)
{
        unsigned flags = smp_load_acquire(&path->dentry->d_flags);

        /* fastpath */
        if (likely(!(flags & DCACHE_MANAGED_DENTRY))) {
                *jumped = false;
                if (unlikely(d_flags_negative(flags)))
                        return -ENOENT;
                return 0;
        }
        return __traverse_mounts(path, flags, jumped, count, lookup_flags);
}

int follow_down_one(struct path *path)
{
        struct vfsmount *mounted;

        mounted = lookup_mnt(path);
        if (mounted) {
                dput(path->dentry);
                mntput(path->mnt);
                path->mnt = mounted;
                path->dentry = dget(mounted->mnt_root);
                return 1;
        }
        return 0;
}
EXPORT_SYMBOL(follow_down_one);

/*
 * Follow down to the covering mount currently visible to userspace.  At each
 * point, the filesystem owning that dentry may be queried as to whether the
 * caller is permitted to proceed or not.
 */
int follow_down(struct path *path)
{
        struct vfsmount *mnt = path->mnt;
        bool jumped;
        int ret = traverse_mounts(path, &jumped, NULL, 0);

        if (path->mnt != mnt)
                mntput(mnt);
        return ret;
}
EXPORT_SYMBOL(follow_down);

/*
 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
 * we meet a managed dentry that would need blocking.
 */
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
                               struct inode **inode, unsigned *seqp)
{
        struct dentry *dentry = path->dentry;
        unsigned int flags = dentry->d_flags;

        if (likely(!(flags & DCACHE_MANAGED_DENTRY)))
                return true;

        if (unlikely(nd->flags & LOOKUP_NO_XDEV))
                return false;

        for (;;) {
                /*
                 * Don't forget we might have a non-mountpoint managed dentry
                 * that wants to block transit.
                 */
                if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) {
                        int res = dentry->d_op->d_manage(path, true);
                        if (res)
                                return res == -EISDIR;
                        flags = dentry->d_flags;
                }

                if (flags & DCACHE_MOUNTED) {
                        struct mount *mounted = __lookup_mnt(path->mnt, dentry);
                        if (mounted) {
                                path->mnt = &mounted->mnt;
                                dentry = path->dentry = mounted->mnt.mnt_root;
                                nd->flags |= LOOKUP_JUMPED;
                                *seqp = read_seqcount_begin(&dentry->d_seq);
                                *inode = dentry->d_inode;
                                /*
                                 * We don't need to re-check ->d_seq after this
                                 * ->d_inode read - there will be an RCU delay
                                 * between mount hash removal and ->mnt_root
                                 * becoming unpinned.
                                 */
                                flags = dentry->d_flags;
                                if (read_seqretry(&mount_lock, nd->m_seq))
                                        return false;
                                continue;
                        }
                        if (read_seqretry(&mount_lock, nd->m_seq))
                                return false;
                }
                return !(flags & DCACHE_NEED_AUTOMOUNT);
        }
}

static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
                          struct path *path, struct inode **inode,
                          unsigned int *seqp)
{
        bool jumped;
        int ret;

        path->mnt = nd->path.mnt;
        path->dentry = dentry;
        if (nd->flags & LOOKUP_RCU) {
                unsigned int seq = *seqp;
                if (unlikely(!*inode))
                        return -ENOENT;
                if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
                        return 0;
                if (!try_to_unlazy_next(nd, dentry, seq))
                        return -ECHILD;
                // *path might've been clobbered by __follow_mount_rcu()
                path->mnt = nd->path.mnt;
                path->dentry = dentry;
        }
        ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
        if (jumped) {
                if (unlikely(nd->flags & LOOKUP_NO_XDEV))
                        ret = -EXDEV;
                else
                        nd->flags |= LOOKUP_JUMPED;
        }
        if (unlikely(ret)) {
                dput(path->dentry);
                if (path->mnt != nd->path.mnt)
                        mntput(path->mnt);
        } else {
                *inode = d_backing_inode(path->dentry);
                *seqp = 0; /* out of RCU mode, so the value doesn't matter */
        }
        return ret;
}

/*
 * This looks up the name in dcache and possibly revalidates the found dentry.
 * NULL is returned if the dentry does not exist in the cache.
 */
static struct dentry *lookup_dcache(const struct qstr *name,
                                    struct dentry *dir,
                                    unsigned int flags)
{
        struct dentry *dentry = d_lookup(dir, name);
        if (dentry) {
                int error = d_revalidate(dentry, flags);
                if (unlikely(error <= 0)) {
                        if (!error)
                                d_invalidate(dentry);
                        dput(dentry);
                        return ERR_PTR(error);
                }
        }
        return dentry;
}

/*
 * Parent directory has inode locked exclusive.  This is one
 * and only case when ->lookup() gets called on non in-lookup
 * dentries - as the matter of fact, this only gets called
 * when directory is guaranteed to have no in-lookup children
 * at all.
 */
static struct dentry *__lookup_hash(const struct qstr *name,
                struct dentry *base, unsigned int flags)
{
        struct dentry *dentry = lookup_dcache(name, base, flags);
        struct dentry *old;
        struct inode *dir = base->d_inode;

        if (dentry)
                return dentry;

        /* Don't create child dentry for a dead directory. */
        if (unlikely(IS_DEADDIR(dir)))
                return ERR_PTR(-ENOENT);

        dentry = d_alloc(base, name);
        if (unlikely(!dentry))
                return ERR_PTR(-ENOMEM);

        old = dir->i_op->lookup(dir, dentry, flags);
        if (unlikely(old)) {
                dput(dentry);
                dentry = old;
        }
        return dentry;
}

static struct dentry *lookup_fast(struct nameidata *nd,
                                  struct inode **inode,
                                  unsigned *seqp)
{
        struct dentry *dentry, *parent = nd->path.dentry;
        int status = 1;

        /*
         * Rename seqlock is not required here because in the off chance
         * of a false negative due to a concurrent rename, the caller is
         * going to fall back to non-racy lookup.
         */
        if (nd->flags & LOOKUP_RCU) {
                unsigned seq;
                dentry = __d_lookup_rcu(parent, &nd->last, &seq);
                if (unlikely(!dentry)) {
                        if (!try_to_unlazy(nd))
                                return ERR_PTR(-ECHILD);
                        return NULL;
                }

                /*
                 * This sequence count validates that the inode matches
                 * the dentry name information from lookup.
                 */
                *inode = d_backing_inode(dentry);
                if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
                        return ERR_PTR(-ECHILD);

                /*
                 * This sequence count validates that the parent had no
                 * changes while we did the lookup of the dentry above.
                 *
                 * The memory barrier in read_seqcount_begin of child is
                 *  enough, we can use __read_seqcount_retry here.
                 */
                if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
                        return ERR_PTR(-ECHILD);

                *seqp = seq;
                status = d_revalidate(dentry, nd->flags);
                if (likely(status > 0))
                        return dentry;
                if (!try_to_unlazy_next(nd, dentry, seq))
                        return ERR_PTR(-ECHILD);
                if (unlikely(status == -ECHILD))
                        /* we'd been told to redo it in non-rcu mode */
                        status = d_revalidate(dentry, nd->flags);
        } else {
                dentry = __d_lookup(parent, &nd->last);
                if (unlikely(!dentry))
                        return NULL;
                status = d_revalidate(dentry, nd->flags);
        }
        if (unlikely(status <= 0)) {
                if (!status)
                        d_invalidate(dentry);
                dput(dentry);
                return ERR_PTR(status);
        }
        return dentry;
}

/* Fast lookup failed, do it the slow way */
static struct dentry *__lookup_slow(const struct qstr *name,
                                    struct dentry *dir,
                                    unsigned int flags)
{
        struct dentry *dentry, *old;
        struct inode *inode = dir->d_inode;
        DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);

        /* Don't go there if it's already dead */
        if (unlikely(IS_DEADDIR(inode)))
                return ERR_PTR(-ENOENT);
again:
        dentry = d_alloc_parallel(dir, name, &wq);
        if (IS_ERR(dentry))
                return dentry;
        if (unlikely(!d_in_lookup(dentry))) {
                int error = d_revalidate(dentry, flags);
                if (unlikely(error <= 0)) {
                        if (!error) {
                                d_invalidate(dentry);
                                dput(dentry);
                                goto again;
                        }
                        dput(dentry);
                        dentry = ERR_PTR(error);
                }
        } else {
                old = inode->i_op->lookup(inode, dentry, flags);
                d_lookup_done(dentry);
                if (unlikely(old)) {
                        dput(dentry);
                        dentry = old;
                }
        }
        return dentry;
}

static struct dentry *lookup_slow(const struct qstr *name,
                                  struct dentry *dir,
                                  unsigned int flags)
{
        struct inode *inode = dir->d_inode;
        struct dentry *res;
        inode_lock_shared(inode);
        res = __lookup_slow(name, dir, flags);
        inode_unlock_shared(inode);
        return res;
}

static inline int may_lookup(struct nameidata *nd)
{
        if (nd->flags & LOOKUP_RCU) {
                int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
                if (err != -ECHILD || !try_to_unlazy(nd))
                        return err;
        }
        return inode_permission(nd->inode, MAY_EXEC);
}

static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
{
        if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
                return -ELOOP;

        if (likely(nd->depth != EMBEDDED_LEVELS))
                return 0;
        if (likely(nd->stack != nd->internal))
                return 0;
        if (likely(nd_alloc_stack(nd)))
                return 0;

        if (nd->flags & LOOKUP_RCU) {
                // we need to grab link before we do unlazy.  And we can't skip
                // unlazy even if we fail to grab the link - cleanup needs it
                bool grabbed_link = legitimize_path(nd, link, seq);

                if (!try_to_unlazy(nd) != 0 || !grabbed_link)
                        return -ECHILD;

                if (nd_alloc_stack(nd))
                        return 0;
        }
        return -ENOMEM;
}

enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};

static const char *pick_link(struct nameidata *nd, struct path *link,
                     struct inode *inode, unsigned seq, int flags)
{
        struct saved *last;
        const char *res;
        int error = reserve_stack(nd, link, seq);

        if (unlikely(error)) {
                if (!(nd->flags & LOOKUP_RCU))
                        path_put(link);
                return ERR_PTR(error);
        }
        last = nd->stack + nd->depth++;
        last->link = *link;
        clear_delayed_call(&last->done);
        last->seq = seq;

        if (flags & WALK_TRAILING) {
                error = may_follow_link(nd, inode);
                if (unlikely(error))
                        return ERR_PTR(error);
        }

        if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) ||
                        unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
                return ERR_PTR(-ELOOP);

        if (!(nd->flags & LOOKUP_RCU)) {
                touch_atime(&last->link);
                cond_resched();
        } else if (atime_needs_update(&last->link, inode)) {
                if (!try_to_unlazy(nd))
                        return ERR_PTR(-ECHILD);
                touch_atime(&last->link);
        }

        error = security_inode_follow_link(link->dentry, inode,
                                           nd->flags & LOOKUP_RCU);
        if (unlikely(error))
                return ERR_PTR(error);

        res = READ_ONCE(inode->i_link);
        if (!res) {
                const char * (*get)(struct dentry *, struct inode *,
                                struct delayed_call *);
                get = inode->i_op->get_link;
                if (nd->flags & LOOKUP_RCU) {
                        res = get(NULL, inode, &last->done);
                        if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd))
                                res = get(link->dentry, inode, &last->done);
                } else {
                        res = get(link->dentry, inode, &last->done);
                }
                if (!res)
                        goto all_done;
                if (IS_ERR(res))
                        return res;
        }
        if (*res == '/') {
                error = nd_jump_root(nd);
                if (unlikely(error))
                        return ERR_PTR(error);
                while (unlikely(*++res == '/'))
                        ;
        }
        if (*res)
                return res;
all_done: // pure jump
        put_link(nd);
        return NULL;
}

/*
 * Do we need to follow links? We _really_ want to be able
 * to do this check without having to look at inode->i_op,
 * so we keep a cache of "no, this doesn't need follow_link"
 * for the common case.
 */
static const char *step_into(struct nameidata *nd, int flags,
                     struct dentry *dentry, struct inode *inode, unsigned seq)
{
        struct path path;
        int err = handle_mounts(nd, dentry, &path, &inode, &seq);

        if (err < 0)
                return ERR_PTR(err);
        if (likely(!d_is_symlink(path.dentry)) ||
           ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) ||
           (flags & WALK_NOFOLLOW)) {
                /* not a symlink or should not follow */
                if (!(nd->flags & LOOKUP_RCU)) {
                        dput(nd->path.dentry);
                        if (nd->path.mnt != path.mnt)
                                mntput(nd->path.mnt);
                }
                nd->path = path;
                nd->inode = inode;
                nd->seq = seq;
                return NULL;
        }
        if (nd->flags & LOOKUP_RCU) {
                /* make sure that d_is_symlink above matches inode */
                if (read_seqcount_retry(&path.dentry->d_seq, seq))
                        return ERR_PTR(-ECHILD);
        } else {
                if (path.mnt == nd->path.mnt)
                        mntget(path.mnt);
        }
        return pick_link(nd, &path, inode, seq, flags);
}

static struct dentry *follow_dotdot_rcu(struct nameidata *nd,
                                        struct inode **inodep,
                                        unsigned *seqp)
{
        struct dentry *parent, *old;

        if (path_equal(&nd->path, &nd->root))
                goto in_root;
        if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
                struct path path;
                unsigned seq;
                if (!choose_mountpoint_rcu(real_mount(nd->path.mnt),
                                           &nd->root, &path, &seq))
                        goto in_root;
                if (unlikely(nd->flags & LOOKUP_NO_XDEV))
                        return ERR_PTR(-ECHILD);
                nd->path = path;
                nd->inode = path.dentry->d_inode;
                nd->seq = seq;
                if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
                        return ERR_PTR(-ECHILD);
                /* we know that mountpoint was pinned */
        }
        old = nd->path.dentry;
        parent = old->d_parent;
        *inodep = parent->d_inode;
        *seqp = read_seqcount_begin(&parent->d_seq);
        if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
                return ERR_PTR(-ECHILD);
        if (unlikely(!path_connected(nd->path.mnt, parent)))
                return ERR_PTR(-ECHILD);
        return parent;
in_root:
        if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
                return ERR_PTR(-ECHILD);
        if (unlikely(nd->flags & LOOKUP_BENEATH))
                return ERR_PTR(-ECHILD);
        return NULL;
}

static struct dentry *follow_dotdot(struct nameidata *nd,
                                 struct inode **inodep,
                                 unsigned *seqp)
{
        struct dentry *parent;

        if (path_equal(&nd->path, &nd->root))
                goto in_root;
        if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
                struct path path;

                if (!choose_mountpoint(real_mount(nd->path.mnt),
                                       &nd->root, &path))
                        goto in_root;
                path_put(&nd->path);
                nd->path = path;
                nd->inode = path.dentry->d_inode;
                if (unlikely(nd->flags & LOOKUP_NO_XDEV))
                        return ERR_PTR(-EXDEV);
        }
        /* rare case of legitimate dget_parent()... */
        parent = dget_parent(nd->path.dentry);
        if (unlikely(!path_connected(nd->path.mnt, parent))) {
                dput(parent);
                return ERR_PTR(-ENOENT);
        }
        *seqp = 0;
        *inodep = parent->d_inode;
        return parent;

in_root:
        if (unlikely(nd->flags & LOOKUP_BENEATH))
                return ERR_PTR(-EXDEV);
        dget(nd->path.dentry);
        return NULL;
}

static const char *handle_dots(struct nameidata *nd, int type)
{
        if (type == LAST_DOTDOT) {
                const char *error = NULL;
                struct dentry *parent;
                struct inode *inode;
                unsigned seq;

                if (!nd->root.mnt) {
                        error = ERR_PTR(set_root(nd));
                        if (error)
                                return error;
                }
                if (nd->flags & LOOKUP_RCU)
                        parent = follow_dotdot_rcu(nd, &inode, &seq);
                else
                        parent = follow_dotdot(nd, &inode, &seq);
                if (IS_ERR(parent))
                        return ERR_CAST(parent);
                if (unlikely(!parent))
                        error = step_into(nd, WALK_NOFOLLOW,
                                         nd->path.dentry, nd->inode, nd->seq);
                else
                        error = step_into(nd, WALK_NOFOLLOW,
                                         parent, inode, seq);
                if (unlikely(error))
                        return error;

                if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
                        /*
                         * If there was a racing rename or mount along our
                         * path, then we can't be sure that ".." hasn't jumped
                         * above nd->root (and so userspace should retry or use
                         * some fallback).
                         */
                        smp_rmb();
                        if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)))
                                return ERR_PTR(-EAGAIN);
                        if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)))
                                return ERR_PTR(-EAGAIN);
                }
        }
        return NULL;
}

static const char *walk_component(struct nameidata *nd, int flags)
{
        struct dentry *dentry;
        struct inode *inode;
        unsigned seq;
        /*
         * "." and ".." are special - ".." especially so because it has
         * to be able to know about the current root directory and
         * parent relationships.
         */
        if (unlikely(nd->last_type != LAST_NORM)) {
                if (!(flags & WALK_MORE) && nd->depth)
                        put_link(nd);
                return handle_dots(nd, nd->last_type);
        }
        dentry = lookup_fast(nd, &inode, &seq);
        if (IS_ERR(dentry))
                return ERR_CAST(dentry);
        if (unlikely(!dentry)) {
                dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags);
                if (IS_ERR(dentry))
                        return ERR_CAST(dentry);
        }
        if (!(flags & WALK_MORE) && nd->depth)
                put_link(nd);
        return step_into(nd, flags, dentry, inode, seq);
}

/*
 * We can do the critical dentry name comparison and hashing
 * operations one word at a time, but we are limited to:
 *
 * - Architectures with fast unaligned word accesses. We could
 *   do a "get_unaligned()" if this helps and is sufficiently
 *   fast.
 *
 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
 *   do not trap on the (extremely unlikely) case of a page
 *   crossing operation.
 *
 * - Furthermore, we need an efficient 64-bit compile for the
 *   64-bit case in order to generate the "number of bytes in
 *   the final mask". Again, that could be replaced with a
 *   efficient population count instruction or similar.
 */
#ifdef CONFIG_DCACHE_WORD_ACCESS

#include <asm/word-at-a-time.h>

#ifdef HASH_MIX

/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */

#elif defined(CONFIG_64BIT)
/*
 * Register pressure in the mixing function is an issue, particularly
 * on 32-bit x86, but almost any function requires one state value and
 * one temporary.  Instead, use a function designed for two state values
 * and no temporaries.
 *
 * This function cannot create a collision in only two iterations, so
 * we have two iterations to achieve avalanche.  In those two iterations,
 * we have six layers of mixing, which is enough to spread one bit's
 * influence out to 2^6 = 64 state bits.
 *
 * Rotate constants are scored by considering either 64 one-bit input
 * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
 * probability of that delta causing a change to each of the 128 output
 * bits, using a sample of random initial states.
 *
 * The Shannon entropy of the computed probabilities is then summed
 * to produce a score.  Ideally, any input change has a 50% chance of
 * toggling any given output bit.
 *
 * Mixing scores (in bits) for (12,45):
 * Input delta: 1-bit      2-bit
 * 1 round:     713.3    42542.6
 * 2 rounds:   2753.7   140389.8
 * 3 rounds:   5954.1   233458.2
 * 4 rounds:   7862.6   256672.2
 * Perfect:    8192     258048
 *            (64*128) (64*63/2 * 128)
 */
#define HASH_MIX(x, y, a)        \
        (        x ^= (a),        \
        y ^= x,        x = rol64(x,12),\
        x += y,        y = rol64(y,45),\
        y *= 9                        )

/*
 * Fold two longs into one 32-bit hash value.  This must be fast, but
 * latency isn't quite as critical, as there is a fair bit of additional
 * work done before the hash value is used.
 */
static inline unsigned int fold_hash(unsigned long x, unsigned long y)
{
        y ^= x * GOLDEN_RATIO_64;
        y *= GOLDEN_RATIO_64;
        return y >> 32;
}

#else        /* 32-bit case */

/*
 * Mixing scores (in bits) for (7,20):
 * Input delta: 1-bit      2-bit
 * 1 round:     330.3     9201.6
 * 2 rounds:   1246.4    25475.4
 * 3 rounds:   1907.1    31295.1
 * 4 rounds:   2042.3    31718.6
 * Perfect:    2048      31744
 *            (32*64)   (32*31/2 * 64)
 */
#define HASH_MIX(x, y, a)        \
        (        x ^= (a),        \
        y ^= x,        x = rol32(x, 7),\
        x += y,        y = rol32(y,20),\
        y *= 9                        )

static inline unsigned int fold_hash(unsigned long x, unsigned long y)
{
        /* Use arch-optimized multiply if one exists */
        return __hash_32(y ^ __hash_32(x));
}

#endif

/*
 * Return the hash of a string of known length.  This is carfully
 * designed to match hash_name(), which is the more critical function.
 * In particular, we must end by hashing a final word containing 0..7
 * payload bytes, to match the way that hash_name() iterates until it
 * finds the delimiter after the name.
 */
unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
{
        unsigned long a, x = 0, y = (unsigned long)salt;

        for (;;) {
                if (!len)
                        goto done;
                a = load_unaligned_zeropad(name);
                if (len < sizeof(unsigned long))
                        break;
                HASH_MIX(x, y, a);
                name += sizeof(unsigned long);
                len -= sizeof(unsigned long);
        }
        x ^= a & bytemask_from_count(len);
done:
        return fold_hash(x, y);
}
EXPORT_SYMBOL(full_name_hash);

/* Return the "hash_len" (hash and length) of a null-terminated string */
u64 hashlen_string(const void *salt, const char *name)
{
        unsigned long a = 0, x = 0, y = (unsigned long)salt;
        unsigned long adata, mask, len;
        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;

        len = 0;
        goto inside;

        do {
                HASH_MIX(x, y, a);
                len += sizeof(unsigned long);
inside:
                a = load_unaligned_zeropad(name+len);
        } while (!has_zero(a, &adata, &constants));

        adata = prep_zero_mask(a, adata, &constants);
        mask = create_zero_mask(adata);
        x ^= a & zero_bytemask(mask);

        return hashlen_create(fold_hash(x, y), len + find_zero(mask));
}
EXPORT_SYMBOL(hashlen_string);

/*
 * Calculate the length and hash of the path component, and
 * return the "hash_len" as the result.
 */
static inline u64 hash_name(const void *salt, const char *name)
{
        unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
        unsigned long adata, bdata, mask, len;
        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;

        len = 0;
        goto inside;

        do {
                HASH_MIX(x, y, a);
                len += sizeof(unsigned long);
inside:
                a = load_unaligned_zeropad(name+len);
                b = a ^ REPEAT_BYTE('/');
        } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));

        adata = prep_zero_mask(a, adata, &constants);
        bdata = prep_zero_mask(b, bdata, &constants);
        mask = create_zero_mask(adata | bdata);
        x ^= a & zero_bytemask(mask);

        return hashlen_create(fold_hash(x, y), len + find_zero(mask));
}

#else        /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */

/* Return the hash of a string of known length */
unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
{
        unsigned long hash = init_name_hash(salt);
        while (len--)
                hash = partial_name_hash((unsigned char)*name++, hash);
        return end_name_hash(hash);
}
EXPORT_SYMBOL(full_name_hash);

/* Return the "hash_len" (hash and length) of a null-terminated string */
u64 hashlen_string(const void *salt, const char *name)
{
        unsigned long hash = init_name_hash(salt);
        unsigned long len = 0, c;

        c = (unsigned char)*name;
        while (c) {
                len++;
                hash = partial_name_hash(c, hash);
                c = (unsigned char)name[len];
        }
        return hashlen_create(end_name_hash(hash), len);
}
EXPORT_SYMBOL(hashlen_string);

/*
 * We know there's a real path component here of at least
 * one character.
 */
static inline u64 hash_name(const void *salt, const char *name)
{
        unsigned long hash = init_name_hash(salt);
        unsigned long len = 0, c;

        c = (unsigned char)*name;
        do {
                len++;
                hash = partial_name_hash(c, hash);
                c = (unsigned char)name[len];
        } while (c && c != '/');
        return hashlen_create(end_name_hash(hash), len);
}

#endif

/*
 * Name resolution.
 * This is the basic name resolution function, turning a pathname into
 * the final dentry. We expect 'base' to be positive and a directory.
 *
 * Returns 0 and nd will have valid dentry and mnt on success.
 * Returns error and drops reference to input namei data on failure.
 */
static int link_path_walk(const char *name, struct nameidata *nd)
{
        int depth = 0; // depth <= nd->depth
        int err;

        nd->last_type = LAST_ROOT;
        nd->flags |= LOOKUP_PARENT;
        if (IS_ERR(name))
                return PTR_ERR(name);
        while (*name=='/')
                name++;
        if (!*name)
                return 0;

        /* At this point we know we have a real path component. */
        for(;;) {
                const char *link;
                u64 hash_len;
                int type;

                err = may_lookup(nd);
                if (err)
                        return err;

                hash_len = hash_name(nd->path.dentry, name);

                type = LAST_NORM;
                if (name[0] == '.') switch (hashlen_len(hash_len)) {
                        case 2:
                                if (name[1] == '.') {
                                        type = LAST_DOTDOT;
                                        nd->flags |= LOOKUP_JUMPED;
                                }
                                break;
                        case 1:
                                type = LAST_DOT;
                }
                if (likely(type == LAST_NORM)) {
                        struct dentry *parent = nd->path.dentry;
                        nd->flags &= ~LOOKUP_JUMPED;
                        if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
                                struct qstr this = { { .hash_len = hash_len }, .name = name };
                                err = parent->d_op->d_hash(parent, &this);
                                if (err < 0)
                                        return err;
                                hash_len = this.hash_len;
                                name = this.name;
                        }
                }

                nd->last.hash_len = hash_len;
                nd->last.name = name;
                nd->last_type = type;

                name += hashlen_len(hash_len);
                if (!*name)
                        goto OK;
                /*
                 * If it wasn't NUL, we know it was '/'. Skip that
                 * slash, and continue until no more slashes.
                 */
                do {
                        name++;
                } while (unlikely(*name == '/'));
                if (unlikely(!*name)) {
OK:
                        /* pathname or trailing symlink, done */
                        if (!depth) {
                                nd->dir_uid = nd->inode->i_uid;
                                nd->dir_mode = nd->inode->i_mode;
                                nd->flags &= ~LOOKUP_PARENT;
                                return 0;
                        }
                        /* last component of nested symlink */
                        name = nd->stack[--depth].name;
                        link = walk_component(nd, 0);
                } else {
                        /* not the last component */
                        link = walk_component(nd, WALK_MORE);
                }
                if (unlikely(link)) {
                        if (IS_ERR(link))
                                return PTR_ERR(link);
                        /* a symlink to follow */
                        nd->stack[depth++].name = name;
                        name = link;
                        continue;
                }
                if (unlikely(!d_can_lookup(nd->path.dentry))) {
                        if (nd->flags & LOOKUP_RCU) {
                                if (!try_to_unlazy(nd))
                                        return -ECHILD;
                        }
                        return -ENOTDIR;
                }
        }
}

/* must be paired with terminate_walk() */
static const char *path_init(struct nameidata *nd, unsigned flags)
{
        int error;
        const char *s = nd->name->name;

        /* LOOKUP_CACHED requires RCU, ask caller to retry */
        if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
                return ERR_PTR(-EAGAIN);

        if (!*s)
                flags &= ~LOOKUP_RCU;
        if (flags & LOOKUP_RCU)
                rcu_read_lock();

        nd->flags = flags | LOOKUP_JUMPED;
        nd->depth = 0;

        nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
        nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
        smp_rmb();

        if (flags & LOOKUP_ROOT) {
                struct dentry *root = nd->root.dentry;
                struct inode *inode = root->d_inode;
                if (*s && unlikely(!d_can_lookup(root)))
                        return ERR_PTR(-ENOTDIR);
                nd->path = nd->root;
                nd->inode = inode;
                if (flags & LOOKUP_RCU) {
                        nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
                        nd->root_seq = nd->seq;
                } else {
                        path_get(&nd->path);
                }
                return s;
        }

        nd->root.mnt = NULL;

        /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
        if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
                error = nd_jump_root(nd);
                if (unlikely(error))
                        return ERR_PTR(error);
                return s;
        }

        /* Relative pathname -- get the starting-point it is relative to. */
        if (nd->dfd == AT_FDCWD) {
                if (flags & LOOKUP_RCU) {
                        struct fs_struct *fs = current->fs;
                        unsigned seq;

                        do {
                                seq = read_seqcount_begin(&fs->seq);
                                nd->path = fs->pwd;
                                nd->inode = nd->path.dentry->d_inode;
                                nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
                        } while (read_seqcount_retry(&fs->seq, seq));
                } else {
                        get_fs_pwd(current->fs, &nd->path);
                        nd->inode = nd->path.dentry->d_inode;
                }
        } else {
                /* Caller must check execute permissions on the starting path component */
                struct fd f = fdget_raw(nd->dfd);
                struct dentry *dentry;

                if (!f.file)
                        return ERR_PTR(-EBADF);

                dentry = f.file->f_path.dentry;

                if (*s && unlikely(!d_can_lookup(dentry))) {
                        fdput(f);
                        return ERR_PTR(-ENOTDIR);
                }

                nd->path = f.file->f_path;
                if (flags & LOOKUP_RCU) {
                        nd->inode = nd->path.dentry->d_inode;
                        nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
                } else {
                        path_get(&nd->path);
                        nd->inode = nd->path.dentry->d_inode;
                }
                fdput(f);
        }

        /* For scoped-lookups we need to set the root to the dirfd as well. */
        if (flags & LOOKUP_IS_SCOPED) {
                nd->root = nd->path;
                if (flags & LOOKUP_RCU) {
                        nd->root_seq = nd->seq;
                } else {
                        path_get(&nd->root);
                        nd->flags |= LOOKUP_ROOT_GRABBED;
                }
        }
        return s;
}

static inline const char *lookup_last(struct nameidata *nd)
{
        if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
                nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;

        return walk_component(nd, WALK_TRAILING);
}

static int handle_lookup_down(struct nameidata *nd)
{
        if (!(nd->flags & LOOKUP_RCU))
                dget(nd->path.dentry);
        return PTR_ERR(step_into(nd, WALK_NOFOLLOW,
                        nd->path.dentry, nd->inode, nd->seq));
}

/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
{
        const char *s = path_init(nd, flags);
        int err;

        if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) {
                err = handle_lookup_down(nd);
                if (unlikely(err < 0))
                        s = ERR_PTR(err);
        }

        while (!(err = link_path_walk(s, nd)) &&
               (s = lookup_last(nd)) != NULL)
                ;
        if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
                err = handle_lookup_down(nd);
                nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please...
        }
        if (!err)
                err = complete_walk(nd);

        if (!err && nd->flags & LOOKUP_DIRECTORY)
                if (!d_can_lookup(nd->path.dentry))
                        err = -ENOTDIR;
        if (!err) {
                *path = nd->path;
                nd->path.mnt = NULL;
                nd->path.dentry = NULL;
        }
        terminate_walk(nd);
        return err;
}

int filename_lookup(int dfd, struct filename *name, unsigned flags,
                    struct path *path, struct path *root)
{
        int retval;
        struct nameidata nd;
        if (IS_ERR(name))
                return PTR_ERR(name);
        if (unlikely(root)) {
                nd.root = *root;
                flags |= LOOKUP_ROOT;
        }
        set_nameidata(&nd, dfd, name);
        retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
        if (unlikely(retval == -ECHILD))
                retval = path_lookupat(&nd, flags, path);
        if (unlikely(retval == -ESTALE))
                retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);

        if (likely(!retval))
                audit_inode(name, path->dentry,
                            flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0);
        restore_nameidata();
        putname(name);
        return retval;
}

/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
static int path_parentat(struct nameidata *nd, unsigned flags,
                                struct path *parent)
{
        const char *s = path_init(nd, flags);
        int err = link_path_walk(s, nd);
        if (!err)
                err = complete_walk(nd);
        if (!err) {
                *parent = nd->path;
                nd->path.mnt = NULL;
                nd->path.dentry = NULL;
        }
        terminate_walk(nd);
        return err;
}

static struct filename *filename_parentat(int dfd, struct filename *name,
                                unsigned int flags, struct path *parent,
                                struct qstr *last, int *type)
{
        int retval;
        struct nameidata nd;

        if (IS_ERR(name))
                return name;
        set_nameidata(&nd, dfd, name);
        retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
        if (unlikely(retval == -ECHILD))
                retval = path_parentat(&nd, flags, parent);
        if (unlikely(retval == -ESTALE))
                retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
        if (likely(!retval)) {
                *last = nd.last;
                *type = nd.last_type;
                audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
        } else {
                putname(name);
                name = ERR_PTR(retval);
        }
        restore_nameidata();
        return name;
}

/* does lookup, returns the object with parent locked */
struct dentry *kern_path_locked(const char *name, struct path *path)
{
        struct filename *filename;
        struct dentry *d;
        struct qstr last;
        int type;

        filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
                                    &last, &type);
        if (IS_ERR(filename))
                return ERR_CAST(filename);
        if (unlikely(type != LAST_NORM)) {
                path_put(path);
                putname(filename);
                return ERR_PTR(-EINVAL);
        }
        inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
        d = __lookup_hash(&last, path->dentry, 0);
        if (IS_ERR(d)) {
                inode_unlock(path->dentry->d_inode);
                path_put(path);
        }
        putname(filename);
        return d;
}

int kern_path(const char *name, unsigned int flags, struct path *path)
{
        return filename_lookup(AT_FDCWD, getname_kernel(name),
                               flags, path, NULL);
}
EXPORT_SYMBOL(kern_path);

/**
 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
 * @dentry:  pointer to dentry of the base directory
 * @mnt: pointer to vfs mount of the base directory
 * @name: pointer to file name
 * @flags: lookup flags
 * @path: pointer to struct path to fill
 */
int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
                    const char *name, unsigned int flags,
                    struct path *path)
{
        struct path root = {.mnt = mnt, .dentry = dentry};
        /* the first argument of filename_lookup() is ignored with root */
        return filename_lookup(AT_FDCWD, getname_kernel(name),
                               flags , path, &root);
}
EXPORT_SYMBOL(vfs_path_lookup);

static int lookup_one_len_common(const char *name, struct dentry *base,
                                 int len, struct qstr *this)
{
        this->name = name;
        this->len = len;
        this->hash = full_name_hash(base, name, len);
        if (!len)
                return -EACCES;

        if (unlikely(name[0] == '.')) {
                if (len < 2 || (len == 2 && name[1] == '.'))
                        return -EACCES;
        }

        while (len--) {
                unsigned int c = *(const unsigned char *)name++;
                if (c == '/' || c == '\0')
                        return -EACCES;
        }
        /*
         * See if the low-level filesystem might want
         * to use its own hash..
         */
        if (base->d_flags & DCACHE_OP_HASH) {
                int err = base->d_op->d_hash(base, this);
                if (err < 0)
                        return err;
        }

        return inode_permission(base->d_inode, MAY_EXEC);
}

/**
 * try_lookup_one_len - filesystem helper to lookup single pathname component
 * @name:        pathname component to lookup
 * @base:        base directory to lookup from
 * @len:        maximum length @len should be interpreted to
 *
 * Look up a dentry by name in the dcache, returning NULL if it does not
 * currently exist.  The function does not try to create a dentry.
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * The caller must hold base->i_mutex.
 */
struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len)
{
        struct qstr this;
        int err;

        WARN_ON_ONCE(!inode_is_locked(base->d_inode));

        err = lookup_one_len_common(name, base, len, &this);
        if (err)
                return ERR_PTR(err);

        return lookup_dcache(&this, base, 0);
}
EXPORT_SYMBOL(try_lookup_one_len);

/**
 * lookup_one_len - filesystem helper to lookup single pathname component
 * @name:        pathname component to lookup
 * @base:        base directory to lookup from
 * @len:        maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * The caller must hold base->i_mutex.
 */
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
        struct dentry *dentry;
        struct qstr this;
        int err;

        WARN_ON_ONCE(!inode_is_locked(base->d_inode));

        err = lookup_one_len_common(name, base, len, &this);
        if (err)
                return ERR_PTR(err);

        dentry = lookup_dcache(&this, base, 0);
        return dentry ? dentry : __lookup_slow(&this, base, 0);
}
EXPORT_SYMBOL(lookup_one_len);

/**
 * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
 * @name:        pathname component to lookup
 * @base:        base directory to lookup from
 * @len:        maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * Unlike lookup_one_len, it should be called without the parent
 * i_mutex held, and will take the i_mutex itself if necessary.
 */
struct dentry *lookup_one_len_unlocked(const char *name,
                                       struct dentry *base, int len)
{
        struct qstr this;
        int err;
        struct dentry *ret;

        err = lookup_one_len_common(name, base, len, &this);
        if (err)
                return ERR_PTR(err);

        ret = lookup_dcache(&this, base, 0);
        if (!ret)
                ret = lookup_slow(&this, base, 0);
        return ret;
}
EXPORT_SYMBOL(lookup_one_len_unlocked);

/*
 * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT)
 * on negatives.  Returns known positive or ERR_PTR(); that's what
 * most of the users want.  Note that pinned negative with unlocked parent
 * _can_ become positive at any time, so callers of lookup_one_len_unlocked()
 * need to be very careful; pinned positives have ->d_inode stable, so
 * this one avoids such problems.
 */
struct dentry *lookup_positive_unlocked(const char *name,
                                       struct dentry *base, int len)
{
        struct dentry *ret = lookup_one_len_unlocked(name, base, len);
        if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
                dput(ret);
                ret = ERR_PTR(-ENOENT);
        }
        return ret;
}
EXPORT_SYMBOL(lookup_positive_unlocked);

#ifdef CONFIG_UNIX98_PTYS
int path_pts(struct path *path)
{
        /* Find something mounted on "pts" in the same directory as
         * the input path.
         */
        struct dentry *parent = dget_parent(path->dentry);
        struct dentry *child;
        struct qstr this = QSTR_INIT("pts", 3);

        if (unlikely(!path_connected(path->mnt, parent))) {
                dput(parent);
                return -ENOENT;
        }
        dput(path->dentry);
        path->dentry = parent;
        child = d_hash_and_lookup(parent, &this);
        if (IS_ERR_OR_NULL(child))
                return -ENOENT;

        path->dentry = child;
        dput(parent);
        follow_down(path);
        return 0;
}
#endif

int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
                 struct path *path, int *empty)
{
        return filename_lookup(dfd, getname_flags(name, flags, empty),
                               flags, path, NULL);
}
EXPORT_SYMBOL(user_path_at_empty);

int __check_sticky(struct inode *dir, struct inode *inode)
{
        kuid_t fsuid = current_fsuid();

        if (uid_eq(inode->i_uid, fsuid))
                return 0;
        if (uid_eq(dir->i_uid, fsuid))
                return 0;
        return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
}
EXPORT_SYMBOL(__check_sticky);

/*
 *        Check whether we can remove a link victim from directory dir, check
 *  whether the type of victim is right.
 *  1. We can't do it if dir is read-only (done in permission())
 *  2. We should have write and exec permissions on dir
 *  3. We can't remove anything from append-only dir
 *  4. We can't do anything with immutable dir (done in permission())
 *  5. If the sticky bit on dir is set we should either
 *        a. be owner of dir, or
 *        b. be owner of victim, or
 *        c. have CAP_FOWNER capability
 *  6. If the victim is append-only or immutable we can't do antyhing with
 *     links pointing to it.
 *  7. If the victim has an unknown uid or gid we can't change the inode.
 *  8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
 *  9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
 * 10. We can't remove a root or mountpoint.
 * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
 *     nfs_async_unlink().
 */
static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
{
        struct inode *inode = d_backing_inode(victim);
        int error;

        if (d_is_negative(victim))
                return -ENOENT;
        BUG_ON(!inode);

        BUG_ON(victim->d_parent->d_inode != dir);

        /* Inode writeback is not safe when the uid or gid are invalid. */
        if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
                return -EOVERFLOW;

        audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);

        error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
        if (error)
                return error;
        if (IS_APPEND(dir))
                return -EPERM;

        if (check_sticky(dir, inode) || IS_APPEND(inode) ||
            IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode))
                return -EPERM;
        if (isdir) {
                if (!d_is_dir(victim))
                        return -ENOTDIR;
                if (IS_ROOT(victim))
                        return -EBUSY;
        } else if (d_is_dir(victim))
                return -EISDIR;
        if (IS_DEADDIR(dir))
                return -ENOENT;
        if (victim->d_flags & DCACHE_NFSFS_RENAMED)
                return -EBUSY;
        return 0;
}

/*        Check whether we can create an object with dentry child in directory
 *  dir.
 *  1. We can't do it if child already exists (open has special treatment for
 *     this case, but since we are inlined it's OK)
 *  2. We can't do it if dir is read-only (done in permission())
 *  3. We can't do it if the fs can't represent the fsuid or fsgid.
 *  4. We should have write and exec permissions on dir
 *  5. We can't do it if dir is immutable (done in permission())
 */
static inline int may_create(struct inode *dir, struct dentry *child)
{
        struct user_namespace *s_user_ns;
        audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
        if (child->d_inode)
                return -EEXIST;
        if (IS_DEADDIR(dir))
                return -ENOENT;
        s_user_ns = dir->i_sb->s_user_ns;
        if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
            !kgid_has_mapping(s_user_ns, current_fsgid()))
                return -EOVERFLOW;
        return inode_permission(dir, MAY_WRITE | MAY_EXEC);
}

/*
 * p1 and p2 should be directories on the same fs.
 */
struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
{
        struct dentry *p;

        if (p1 == p2) {
                inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
                return NULL;
        }

        mutex_lock(&p1->d_sb->s_vfs_rename_mutex);

        p = d_ancestor(p2, p1);
        if (p) {
                inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
                inode_lock_nested(p1->d_inode, I_MUTEX_PARENT2);
                return p;
        }

        p = d_ancestor(p1, p2);
        inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
        inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
        return p;
}
EXPORT_SYMBOL(lock_rename);

void unlock_rename(struct dentry *p1, struct dentry *p2)
{
        inode_unlock(p1->d_inode);
        if (p1 != p2) {
                inode_unlock(p2->d_inode);
                mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
        }
}
EXPORT_SYMBOL(unlock_rename);

/**
 * mode_strip_umask - handle vfs umask stripping
 * @dir:        parent directory of the new inode
 * @mode:        mode of the new inode to be created in @dir
 *
 * Umask stripping depends on whether or not the filesystem supports POSIX
 * ACLs. If the filesystem doesn't support it umask stripping is done directly
 * in here. If the filesystem does support POSIX ACLs umask stripping is
 * deferred until the filesystem calls posix_acl_create().
 *
 * Returns: mode
 */
static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode)
{
        if (!IS_POSIXACL(dir))
                mode &= ~current_umask();
        return mode;
}

/**
 * vfs_prepare_mode - prepare the mode to be used for a new inode
 * @dir:        parent directory of the new inode
 * @mode:        mode of the new inode
 * @mask_perms:        allowed permission by the vfs
 * @type:        type of file to be created
 *
 * This helper consolidates and enforces vfs restrictions on the @mode of a new
 * object to be created.
 *
 * Umask stripping depends on whether the filesystem supports POSIX ACLs (see
 * the kernel documentation for mode_strip_umask()). Moving umask stripping
 * after setgid stripping allows the same ordering for both non-POSIX ACL and
 * POSIX ACL supporting filesystems.
 *
 * Note that it's currently valid for @type to be 0 if a directory is created.
 * Filesystems raise that flag individually and we need to check whether each
 * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a
 * non-zero type.
 *
 * Returns: mode to be passed to the filesystem
 */
static inline umode_t vfs_prepare_mode(const struct inode *dir, umode_t mode,
                                       umode_t mask_perms, umode_t type)
{
        mode = mode_strip_sgid(dir, mode);
        mode = mode_strip_umask(dir, mode);

        /*
         * Apply the vfs mandated allowed permission mask and set the type of
         * file to be created before we call into the filesystem.
         */
        mode &= (mask_perms & ~S_IFMT);
        mode |= (type & S_IFMT);

        return mode;
}

int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
                bool want_excl)
{
        int error = may_create(dir, dentry);
        if (error)
                return error;

        if (!dir->i_op->create)
                return -EACCES;        /* shouldn't it be ENOSYS? */

        mode = vfs_prepare_mode(dir, mode, S_IALLUGO, S_IFREG);
        error = security_inode_create(dir, dentry, mode);
        if (error)
                return error;
        error = dir->i_op->create(dir, dentry, mode, want_excl);
        if (!error)
                fsnotify_create(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_create);

int vfs_mkobj(struct dentry *dentry, umode_t mode,
                int (*f)(struct dentry *, umode_t, void *),
                void *arg)
{
        struct inode *dir = dentry->d_parent->d_inode;
        int error = may_create(dir, dentry);
        if (error)
                return error;

        mode &= S_IALLUGO;
        mode |= S_IFREG;
        error = security_inode_create(dir, dentry, mode);
        if (error)
                return error;
        error = f(dentry, mode, arg);
        if (!error)
                fsnotify_create(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_mkobj);

bool may_open_dev(const struct path *path)
{
        return !(path->mnt->mnt_flags & MNT_NODEV) &&
                !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
}

static int may_open(const struct path *path, int acc_mode, int flag)
{
        struct dentry *dentry = path->dentry;
        struct inode *inode = dentry->d_inode;
        int error;

        if (!inode)
                return -ENOENT;

        switch (inode->i_mode & S_IFMT) {
        case S_IFLNK:
                return -ELOOP;
        case S_IFDIR:
                if (acc_mode & MAY_WRITE)
                        return -EISDIR;
                if (acc_mode & MAY_EXEC)
                        return -EACCES;
                break;
        case S_IFBLK:
        case S_IFCHR:
                if (!may_open_dev(path))
                        return -EACCES;
                fallthrough;
        case S_IFIFO:
        case S_IFSOCK:
                if (acc_mode & MAY_EXEC)
                        return -EACCES;
                flag &= ~O_TRUNC;
                break;
        case S_IFREG:
                if ((acc_mode & MAY_EXEC) && path_noexec(path))
                        return -EACCES;
                break;
        }

        error = inode_permission(inode, MAY_OPEN | acc_mode);
        if (error)
                return error;

        /*
         * An append-only file must be opened in append mode for writing.
         */
        if (IS_APPEND(inode)) {
                if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
                        return -EPERM;
                if (flag & O_TRUNC)
                        return -EPERM;
        }

        /* O_NOATIME can only be set by the owner or superuser */
        if (flag & O_NOATIME && !inode_owner_or_capable(inode))
                return -EPERM;

        return 0;
}

static int handle_truncate(struct file *filp)
{
        const struct path *path = &filp->f_path;
        struct inode *inode = path->dentry->d_inode;
        int error = get_write_access(inode);
        if (error)
                return error;
        /*
         * Refuse to truncate files with mandatory locks held on them.
         */
        error = locks_verify_locked(filp);
        if (!error)
                error = security_path_truncate(path);
        if (!error) {
                error = do_truncate(path->dentry, 0,
                                    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
                                    filp);
        }
        put_write_access(inode);
        return error;
}

static inline int open_to_namei_flags(int flag)
{
        if ((flag & O_ACCMODE) == 3)
                flag--;
        return flag;
}

static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode)
{
        struct user_namespace *s_user_ns;
        int error = security_path_mknod(dir, dentry, mode, 0);
        if (error)
                return error;

        s_user_ns = dir->dentry->d_sb->s_user_ns;
        if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
            !kgid_has_mapping(s_user_ns, current_fsgid()))
                return -EOVERFLOW;

        error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
        if (error)
                return error;

        return security_inode_create(dir->dentry->d_inode, dentry, mode);
}

/*
 * Attempt to atomically look up, create and open a file from a negative
 * dentry.
 *
 * Returns 0 if successful.  The file will have been created and attached to
 * @file by the filesystem calling finish_open().
 *
 * If the file was looked up only or didn't need creating, FMODE_OPENED won't
 * be set.  The caller will need to perform the open themselves.  @path will
 * have been updated to point to the new dentry.  This may be negative.
 *
 * Returns an error code otherwise.
 */
static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
                                  struct file *file,
                                  int open_flag, umode_t mode)
{
        struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
        struct inode *dir =  nd->path.dentry->d_inode;
        int error;

        if (nd->flags & LOOKUP_DIRECTORY)
                open_flag |= O_DIRECTORY;

        file->f_path.dentry = DENTRY_NOT_SET;
        file->f_path.mnt = nd->path.mnt;
        error = dir->i_op->atomic_open(dir, dentry, file,
                                       open_to_namei_flags(open_flag), mode);
        d_lookup_done(dentry);
        if (!error) {
                if (file->f_mode & FMODE_OPENED) {
                        if (unlikely(dentry != file->f_path.dentry)) {
                                dput(dentry);
                                dentry = dget(file->f_path.dentry);
                        }
                } else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
                        error = -EIO;
                } else {
                        if (file->f_path.dentry) {
                                dput(dentry);
                                dentry = file->f_path.dentry;
                        }
                        if (unlikely(d_is_negative(dentry)))
                                error = -ENOENT;
                }
        }
        if (error) {
                dput(dentry);
                dentry = ERR_PTR(error);
        }
        return dentry;
}

/*
 * Look up and maybe create and open the last component.
 *
 * Must be called with parent locked (exclusive in O_CREAT case).
 *
 * Returns 0 on success, that is, if
 *  the file was successfully atomically created (if necessary) and opened, or
 *  the file was not completely opened at this time, though lookups and
 *  creations were performed.
 * These case are distinguished by presence of FMODE_OPENED on file->f_mode.
 * In the latter case dentry returned in @path might be negative if O_CREAT
 * hadn't been specified.
 *
 * An error code is returned on failure.
 */
static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
                                  const struct open_flags *op,
                                  bool got_write)
{
        struct dentry *dir = nd->path.dentry;
        struct inode *dir_inode = dir->d_inode;
        int open_flag = op->open_flag;
        struct dentry *dentry;
        int error, create_error = 0;
        umode_t mode = op->mode;
        DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);

        if (unlikely(IS_DEADDIR(dir_inode)))
                return ERR_PTR(-ENOENT);

        file->f_mode &= ~FMODE_CREATED;
        dentry = d_lookup(dir, &nd->last);
        for (;;) {
                if (!dentry) {
                        dentry = d_alloc_parallel(dir, &nd->last, &wq);
                        if (IS_ERR(dentry))
                                return dentry;
                }
                if (d_in_lookup(dentry))
                        break;

                error = d_revalidate(dentry, nd->flags);
                if (likely(error > 0))
                        break;
                if (error)
                        goto out_dput;
                d_invalidate(dentry);
                dput(dentry);
                dentry = NULL;
        }
        if (dentry->d_inode) {
                /* Cached positive dentry: will open in f_op->open */
                return dentry;
        }

        /*
         * Checking write permission is tricky, bacuse we don't know if we are
         * going to actually need it: O_CREAT opens should work as long as the
         * file exists.  But checking existence breaks atomicity.  The trick is
         * to check access and if not granted clear O_CREAT from the flags.
         *
         * Another problem is returing the "right" error value (e.g. for an
         * O_EXCL open we want to return EEXIST not EROFS).
         */
        if (unlikely(!got_write))
                open_flag &= ~O_TRUNC;
        if (open_flag & O_CREAT) {
                if (open_flag & O_EXCL)
                        open_flag &= ~O_TRUNC;
                mode = vfs_prepare_mode(dir->d_inode, mode, mode, mode);
                if (likely(got_write))
                        create_error = may_o_create(&nd->path, dentry, mode);
                else
                        create_error = -EROFS;
        }
        if (create_error)
                open_flag &= ~O_CREAT;
        if (dir_inode->i_op->atomic_open) {
                dentry = atomic_open(nd, dentry, file, open_flag, mode);
                if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT))
                        dentry = ERR_PTR(create_error);
                return dentry;
        }

        if (d_in_lookup(dentry)) {
                struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
                                                             nd->flags);
                d_lookup_done(dentry);
                if (unlikely(res)) {
                        if (IS_ERR(res)) {
                                error = PTR_ERR(res);
                                goto out_dput;
                        }
                        dput(dentry);
                        dentry = res;
                }
        }

        /* Negative dentry, just create the file */
        if (!dentry->d_inode && (open_flag & O_CREAT)) {
                file->f_mode |= FMODE_CREATED;
                audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
                if (!dir_inode->i_op->create) {
                        error = -EACCES;
                        goto out_dput;
                }
                error = dir_inode->i_op->create(dir_inode, dentry, mode,
                                                open_flag & O_EXCL);
                if (error)
                        goto out_dput;
        }
        if (unlikely(create_error) && !dentry->d_inode) {
                error = create_error;
                goto out_dput;
        }
        return dentry;

out_dput:
        dput(dentry);
        return ERR_PTR(error);
}

static const char *open_last_lookups(struct nameidata *nd,
                   struct file *file, const struct open_flags *op)
{
        struct dentry *dir = nd->path.dentry;
        int open_flag = op->open_flag;
        bool got_write = false;
        unsigned seq;
        struct inode *inode;
        struct dentry *dentry;
        const char *res;

        nd->flags |= op->intent;

        if (nd->last_type != LAST_NORM) {
                if (nd->depth)
                        put_link(nd);
                return handle_dots(nd, nd->last_type);
        }

        if (!(open_flag & O_CREAT)) {
                if (nd->last.name[nd->last.len])
                        nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
                /* we _can_ be in RCU mode here */
                dentry = lookup_fast(nd, &inode, &seq);
                if (IS_ERR(dentry))
                        return ERR_CAST(dentry);
                if (likely(dentry))
                        goto finish_lookup;

                BUG_ON(nd->flags & LOOKUP_RCU);
        } else {
                /* create side of things */
                if (nd->flags & LOOKUP_RCU) {
                        if (!try_to_unlazy(nd))
                                return ERR_PTR(-ECHILD);
                }
                audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
                /* trailing slashes? */
                if (unlikely(nd->last.name[nd->last.len]))
                        return ERR_PTR(-EISDIR);
        }

        if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
                got_write = !mnt_want_write(nd->path.mnt);
                /*
                 * do _not_ fail yet - we might not need that or fail with
                 * a different error; let lookup_open() decide; we'll be
                 * dropping this one anyway.
                 */
        }
        if (open_flag & O_CREAT)
                inode_lock(dir->d_inode);
        else
                inode_lock_shared(dir->d_inode);
        dentry = lookup_open(nd, file, op, got_write);
        if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED))
                fsnotify_create(dir->d_inode, dentry);
        if (open_flag & O_CREAT)
                inode_unlock(dir->d_inode);
        else
                inode_unlock_shared(dir->d_inode);

        if (got_write)
                mnt_drop_write(nd->path.mnt);

        if (IS_ERR(dentry))
                return ERR_CAST(dentry);

        if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) {
                dput(nd->path.dentry);
                nd->path.dentry = dentry;
                return NULL;
        }

finish_lookup:
        if (nd->depth)
                put_link(nd);
        res = step_into(nd, WALK_TRAILING, dentry, inode, seq);
        if (unlikely(res))
                nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
        return res;
}

/*
 * Handle the last step of open()
 */
static int do_open(struct nameidata *nd,
                   struct file *file, const struct open_flags *op)
{
        int open_flag = op->open_flag;
        bool do_truncate;
        int acc_mode;
        int error;

        if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
                error = complete_walk(nd);
                if (error)
                        return error;
        }
        if (!(file->f_mode & FMODE_CREATED))
                audit_inode(nd->name, nd->path.dentry, 0);
        if (open_flag & O_CREAT) {
                if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
                        return -EEXIST;
                if (d_is_dir(nd->path.dentry))
                        return -EISDIR;
                error = may_create_in_sticky(nd->dir_mode, nd->dir_uid,
                                             d_backing_inode(nd->path.dentry));
                if (unlikely(error))
                        return error;
        }
        if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
                return -ENOTDIR;

        do_truncate = false;
        acc_mode = op->acc_mode;
        if (file->f_mode & FMODE_CREATED) {
                /* Don't check for write permission, don't truncate */
                open_flag &= ~O_TRUNC;
                acc_mode = 0;
        } else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
                error = mnt_want_write(nd->path.mnt);
                if (error)
                        return error;
                do_truncate = true;
        }
        error = may_open(&nd->path, acc_mode, open_flag);
        if (!error && !(file->f_mode & FMODE_OPENED))
                error = vfs_open(&nd->path, file);
        if (!error)
                error = ima_file_check(file, op->acc_mode);
        if (!error && do_truncate)
                error = handle_truncate(file);
        if (unlikely(error > 0)) {
                WARN_ON(1);
                error = -EINVAL;
        }
        if (do_truncate)
                mnt_drop_write(nd->path.mnt);
        return error;
}

struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag)
{
        struct dentry *child = NULL;
        struct inode *dir = dentry->d_inode;
        struct inode *inode;
        int error;

        /* we want directory to be writable */
        error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
        if (error)
                goto out_err;
        error = -EOPNOTSUPP;
        if (!dir->i_op->tmpfile)
                goto out_err;
        error = -ENOMEM;
        child = d_alloc(dentry, &slash_name);
        if (unlikely(!child))
                goto out_err;
        mode = vfs_prepare_mode(dir, mode, mode, mode);
        error = dir->i_op->tmpfile(dir, child, mode);
        if (error)
                goto out_err;
        error = -ENOENT;
        inode = child->d_inode;
        if (unlikely(!inode))
                goto out_err;
        if (!(open_flag & O_EXCL)) {
                spin_lock(&inode->i_lock);
                inode->i_state |= I_LINKABLE;
                spin_unlock(&inode->i_lock);
        }
        ima_post_create_tmpfile(inode);
        return child;

out_err:
        dput(child);
        return ERR_PTR(error);
}
EXPORT_SYMBOL(vfs_tmpfile);

static int do_tmpfile(struct nameidata *nd, unsigned flags,
                const struct open_flags *op,
                struct file *file)
{
        struct dentry *child;
        struct path path;
        int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
        if (unlikely(error))
                return error;
        error = mnt_want_write(path.mnt);
        if (unlikely(error))
                goto out;
        child = vfs_tmpfile(path.dentry, op->mode, op->open_flag);
        error = PTR_ERR(child);
        if (IS_ERR(child))
                goto out2;
        dput(path.dentry);
        path.dentry = child;
        audit_inode(nd->name, child, 0);
        /* Don't check for other permissions, the inode was just created */
        error = may_open(&path, 0, op->open_flag);
        if (error)
                goto out2;
        file->f_path.mnt = path.mnt;
        error = finish_open(file, child, NULL);
out2:
        mnt_drop_write(path.mnt);
out:
        path_put(&path);
        return error;
}

static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
{
        struct path path;
        int error = path_lookupat(nd, flags, &path);
        if (!error) {
                audit_inode(nd->name, path.dentry, 0);
                error = vfs_open(&path, file);
                path_put(&path);
        }
        return error;
}

static struct file *path_openat(struct nameidata *nd,
                        const struct open_flags *op, unsigned flags)
{
        struct file *file;
        int error;

        file = alloc_empty_file(op->open_flag, current_cred());
        if (IS_ERR(file))
                return file;

        if (unlikely(file->f_flags & __O_TMPFILE)) {
                error = do_tmpfile(nd, flags, op, file);
        } else if (unlikely(file->f_flags & O_PATH)) {
                error = do_o_path(nd, flags, file);
        } else {
                const char *s = path_init(nd, flags);
                while (!(error = link_path_walk(s, nd)) &&
                       (s = open_last_lookups(nd, file, op)) != NULL)
                        ;
                if (!error)
                        error = do_open(nd, file, op);
                terminate_walk(nd);
        }
        if (likely(!error)) {
                if (likely(file->f_mode & FMODE_OPENED))
                        return file;
                WARN_ON(1);
                error = -EINVAL;
        }
        fput(file);
        if (error == -EOPENSTALE) {
                if (flags & LOOKUP_RCU)
                        error = -ECHILD;
                else
                        error = -ESTALE;
        }
        return ERR_PTR(error);
}

struct file *do_filp_open(int dfd, struct filename *pathname,
                const struct open_flags *op)
{
        struct nameidata nd;
        int flags = op->lookup_flags;
        struct file *filp;

        set_nameidata(&nd, dfd, pathname);
        filp = path_openat(&nd, op, flags | LOOKUP_RCU);
        if (unlikely(filp == ERR_PTR(-ECHILD)))
                filp = path_openat(&nd, op, flags);
        if (unlikely(filp == ERR_PTR(-ESTALE)))
                filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
        restore_nameidata();
        return filp;
}

struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
                const char *name, const struct open_flags *op)
{
        struct nameidata nd;
        struct file *file;
        struct filename *filename;
        int flags = op->lookup_flags | LOOKUP_ROOT;

        nd.root.mnt = mnt;
        nd.root.dentry = dentry;

        if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
                return ERR_PTR(-ELOOP);

        filename = getname_kernel(name);
        if (IS_ERR(filename))
                return ERR_CAST(filename);

        set_nameidata(&nd, -1, filename);
        file = path_openat(&nd, op, flags | LOOKUP_RCU);
        if (unlikely(file == ERR_PTR(-ECHILD)))
                file = path_openat(&nd, op, flags);
        if (unlikely(file == ERR_PTR(-ESTALE)))
                file = path_openat(&nd, op, flags | LOOKUP_REVAL);
        restore_nameidata();
        putname(filename);
        return file;
}

static struct dentry *filename_create(int dfd, struct filename *name,
                                struct path *path, unsigned int lookup_flags)
{
        struct dentry *dentry = ERR_PTR(-EEXIST);
        struct qstr last;
        int type;
        int err2;
        int error;
        bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);

        /*
         * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
         * other flags passed in are ignored!
         */
        lookup_flags &= LOOKUP_REVAL;

        name = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
        if (IS_ERR(name))
                return ERR_CAST(name);

        /*
         * Yucky last component or no last component at all?
         * (foo/., foo/.., /////)
         */
        if (unlikely(type != LAST_NORM))
                goto out;

        /* don't fail immediately if it's r/o, at least try to report other errors */
        err2 = mnt_want_write(path->mnt);
        /*
         * Do the final lookup.
         */
        lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
        inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
        dentry = __lookup_hash(&last, path->dentry, lookup_flags);
        if (IS_ERR(dentry))
                goto unlock;

        error = -EEXIST;
        if (d_is_positive(dentry))
                goto fail;

        /*
         * Special case - lookup gave negative, but... we had foo/bar/
         * From the vfs_mknod() POV we just have a negative dentry -
         * all is fine. Let's be bastards - you had / on the end, you've
         * been asking for (non-existent) directory. -ENOENT for you.
         */
        if (unlikely(!is_dir && last.name[last.len])) {
                error = -ENOENT;
                goto fail;
        }
        if (unlikely(err2)) {
                error = err2;
                goto fail;
        }
        putname(name);
        return dentry;
fail:
        dput(dentry);
        dentry = ERR_PTR(error);
unlock:
        inode_unlock(path->dentry->d_inode);
        if (!err2)
                mnt_drop_write(path->mnt);
out:
        path_put(path);
        putname(name);
        return dentry;
}

struct dentry *kern_path_create(int dfd, const char *pathname,
                                struct path *path, unsigned int lookup_flags)
{
        return filename_create(dfd, getname_kernel(pathname),
                                path, lookup_flags);
}
EXPORT_SYMBOL(kern_path_create);

void done_path_create(struct path *path, struct dentry *dentry)
{
        dput(dentry);
        inode_unlock(path->dentry->d_inode);
        mnt_drop_write(path->mnt);
        path_put(path);
}
EXPORT_SYMBOL(done_path_create);

inline struct dentry *user_path_create(int dfd, const char __user *pathname,
                                struct path *path, unsigned int lookup_flags)
{
        return filename_create(dfd, getname(pathname), path, lookup_flags);
}
EXPORT_SYMBOL(user_path_create);

int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
{
        bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
        int error = may_create(dir, dentry);

        if (error)
                return error;

        if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout &&
            !capable(CAP_MKNOD))
                return -EPERM;

        if (!dir->i_op->mknod)
                return -EPERM;

        mode = vfs_prepare_mode(dir, mode, mode, mode);
        error = devcgroup_inode_mknod(mode, dev);
        if (error)
                return error;

        error = security_inode_mknod(dir, dentry, mode, dev);
        if (error)
                return error;

        error = dir->i_op->mknod(dir, dentry, mode, dev);
        if (!error)
                fsnotify_create(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_mknod);

static int may_mknod(umode_t mode)
{
        switch (mode & S_IFMT) {
        case S_IFREG:
        case S_IFCHR:
        case S_IFBLK:
        case S_IFIFO:
        case S_IFSOCK:
        case 0: /* zero mode translates to S_IFREG */
                return 0;
        case S_IFDIR:
                return -EPERM;
        default:
                return -EINVAL;
        }
}

static long do_mknodat(int dfd, const char __user *filename, umode_t mode,
                unsigned int dev)
{
        struct dentry *dentry;
        struct path path;
        int error;
        unsigned int lookup_flags = 0;

        error = may_mknod(mode);
        if (error)
                return error;
retry:
        dentry = user_path_create(dfd, filename, &path, lookup_flags);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);

        error = security_path_mknod(&path, dentry,
                        mode_strip_umask(path.dentry->d_inode, mode), dev);
        if (error)
                goto out;
        switch (mode & S_IFMT) {
                case 0: case S_IFREG:
                        error = vfs_create(path.dentry->d_inode,dentry,mode,true);
                        if (!error)
                                ima_post_path_mknod(dentry);
                        break;
                case S_IFCHR: case S_IFBLK:
                        error = vfs_mknod(path.dentry->d_inode,dentry,mode,
                                        new_decode_dev(dev));
                        break;
                case S_IFIFO: case S_IFSOCK:
                        error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
                        break;
        }
out:
        done_path_create(&path, dentry);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
                unsigned int, dev)
{
        return do_mknodat(dfd, filename, mode, dev);
}

SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
{
        return do_mknodat(AT_FDCWD, filename, mode, dev);
}

int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
        int error = may_create(dir, dentry);
        unsigned max_links = dir->i_sb->s_max_links;

        if (error)
                return error;

        if (!dir->i_op->mkdir)
                return -EPERM;

        mode = vfs_prepare_mode(dir, mode, S_IRWXUGO | S_ISVTX, 0);
        error = security_inode_mkdir(dir, dentry, mode);
        if (error)
                return error;

        if (max_links && dir->i_nlink >= max_links)
                return -EMLINK;

        error = dir->i_op->mkdir(dir, dentry, mode);
        if (!error)
                fsnotify_mkdir(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_mkdir);

static long do_mkdirat(int dfd, const char __user *pathname, umode_t mode)
{
        struct dentry *dentry;
        struct path path;
        int error;
        unsigned int lookup_flags = LOOKUP_DIRECTORY;

retry:
        dentry = user_path_create(dfd, pathname, &path, lookup_flags);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);

        error = security_path_mkdir(&path, dentry,
                        mode_strip_umask(path.dentry->d_inode, mode));
        if (!error)
                error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
        done_path_create(&path, dentry);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
{
        return do_mkdirat(dfd, pathname, mode);
}

SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
{
        return do_mkdirat(AT_FDCWD, pathname, mode);
}

int vfs_rmdir(struct inode *dir, struct dentry *dentry)
{
        int error = may_delete(dir, dentry, 1);

        if (error)
                return error;

        if (!dir->i_op->rmdir)
                return -EPERM;

        dget(dentry);
        inode_lock(dentry->d_inode);

        error = -EBUSY;
        if (is_local_mountpoint(dentry))
                goto out;

        error = security_inode_rmdir(dir, dentry);
        if (error)
                goto out;

        error = dir->i_op->rmdir(dir, dentry);
        if (error)
                goto out;

        shrink_dcache_parent(dentry);
        dentry->d_inode->i_flags |= S_DEAD;
        dont_mount(dentry);
        detach_mounts(dentry);

out:
        inode_unlock(dentry->d_inode);
        dput(dentry);
        if (!error)
                d_delete_notify(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_rmdir);

long do_rmdir(int dfd, struct filename *name)
{
        int error = 0;
        struct dentry *dentry;
        struct path path;
        struct qstr last;
        int type;
        unsigned int lookup_flags = 0;
retry:
        name = filename_parentat(dfd, name, lookup_flags,
                                &path, &last, &type);
        if (IS_ERR(name))
                return PTR_ERR(name);

        switch (type) {
        case LAST_DOTDOT:
                error = -ENOTEMPTY;
                goto exit1;
        case LAST_DOT:
                error = -EINVAL;
                goto exit1;
        case LAST_ROOT:
                error = -EBUSY;
                goto exit1;
        }

        error = mnt_want_write(path.mnt);
        if (error)
                goto exit1;

        inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
        dentry = __lookup_hash(&last, path.dentry, lookup_flags);
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto exit2;
        if (!dentry->d_inode) {
                error = -ENOENT;
                goto exit3;
        }
        error = security_path_rmdir(&path, dentry);
        if (error)
                goto exit3;
        error = vfs_rmdir(path.dentry->d_inode, dentry);
exit3:
        dput(dentry);
exit2:
        inode_unlock(path.dentry->d_inode);
        mnt_drop_write(path.mnt);
exit1:
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        putname(name);
        return error;
}

SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
{
        return do_rmdir(AT_FDCWD, getname(pathname));
}

/**
 * vfs_unlink - unlink a filesystem object
 * @dir:        parent directory
 * @dentry:        victim
 * @delegated_inode: returns victim inode, if the inode is delegated.
 *
 * The caller must hold dir->i_mutex.
 *
 * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
 * return a reference to the inode in delegated_inode.  The caller
 * should then break the delegation on that inode and retry.  Because
 * breaking a delegation may take a long time, the caller should drop
 * dir->i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 */
int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
{
        struct inode *target = dentry->d_inode;
        int error = may_delete(dir, dentry, 0);

        if (error)
                return error;

        if (!dir->i_op->unlink)
                return -EPERM;

        inode_lock(target);
        if (is_local_mountpoint(dentry))
                error = -EBUSY;
        else {
                error = security_inode_unlink(dir, dentry);
                if (!error) {
                        error = try_break_deleg(target, delegated_inode);
                        if (error)
                                goto out;
                        error = dir->i_op->unlink(dir, dentry);
                        if (!error) {
                                dont_mount(dentry);
                                detach_mounts(dentry);
                        }
                }
        }
out:
        inode_unlock(target);

        /* We don't d_delete() NFS sillyrenamed files--they still exist. */
        if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) {
                fsnotify_unlink(dir, dentry);
        } else if (!error) {
                fsnotify_link_count(target);
                d_delete_notify(dir, dentry);
        }

        return error;
}
EXPORT_SYMBOL(vfs_unlink);

/*
 * Make sure that the actual truncation of the file will occur outside its
 * directory's i_mutex.  Truncate can take a long time if there is a lot of
 * writeout happening, and we don't want to prevent access to the directory
 * while waiting on the I/O.
 */
long do_unlinkat(int dfd, struct filename *name)
{
        int error;
        struct dentry *dentry;
        struct path path;
        struct qstr last;
        int type;
        struct inode *inode = NULL;
        struct inode *delegated_inode = NULL;
        unsigned int lookup_flags = 0;
retry:
        name = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
        if (IS_ERR(name))
                return PTR_ERR(name);

        error = -EISDIR;
        if (type != LAST_NORM)
                goto exit1;

        error = mnt_want_write(path.mnt);
        if (error)
                goto exit1;
retry_deleg:
        inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
        dentry = __lookup_hash(&last, path.dentry, lookup_flags);
        error = PTR_ERR(dentry);
        if (!IS_ERR(dentry)) {
                /* Why not before? Because we want correct error value */
                if (last.name[last.len])
                        goto slashes;
                inode = dentry->d_inode;
                if (d_is_negative(dentry))
                        goto slashes;
                ihold(inode);
                error = security_path_unlink(&path, dentry);
                if (error)
                        goto exit2;
                error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
exit2:
                dput(dentry);
        }
        inode_unlock(path.dentry->d_inode);
        if (inode)
                iput(inode);        /* truncate the inode here */
        inode = NULL;
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
        mnt_drop_write(path.mnt);
exit1:
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                inode = NULL;
                goto retry;
        }
        putname(name);
        return error;

slashes:
        if (d_is_negative(dentry))
                error = -ENOENT;
        else if (d_is_dir(dentry))
                error = -EISDIR;
        else
                error = -ENOTDIR;
        goto exit2;
}

SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
{
        if ((flag & ~AT_REMOVEDIR) != 0)
                return -EINVAL;

        if (flag & AT_REMOVEDIR)
                return do_rmdir(dfd, getname(pathname));
        return do_unlinkat(dfd, getname(pathname));
}

SYSCALL_DEFINE1(unlink, const char __user *, pathname)
{
        return do_unlinkat(AT_FDCWD, getname(pathname));
}

int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
{
        int error = may_create(dir, dentry);

        if (error)
                return error;

        if (!dir->i_op->symlink)
                return -EPERM;

        error = security_inode_symlink(dir, dentry, oldname);
        if (error)
                return error;

        error = dir->i_op->symlink(dir, dentry, oldname);
        if (!error)
                fsnotify_create(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_symlink);

static long do_symlinkat(const char __user *oldname, int newdfd,
                  const char __user *newname)
{
        int error;
        struct filename *from;
        struct dentry *dentry;
        struct path path;
        unsigned int lookup_flags = 0;

        from = getname(oldname);
        if (IS_ERR(from))
                return PTR_ERR(from);
retry:
        dentry = user_path_create(newdfd, newname, &path, lookup_flags);
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto out_putname;

        error = security_path_symlink(&path, dentry, from->name);
        if (!error)
                error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
        done_path_create(&path, dentry);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out_putname:
        putname(from);
        return error;
}

SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
                int, newdfd, const char __user *, newname)
{
        return do_symlinkat(oldname, newdfd, newname);
}

SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
{
        return do_symlinkat(oldname, AT_FDCWD, newname);
}

/**
 * vfs_link - create a new link
 * @old_dentry:        object to be linked
 * @dir:        new parent
 * @new_dentry:        where to create the new link
 * @delegated_inode: returns inode needing a delegation break
 *
 * The caller must hold dir->i_mutex
 *
 * If vfs_link discovers a delegation on the to-be-linked file in need
 * of breaking, it will return -EWOULDBLOCK and return a reference to the
 * inode in delegated_inode.  The caller should then break the delegation
 * and retry.  Because breaking a delegation may take a long time, the
 * caller should drop the i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 */
int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
{
        struct inode *inode = old_dentry->d_inode;
        unsigned max_links = dir->i_sb->s_max_links;
        int error;

        if (!inode)
                return -ENOENT;

        error = may_create(dir, new_dentry);
        if (error)
                return error;

        if (dir->i_sb != inode->i_sb)
                return -EXDEV;

        /*
         * A link to an append-only or immutable file cannot be created.
         */
        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return -EPERM;
        /*
         * Updating the link count will likely cause i_uid and i_gid to
         * be writen back improperly if their true value is unknown to
         * the vfs.
         */
        if (HAS_UNMAPPED_ID(inode))
                return -EPERM;
        if (!dir->i_op->link)
                return -EPERM;
        if (S_ISDIR(inode->i_mode))
                return -EPERM;

        error = security_inode_link(old_dentry, dir, new_dentry);
        if (error)
                return error;

        inode_lock(inode);
        /* Make sure we don't allow creating hardlink to an unlinked file */
        if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
                error =  -ENOENT;
        else if (max_links && inode->i_nlink >= max_links)
                error = -EMLINK;
        else {
                error = try_break_deleg(inode, delegated_inode);
                if (!error)
                        error = dir->i_op->link(old_dentry, dir, new_dentry);
        }

        if (!error && (inode->i_state & I_LINKABLE)) {
                spin_lock(&inode->i_lock);
                inode->i_state &= ~I_LINKABLE;
                spin_unlock(&inode->i_lock);
        }
        inode_unlock(inode);
        if (!error)
                fsnotify_link(dir, inode, new_dentry);
        return error;
}
EXPORT_SYMBOL(vfs_link);

/*
 * Hardlinks are often used in delicate situations.  We avoid
 * security-related surprises by not following symlinks on the
 * newname.  --KAB
 *
 * We don't follow them on the oldname either to be compatible
 * with linux 2.0, and to avoid hard-linking to directories
 * and other special files.  --ADM
 */
static int do_linkat(int olddfd, const char __user *oldname, int newdfd,
              const char __user *newname, int flags)
{
        struct dentry *new_dentry;
        struct path old_path, new_path;
        struct inode *delegated_inode = NULL;
        int how = 0;
        int error;

        if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
                return -EINVAL;
        /*
         * To use null names we require CAP_DAC_READ_SEARCH
         * This ensures that not everyone will be able to create
         * handlink using the passed filedescriptor.
         */
        if (flags & AT_EMPTY_PATH) {
                if (!capable(CAP_DAC_READ_SEARCH))
                        return -ENOENT;
                how = LOOKUP_EMPTY;
        }

        if (flags & AT_SYMLINK_FOLLOW)
                how |= LOOKUP_FOLLOW;
retry:
        error = user_path_at(olddfd, oldname, how, &old_path);
        if (error)
                return error;

        new_dentry = user_path_create(newdfd, newname, &new_path,
                                        (how & LOOKUP_REVAL));
        error = PTR_ERR(new_dentry);
        if (IS_ERR(new_dentry))
                goto out;

        error = -EXDEV;
        if (old_path.mnt != new_path.mnt)
                goto out_dput;
        error = may_linkat(&old_path);
        if (unlikely(error))
                goto out_dput;
        error = security_path_link(old_path.dentry, &new_path, new_dentry);
        if (error)
                goto out_dput;
        error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
out_dput:
        done_path_create(&new_path, new_dentry);
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error) {
                        path_put(&old_path);
                        goto retry;
                }
        }
        if (retry_estale(error, how)) {
                path_put(&old_path);
                how |= LOOKUP_REVAL;
                goto retry;
        }
out:
        path_put(&old_path);

        return error;
}

SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
                int, newdfd, const char __user *, newname, int, flags)
{
        return do_linkat(olddfd, oldname, newdfd, newname, flags);
}

SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
{
        return do_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
}

/**
 * vfs_rename - rename a filesystem object
 * @old_dir:        parent of source
 * @old_dentry:        source
 * @new_dir:        parent of destination
 * @new_dentry:        destination
 * @delegated_inode: returns an inode needing a delegation break
 * @flags:        rename flags
 *
 * The caller must hold multiple mutexes--see lock_rename()).
 *
 * If vfs_rename discovers a delegation in need of breaking at either
 * the source or destination, it will return -EWOULDBLOCK and return a
 * reference to the inode in delegated_inode.  The caller should then
 * break the delegation and retry.  Because breaking a delegation may
 * take a long time, the caller should drop all locks before doing
 * so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 *
 * The worst of all namespace operations - renaming directory. "Perverted"
 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
 * Problems:
 *
 *        a) we can get into loop creation.
 *        b) race potential - two innocent renames can create a loop together.
 *           That's where 4.4BSD screws up. Current fix: serialization on
 *           sb->s_vfs_rename_mutex. We might be more accurate, but that's another
 *           story.
 *        c) we may have to lock up to _four_ objects - parents and victim (if it exists),
 *           and source (if it's a non-directory or a subdirectory that moves to
 *           different parent).
 *           And that - after we got ->i_mutex on parents (until then we don't know
 *           whether the target exists).  Solution: try to be smart with locking
 *           order for inodes.  We rely on the fact that tree topology may change
 *           only under ->s_vfs_rename_mutex _and_ that parent of the object we
 *           move will be locked.  Thus we can rank directories by the tree
 *           (ancestors first) and rank all non-directories after them.
 *           That works since everybody except rename does "lock parent, lookup,
 *           lock child" and rename is under ->s_vfs_rename_mutex.
 *           HOWEVER, it relies on the assumption that any object with ->lookup()
 *           has no more than 1 dentry.  If "hybrid" objects will ever appear,
 *           we'd better make sure that there's no link(2) for them.
 *        d) conversion from fhandle to dentry may come in the wrong moment - when
 *           we are removing the target. Solution: we will have to grab ->i_mutex
 *           in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
 *           ->i_mutex on parents, which works but leads to some truly excessive
 *           locking].
 */
int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
               struct inode *new_dir, struct dentry *new_dentry,
               struct inode **delegated_inode, unsigned int flags)
{
        int error;
        bool is_dir = d_is_dir(old_dentry);
        struct inode *source = old_dentry->d_inode;
        struct inode *target = new_dentry->d_inode;
        bool new_is_dir = false;
        unsigned max_links = new_dir->i_sb->s_max_links;
        struct name_snapshot old_name;
        bool lock_old_subdir, lock_new_subdir;

        if (source == target)
                return 0;

        error = may_delete(old_dir, old_dentry, is_dir);
        if (error)
                return error;

        if (!target) {
                error = may_create(new_dir, new_dentry);
        } else {
                new_is_dir = d_is_dir(new_dentry);

                if (!(flags & RENAME_EXCHANGE))
                        error = may_delete(new_dir, new_dentry, is_dir);
                else
                        error = may_delete(new_dir, new_dentry, new_is_dir);
        }
        if (error)
                return error;

        if (!old_dir->i_op->rename)
                return -EPERM;

        /*
         * If we are going to change the parent - check write permissions,
         * we'll need to flip '..'.
         */
        if (new_dir != old_dir) {
                if (is_dir) {
                        error = inode_permission(source, MAY_WRITE);
                        if (error)
                                return error;
                }
                if ((flags & RENAME_EXCHANGE) && new_is_dir) {
                        error = inode_permission(target, MAY_WRITE);
                        if (error)
                                return error;
                }
        }

        error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
                                      flags);
        if (error)
                return error;

        take_dentry_name_snapshot(&old_name, old_dentry);
        dget(new_dentry);
        /*
         * Lock children.
         * The source subdirectory needs to be locked on cross-directory
         * rename or cross-directory exchange since its parent changes.
         * The target subdirectory needs to be locked on cross-directory
         * exchange due to parent change and on any rename due to becoming
         * a victim.
         * Non-directories need locking in all cases (for NFS reasons);
         * they get locked after any subdirectories (in inode address order).
         *
         * NOTE: WE ONLY LOCK UNRELATED DIRECTORIES IN CROSS-DIRECTORY CASE.
         * NEVER, EVER DO THAT WITHOUT ->s_vfs_rename_mutex.
         */
        lock_old_subdir = new_dir != old_dir;
        lock_new_subdir = new_dir != old_dir || !(flags & RENAME_EXCHANGE);
        if (is_dir) {
                if (lock_old_subdir)
                        inode_lock_nested(source, I_MUTEX_CHILD);
                if (target && (!new_is_dir || lock_new_subdir))
                        inode_lock(target);
        } else if (new_is_dir) {
                if (lock_new_subdir)
                        inode_lock_nested(target, I_MUTEX_CHILD);
                inode_lock(source);
        } else {
                lock_two_nondirectories(source, target);
        }

        error = -EBUSY;
        if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
                goto out;

        if (max_links && new_dir != old_dir) {
                error = -EMLINK;
                if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
                        goto out;
                if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
                    old_dir->i_nlink >= max_links)
                        goto out;
        }
        if (!is_dir) {
                error = try_break_deleg(source, delegated_inode);
                if (error)
                        goto out;
        }
        if (target && !new_is_dir) {
                error = try_break_deleg(target, delegated_inode);
                if (error)
                        goto out;
        }
        error = old_dir->i_op->rename(old_dir, old_dentry,
                                       new_dir, new_dentry, flags);
        if (error)
                goto out;

        if (!(flags & RENAME_EXCHANGE) && target) {
                if (is_dir) {
                        shrink_dcache_parent(new_dentry);
                        target->i_flags |= S_DEAD;
                }
                dont_mount(new_dentry);
                detach_mounts(new_dentry);
        }
        if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
                if (!(flags & RENAME_EXCHANGE))
                        d_move(old_dentry, new_dentry);
                else
                        d_exchange(old_dentry, new_dentry);
        }
out:
        if (!is_dir || lock_old_subdir)
                inode_unlock(source);
        if (target && (!new_is_dir || lock_new_subdir))
                inode_unlock(target);
        dput(new_dentry);
        if (!error) {
                fsnotify_move(old_dir, new_dir, &old_name.name, is_dir,
                              !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
                if (flags & RENAME_EXCHANGE) {
                        fsnotify_move(new_dir, old_dir, &old_dentry->d_name,
                                      new_is_dir, NULL, new_dentry);
                }
        }
        release_dentry_name_snapshot(&old_name);

        return error;
}
EXPORT_SYMBOL(vfs_rename);

int do_renameat2(int olddfd, struct filename *from, int newdfd,
                 struct filename *to, unsigned int flags)
{
        struct dentry *old_dentry, *new_dentry;
        struct dentry *trap;
        struct path old_path, new_path;
        struct qstr old_last, new_last;
        int old_type, new_type;
        struct inode *delegated_inode = NULL;
        unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
        bool should_retry = false;
        int error = -EINVAL;

        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
                goto put_both;

        if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
            (flags & RENAME_EXCHANGE))
                goto put_both;

        if (flags & RENAME_EXCHANGE)
                target_flags = 0;

retry:
        from = filename_parentat(olddfd, from, lookup_flags, &old_path,
                                        &old_last, &old_type);
        if (IS_ERR(from)) {
                error = PTR_ERR(from);
                goto put_new;
        }

        to = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
                                &new_type);
        if (IS_ERR(to)) {
                error = PTR_ERR(to);
                goto exit1;
        }

        error = -EXDEV;
        if (old_path.mnt != new_path.mnt)
                goto exit2;

        error = -EBUSY;
        if (old_type != LAST_NORM)
                goto exit2;

        if (flags & RENAME_NOREPLACE)
                error = -EEXIST;
        if (new_type != LAST_NORM)
                goto exit2;

        error = mnt_want_write(old_path.mnt);
        if (error)
                goto exit2;

retry_deleg:
        trap = lock_rename(new_path.dentry, old_path.dentry);

        old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags);
        error = PTR_ERR(old_dentry);
        if (IS_ERR(old_dentry))
                goto exit3;
        /* source must exist */
        error = -ENOENT;
        if (d_is_negative(old_dentry))
                goto exit4;
        new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags);
        error = PTR_ERR(new_dentry);
        if (IS_ERR(new_dentry))
                goto exit4;
        error = -EEXIST;
        if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
                goto exit5;
        if (flags & RENAME_EXCHANGE) {
                error = -ENOENT;
                if (d_is_negative(new_dentry))
                        goto exit5;

                if (!d_is_dir(new_dentry)) {
                        error = -ENOTDIR;
                        if (new_last.name[new_last.len])
                                goto exit5;
                }
        }
        /* unless the source is a directory trailing slashes give -ENOTDIR */
        if (!d_is_dir(old_dentry)) {
                error = -ENOTDIR;
                if (old_last.name[old_last.len])
                        goto exit5;
                if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
                        goto exit5;
        }
        /* source should not be ancestor of target */
        error = -EINVAL;
        if (old_dentry == trap)
                goto exit5;
        /* target should not be an ancestor of source */
        if (!(flags & RENAME_EXCHANGE))
                error = -ENOTEMPTY;
        if (new_dentry == trap)
                goto exit5;

        error = security_path_rename(&old_path, old_dentry,
                                     &new_path, new_dentry, flags);
        if (error)
                goto exit5;
        error = vfs_rename(old_path.dentry->d_inode, old_dentry,
                           new_path.dentry->d_inode, new_dentry,
                           &delegated_inode, flags);
exit5:
        dput(new_dentry);
exit4:
        dput(old_dentry);
exit3:
        unlock_rename(new_path.dentry, old_path.dentry);
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
        mnt_drop_write(old_path.mnt);
exit2:
        if (retry_estale(error, lookup_flags))
                should_retry = true;
        path_put(&new_path);
exit1:
        path_put(&old_path);
        if (should_retry) {
                should_retry = false;
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
put_both:
        if (!IS_ERR(from))
                putname(from);
put_new:
        if (!IS_ERR(to))
                putname(to);
        return error;
}

SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
                int, newdfd, const char __user *, newname, unsigned int, flags)
{
        return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
                                flags);
}

SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
                int, newdfd, const char __user *, newname)
{
        return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
                                0);
}

SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
{
        return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD,
                                getname(newname), 0);
}

int readlink_copy(char __user *buffer, int buflen, const char *link)
{
        int len = PTR_ERR(link);
        if (IS_ERR(link))
                goto out;

        len = strlen(link);
        if (len > (unsigned) buflen)
                len = buflen;
        if (copy_to_user(buffer, link, len))
                len = -EFAULT;
out:
        return len;
}

/**
 * vfs_readlink - copy symlink body into userspace buffer
 * @dentry: dentry on which to get symbolic link
 * @buffer: user memory pointer
 * @buflen: size of buffer
 *
 * Does not touch atime.  That's up to the caller if necessary
 *
 * Does not call security hook.
 */
int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
        struct inode *inode = d_inode(dentry);
        DEFINE_DELAYED_CALL(done);
        const char *link;
        int res;

        if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
                if (unlikely(inode->i_op->readlink))
                        return inode->i_op->readlink(dentry, buffer, buflen);

                if (!d_is_symlink(dentry))
                        return -EINVAL;

                spin_lock(&inode->i_lock);
                inode->i_opflags |= IOP_DEFAULT_READLINK;
                spin_unlock(&inode->i_lock);
        }

        link = READ_ONCE(inode->i_link);
        if (!link) {
                link = inode->i_op->get_link(dentry, inode, &done);
                if (IS_ERR(link))
                        return PTR_ERR(link);
        }
        res = readlink_copy(buffer, buflen, link);
        do_delayed_call(&done);
        return res;
}
EXPORT_SYMBOL(vfs_readlink);

/**
 * vfs_get_link - get symlink body
 * @dentry: dentry on which to get symbolic link
 * @done: caller needs to free returned data with this
 *
 * Calls security hook and i_op->get_link() on the supplied inode.
 *
 * It does not touch atime.  That's up to the caller if necessary.
 *
 * Does not work on "special" symlinks like /proc/$$/fd/N
 */
const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done)
{
        const char *res = ERR_PTR(-EINVAL);
        struct inode *inode = d_inode(dentry);

        if (d_is_symlink(dentry)) {
                res = ERR_PTR(security_inode_readlink(dentry));
                if (!res)
                        res = inode->i_op->get_link(dentry, inode, done);
        }
        return res;
}
EXPORT_SYMBOL(vfs_get_link);

/* get the link contents into pagecache */
const char *page_get_link(struct dentry *dentry, struct inode *inode,
                          struct delayed_call *callback)
{
        char *kaddr;
        struct page *page;
        struct address_space *mapping = inode->i_mapping;

        if (!dentry) {
                page = find_get_page(mapping, 0);
                if (!page)
                        return ERR_PTR(-ECHILD);
                if (!PageUptodate(page)) {
                        put_page(page);
                        return ERR_PTR(-ECHILD);
                }
        } else {
                page = read_mapping_page(mapping, 0, NULL);
                if (IS_ERR(page))
                        return (char*)page;
        }
        set_delayed_call(callback, page_put_link, page);
        BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
        kaddr = page_address(page);
        nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
        return kaddr;
}

EXPORT_SYMBOL(page_get_link);

void page_put_link(void *arg)
{
        put_page(arg);
}
EXPORT_SYMBOL(page_put_link);

int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
        DEFINE_DELAYED_CALL(done);
        int res = readlink_copy(buffer, buflen,
                                page_get_link(dentry, d_inode(dentry),
                                              &done));
        do_delayed_call(&done);
        return res;
}
EXPORT_SYMBOL(page_readlink);

/*
 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
 */
int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
{
        struct address_space *mapping = inode->i_mapping;
        struct page *page;
        void *fsdata = NULL;
        int err;
        unsigned int flags = 0;
        if (nofs)
                flags |= AOP_FLAG_NOFS;

retry:
        err = pagecache_write_begin(NULL, mapping, 0, len-1,
                                flags, &page, &fsdata);
        if (err)
                goto fail;

        memcpy(page_address(page), symname, len-1);

        err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
                                                        page, fsdata);
        if (err < 0)
                goto fail;
        if (err < len-1)
                goto retry;

        mark_inode_dirty(inode);
        return 0;
fail:
        return err;
}
EXPORT_SYMBOL(__page_symlink);

int page_symlink(struct inode *inode, const char *symname, int len)
{
        return __page_symlink(inode, symname, len,
                        !mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
}
EXPORT_SYMBOL(page_symlink);

const struct inode_operations page_symlink_inode_operations = {
        .get_link        = page_get_link,
};
EXPORT_SYMBOL(page_symlink_inode_operations);





































    1 


    1 


    1 










    1 
    1 



    1 


    1 





    1 










    1 








    1 






















    1 
    1 
    1 




























    1 
    1 



    1 

    1 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
// SPDX-License-Identifier: GPL-2.0-or-later
/* bit search implementation
 *
 * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * Copyright (C) 2008 IBM Corporation
 * 'find_last_bit' is written by Rusty Russell <rusty@rustcorp.com.au>
 * (Inspired by David Howell's find_next_bit implementation)
 *
 * Rewritten by Yury Norov <yury.norov@gmail.com> to decrease
 * size and improve performance, 2015.
 */

#include <linux/bitops.h>
#include <linux/bitmap.h>
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/minmax.h>

#if !defined(find_next_bit) || !defined(find_next_zero_bit) ||                        \
        !defined(find_next_bit_le) || !defined(find_next_zero_bit_le) ||        \
        !defined(find_next_and_bit)
/*
 * This is a common helper function for find_next_bit, find_next_zero_bit, and
 * find_next_and_bit. The differences are:
 *  - The "invert" argument, which is XORed with each fetched word before
 *    searching it for one bits.
 *  - The optional "addr2", which is anded with "addr1" if present.
 */
static unsigned long _find_next_bit(const unsigned long *addr1,
                const unsigned long *addr2, unsigned long nbits,
                unsigned long start, unsigned long invert, unsigned long le)
{
        unsigned long tmp, mask;

        if (unlikely(start >= nbits))
                return nbits;

        tmp = addr1[start / BITS_PER_LONG];
        if (addr2)
                tmp &= addr2[start / BITS_PER_LONG];
        tmp ^= invert;

        /* Handle 1st word. */
        mask = BITMAP_FIRST_WORD_MASK(start);
        if (le)
                mask = swab(mask);

        tmp &= mask;

        start = round_down(start, BITS_PER_LONG);

        while (!tmp) {
                start += BITS_PER_LONG;
                if (start >= nbits)
                        return nbits;

                tmp = addr1[start / BITS_PER_LONG];
                if (addr2)
                        tmp &= addr2[start / BITS_PER_LONG];
                tmp ^= invert;
        }

        if (le)
                tmp = swab(tmp);

        return min(start + __ffs(tmp), nbits);
}
#endif

#ifndef find_next_bit
/*
 * Find the next set bit in a memory region.
 */
unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
                            unsigned long offset)
{
        return _find_next_bit(addr, NULL, size, offset, 0UL, 0);
}
EXPORT_SYMBOL(find_next_bit);
#endif

#ifndef find_next_zero_bit
unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
                                 unsigned long offset)
{
        return _find_next_bit(addr, NULL, size, offset, ~0UL, 0);
}
EXPORT_SYMBOL(find_next_zero_bit);
#endif

#if !defined(find_next_and_bit)
unsigned long find_next_and_bit(const unsigned long *addr1,
                const unsigned long *addr2, unsigned long size,
                unsigned long offset)
{
        return _find_next_bit(addr1, addr2, size, offset, 0UL, 0);
}
EXPORT_SYMBOL(find_next_and_bit);
#endif

#ifndef find_first_bit
/*
 * Find the first set bit in a memory region.
 */
unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
{
        unsigned long idx;

        for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
                if (addr[idx])
                        return min(idx * BITS_PER_LONG + __ffs(addr[idx]), size);
        }

        return size;
}
EXPORT_SYMBOL(find_first_bit);
#endif

#ifndef find_first_zero_bit
/*
 * Find the first cleared bit in a memory region.
 */
unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
{
        unsigned long idx;

        for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
                if (addr[idx] != ~0UL)
                        return min(idx * BITS_PER_LONG + ffz(addr[idx]), size);
        }

        return size;
}
EXPORT_SYMBOL(find_first_zero_bit);
#endif

#ifndef find_last_bit
unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
{
        if (size) {
                unsigned long val = BITMAP_LAST_WORD_MASK(size);
                unsigned long idx = (size-1) / BITS_PER_LONG;

                do {
                        val &= addr[idx];
                        if (val)
                                return idx * BITS_PER_LONG + __fls(val);

                        val = ~0ul;
                } while (idx--);
        }
        return size;
}
EXPORT_SYMBOL(find_last_bit);
#endif

#ifdef __BIG_ENDIAN

#ifndef find_next_zero_bit_le
unsigned long find_next_zero_bit_le(const void *addr, unsigned
                long size, unsigned long offset)
{
        return _find_next_bit(addr, NULL, size, offset, ~0UL, 1);
}
EXPORT_SYMBOL(find_next_zero_bit_le);
#endif

#ifndef find_next_bit_le
unsigned long find_next_bit_le(const void *addr, unsigned
                long size, unsigned long offset)
{
        return _find_next_bit(addr, NULL, size, offset, 0UL, 1);
}
EXPORT_SYMBOL(find_next_bit_le);
#endif

#endif /* __BIG_ENDIAN */

unsigned long find_next_clump8(unsigned long *clump, const unsigned long *addr,
                               unsigned long size, unsigned long offset)
{
        offset = find_next_bit(addr, size, offset);
        if (offset == size)
                return size;

        offset = round_down(offset, 8);
        *clump = bitmap_get_value8(addr, offset);

        return offset;
}
EXPORT_SYMBOL(find_next_clump8);
























    1 















































































































































































































































































    1 






































    1 








































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM io_uring

#if !defined(_TRACE_IO_URING_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_IO_URING_H

#include <linux/tracepoint.h>

struct io_wq_work;

/**
 * io_uring_create - called after a new io_uring context was prepared
 *
 * @fd:                corresponding file descriptor
 * @ctx:        pointer to a ring context structure
 * @sq_entries:        actual SQ size
 * @cq_entries:        actual CQ size
 * @flags:        SQ ring flags, provided to io_uring_setup(2)
 *
 * Allows to trace io_uring creation and provide pointer to a context, that can
 * be used later to find correlated events.
 */
TRACE_EVENT(io_uring_create,

        TP_PROTO(int fd, void *ctx, u32 sq_entries, u32 cq_entries, u32 flags),

        TP_ARGS(fd, ctx, sq_entries, cq_entries, flags),

        TP_STRUCT__entry (
                __field(  int,                fd                        )
                __field(  void *,        ctx                        )
                __field(  u32,                sq_entries        )
                __field(  u32,                cq_entries        )
                __field(  u32,                flags                )
        ),

        TP_fast_assign(
                __entry->fd                        = fd;
                __entry->ctx                = ctx;
                __entry->sq_entries        = sq_entries;
                __entry->cq_entries        = cq_entries;
                __entry->flags                = flags;
        ),

        TP_printk("ring %p, fd %d sq size %d, cq size %d, flags %d",
                          __entry->ctx, __entry->fd, __entry->sq_entries,
                          __entry->cq_entries, __entry->flags)
);

/**
 * io_uring_register - called after a buffer/file/eventfd was successfully
 *                                            registered for a ring
 *
 * @ctx:                pointer to a ring context structure
 * @opcode:                describes which operation to perform
 * @nr_user_files:        number of registered files
 * @nr_user_bufs:        number of registered buffers
 * @cq_ev_fd:                whether eventfs registered or not
 * @ret:                return code
 *
 * Allows to trace fixed files/buffers/eventfds, that could be registered to
 * avoid an overhead of getting references to them for every operation. This
 * event, together with io_uring_file_get, can provide a full picture of how
 * much overhead one can reduce via fixing.
 */
TRACE_EVENT(io_uring_register,

        TP_PROTO(void *ctx, unsigned opcode, unsigned nr_files,
                         unsigned nr_bufs, bool eventfd, long ret),

        TP_ARGS(ctx, opcode, nr_files, nr_bufs, eventfd, ret),

        TP_STRUCT__entry (
                __field(  void *,        ctx                        )
                __field(  unsigned,        opcode                )
                __field(  unsigned,        nr_files        )
                __field(  unsigned,        nr_bufs                )
                __field(  bool,                eventfd                )
                __field(  long,                ret                        )
        ),

        TP_fast_assign(
                __entry->ctx                = ctx;
                __entry->opcode                = opcode;
                __entry->nr_files        = nr_files;
                __entry->nr_bufs        = nr_bufs;
                __entry->eventfd        = eventfd;
                __entry->ret                = ret;
        ),

        TP_printk("ring %p, opcode %d, nr_user_files %d, nr_user_bufs %d, "
                          "eventfd %d, ret %ld",
                          __entry->ctx, __entry->opcode, __entry->nr_files,
                          __entry->nr_bufs, __entry->eventfd, __entry->ret)
);

/**
 * io_uring_file_get - called before getting references to an SQE file
 *
 * @ctx:        pointer to a ring context structure
 * @fd:                SQE file descriptor
 *
 * Allows to trace out how often an SQE file reference is obtained, which can
 * help figuring out if it makes sense to use fixed files, or check that fixed
 * files are used correctly.
 */
TRACE_EVENT(io_uring_file_get,

        TP_PROTO(void *ctx, int fd),

        TP_ARGS(ctx, fd),

        TP_STRUCT__entry (
                __field(  void *,        ctx        )
                __field(  int,                fd        )
        ),

        TP_fast_assign(
                __entry->ctx        = ctx;
                __entry->fd                = fd;
        ),

        TP_printk("ring %p, fd %d", __entry->ctx, __entry->fd)
);

/**
 * io_uring_queue_async_work - called before submitting a new async work
 *
 * @ctx:        pointer to a ring context structure
 * @hashed:        type of workqueue, hashed or normal
 * @req:        pointer to a submitted request
 * @work:        pointer to a submitted io_wq_work
 *
 * Allows to trace asynchronous work submission.
 */
TRACE_EVENT(io_uring_queue_async_work,

        TP_PROTO(void *ctx, int rw, void * req, struct io_wq_work *work,
                         unsigned int flags),

        TP_ARGS(ctx, rw, req, work, flags),

        TP_STRUCT__entry (
                __field(  void *,                        ctx        )
                __field(  int,                                rw        )
                __field(  void *,                        req        )
                __field(  struct io_wq_work *,                work        )
                __field(  unsigned int,                        flags        )
        ),

        TP_fast_assign(
                __entry->ctx        = ctx;
                __entry->rw        = rw;
                __entry->req        = req;
                __entry->work        = work;
                __entry->flags        = flags;
        ),

        TP_printk("ring %p, request %p, flags %d, %s queue, work %p",
                          __entry->ctx, __entry->req, __entry->flags,
                          __entry->rw ? "hashed" : "normal", __entry->work)
);

/**
 * io_uring_defer - called when an io_uring request is deferred
 *
 * @ctx:        pointer to a ring context structure
 * @req:        pointer to a deferred request
 * @user_data:        user data associated with the request
 *
 * Allows to track deferred requests, to get an insight about what requests are
 * not started immediately.
 */
TRACE_EVENT(io_uring_defer,

        TP_PROTO(void *ctx, void *req, unsigned long long user_data),

        TP_ARGS(ctx, req, user_data),

        TP_STRUCT__entry (
                __field(  void *,        ctx                )
                __field(  void *,        req                )
                __field(  unsigned long long, data        )
        ),

        TP_fast_assign(
                __entry->ctx        = ctx;
                __entry->req        = req;
                __entry->data        = user_data;
        ),

        TP_printk("ring %p, request %p user_data %llu", __entry->ctx,
                        __entry->req, __entry->data)
);

/**
 * io_uring_link - called before the io_uring request added into link_list of
 *                    another request
 *
 * @ctx:                pointer to a ring context structure
 * @req:                pointer to a linked request
 * @target_req:                pointer to a previous request, that would contain @req
 *
 * Allows to track linked requests, to understand dependencies between requests
 * and how does it influence their execution flow.
 */
TRACE_EVENT(io_uring_link,

        TP_PROTO(void *ctx, void *req, void *target_req),

        TP_ARGS(ctx, req, target_req),

        TP_STRUCT__entry (
                __field(  void *,        ctx                )
                __field(  void *,        req                )
                __field(  void *,        target_req        )
        ),

        TP_fast_assign(
                __entry->ctx                = ctx;
                __entry->req                = req;
                __entry->target_req        = target_req;
        ),

        TP_printk("ring %p, request %p linked after %p",
                          __entry->ctx, __entry->req, __entry->target_req)
);

/**
 * io_uring_cqring_wait - called before start waiting for an available CQE
 *
 * @ctx:                pointer to a ring context structure
 * @min_events:        minimal number of events to wait for
 *
 * Allows to track waiting for CQE, so that we can e.g. troubleshoot
 * situations, when an application wants to wait for an event, that never
 * comes.
 */
TRACE_EVENT(io_uring_cqring_wait,

        TP_PROTO(void *ctx, int min_events),

        TP_ARGS(ctx, min_events),

        TP_STRUCT__entry (
                __field(  void *,        ctx                )
                __field(  int,                min_events        )
        ),

        TP_fast_assign(
                __entry->ctx        = ctx;
                __entry->min_events        = min_events;
        ),

        TP_printk("ring %p, min_events %d", __entry->ctx, __entry->min_events)
);

/**
 * io_uring_fail_link - called before failing a linked request
 *
 * @req:        request, which links were cancelled
 * @link:        cancelled link
 *
 * Allows to track linked requests cancellation, to see not only that some work
 * was cancelled, but also which request was the reason.
 */
TRACE_EVENT(io_uring_fail_link,

        TP_PROTO(void *req, void *link),

        TP_ARGS(req, link),

        TP_STRUCT__entry (
                __field(  void *,        req        )
                __field(  void *,        link        )
        ),

        TP_fast_assign(
                __entry->req        = req;
                __entry->link        = link;
        ),

        TP_printk("request %p, link %p", __entry->req, __entry->link)
);

/**
 * io_uring_complete - called when completing an SQE
 *
 * @ctx:                pointer to a ring context structure
 * @user_data:                user data associated with the request
 * @res:                result of the request
 * @cflags:                completion flags
 *
 */
TRACE_EVENT(io_uring_complete,

        TP_PROTO(void *ctx, u64 user_data, int res, unsigned cflags),

        TP_ARGS(ctx, user_data, res, cflags),

        TP_STRUCT__entry (
                __field(  void *,        ctx                )
                __field(  u64,                user_data        )
                __field(  int,                res                )
                __field(  unsigned,        cflags                )
        ),

        TP_fast_assign(
                __entry->ctx                = ctx;
                __entry->user_data        = user_data;
                __entry->res                = res;
                __entry->cflags                = cflags;
        ),

        TP_printk("ring %p, user_data 0x%llx, result %d, cflags %x",
                          __entry->ctx, (unsigned long long)__entry->user_data,
                          __entry->res, __entry->cflags)
);

/**
 * io_uring_submit_sqe - called before submitting one SQE
 *
 * @ctx:                pointer to a ring context structure
 * @req:                pointer to a submitted request
 * @opcode:                opcode of request
 * @user_data:                user data associated with the request
 * @flags                request flags
 * @force_nonblock:        whether a context blocking or not
 * @sq_thread:                true if sq_thread has submitted this SQE
 *
 * Allows to track SQE submitting, to understand what was the source of it, SQ
 * thread or io_uring_enter call.
 */
TRACE_EVENT(io_uring_submit_sqe,

        TP_PROTO(void *ctx, void *req, u8 opcode, u64 user_data, u32 flags,
                 bool force_nonblock, bool sq_thread),

        TP_ARGS(ctx, req, opcode, user_data, flags, force_nonblock, sq_thread),

        TP_STRUCT__entry (
                __field(  void *,        ctx                )
                __field(  void *,        req                )
                __field(  u8,                opcode                )
                __field(  u64,                user_data        )
                __field(  u32,                flags                )
                __field(  bool,                force_nonblock        )
                __field(  bool,                sq_thread        )
        ),

        TP_fast_assign(
                __entry->ctx                = ctx;
                __entry->req                = req;
                __entry->opcode                = opcode;
                __entry->user_data        = user_data;
                __entry->flags                = flags;
                __entry->force_nonblock        = force_nonblock;
                __entry->sq_thread        = sq_thread;
        ),

        TP_printk("ring %p, req %p, op %d, data 0x%llx, flags %u, "
                  "non block %d, sq_thread %d", __entry->ctx, __entry->req,
                  __entry->opcode, (unsigned long long)__entry->user_data,
                  __entry->flags, __entry->force_nonblock, __entry->sq_thread)
);

/*
 * io_uring_poll_arm - called after arming a poll wait if successful
 *
 * @ctx:                pointer to a ring context structure
 * @req:                pointer to the armed request
 * @opcode:                opcode of request
 * @user_data:                user data associated with the request
 * @mask:                request poll events mask
 * @events:                registered events of interest
 *
 * Allows to track which fds are waiting for and what are the events of
 * interest.
 */
TRACE_EVENT(io_uring_poll_arm,

        TP_PROTO(void *ctx, void *req, u8 opcode, u64 user_data,
                 int mask, int events),

        TP_ARGS(ctx, req, opcode, user_data, mask, events),

        TP_STRUCT__entry (
                __field(  void *,        ctx                )
                __field(  void *,        req                )
                __field(  u8,                opcode                )
                __field(  u64,                user_data        )
                __field(  int,                mask                )
                __field(  int,                events                )
        ),

        TP_fast_assign(
                __entry->ctx                = ctx;
                __entry->req                = req;
                __entry->opcode                = opcode;
                __entry->user_data        = user_data;
                __entry->mask                = mask;
                __entry->events                = events;
        ),

        TP_printk("ring %p, req %p, op %d, data 0x%llx, mask 0x%x, events 0x%x",
                  __entry->ctx, __entry->req, __entry->opcode,
                  (unsigned long long) __entry->user_data,
                  __entry->mask, __entry->events)
);

TRACE_EVENT(io_uring_poll_wake,

        TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask),

        TP_ARGS(ctx, opcode, user_data, mask),

        TP_STRUCT__entry (
                __field(  void *,        ctx                )
                __field(  u8,                opcode                )
                __field(  u64,                user_data        )
                __field(  int,                mask                )
        ),

        TP_fast_assign(
                __entry->ctx                = ctx;
                __entry->opcode                = opcode;
                __entry->user_data        = user_data;
                __entry->mask                = mask;
        ),

        TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x",
                          __entry->ctx, __entry->opcode,
                          (unsigned long long) __entry->user_data,
                          __entry->mask)
);

TRACE_EVENT(io_uring_task_add,

        TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask),

        TP_ARGS(ctx, opcode, user_data, mask),

        TP_STRUCT__entry (
                __field(  void *,        ctx                )
                __field(  u8,                opcode                )
                __field(  u64,                user_data        )
                __field(  int,                mask                )
        ),

        TP_fast_assign(
                __entry->ctx                = ctx;
                __entry->opcode                = opcode;
                __entry->user_data        = user_data;
                __entry->mask                = mask;
        ),

        TP_printk("ring %p, op %d, data 0x%llx, mask %x",
                          __entry->ctx, __entry->opcode,
                          (unsigned long long) __entry->user_data,
                          __entry->mask)
);

/*
 * io_uring_task_run - called when task_work_run() executes the poll events
 *                     notification callbacks
 *
 * @ctx:                pointer to a ring context structure
 * @req:                pointer to the armed request
 * @opcode:                opcode of request
 * @user_data:                user data associated with the request
 *
 * Allows to track when notified poll events are processed
 */
TRACE_EVENT(io_uring_task_run,

        TP_PROTO(void *ctx, void *req, u8 opcode, u64 user_data),

        TP_ARGS(ctx, req, opcode, user_data),

        TP_STRUCT__entry (
                __field(  void *,        ctx                )
                __field(  void *,        req                )
                __field(  u8,                opcode                )
                __field(  u64,                user_data        )
        ),

        TP_fast_assign(
                __entry->ctx                = ctx;
                __entry->req                = req;
                __entry->opcode                = opcode;
                __entry->user_data        = user_data;
        ),

        TP_printk("ring %p, req %p, op %d, data 0x%llx",
                  __entry->ctx, __entry->req, __entry->opcode,
                  (unsigned long long) __entry->user_data)
);

#endif /* _TRACE_IO_URING_H */

/* This part must be outside protection */
#include <trace/define_trace.h>






















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 




    1 





    1 




















    1 

    1 







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/err.h>
#include <linux/spinlock.h>

#include <linux/mm.h>
#include <linux/memremap.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>

#include <linux/sched/signal.h>
#include <linux/rwsem.h>
#include <linux/hugetlb.h>
#include <linux/migrate.h>
#include <linux/mm_inline.h>
#include <linux/sched/mm.h>

#include <asm/mmu_context.h>
#include <asm/tlbflush.h>

#include "internal.h"

struct follow_page_context {
        struct dev_pagemap *pgmap;
        unsigned int page_mask;
};

static void hpage_pincount_add(struct page *page, int refs)
{
        VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
        VM_BUG_ON_PAGE(page != compound_head(page), page);

        atomic_add(refs, compound_pincount_ptr(page));
}

static void hpage_pincount_sub(struct page *page, int refs)
{
        VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
        VM_BUG_ON_PAGE(page != compound_head(page), page);

        atomic_sub(refs, compound_pincount_ptr(page));
}

/* Equivalent to calling put_page() @refs times. */
static void put_page_refs(struct page *page, int refs)
{
#ifdef CONFIG_DEBUG_VM
        if (VM_WARN_ON_ONCE_PAGE(page_ref_count(page) < refs, page))
                return;
#endif

        /*
         * Calling put_page() for each ref is unnecessarily slow. Only the last
         * ref needs a put_page().
         */
        if (refs > 1)
                page_ref_sub(page, refs - 1);
        put_page(page);
}

/*
 * Return the compound head page with ref appropriately incremented,
 * or NULL if that failed.
 */
static inline struct page *try_get_compound_head(struct page *page, int refs)
{
        struct page *head = compound_head(page);

        if (WARN_ON_ONCE(page_ref_count(head) < 0))
                return NULL;
        if (unlikely(!page_cache_add_speculative(head, refs)))
                return NULL;

        /*
         * At this point we have a stable reference to the head page; but it
         * could be that between the compound_head() lookup and the refcount
         * increment, the compound page was split, in which case we'd end up
         * holding a reference on a page that has nothing to do with the page
         * we were given anymore.
         * So now that the head page is stable, recheck that the pages still
         * belong together.
         */
        if (unlikely(compound_head(page) != head)) {
                put_page_refs(head, refs);
                return NULL;
        }

        return head;
}

/*
 * try_grab_compound_head() - attempt to elevate a page's refcount, by a
 * flags-dependent amount.
 *
 * "grab" names in this file mean, "look at flags to decide whether to use
 * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
 *
 * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
 * same time. (That's true throughout the get_user_pages*() and
 * pin_user_pages*() APIs.) Cases:
 *
 *    FOLL_GET: page's refcount will be incremented by 1.
 *    FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
 *
 * Return: head page (with refcount appropriately incremented) for success, or
 * NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's
 * considered failure, and furthermore, a likely bug in the caller, so a warning
 * is also emitted.
 */
static __maybe_unused struct page *try_grab_compound_head(struct page *page,
                                                          int refs,
                                                          unsigned int flags)
{
        if (flags & FOLL_GET)
                return try_get_compound_head(page, refs);
        else if (flags & FOLL_PIN) {
                int orig_refs = refs;

                /*
                 * Can't do FOLL_LONGTERM + FOLL_PIN with CMA in the gup fast
                 * path, so fail and let the caller fall back to the slow path.
                 */
                if (unlikely(flags & FOLL_LONGTERM) &&
                                is_migrate_cma_page(page))
                        return NULL;

                /*
                 * CAUTION: Don't use compound_head() on the page before this
                 * point, the result won't be stable.
                 */
                page = try_get_compound_head(page, refs);
                if (!page)
                        return NULL;

                /*
                 * When pinning a compound page of order > 1 (which is what
                 * hpage_pincount_available() checks for), use an exact count to
                 * track it, via hpage_pincount_add/_sub().
                 *
                 * However, be sure to *also* increment the normal page refcount
                 * field at least once, so that the page really is pinned.
                 */
                if (hpage_pincount_available(page))
                        hpage_pincount_add(page, refs);
                else
                        page_ref_add(page, refs * (GUP_PIN_COUNTING_BIAS - 1));

                mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED,
                                    orig_refs);

                return page;
        }

        WARN_ON_ONCE(1);
        return NULL;
}

static void put_compound_head(struct page *page, int refs, unsigned int flags)
{
        if (flags & FOLL_PIN) {
                mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED,
                                    refs);

                if (hpage_pincount_available(page))
                        hpage_pincount_sub(page, refs);
                else
                        refs *= GUP_PIN_COUNTING_BIAS;
        }

        put_page_refs(page, refs);
}

/**
 * try_grab_page() - elevate a page's refcount by a flag-dependent amount
 *
 * This might not do anything at all, depending on the flags argument.
 *
 * "grab" names in this file mean, "look at flags to decide whether to use
 * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
 *
 * @page:    pointer to page to be grabbed
 * @flags:   gup flags: these are the FOLL_* flag values.
 *
 * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
 * time. Cases:
 *
 *    FOLL_GET: page's refcount will be incremented by 1.
 *    FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
 *
 * Return: true for success, or if no action was required (if neither FOLL_PIN
 * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or
 * FOLL_PIN was set, but the page could not be grabbed.
 */
bool __must_check try_grab_page(struct page *page, unsigned int flags)
{
        WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));

        if (flags & FOLL_GET)
                return try_get_page(page);
        else if (flags & FOLL_PIN) {
                int refs = 1;

                page = compound_head(page);

                if (WARN_ON_ONCE(page_ref_count(page) <= 0))
                        return false;

                if (hpage_pincount_available(page))
                        hpage_pincount_add(page, 1);
                else
                        refs = GUP_PIN_COUNTING_BIAS;

                /*
                 * Similar to try_grab_compound_head(): even if using the
                 * hpage_pincount_add/_sub() routines, be sure to
                 * *also* increment the normal page refcount field at least
                 * once, so that the page really is pinned.
                 */
                page_ref_add(page, refs);

                mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1);
        }

        return true;
}

/**
 * unpin_user_page() - release a dma-pinned page
 * @page:            pointer to page to be released
 *
 * Pages that were pinned via pin_user_pages*() must be released via either
 * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
 * that such pages can be separately tracked and uniquely handled. In
 * particular, interactions with RDMA and filesystems need special handling.
 */
void unpin_user_page(struct page *page)
{
        put_compound_head(compound_head(page), 1, FOLL_PIN);
}
EXPORT_SYMBOL(unpin_user_page);

/**
 * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
 * @pages:  array of pages to be maybe marked dirty, and definitely released.
 * @npages: number of pages in the @pages array.
 * @make_dirty: whether to mark the pages dirty
 *
 * "gup-pinned page" refers to a page that has had one of the get_user_pages()
 * variants called on that page.
 *
 * For each page in the @pages array, make that page (or its head page, if a
 * compound page) dirty, if @make_dirty is true, and if the page was previously
 * listed as clean. In any case, releases all pages using unpin_user_page(),
 * possibly via unpin_user_pages(), for the non-dirty case.
 *
 * Please see the unpin_user_page() documentation for details.
 *
 * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
 * required, then the caller should a) verify that this is really correct,
 * because _lock() is usually required, and b) hand code it:
 * set_page_dirty_lock(), unpin_user_page().
 *
 */
void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
                                 bool make_dirty)
{
        unsigned long index;

        /*
         * TODO: this can be optimized for huge pages: if a series of pages is
         * physically contiguous and part of the same compound page, then a
         * single operation to the head page should suffice.
         */

        if (!make_dirty) {
                unpin_user_pages(pages, npages);
                return;
        }

        for (index = 0; index < npages; index++) {
                struct page *page = compound_head(pages[index]);
                /*
                 * Checking PageDirty at this point may race with
                 * clear_page_dirty_for_io(), but that's OK. Two key
                 * cases:
                 *
                 * 1) This code sees the page as already dirty, so it
                 * skips the call to set_page_dirty(). That could happen
                 * because clear_page_dirty_for_io() called
                 * page_mkclean(), followed by set_page_dirty().
                 * However, now the page is going to get written back,
                 * which meets the original intention of setting it
                 * dirty, so all is well: clear_page_dirty_for_io() goes
                 * on to call TestClearPageDirty(), and write the page
                 * back.
                 *
                 * 2) This code sees the page as clean, so it calls
                 * set_page_dirty(). The page stays dirty, despite being
                 * written back, so it gets written back again in the
                 * next writeback cycle. This is harmless.
                 */
                if (!PageDirty(page))
                        set_page_dirty_lock(page);
                unpin_user_page(page);
        }
}
EXPORT_SYMBOL(unpin_user_pages_dirty_lock);

/**
 * unpin_user_pages() - release an array of gup-pinned pages.
 * @pages:  array of pages to be marked dirty and released.
 * @npages: number of pages in the @pages array.
 *
 * For each page in the @pages array, release the page using unpin_user_page().
 *
 * Please see the unpin_user_page() documentation for details.
 */
void unpin_user_pages(struct page **pages, unsigned long npages)
{
        unsigned long index;

        /*
         * If this WARN_ON() fires, then the system *might* be leaking pages (by
         * leaving them pinned), but probably not. More likely, gup/pup returned
         * a hard -ERRNO error to the caller, who erroneously passed it here.
         */
        if (WARN_ON(IS_ERR_VALUE(npages)))
                return;
        /*
         * TODO: this can be optimized for huge pages: if a series of pages is
         * physically contiguous and part of the same compound page, then a
         * single operation to the head page should suffice.
         */
        for (index = 0; index < npages; index++)
                unpin_user_page(pages[index]);
}
EXPORT_SYMBOL(unpin_user_pages);

#ifdef CONFIG_MMU
static struct page *no_page_table(struct vm_area_struct *vma,
                unsigned int flags)
{
        /*
         * When core dumping an enormous anonymous area that nobody
         * has touched so far, we don't want to allocate unnecessary pages or
         * page tables.  Return error instead of NULL to skip handle_mm_fault,
         * then get_dump_page() will return NULL to leave a hole in the dump.
         * But we can only make this optimization where a hole would surely
         * be zero-filled if handle_mm_fault() actually did handle it.
         */
        if ((flags & FOLL_DUMP) &&
                        (vma_is_anonymous(vma) || !vma->vm_ops->fault))
                return ERR_PTR(-EFAULT);
        return NULL;
}

static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
                pte_t *pte, unsigned int flags)
{
        /* No page to get reference */
        if (flags & FOLL_GET)
                return -EFAULT;

        if (flags & FOLL_TOUCH) {
                pte_t entry = *pte;

                if (flags & FOLL_WRITE)
                        entry = pte_mkdirty(entry);
                entry = pte_mkyoung(entry);

                if (!pte_same(*pte, entry)) {
                        set_pte_at(vma->vm_mm, address, pte, entry);
                        update_mmu_cache(vma, address, pte);
                }
        }

        /* Proper page table entry exists, but no corresponding struct page */
        return -EEXIST;
}

/*
 * FOLL_FORCE can write to even unwritable pte's, but only
 * after we've gone through a COW cycle and they are dirty.
 */
static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
{
        return pte_write(pte) ||
                ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
}

static struct page *follow_page_pte(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmd, unsigned int flags,
                struct dev_pagemap **pgmap)
{
        struct mm_struct *mm = vma->vm_mm;
        struct page *page;
        spinlock_t *ptl;
        pte_t *ptep, pte;
        int ret;

        /* FOLL_GET and FOLL_PIN are mutually exclusive. */
        if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
                         (FOLL_PIN | FOLL_GET)))
                return ERR_PTR(-EINVAL);

        /*
         * Considering PTE level hugetlb, like continuous-PTE hugetlb on
         * ARM64 architecture.
         */
        if (is_vm_hugetlb_page(vma)) {
                page = follow_huge_pmd_pte(vma, address, flags);
                if (page)
                        return page;
                return no_page_table(vma, flags);
        }

retry:
        if (unlikely(pmd_bad(*pmd)))
                return no_page_table(vma, flags);

        ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
        pte = *ptep;
        if (!pte_present(pte)) {
                swp_entry_t entry;
                /*
                 * KSM's break_ksm() relies upon recognizing a ksm page
                 * even while it is being migrated, so for that case we
                 * need migration_entry_wait().
                 */
                if (likely(!(flags & FOLL_MIGRATION)))
                        goto no_page;
                if (pte_none(pte))
                        goto no_page;
                entry = pte_to_swp_entry(pte);
                if (!is_migration_entry(entry))
                        goto no_page;
                pte_unmap_unlock(ptep, ptl);
                migration_entry_wait(mm, pmd, address);
                goto retry;
        }
        if ((flags & FOLL_NUMA) && pte_protnone(pte))
                goto no_page;
        if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
                pte_unmap_unlock(ptep, ptl);
                return NULL;
        }

        page = vm_normal_page(vma, address, pte);
        if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
                /*
                 * Only return device mapping pages in the FOLL_GET or FOLL_PIN
                 * case since they are only valid while holding the pgmap
                 * reference.
                 */
                *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
                if (*pgmap)
                        page = pte_page(pte);
                else
                        goto no_page;
        } else if (unlikely(!page)) {
                if (flags & FOLL_DUMP) {
                        /* Avoid special (like zero) pages in core dumps */
                        page = ERR_PTR(-EFAULT);
                        goto out;
                }

                if (is_zero_pfn(pte_pfn(pte))) {
                        page = pte_page(pte);
                } else {
                        ret = follow_pfn_pte(vma, address, ptep, flags);
                        page = ERR_PTR(ret);
                        goto out;
                }
        }

        if (flags & FOLL_SPLIT && PageTransCompound(page)) {
                get_page(page);
                pte_unmap_unlock(ptep, ptl);
                lock_page(page);
                ret = split_huge_page(page);
                unlock_page(page);
                put_page(page);
                if (ret)
                        return ERR_PTR(ret);
                goto retry;
        }

        /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
        if (unlikely(!try_grab_page(page, flags))) {
                page = ERR_PTR(-ENOMEM);
                goto out;
        }
        /*
         * We need to make the page accessible if and only if we are going
         * to access its content (the FOLL_PIN case).  Please see
         * Documentation/core-api/pin_user_pages.rst for details.
         */
        if (flags & FOLL_PIN) {
                ret = arch_make_page_accessible(page);
                if (ret) {
                        unpin_user_page(page);
                        page = ERR_PTR(ret);
                        goto out;
                }
        }
        if (flags & FOLL_TOUCH) {
                if ((flags & FOLL_WRITE) &&
                    !pte_dirty(pte) && !PageDirty(page))
                        set_page_dirty(page);
                /*
                 * pte_mkyoung() would be more correct here, but atomic care
                 * is needed to avoid losing the dirty bit: it is easier to use
                 * mark_page_accessed().
                 */
                mark_page_accessed(page);
        }
        if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
                /* Do not mlock pte-mapped THP */
                if (PageTransCompound(page))
                        goto out;

                /*
                 * The preliminary mapping check is mainly to avoid the
                 * pointless overhead of lock_page on the ZERO_PAGE
                 * which might bounce very badly if there is contention.
                 *
                 * If the page is already locked, we don't need to
                 * handle it now - vmscan will handle it later if and
                 * when it attempts to reclaim the page.
                 */
                if (page->mapping && trylock_page(page)) {
                        lru_add_drain();  /* push cached pages to LRU */
                        /*
                         * Because we lock page here, and migration is
                         * blocked by the pte's page reference, and we
                         * know the page is still mapped, we don't even
                         * need to check for file-cache page truncation.
                         */
                        mlock_vma_page(page);
                        unlock_page(page);
                }
        }
out:
        pte_unmap_unlock(ptep, ptl);
        return page;
no_page:
        pte_unmap_unlock(ptep, ptl);
        if (!pte_none(pte))
                return NULL;
        return no_page_table(vma, flags);
}

static struct page *follow_pmd_mask(struct vm_area_struct *vma,
                                    unsigned long address, pud_t *pudp,
                                    unsigned int flags,
                                    struct follow_page_context *ctx)
{
        pmd_t *pmd, pmdval;
        spinlock_t *ptl;
        struct page *page;
        struct mm_struct *mm = vma->vm_mm;

        pmd = pmd_offset(pudp, address);
        /*
         * The READ_ONCE() will stabilize the pmdval in a register or
         * on the stack so that it will stop changing under the code.
         */
        pmdval = READ_ONCE(*pmd);
        if (pmd_none(pmdval))
                return no_page_table(vma, flags);
        if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
                page = follow_huge_pmd_pte(vma, address, flags);
                if (page)
                        return page;
                return no_page_table(vma, flags);
        }
        if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
                page = follow_huge_pd(vma, address,
                                      __hugepd(pmd_val(pmdval)), flags,
                                      PMD_SHIFT);
                if (page)
                        return page;
                return no_page_table(vma, flags);
        }
retry:
        if (!pmd_present(pmdval)) {
                if (likely(!(flags & FOLL_MIGRATION)))
                        return no_page_table(vma, flags);
                VM_BUG_ON(thp_migration_supported() &&
                                  !is_pmd_migration_entry(pmdval));
                if (is_pmd_migration_entry(pmdval))
                        pmd_migration_entry_wait(mm, pmd);
                pmdval = READ_ONCE(*pmd);
                /*
                 * MADV_DONTNEED may convert the pmd to null because
                 * mmap_lock is held in read mode
                 */
                if (pmd_none(pmdval))
                        return no_page_table(vma, flags);
                goto retry;
        }
        if (pmd_devmap(pmdval)) {
                ptl = pmd_lock(mm, pmd);
                page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
                spin_unlock(ptl);
                if (page)
                        return page;
        }
        if (likely(!pmd_trans_huge(pmdval)))
                return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);

        if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
                return no_page_table(vma, flags);

retry_locked:
        ptl = pmd_lock(mm, pmd);
        if (unlikely(pmd_none(*pmd))) {
                spin_unlock(ptl);
                return no_page_table(vma, flags);
        }
        if (unlikely(!pmd_present(*pmd))) {
                spin_unlock(ptl);
                if (likely(!(flags & FOLL_MIGRATION)))
                        return no_page_table(vma, flags);
                pmd_migration_entry_wait(mm, pmd);
                goto retry_locked;
        }
        if (unlikely(!pmd_trans_huge(*pmd))) {
                spin_unlock(ptl);
                return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
        }
        if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
                int ret;
                page = pmd_page(*pmd);
                if (is_huge_zero_page(page)) {
                        spin_unlock(ptl);
                        ret = 0;
                        split_huge_pmd(vma, pmd, address);
                        if (pmd_trans_unstable(pmd))
                                ret = -EBUSY;
                } else if (flags & FOLL_SPLIT) {
                        if (unlikely(!try_get_page(page))) {
                                spin_unlock(ptl);
                                return ERR_PTR(-ENOMEM);
                        }
                        spin_unlock(ptl);
                        lock_page(page);
                        ret = split_huge_page(page);
                        unlock_page(page);
                        put_page(page);
                        if (pmd_none(*pmd))
                                return no_page_table(vma, flags);
                } else {  /* flags & FOLL_SPLIT_PMD */
                        spin_unlock(ptl);
                        split_huge_pmd(vma, pmd, address);
                        ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
                }

                return ret ? ERR_PTR(ret) :
                        follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
        }
        page = follow_trans_huge_pmd(vma, address, pmd, flags);
        spin_unlock(ptl);
        ctx->page_mask = HPAGE_PMD_NR - 1;
        return page;
}

static struct page *follow_pud_mask(struct vm_area_struct *vma,
                                    unsigned long address, p4d_t *p4dp,
                                    unsigned int flags,
                                    struct follow_page_context *ctx)
{
        pud_t *pud;
        spinlock_t *ptl;
        struct page *page;
        struct mm_struct *mm = vma->vm_mm;

        pud = pud_offset(p4dp, address);
        if (pud_none(*pud))
                return no_page_table(vma, flags);
        if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
                page = follow_huge_pud(mm, address, pud, flags);
                if (page)
                        return page;
                return no_page_table(vma, flags);
        }
        if (is_hugepd(__hugepd(pud_val(*pud)))) {
                page = follow_huge_pd(vma, address,
                                      __hugepd(pud_val(*pud)), flags,
                                      PUD_SHIFT);
                if (page)
                        return page;
                return no_page_table(vma, flags);
        }
        if (pud_devmap(*pud)) {
                ptl = pud_lock(mm, pud);
                page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
                spin_unlock(ptl);
                if (page)
                        return page;
        }
        if (unlikely(pud_bad(*pud)))
                return no_page_table(vma, flags);

        return follow_pmd_mask(vma, address, pud, flags, ctx);
}

static struct page *follow_p4d_mask(struct vm_area_struct *vma,
                                    unsigned long address, pgd_t *pgdp,
                                    unsigned int flags,
                                    struct follow_page_context *ctx)
{
        p4d_t *p4d;
        struct page *page;

        p4d = p4d_offset(pgdp, address);
        if (p4d_none(*p4d))
                return no_page_table(vma, flags);
        BUILD_BUG_ON(p4d_huge(*p4d));
        if (unlikely(p4d_bad(*p4d)))
                return no_page_table(vma, flags);

        if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
                page = follow_huge_pd(vma, address,
                                      __hugepd(p4d_val(*p4d)), flags,
                                      P4D_SHIFT);
                if (page)
                        return page;
                return no_page_table(vma, flags);
        }
        return follow_pud_mask(vma, address, p4d, flags, ctx);
}

/**
 * follow_page_mask - look up a page descriptor from a user-virtual address
 * @vma: vm_area_struct mapping @address
 * @address: virtual address to look up
 * @flags: flags modifying lookup behaviour
 * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
 *       pointer to output page_mask
 *
 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
 *
 * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
 * the device's dev_pagemap metadata to avoid repeating expensive lookups.
 *
 * On output, the @ctx->page_mask is set according to the size of the page.
 *
 * Return: the mapped (struct page *), %NULL if no mapping exists, or
 * an error pointer if there is a mapping to something not represented
 * by a page descriptor (see also vm_normal_page()).
 */
static struct page *follow_page_mask(struct vm_area_struct *vma,
                              unsigned long address, unsigned int flags,
                              struct follow_page_context *ctx)
{
        pgd_t *pgd;
        struct page *page;
        struct mm_struct *mm = vma->vm_mm;

        ctx->page_mask = 0;

        /* make this handle hugepd */
        page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
        if (!IS_ERR(page)) {
                WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
                return page;
        }

        pgd = pgd_offset(mm, address);

        if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
                return no_page_table(vma, flags);

        if (pgd_huge(*pgd)) {
                page = follow_huge_pgd(mm, address, pgd, flags);
                if (page)
                        return page;
                return no_page_table(vma, flags);
        }
        if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
                page = follow_huge_pd(vma, address,
                                      __hugepd(pgd_val(*pgd)), flags,
                                      PGDIR_SHIFT);
                if (page)
                        return page;
                return no_page_table(vma, flags);
        }

        return follow_p4d_mask(vma, address, pgd, flags, ctx);
}

struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                         unsigned int foll_flags)
{
        struct follow_page_context ctx = { NULL };
        struct page *page;

        page = follow_page_mask(vma, address, foll_flags, &ctx);
        if (ctx.pgmap)
                put_dev_pagemap(ctx.pgmap);
        return page;
}

static int get_gate_page(struct mm_struct *mm, unsigned long address,
                unsigned int gup_flags, struct vm_area_struct **vma,
                struct page **page)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;
        int ret = -EFAULT;

        /* user gate pages are read-only */
        if (gup_flags & FOLL_WRITE)
                return -EFAULT;
        if (address > TASK_SIZE)
                pgd = pgd_offset_k(address);
        else
                pgd = pgd_offset_gate(mm, address);
        if (pgd_none(*pgd))
                return -EFAULT;
        p4d = p4d_offset(pgd, address);
        if (p4d_none(*p4d))
                return -EFAULT;
        pud = pud_offset(p4d, address);
        if (pud_none(*pud))
                return -EFAULT;
        pmd = pmd_offset(pud, address);
        if (!pmd_present(*pmd))
                return -EFAULT;
        VM_BUG_ON(pmd_trans_huge(*pmd));
        pte = pte_offset_map(pmd, address);
        if (pte_none(*pte))
                goto unmap;
        *vma = get_gate_vma(mm);
        if (!page)
                goto out;
        *page = vm_normal_page(*vma, address, *pte);
        if (!*page) {
                if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
                        goto unmap;
                *page = pte_page(*pte);
        }
        if (unlikely(!try_grab_page(*page, gup_flags))) {
                ret = -ENOMEM;
                goto unmap;
        }
out:
        ret = 0;
unmap:
        pte_unmap(pte);
        return ret;
}

/*
 * mmap_lock must be held on entry.  If @locked != NULL and *@flags
 * does not include FOLL_NOWAIT, the mmap_lock may be released.  If it
 * is, *@locked will be set to 0 and -EBUSY returned.
 */
static int faultin_page(struct vm_area_struct *vma,
                unsigned long address, unsigned int *flags, int *locked)
{
        unsigned int fault_flags = 0;
        vm_fault_t ret;

        /* mlock all present pages, but do not fault in new pages */
        if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
                return -ENOENT;
        if (*flags & FOLL_WRITE)
                fault_flags |= FAULT_FLAG_WRITE;
        if (*flags & FOLL_REMOTE)
                fault_flags |= FAULT_FLAG_REMOTE;
        if (locked)
                fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
        if (*flags & FOLL_NOWAIT)
                fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
        if (*flags & FOLL_TRIED) {
                /*
                 * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED
                 * can co-exist
                 */
                fault_flags |= FAULT_FLAG_TRIED;
        }

        ret = handle_mm_fault(vma, address, fault_flags, NULL);
        if (ret & VM_FAULT_ERROR) {
                int err = vm_fault_to_errno(ret, *flags);

                if (err)
                        return err;
                BUG();
        }

        if (ret & VM_FAULT_RETRY) {
                if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
                        *locked = 0;
                return -EBUSY;
        }

        /*
         * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
         * necessary, even if maybe_mkwrite decided not to set pte_write. We
         * can thus safely do subsequent page lookups as if they were reads.
         * But only do so when looping for pte_write is futile: in some cases
         * userspace may also be wanting to write to the gotten user page,
         * which a read fault here might prevent (a readonly page might get
         * reCOWed by userspace write).
         */
        if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
                *flags |= FOLL_COW;
        return 0;
}

static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
{
        vm_flags_t vm_flags = vma->vm_flags;
        int write = (gup_flags & FOLL_WRITE);
        int foreign = (gup_flags & FOLL_REMOTE);

        if (vm_flags & (VM_IO | VM_PFNMAP))
                return -EFAULT;

        if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
                return -EFAULT;

        if (write) {
                if (!(vm_flags & VM_WRITE)) {
                        if (!(gup_flags & FOLL_FORCE))
                                return -EFAULT;
                        /*
                         * We used to let the write,force case do COW in a
                         * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
                         * set a breakpoint in a read-only mapping of an
                         * executable, without corrupting the file (yet only
                         * when that file had been opened for writing!).
                         * Anon pages in shared mappings are surprising: now
                         * just reject it.
                         */
                        if (!is_cow_mapping(vm_flags))
                                return -EFAULT;
                }
        } else if (!(vm_flags & VM_READ)) {
                if (!(gup_flags & FOLL_FORCE))
                        return -EFAULT;
                /*
                 * Is there actually any vma we can reach here which does not
                 * have VM_MAYREAD set?
                 */
                if (!(vm_flags & VM_MAYREAD))
                        return -EFAULT;
        }
        /*
         * gups are always data accesses, not instruction
         * fetches, so execute=false here
         */
        if (!arch_vma_access_permitted(vma, write, false, foreign))
                return -EFAULT;
        return 0;
}

/**
 * __get_user_pages() - pin user pages in memory
 * @mm:                mm_struct of target mm
 * @start:        starting user address
 * @nr_pages:        number of pages from start to pin
 * @gup_flags:        flags modifying pin behaviour
 * @pages:        array that receives pointers to the pages pinned.
 *                Should be at least nr_pages long. Or NULL, if caller
 *                only intends to ensure the pages are faulted in.
 * @vmas:        array of pointers to vmas corresponding to each page.
 *                Or NULL if the caller does not require them.
 * @locked:     whether we're still with the mmap_lock held
 *
 * Returns either number of pages pinned (which may be less than the
 * number requested), or an error. Details about the return value:
 *
 * -- If nr_pages is 0, returns 0.
 * -- If nr_pages is >0, but no pages were pinned, returns -errno.
 * -- If nr_pages is >0, and some pages were pinned, returns the number of
 *    pages pinned. Again, this may be less than nr_pages.
 * -- 0 return value is possible when the fault would need to be retried.
 *
 * The caller is responsible for releasing returned @pages, via put_page().
 *
 * @vmas are valid only as long as mmap_lock is held.
 *
 * Must be called with mmap_lock held.  It may be released.  See below.
 *
 * __get_user_pages walks a process's page tables and takes a reference to
 * each struct page that each user address corresponds to at a given
 * instant. That is, it takes the page that would be accessed if a user
 * thread accesses the given user virtual address at that instant.
 *
 * This does not guarantee that the page exists in the user mappings when
 * __get_user_pages returns, and there may even be a completely different
 * page there in some cases (eg. if mmapped pagecache has been invalidated
 * and subsequently re faulted). However it does guarantee that the page
 * won't be freed completely. And mostly callers simply care that the page
 * contains data that was valid *at some point in time*. Typically, an IO
 * or similar operation cannot guarantee anything stronger anyway because
 * locks can't be held over the syscall boundary.
 *
 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
 * the page is written to, set_page_dirty (or set_page_dirty_lock, as
 * appropriate) must be called after the page is finished with, and
 * before put_page is called.
 *
 * If @locked != NULL, *@locked will be set to 0 when mmap_lock is
 * released by an up_read().  That can happen if @gup_flags does not
 * have FOLL_NOWAIT.
 *
 * A caller using such a combination of @locked and @gup_flags
 * must therefore hold the mmap_lock for reading only, and recognize
 * when it's been released.  Otherwise, it must be held for either
 * reading or writing and will not be released.
 *
 * In most cases, get_user_pages or get_user_pages_fast should be used
 * instead of __get_user_pages. __get_user_pages should be used only if
 * you need some special @gup_flags.
 */
static long __get_user_pages(struct mm_struct *mm,
                unsigned long start, unsigned long nr_pages,
                unsigned int gup_flags, struct page **pages,
                struct vm_area_struct **vmas, int *locked)
{
        long ret = 0, i = 0;
        struct vm_area_struct *vma = NULL;
        struct follow_page_context ctx = { NULL };

        if (!nr_pages)
                return 0;

        start = untagged_addr(start);

        VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));

        /*
         * If FOLL_FORCE is set then do not force a full fault as the hinting
         * fault information is unrelated to the reference behaviour of a task
         * using the address space
         */
        if (!(gup_flags & FOLL_FORCE))
                gup_flags |= FOLL_NUMA;

        do {
                struct page *page;
                unsigned int foll_flags = gup_flags;
                unsigned int page_increm;

                /* first iteration or cross vma bound */
                if (!vma || start >= vma->vm_end) {
                        vma = find_extend_vma(mm, start);
                        if (!vma && in_gate_area(mm, start)) {
                                ret = get_gate_page(mm, start & PAGE_MASK,
                                                gup_flags, &vma,
                                                pages ? &pages[i] : NULL);
                                if (ret)
                                        goto out;
                                ctx.page_mask = 0;
                                goto next_page;
                        }

                        if (!vma || check_vma_flags(vma, gup_flags)) {
                                ret = -EFAULT;
                                goto out;
                        }
                        if (is_vm_hugetlb_page(vma)) {
                                i = follow_hugetlb_page(mm, vma, pages, vmas,
                                                &start, &nr_pages, i,
                                                gup_flags, locked);
                                if (locked && *locked == 0) {
                                        /*
                                         * We've got a VM_FAULT_RETRY
                                         * and we've lost mmap_lock.
                                         * We must stop here.
                                         */
                                        BUG_ON(gup_flags & FOLL_NOWAIT);
                                        BUG_ON(ret != 0);
                                        goto out;
                                }
                                continue;
                        }
                }
retry:
                /*
                 * If we have a pending SIGKILL, don't keep faulting pages and
                 * potentially allocating memory.
                 */
                if (fatal_signal_pending(current)) {
                        ret = -EINTR;
                        goto out;
                }
                cond_resched();

                page = follow_page_mask(vma, start, foll_flags, &ctx);
                if (!page) {
                        ret = faultin_page(vma, start, &foll_flags, locked);
                        switch (ret) {
                        case 0:
                                goto retry;
                        case -EBUSY:
                                ret = 0;
                                fallthrough;
                        case -EFAULT:
                        case -ENOMEM:
                        case -EHWPOISON:
                                goto out;
                        case -ENOENT:
                                goto next_page;
                        }
                        BUG();
                } else if (PTR_ERR(page) == -EEXIST) {
                        /*
                         * Proper page table entry exists, but no corresponding
                         * struct page.
                         */
                        goto next_page;
                } else if (IS_ERR(page)) {
                        ret = PTR_ERR(page);
                        goto out;
                }
                if (pages) {
                        pages[i] = page;
                        flush_anon_page(vma, page, start);
                        flush_dcache_page(page);
                        ctx.page_mask = 0;
                }
next_page:
                if (vmas) {
                        vmas[i] = vma;
                        ctx.page_mask = 0;
                }
                page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
                if (page_increm > nr_pages)
                        page_increm = nr_pages;
                i += page_increm;
                start += page_increm * PAGE_SIZE;
                nr_pages -= page_increm;
        } while (nr_pages);
out:
        if (ctx.pgmap)
                put_dev_pagemap(ctx.pgmap);
        return i ? i : ret;
}

static bool vma_permits_fault(struct vm_area_struct *vma,
                              unsigned int fault_flags)
{
        bool write   = !!(fault_flags & FAULT_FLAG_WRITE);
        bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
        vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;

        if (!(vm_flags & vma->vm_flags))
                return false;

        /*
         * The architecture might have a hardware protection
         * mechanism other than read/write that can deny access.
         *
         * gup always represents data access, not instruction
         * fetches, so execute=false here:
         */
        if (!arch_vma_access_permitted(vma, write, false, foreign))
                return false;

        return true;
}

/**
 * fixup_user_fault() - manually resolve a user page fault
 * @mm:                mm_struct of target mm
 * @address:        user address
 * @fault_flags:flags to pass down to handle_mm_fault()
 * @unlocked:        did we unlock the mmap_lock while retrying, maybe NULL if caller
 *                does not allow retry. If NULL, the caller must guarantee
 *                that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY.
 *
 * This is meant to be called in the specific scenario where for locking reasons
 * we try to access user memory in atomic context (within a pagefault_disable()
 * section), this returns -EFAULT, and we want to resolve the user fault before
 * trying again.
 *
 * Typically this is meant to be used by the futex code.
 *
 * The main difference with get_user_pages() is that this function will
 * unconditionally call handle_mm_fault() which will in turn perform all the
 * necessary SW fixup of the dirty and young bits in the PTE, while
 * get_user_pages() only guarantees to update these in the struct page.
 *
 * This is important for some architectures where those bits also gate the
 * access permission to the page because they are maintained in software.  On
 * such architectures, gup() will not be enough to make a subsequent access
 * succeed.
 *
 * This function will not return with an unlocked mmap_lock. So it has not the
 * same semantics wrt the @mm->mmap_lock as does filemap_fault().
 */
int fixup_user_fault(struct mm_struct *mm,
                     unsigned long address, unsigned int fault_flags,
                     bool *unlocked)
{
        struct vm_area_struct *vma;
        vm_fault_t ret, major = 0;

        address = untagged_addr(address);

        if (unlocked)
                fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;

retry:
        vma = find_extend_vma(mm, address);
        if (!vma || address < vma->vm_start)
                return -EFAULT;

        if (!vma_permits_fault(vma, fault_flags))
                return -EFAULT;

        if ((fault_flags & FAULT_FLAG_KILLABLE) &&
            fatal_signal_pending(current))
                return -EINTR;

        ret = handle_mm_fault(vma, address, fault_flags, NULL);
        major |= ret & VM_FAULT_MAJOR;
        if (ret & VM_FAULT_ERROR) {
                int err = vm_fault_to_errno(ret, 0);

                if (err)
                        return err;
                BUG();
        }

        if (ret & VM_FAULT_RETRY) {
                mmap_read_lock(mm);
                *unlocked = true;
                fault_flags |= FAULT_FLAG_TRIED;
                goto retry;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(fixup_user_fault);

/*
 * Please note that this function, unlike __get_user_pages will not
 * return 0 for nr_pages > 0 without FOLL_NOWAIT
 */
static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
                                                unsigned long start,
                                                unsigned long nr_pages,
                                                struct page **pages,
                                                struct vm_area_struct **vmas,
                                                int *locked,
                                                unsigned int flags)
{
        long ret, pages_done;
        bool lock_dropped;

        if (locked) {
                /* if VM_FAULT_RETRY can be returned, vmas become invalid */
                BUG_ON(vmas);
                /* check caller initialized locked */
                BUG_ON(*locked != 1);
        }

        if (flags & FOLL_PIN)
                atomic_set(&mm->has_pinned, 1);

        /*
         * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
         * is to set FOLL_GET if the caller wants pages[] filled in (but has
         * carelessly failed to specify FOLL_GET), so keep doing that, but only
         * for FOLL_GET, not for the newer FOLL_PIN.
         *
         * FOLL_PIN always expects pages to be non-null, but no need to assert
         * that here, as any failures will be obvious enough.
         */
        if (pages && !(flags & FOLL_PIN))
                flags |= FOLL_GET;

        pages_done = 0;
        lock_dropped = false;
        for (;;) {
                ret = __get_user_pages(mm, start, nr_pages, flags, pages,
                                       vmas, locked);
                if (!locked)
                        /* VM_FAULT_RETRY couldn't trigger, bypass */
                        return ret;

                /* VM_FAULT_RETRY cannot return errors */
                if (!*locked) {
                        BUG_ON(ret < 0);
                        BUG_ON(ret >= nr_pages);
                }

                if (ret > 0) {
                        nr_pages -= ret;
                        pages_done += ret;
                        if (!nr_pages)
                                break;
                }
                if (*locked) {
                        /*
                         * VM_FAULT_RETRY didn't trigger or it was a
                         * FOLL_NOWAIT.
                         */
                        if (!pages_done)
                                pages_done = ret;
                        break;
                }
                /*
                 * VM_FAULT_RETRY triggered, so seek to the faulting offset.
                 * For the prefault case (!pages) we only update counts.
                 */
                if (likely(pages))
                        pages += ret;
                start += ret << PAGE_SHIFT;
                lock_dropped = true;

retry:
                /*
                 * Repeat on the address that fired VM_FAULT_RETRY
                 * with both FAULT_FLAG_ALLOW_RETRY and
                 * FAULT_FLAG_TRIED.  Note that GUP can be interrupted
                 * by fatal signals, so we need to check it before we
                 * start trying again otherwise it can loop forever.
                 */

                if (fatal_signal_pending(current)) {
                        if (!pages_done)
                                pages_done = -EINTR;
                        break;
                }

                ret = mmap_read_lock_killable(mm);
                if (ret) {
                        BUG_ON(ret > 0);
                        if (!pages_done)
                                pages_done = ret;
                        break;
                }

                *locked = 1;
                ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
                                       pages, NULL, locked);
                if (!*locked) {
                        /* Continue to retry until we succeeded */
                        BUG_ON(ret != 0);
                        goto retry;
                }
                if (ret != 1) {
                        BUG_ON(ret > 1);
                        if (!pages_done)
                                pages_done = ret;
                        break;
                }
                nr_pages--;
                pages_done++;
                if (!nr_pages)
                        break;
                if (likely(pages))
                        pages++;
                start += PAGE_SIZE;
        }
        if (lock_dropped && *locked) {
                /*
                 * We must let the caller know we temporarily dropped the lock
                 * and so the critical section protected by it was lost.
                 */
                mmap_read_unlock(mm);
                *locked = 0;
        }
        return pages_done;
}

/**
 * populate_vma_page_range() -  populate a range of pages in the vma.
 * @vma:   target vma
 * @start: start address
 * @end:   end address
 * @locked: whether the mmap_lock is still held
 *
 * This takes care of mlocking the pages too if VM_LOCKED is set.
 *
 * Return either number of pages pinned in the vma, or a negative error
 * code on error.
 *
 * vma->vm_mm->mmap_lock must be held.
 *
 * If @locked is NULL, it may be held for read or write and will
 * be unperturbed.
 *
 * If @locked is non-NULL, it must held for read only and may be
 * released.  If it's released, *@locked will be set to 0.
 */
long populate_vma_page_range(struct vm_area_struct *vma,
                unsigned long start, unsigned long end, int *locked)
{
        struct mm_struct *mm = vma->vm_mm;
        unsigned long nr_pages = (end - start) / PAGE_SIZE;
        int gup_flags;

        VM_BUG_ON(start & ~PAGE_MASK);
        VM_BUG_ON(end   & ~PAGE_MASK);
        VM_BUG_ON_VMA(start < vma->vm_start, vma);
        VM_BUG_ON_VMA(end   > vma->vm_end, vma);
        mmap_assert_locked(mm);

        gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
        if (vma->vm_flags & VM_LOCKONFAULT)
                gup_flags &= ~FOLL_POPULATE;
        /*
         * We want to touch writable mappings with a write fault in order
         * to break COW, except for shared mappings because these don't COW
         * and we would not want to dirty them for nothing.
         */
        if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
                gup_flags |= FOLL_WRITE;

        /*
         * We want mlock to succeed for regions that have any permissions
         * other than PROT_NONE.
         */
        if (vma_is_accessible(vma))
                gup_flags |= FOLL_FORCE;

        /*
         * We made sure addr is within a VMA, so the following will
         * not result in a stack expansion that recurses back here.
         */
        return __get_user_pages(mm, start, nr_pages, gup_flags,
                                NULL, NULL, locked);
}

/*
 * __mm_populate - populate and/or mlock pages within a range of address space.
 *
 * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
 * flags. VMAs must be already marked with the desired vm_flags, and
 * mmap_lock must not be held.
 */
int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
{
        struct mm_struct *mm = current->mm;
        unsigned long end, nstart, nend;
        struct vm_area_struct *vma = NULL;
        int locked = 0;
        long ret = 0;

        end = start + len;

        for (nstart = start; nstart < end; nstart = nend) {
                /*
                 * We want to fault in pages for [nstart; end) address range.
                 * Find first corresponding VMA.
                 */
                if (!locked) {
                        locked = 1;
                        mmap_read_lock(mm);
                        vma = find_vma(mm, nstart);
                } else if (nstart >= vma->vm_end)
                        vma = vma->vm_next;
                if (!vma || vma->vm_start >= end)
                        break;
                /*
                 * Set [nstart; nend) to intersection of desired address
                 * range with the first VMA. Also, skip undesirable VMA types.
                 */
                nend = min(end, vma->vm_end);
                if (vma->vm_flags & (VM_IO | VM_PFNMAP))
                        continue;
                if (nstart < vma->vm_start)
                        nstart = vma->vm_start;
                /*
                 * Now fault in a range of pages. populate_vma_page_range()
                 * double checks the vma flags, so that it won't mlock pages
                 * if the vma was already munlocked.
                 */
                ret = populate_vma_page_range(vma, nstart, nend, &locked);
                if (ret < 0) {
                        if (ignore_errors) {
                                ret = 0;
                                continue;        /* continue at next VMA */
                        }
                        break;
                }
                nend = nstart + ret * PAGE_SIZE;
                ret = 0;
        }
        if (locked)
                mmap_read_unlock(mm);
        return ret;        /* 0 or negative error code */
}
#else /* CONFIG_MMU */
static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
                unsigned long nr_pages, struct page **pages,
                struct vm_area_struct **vmas, int *locked,
                unsigned int foll_flags)
{
        struct vm_area_struct *vma;
        unsigned long vm_flags;
        int i;

        /* calculate required read or write permissions.
         * If FOLL_FORCE is set, we only require the "MAY" flags.
         */
        vm_flags  = (foll_flags & FOLL_WRITE) ?
                        (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
        vm_flags &= (foll_flags & FOLL_FORCE) ?
                        (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);

        for (i = 0; i < nr_pages; i++) {
                vma = find_vma(mm, start);
                if (!vma)
                        goto finish_or_fault;

                /* protect what we can, including chardevs */
                if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
                    !(vm_flags & vma->vm_flags))
                        goto finish_or_fault;

                if (pages) {
                        pages[i] = virt_to_page(start);
                        if (pages[i])
                                get_page(pages[i]);
                }
                if (vmas)
                        vmas[i] = vma;
                start = (start + PAGE_SIZE) & PAGE_MASK;
        }

        return i;

finish_or_fault:
        return i ? : -EFAULT;
}
#endif /* !CONFIG_MMU */

/**
 * get_dump_page() - pin user page in memory while writing it to core dump
 * @addr: user address
 *
 * Returns struct page pointer of user page pinned for dump,
 * to be freed afterwards by put_page().
 *
 * Returns NULL on any kind of failure - a hole must then be inserted into
 * the corefile, to preserve alignment with its headers; and also returns
 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
 * allowing a hole to be left in the corefile to save diskspace.
 *
 * Called without mmap_lock (takes and releases the mmap_lock by itself).
 */
#ifdef CONFIG_ELF_CORE
struct page *get_dump_page(unsigned long addr)
{
        struct mm_struct *mm = current->mm;
        struct page *page;
        int locked = 1;
        int ret;

        if (mmap_read_lock_killable(mm))
                return NULL;
        ret = __get_user_pages_locked(mm, addr, 1, &page, NULL, &locked,
                                      FOLL_FORCE | FOLL_DUMP | FOLL_GET);
        if (locked)
                mmap_read_unlock(mm);
        return (ret == 1) ? page : NULL;
}
#endif /* CONFIG_ELF_CORE */

#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
{
        long i;
        struct vm_area_struct *vma_prev = NULL;

        for (i = 0; i < nr_pages; i++) {
                struct vm_area_struct *vma = vmas[i];

                if (vma == vma_prev)
                        continue;

                vma_prev = vma;

                if (vma_is_fsdax(vma))
                        return true;
        }
        return false;
}

#ifdef CONFIG_CMA
static long check_and_migrate_cma_pages(struct mm_struct *mm,
                                        unsigned long start,
                                        unsigned long nr_pages,
                                        struct page **pages,
                                        struct vm_area_struct **vmas,
                                        unsigned int gup_flags)
{
        unsigned long i, isolation_error_count;
        bool drain_allow;
        LIST_HEAD(cma_page_list);
        long ret = nr_pages;
        struct page *prev_head, *head;
        struct migration_target_control mtc = {
                .nid = NUMA_NO_NODE,
                .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_NOWARN,
        };

check_again:
        prev_head = NULL;
        isolation_error_count = 0;
        drain_allow = true;
        for (i = 0; i < nr_pages; i++) {
                head = compound_head(pages[i]);
                if (head == prev_head)
                        continue;
                prev_head = head;
                /*
                 * If we get a page from the CMA zone, since we are going to
                 * be pinning these entries, we might as well move them out
                 * of the CMA zone if possible.
                 */
                if (is_migrate_cma_page(head)) {
                        if (PageHuge(head)) {
                                if (isolate_hugetlb(head, &cma_page_list))
                                        isolation_error_count++;
                        } else {
                                if (!PageLRU(head) && drain_allow) {
                                        lru_add_drain_all();
                                        drain_allow = false;
                                }

                                if (isolate_lru_page(head)) {
                                        isolation_error_count++;
                                        continue;
                                }
                                list_add_tail(&head->lru, &cma_page_list);
                                mod_node_page_state(page_pgdat(head),
                                                    NR_ISOLATED_ANON +
                                                    page_is_file_lru(head),
                                                    thp_nr_pages(head));
                        }
                }
        }

        /*
         * If list is empty, and no isolation errors, means that all pages are
         * in the correct zone.
         */
        if (list_empty(&cma_page_list) && !isolation_error_count)
                return ret;

        if (!list_empty(&cma_page_list)) {
                /*
                 * drop the above get_user_pages reference.
                 */
                if (gup_flags & FOLL_PIN)
                        unpin_user_pages(pages, nr_pages);
                else
                        for (i = 0; i < nr_pages; i++)
                                put_page(pages[i]);

                ret = migrate_pages(&cma_page_list, alloc_migration_target,
                                    NULL, (unsigned long)&mtc, MIGRATE_SYNC,
                                    MR_CONTIG_RANGE);
                if (ret) {
                        if (!list_empty(&cma_page_list))
                                putback_movable_pages(&cma_page_list);
                        return ret > 0 ? -ENOMEM : ret;
                }

                /* We unpinned pages before migration, pin them again */
                ret = __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
                                              NULL, gup_flags);
                if (ret <= 0)
                        return ret;
                nr_pages = ret;
        }

        /*
         * check again because pages were unpinned, and we also might have
         * had isolation errors and need more pages to migrate.
         */
        goto check_again;
}
#else
static long check_and_migrate_cma_pages(struct mm_struct *mm,
                                        unsigned long start,
                                        unsigned long nr_pages,
                                        struct page **pages,
                                        struct vm_area_struct **vmas,
                                        unsigned int gup_flags)
{
        return nr_pages;
}
#endif /* CONFIG_CMA */

/*
 * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
 * allows us to process the FOLL_LONGTERM flag.
 */
static long __gup_longterm_locked(struct mm_struct *mm,
                                  unsigned long start,
                                  unsigned long nr_pages,
                                  struct page **pages,
                                  struct vm_area_struct **vmas,
                                  unsigned int gup_flags)
{
        struct vm_area_struct **vmas_tmp = vmas;
        unsigned long flags = 0;
        long rc, i;

        if (gup_flags & FOLL_LONGTERM) {
                if (!pages)
                        return -EINVAL;

                if (!vmas_tmp) {
                        vmas_tmp = kcalloc(nr_pages,
                                           sizeof(struct vm_area_struct *),
                                           GFP_KERNEL);
                        if (!vmas_tmp)
                                return -ENOMEM;
                }
                flags = memalloc_nocma_save();
        }

        rc = __get_user_pages_locked(mm, start, nr_pages, pages,
                                     vmas_tmp, NULL, gup_flags);

        if (gup_flags & FOLL_LONGTERM) {
                if (rc < 0)
                        goto out;

                if (check_dax_vmas(vmas_tmp, rc)) {
                        if (gup_flags & FOLL_PIN)
                                unpin_user_pages(pages, rc);
                        else
                                for (i = 0; i < rc; i++)
                                        put_page(pages[i]);
                        rc = -EOPNOTSUPP;
                        goto out;
                }

                rc = check_and_migrate_cma_pages(mm, start, rc, pages,
                                                 vmas_tmp, gup_flags);
out:
                memalloc_nocma_restore(flags);
        }

        if (vmas_tmp != vmas)
                kfree(vmas_tmp);
        return rc;
}
#else /* !CONFIG_FS_DAX && !CONFIG_CMA */
static __always_inline long __gup_longterm_locked(struct mm_struct *mm,
                                                  unsigned long start,
                                                  unsigned long nr_pages,
                                                  struct page **pages,
                                                  struct vm_area_struct **vmas,
                                                  unsigned int flags)
{
        return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
                                       NULL, flags);
}
#endif /* CONFIG_FS_DAX || CONFIG_CMA */

static bool is_valid_gup_flags(unsigned int gup_flags)
{
        /*
         * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
         * never directly by the caller, so enforce that with an assertion:
         */
        if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
                return false;
        /*
         * FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying
         * that is, FOLL_LONGTERM is a specific case, more restrictive case of
         * FOLL_PIN.
         */
        if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
                return false;

        return true;
}

#ifdef CONFIG_MMU
static long __get_user_pages_remote(struct mm_struct *mm,
                                    unsigned long start, unsigned long nr_pages,
                                    unsigned int gup_flags, struct page **pages,
                                    struct vm_area_struct **vmas, int *locked)
{
        /*
         * Parts of FOLL_LONGTERM behavior are incompatible with
         * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
         * vmas. However, this only comes up if locked is set, and there are
         * callers that do request FOLL_LONGTERM, but do not set locked. So,
         * allow what we can.
         */
        if (gup_flags & FOLL_LONGTERM) {
                if (WARN_ON_ONCE(locked))
                        return -EINVAL;
                /*
                 * This will check the vmas (even if our vmas arg is NULL)
                 * and return -ENOTSUPP if DAX isn't allowed in this case:
                 */
                return __gup_longterm_locked(mm, start, nr_pages, pages,
                                             vmas, gup_flags | FOLL_TOUCH |
                                             FOLL_REMOTE);
        }

        return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
                                       locked,
                                       gup_flags | FOLL_TOUCH | FOLL_REMOTE);
}

/**
 * get_user_pages_remote() - pin user pages in memory
 * @mm:                mm_struct of target mm
 * @start:        starting user address
 * @nr_pages:        number of pages from start to pin
 * @gup_flags:        flags modifying lookup behaviour
 * @pages:        array that receives pointers to the pages pinned.
 *                Should be at least nr_pages long. Or NULL, if caller
 *                only intends to ensure the pages are faulted in.
 * @vmas:        array of pointers to vmas corresponding to each page.
 *                Or NULL if the caller does not require them.
 * @locked:        pointer to lock flag indicating whether lock is held and
 *                subsequently whether VM_FAULT_RETRY functionality can be
 *                utilised. Lock must initially be held.
 *
 * Returns either number of pages pinned (which may be less than the
 * number requested), or an error. Details about the return value:
 *
 * -- If nr_pages is 0, returns 0.
 * -- If nr_pages is >0, but no pages were pinned, returns -errno.
 * -- If nr_pages is >0, and some pages were pinned, returns the number of
 *    pages pinned. Again, this may be less than nr_pages.
 *
 * The caller is responsible for releasing returned @pages, via put_page().
 *
 * @vmas are valid only as long as mmap_lock is held.
 *
 * Must be called with mmap_lock held for read or write.
 *
 * get_user_pages_remote walks a process's page tables and takes a reference
 * to each struct page that each user address corresponds to at a given
 * instant. That is, it takes the page that would be accessed if a user
 * thread accesses the given user virtual address at that instant.
 *
 * This does not guarantee that the page exists in the user mappings when
 * get_user_pages_remote returns, and there may even be a completely different
 * page there in some cases (eg. if mmapped pagecache has been invalidated
 * and subsequently re faulted). However it does guarantee that the page
 * won't be freed completely. And mostly callers simply care that the page
 * contains data that was valid *at some point in time*. Typically, an IO
 * or similar operation cannot guarantee anything stronger anyway because
 * locks can't be held over the syscall boundary.
 *
 * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
 * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
 * be called after the page is finished with, and before put_page is called.
 *
 * get_user_pages_remote is typically used for fewer-copy IO operations,
 * to get a handle on the memory by some means other than accesses
 * via the user virtual addresses. The pages may be submitted for
 * DMA to devices or accessed via their kernel linear mapping (via the
 * kmap APIs). Care should be taken to use the correct cache flushing APIs.
 *
 * See also get_user_pages_fast, for performance critical applications.
 *
 * get_user_pages_remote should be phased out in favor of
 * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
 * should use get_user_pages_remote because it cannot pass
 * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
 */
long get_user_pages_remote(struct mm_struct *mm,
                unsigned long start, unsigned long nr_pages,
                unsigned int gup_flags, struct page **pages,
                struct vm_area_struct **vmas, int *locked)
{
        if (!is_valid_gup_flags(gup_flags))
                return -EINVAL;

        return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
                                       pages, vmas, locked);
}
EXPORT_SYMBOL(get_user_pages_remote);

#else /* CONFIG_MMU */
long get_user_pages_remote(struct mm_struct *mm,
                           unsigned long start, unsigned long nr_pages,
                           unsigned int gup_flags, struct page **pages,
                           struct vm_area_struct **vmas, int *locked)
{
        return 0;
}

static long __get_user_pages_remote(struct mm_struct *mm,
                                    unsigned long start, unsigned long nr_pages,
                                    unsigned int gup_flags, struct page **pages,
                                    struct vm_area_struct **vmas, int *locked)
{
        return 0;
}
#endif /* !CONFIG_MMU */

/**
 * get_user_pages() - pin user pages in memory
 * @start:      starting user address
 * @nr_pages:   number of pages from start to pin
 * @gup_flags:  flags modifying lookup behaviour
 * @pages:      array that receives pointers to the pages pinned.
 *              Should be at least nr_pages long. Or NULL, if caller
 *              only intends to ensure the pages are faulted in.
 * @vmas:       array of pointers to vmas corresponding to each page.
 *              Or NULL if the caller does not require them.
 *
 * This is the same as get_user_pages_remote(), just with a less-flexible
 * calling convention where we assume that the mm being operated on belongs to
 * the current task, and doesn't allow passing of a locked parameter.  We also
 * obviously don't pass FOLL_REMOTE in here.
 */
long get_user_pages(unsigned long start, unsigned long nr_pages,
                unsigned int gup_flags, struct page **pages,
                struct vm_area_struct **vmas)
{
        if (!is_valid_gup_flags(gup_flags))
                return -EINVAL;

        return __gup_longterm_locked(current->mm, start, nr_pages,
                                     pages, vmas, gup_flags | FOLL_TOUCH);
}
EXPORT_SYMBOL(get_user_pages);

/**
 * get_user_pages_locked() is suitable to replace the form:
 *
 *      mmap_read_lock(mm);
 *      do_something()
 *      get_user_pages(mm, ..., pages, NULL);
 *      mmap_read_unlock(mm);
 *
 *  to:
 *
 *      int locked = 1;
 *      mmap_read_lock(mm);
 *      do_something()
 *      get_user_pages_locked(mm, ..., pages, &locked);
 *      if (locked)
 *          mmap_read_unlock(mm);
 *
 * @start:      starting user address
 * @nr_pages:   number of pages from start to pin
 * @gup_flags:  flags modifying lookup behaviour
 * @pages:      array that receives pointers to the pages pinned.
 *              Should be at least nr_pages long. Or NULL, if caller
 *              only intends to ensure the pages are faulted in.
 * @locked:     pointer to lock flag indicating whether lock is held and
 *              subsequently whether VM_FAULT_RETRY functionality can be
 *              utilised. Lock must initially be held.
 *
 * We can leverage the VM_FAULT_RETRY functionality in the page fault
 * paths better by using either get_user_pages_locked() or
 * get_user_pages_unlocked().
 *
 */
long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
                           unsigned int gup_flags, struct page **pages,
                           int *locked)
{
        /*
         * FIXME: Current FOLL_LONGTERM behavior is incompatible with
         * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
         * vmas.  As there are no users of this flag in this call we simply
         * disallow this option for now.
         */
        if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
                return -EINVAL;
        /*
         * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
         * never directly by the caller, so enforce that:
         */
        if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
                return -EINVAL;

        return __get_user_pages_locked(current->mm, start, nr_pages,
                                       pages, NULL, locked,
                                       gup_flags | FOLL_TOUCH);
}
EXPORT_SYMBOL(get_user_pages_locked);

/*
 * get_user_pages_unlocked() is suitable to replace the form:
 *
 *      mmap_read_lock(mm);
 *      get_user_pages(mm, ..., pages, NULL);
 *      mmap_read_unlock(mm);
 *
 *  with:
 *
 *      get_user_pages_unlocked(mm, ..., pages);
 *
 * It is functionally equivalent to get_user_pages_fast so
 * get_user_pages_fast should be used instead if specific gup_flags
 * (e.g. FOLL_FORCE) are not required.
 */
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                             struct page **pages, unsigned int gup_flags)
{
        struct mm_struct *mm = current->mm;
        int locked = 1;
        long ret;

        /*
         * FIXME: Current FOLL_LONGTERM behavior is incompatible with
         * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
         * vmas.  As there are no users of this flag in this call we simply
         * disallow this option for now.
         */
        if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
                return -EINVAL;

        mmap_read_lock(mm);
        ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL,
                                      &locked, gup_flags | FOLL_TOUCH);
        if (locked)
                mmap_read_unlock(mm);
        return ret;
}
EXPORT_SYMBOL(get_user_pages_unlocked);

/*
 * Fast GUP
 *
 * get_user_pages_fast attempts to pin user pages by walking the page
 * tables directly and avoids taking locks. Thus the walker needs to be
 * protected from page table pages being freed from under it, and should
 * block any THP splits.
 *
 * One way to achieve this is to have the walker disable interrupts, and
 * rely on IPIs from the TLB flushing code blocking before the page table
 * pages are freed. This is unsuitable for architectures that do not need
 * to broadcast an IPI when invalidating TLBs.
 *
 * Another way to achieve this is to batch up page table containing pages
 * belonging to more than one mm_user, then rcu_sched a callback to free those
 * pages. Disabling interrupts will allow the fast_gup walker to both block
 * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
 * (which is a relatively rare event). The code below adopts this strategy.
 *
 * Before activating this code, please be aware that the following assumptions
 * are currently made:
 *
 *  *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
 *  free pages containing page tables or TLB flushing requires IPI broadcast.
 *
 *  *) ptes can be read atomically by the architecture.
 *
 *  *) access_ok is sufficient to validate userspace address ranges.
 *
 * The last two assumptions can be relaxed by the addition of helper functions.
 *
 * This code is based heavily on the PowerPC implementation by Nick Piggin.
 */
#ifdef CONFIG_HAVE_FAST_GUP
#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH

/*
 * WARNING: only to be used in the get_user_pages_fast() implementation.
 *
 * With get_user_pages_fast(), we walk down the pagetables without taking any
 * locks.  For this we would like to load the pointers atomically, but sometimes
 * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE).  What
 * we do have is the guarantee that a PTE will only either go from not present
 * to present, or present to not present or both -- it will not switch to a
 * completely different present page without a TLB flush in between; something
 * that we are blocking by holding interrupts off.
 *
 * Setting ptes from not present to present goes:
 *
 *   ptep->pte_high = h;
 *   smp_wmb();
 *   ptep->pte_low = l;
 *
 * And present to not present goes:
 *
 *   ptep->pte_low = 0;
 *   smp_wmb();
 *   ptep->pte_high = 0;
 *
 * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'.
 * We load pte_high *after* loading pte_low, which ensures we don't see an older
 * value of pte_high.  *Then* we recheck pte_low, which ensures that we haven't
 * picked up a changed pte high. We might have gotten rubbish values from
 * pte_low and pte_high, but we are guaranteed that pte_low will not have the
 * present bit set *unless* it is 'l'. Because get_user_pages_fast() only
 * operates on present ptes we're safe.
 */
static inline pte_t gup_get_pte(pte_t *ptep)
{
        pte_t pte;

        do {
                pte.pte_low = ptep->pte_low;
                smp_rmb();
                pte.pte_high = ptep->pte_high;
                smp_rmb();
        } while (unlikely(pte.pte_low != ptep->pte_low));

        return pte;
}
#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */
/*
 * We require that the PTE can be read atomically.
 */
static inline pte_t gup_get_pte(pte_t *ptep)
{
        return ptep_get(ptep);
}
#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */

static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
                                            unsigned int flags,
                                            struct page **pages)
{
        while ((*nr) - nr_start) {
                struct page *page = pages[--(*nr)];

                ClearPageReferenced(page);
                if (flags & FOLL_PIN)
                        unpin_user_page(page);
                else
                        put_page(page);
        }
}

#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
/*
 * Fast-gup relies on pte change detection to avoid concurrent pgtable
 * operations.
 *
 * To pin the page, fast-gup needs to do below in order:
 * (1) pin the page (by prefetching pte), then (2) check pte not changed.
 *
 * For the rest of pgtable operations where pgtable updates can be racy
 * with fast-gup, we need to do (1) clear pte, then (2) check whether page
 * is pinned.
 *
 * Above will work for all pte-level operations, including THP split.
 *
 * For THP collapse, it's a bit more complicated because fast-gup may be
 * walking a pgtable page that is being freed (pte is still valid but pmd
 * can be cleared already).  To avoid race in such condition, we need to
 * also check pmd here to make sure pmd doesn't change (corresponds to
 * pmdp_collapse_flush() in the THP collapse code path).
 */
static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
                         unsigned long end, unsigned int flags,
                         struct page **pages, int *nr)
{
        struct dev_pagemap *pgmap = NULL;
        int nr_start = *nr, ret = 0;
        pte_t *ptep, *ptem;

        ptem = ptep = pte_offset_map(&pmd, addr);
        do {
                pte_t pte = gup_get_pte(ptep);
                struct page *head, *page;

                /*
                 * Similar to the PMD case below, NUMA hinting must take slow
                 * path using the pte_protnone check.
                 */
                if (pte_protnone(pte))
                        goto pte_unmap;

                if (!pte_access_permitted(pte, flags & FOLL_WRITE))
                        goto pte_unmap;

                if (pte_devmap(pte)) {
                        if (unlikely(flags & FOLL_LONGTERM))
                                goto pte_unmap;

                        pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
                        if (unlikely(!pgmap)) {
                                undo_dev_pagemap(nr, nr_start, flags, pages);
                                goto pte_unmap;
                        }
                } else if (pte_special(pte))
                        goto pte_unmap;

                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
                page = pte_page(pte);

                head = try_grab_compound_head(page, 1, flags);
                if (!head)
                        goto pte_unmap;

                if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
                    unlikely(pte_val(pte) != pte_val(*ptep))) {
                        put_compound_head(head, 1, flags);
                        goto pte_unmap;
                }

                VM_BUG_ON_PAGE(compound_head(page) != head, page);

                /*
                 * We need to make the page accessible if and only if we are
                 * going to access its content (the FOLL_PIN case).  Please
                 * see Documentation/core-api/pin_user_pages.rst for
                 * details.
                 */
                if (flags & FOLL_PIN) {
                        ret = arch_make_page_accessible(page);
                        if (ret) {
                                unpin_user_page(page);
                                goto pte_unmap;
                        }
                }
                SetPageReferenced(page);
                pages[*nr] = page;
                (*nr)++;

        } while (ptep++, addr += PAGE_SIZE, addr != end);

        ret = 1;

pte_unmap:
        if (pgmap)
                put_dev_pagemap(pgmap);
        pte_unmap(ptem);
        return ret;
}
#else

/*
 * If we can't determine whether or not a pte is special, then fail immediately
 * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
 * to be special.
 *
 * For a futex to be placed on a THP tail page, get_futex_key requires a
 * get_user_pages_fast_only implementation that can pin pages. Thus it's still
 * useful to have gup_huge_pmd even if we can't operate on ptes.
 */
static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
                         unsigned long end, unsigned int flags,
                         struct page **pages, int *nr)
{
        return 0;
}
#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */

#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
static int __gup_device_huge(unsigned long pfn, unsigned long addr,
                             unsigned long end, unsigned int flags,
                             struct page **pages, int *nr)
{
        int nr_start = *nr;
        struct dev_pagemap *pgmap = NULL;

        do {
                struct page *page = pfn_to_page(pfn);

                pgmap = get_dev_pagemap(pfn, pgmap);
                if (unlikely(!pgmap)) {
                        undo_dev_pagemap(nr, nr_start, flags, pages);
                        return 0;
                }
                SetPageReferenced(page);
                pages[*nr] = page;
                if (unlikely(!try_grab_page(page, flags))) {
                        undo_dev_pagemap(nr, nr_start, flags, pages);
                        return 0;
                }
                (*nr)++;
                pfn++;
        } while (addr += PAGE_SIZE, addr != end);

        if (pgmap)
                put_dev_pagemap(pgmap);
        return 1;
}

static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
                                 unsigned long end, unsigned int flags,
                                 struct page **pages, int *nr)
{
        unsigned long fault_pfn;
        int nr_start = *nr;

        fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
        if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
                return 0;

        if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
                undo_dev_pagemap(nr, nr_start, flags, pages);
                return 0;
        }
        return 1;
}

static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
                                 unsigned long end, unsigned int flags,
                                 struct page **pages, int *nr)
{
        unsigned long fault_pfn;
        int nr_start = *nr;

        fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
        if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
                return 0;

        if (unlikely(pud_val(orig) != pud_val(*pudp))) {
                undo_dev_pagemap(nr, nr_start, flags, pages);
                return 0;
        }
        return 1;
}
#else
static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
                                 unsigned long end, unsigned int flags,
                                 struct page **pages, int *nr)
{
        BUILD_BUG();
        return 0;
}

static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
                                 unsigned long end, unsigned int flags,
                                 struct page **pages, int *nr)
{
        BUILD_BUG();
        return 0;
}
#endif

static int record_subpages(struct page *page, unsigned long addr,
                           unsigned long end, struct page **pages)
{
        int nr;

        for (nr = 0; addr != end; addr += PAGE_SIZE)
                pages[nr++] = page++;

        return nr;
}

#ifdef CONFIG_ARCH_HAS_HUGEPD
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
                                      unsigned long sz)
{
        unsigned long __boundary = (addr + sz) & ~(sz-1);
        return (__boundary - 1 < end - 1) ? __boundary : end;
}

static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
                       unsigned long end, unsigned int flags,
                       struct page **pages, int *nr)
{
        unsigned long pte_end;
        struct page *head, *page;
        pte_t pte;
        int refs;

        pte_end = (addr + sz) & ~(sz-1);
        if (pte_end < end)
                end = pte_end;

        pte = huge_ptep_get(ptep);

        if (!pte_access_permitted(pte, flags & FOLL_WRITE))
                return 0;

        /* hugepages are never "special" */
        VM_BUG_ON(!pfn_valid(pte_pfn(pte)));

        head = pte_page(pte);
        page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
        refs = record_subpages(page, addr, end, pages + *nr);

        head = try_grab_compound_head(head, refs, flags);
        if (!head)
                return 0;

        if (unlikely(pte_val(pte) != pte_val(*ptep))) {
                put_compound_head(head, refs, flags);
                return 0;
        }

        *nr += refs;
        SetPageReferenced(head);
        return 1;
}

static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
                unsigned int pdshift, unsigned long end, unsigned int flags,
                struct page **pages, int *nr)
{
        pte_t *ptep;
        unsigned long sz = 1UL << hugepd_shift(hugepd);
        unsigned long next;

        ptep = hugepte_offset(hugepd, addr, pdshift);
        do {
                next = hugepte_addr_end(addr, end, sz);
                if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
                        return 0;
        } while (ptep++, addr = next, addr != end);

        return 1;
}
#else
static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
                unsigned int pdshift, unsigned long end, unsigned int flags,
                struct page **pages, int *nr)
{
        return 0;
}
#endif /* CONFIG_ARCH_HAS_HUGEPD */

static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
                        unsigned long end, unsigned int flags,
                        struct page **pages, int *nr)
{
        struct page *head, *page;
        int refs;

        if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
                return 0;

        if (pmd_devmap(orig)) {
                if (unlikely(flags & FOLL_LONGTERM))
                        return 0;
                return __gup_device_huge_pmd(orig, pmdp, addr, end, flags,
                                             pages, nr);
        }

        page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
        refs = record_subpages(page, addr, end, pages + *nr);

        head = try_grab_compound_head(pmd_page(orig), refs, flags);
        if (!head)
                return 0;

        if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
                put_compound_head(head, refs, flags);
                return 0;
        }

        *nr += refs;
        SetPageReferenced(head);
        return 1;
}

static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
                        unsigned long end, unsigned int flags,
                        struct page **pages, int *nr)
{
        struct page *head, *page;
        int refs;

        if (!pud_access_permitted(orig, flags & FOLL_WRITE))
                return 0;

        if (pud_devmap(orig)) {
                if (unlikely(flags & FOLL_LONGTERM))
                        return 0;
                return __gup_device_huge_pud(orig, pudp, addr, end, flags,
                                             pages, nr);
        }

        page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
        refs = record_subpages(page, addr, end, pages + *nr);

        head = try_grab_compound_head(pud_page(orig), refs, flags);
        if (!head)
                return 0;

        if (unlikely(pud_val(orig) != pud_val(*pudp))) {
                put_compound_head(head, refs, flags);
                return 0;
        }

        *nr += refs;
        SetPageReferenced(head);
        return 1;
}

static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
                        unsigned long end, unsigned int flags,
                        struct page **pages, int *nr)
{
        int refs;
        struct page *head, *page;

        if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
                return 0;

        BUILD_BUG_ON(pgd_devmap(orig));

        page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
        refs = record_subpages(page, addr, end, pages + *nr);

        head = try_grab_compound_head(pgd_page(orig), refs, flags);
        if (!head)
                return 0;

        if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
                put_compound_head(head, refs, flags);
                return 0;
        }

        *nr += refs;
        SetPageReferenced(head);
        return 1;
}

static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end,
                unsigned int flags, struct page **pages, int *nr)
{
        unsigned long next;
        pmd_t *pmdp;

        pmdp = pmd_offset_lockless(pudp, pud, addr);
        do {
                pmd_t pmd = READ_ONCE(*pmdp);

                next = pmd_addr_end(addr, end);
                if (!pmd_present(pmd))
                        return 0;

                if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
                             pmd_devmap(pmd))) {
                        /*
                         * NUMA hinting faults need to be handled in the GUP
                         * slowpath for accounting purposes and so that they
                         * can be serialised against THP migration.
                         */
                        if (pmd_protnone(pmd))
                                return 0;

                        if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
                                pages, nr))
                                return 0;

                } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) {
                        /*
                         * architecture have different format for hugetlbfs
                         * pmd format and THP pmd format
                         */
                        if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
                                         PMD_SHIFT, next, flags, pages, nr))
                                return 0;
                } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr))
                        return 0;
        } while (pmdp++, addr = next, addr != end);

        return 1;
}

static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end,
                         unsigned int flags, struct page **pages, int *nr)
{
        unsigned long next;
        pud_t *pudp;

        pudp = pud_offset_lockless(p4dp, p4d, addr);
        do {
                pud_t pud = READ_ONCE(*pudp);

                next = pud_addr_end(addr, end);
                if (unlikely(!pud_present(pud)))
                        return 0;
                if (unlikely(pud_huge(pud) || pud_devmap(pud))) {
                        if (!gup_huge_pud(pud, pudp, addr, next, flags,
                                          pages, nr))
                                return 0;
                } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
                        if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
                                         PUD_SHIFT, next, flags, pages, nr))
                                return 0;
                } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr))
                        return 0;
        } while (pudp++, addr = next, addr != end);

        return 1;
}

static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end,
                         unsigned int flags, struct page **pages, int *nr)
{
        unsigned long next;
        p4d_t *p4dp;

        p4dp = p4d_offset_lockless(pgdp, pgd, addr);
        do {
                p4d_t p4d = READ_ONCE(*p4dp);

                next = p4d_addr_end(addr, end);
                if (p4d_none(p4d))
                        return 0;
                BUILD_BUG_ON(p4d_huge(p4d));
                if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
                        if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
                                         P4D_SHIFT, next, flags, pages, nr))
                                return 0;
                } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr))
                        return 0;
        } while (p4dp++, addr = next, addr != end);

        return 1;
}

static void gup_pgd_range(unsigned long addr, unsigned long end,
                unsigned int flags, struct page **pages, int *nr)
{
        unsigned long next;
        pgd_t *pgdp;

        pgdp = pgd_offset(current->mm, addr);
        do {
                pgd_t pgd = READ_ONCE(*pgdp);

                next = pgd_addr_end(addr, end);
                if (pgd_none(pgd))
                        return;
                if (unlikely(pgd_huge(pgd))) {
                        if (!gup_huge_pgd(pgd, pgdp, addr, next, flags,
                                          pages, nr))
                                return;
                } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
                        if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
                                         PGDIR_SHIFT, next, flags, pages, nr))
                                return;
                } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr))
                        return;
        } while (pgdp++, addr = next, addr != end);
}
#else
static inline void gup_pgd_range(unsigned long addr, unsigned long end,
                unsigned int flags, struct page **pages, int *nr)
{
}
#endif /* CONFIG_HAVE_FAST_GUP */

#ifndef gup_fast_permitted
/*
 * Check if it's allowed to use get_user_pages_fast_only() for the range, or
 * we need to fall back to the slow version:
 */
static bool gup_fast_permitted(unsigned long start, unsigned long end)
{
        return true;
}
#endif

static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
                                   unsigned int gup_flags, struct page **pages)
{
        int ret;

        /*
         * FIXME: FOLL_LONGTERM does not work with
         * get_user_pages_unlocked() (see comments in that function)
         */
        if (gup_flags & FOLL_LONGTERM) {
                mmap_read_lock(current->mm);
                ret = __gup_longterm_locked(current->mm,
                                            start, nr_pages,
                                            pages, NULL, gup_flags);
                mmap_read_unlock(current->mm);
        } else {
                ret = get_user_pages_unlocked(start, nr_pages,
                                              pages, gup_flags);
        }

        return ret;
}

static unsigned long lockless_pages_from_mm(unsigned long start,
                                            unsigned long end,
                                            unsigned int gup_flags,
                                            struct page **pages)
{
        unsigned long flags;
        int nr_pinned = 0;
        unsigned seq;

        if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) ||
            !gup_fast_permitted(start, end))
                return 0;

        if (gup_flags & FOLL_PIN) {
                seq = raw_read_seqcount(&current->mm->write_protect_seq);
                if (seq & 1)
                        return 0;
        }

        /*
         * Disable interrupts. The nested form is used, in order to allow full,
         * general purpose use of this routine.
         *
         * With interrupts disabled, we block page table pages from being freed
         * from under us. See struct mmu_table_batch comments in
         * include/asm-generic/tlb.h for more details.
         *
         * We do not adopt an rcu_read_lock() here as we also want to block IPIs
         * that come from THPs splitting.
         */
        local_irq_save(flags);
        gup_pgd_range(start, end, gup_flags, pages, &nr_pinned);
        local_irq_restore(flags);

        /*
         * When pinning pages for DMA there could be a concurrent write protect
         * from fork() via copy_page_range(), in this case always fail fast GUP.
         */
        if (gup_flags & FOLL_PIN) {
                if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) {
                        unpin_user_pages(pages, nr_pinned);
                        return 0;
                }
        }
        return nr_pinned;
}

static int internal_get_user_pages_fast(unsigned long start,
                                        unsigned long nr_pages,
                                        unsigned int gup_flags,
                                        struct page **pages)
{
        unsigned long len, end;
        unsigned long nr_pinned;
        int ret;

        if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
                                       FOLL_FORCE | FOLL_PIN | FOLL_GET |
                                       FOLL_FAST_ONLY)))
                return -EINVAL;

        if (gup_flags & FOLL_PIN)
                atomic_set(&current->mm->has_pinned, 1);

        if (!(gup_flags & FOLL_FAST_ONLY))
                might_lock_read(&current->mm->mmap_lock);

        start = untagged_addr(start) & PAGE_MASK;
        len = nr_pages << PAGE_SHIFT;
        if (check_add_overflow(start, len, &end))
                return 0;
        if (unlikely(!access_ok((void __user *)start, len)))
                return -EFAULT;

        nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages);
        if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)
                return nr_pinned;

        /* Slow path: try to get the remaining pages with get_user_pages */
        start += nr_pinned << PAGE_SHIFT;
        pages += nr_pinned;
        ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, gup_flags,
                                      pages);
        if (ret < 0) {
                /*
                 * The caller has to unpin the pages we already pinned so
                 * returning -errno is not an option
                 */
                if (nr_pinned)
                        return nr_pinned;
                return ret;
        }
        return ret + nr_pinned;
}

/**
 * get_user_pages_fast_only() - pin user pages in memory
 * @start:      starting user address
 * @nr_pages:   number of pages from start to pin
 * @gup_flags:  flags modifying pin behaviour
 * @pages:      array that receives pointers to the pages pinned.
 *              Should be at least nr_pages long.
 *
 * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
 * the regular GUP.
 * Note a difference with get_user_pages_fast: this always returns the
 * number of pages pinned, 0 if no pages were pinned.
 *
 * If the architecture does not support this function, simply return with no
 * pages pinned.
 *
 * Careful, careful! COW breaking can go either way, so a non-write
 * access can get ambiguous page results. If you call this function without
 * 'write' set, you'd better be sure that you're ok with that ambiguity.
 */
int get_user_pages_fast_only(unsigned long start, int nr_pages,
                             unsigned int gup_flags, struct page **pages)
{
        int nr_pinned;
        /*
         * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
         * because gup fast is always a "pin with a +1 page refcount" request.
         *
         * FOLL_FAST_ONLY is required in order to match the API description of
         * this routine: no fall back to regular ("slow") GUP.
         */
        gup_flags |= FOLL_GET | FOLL_FAST_ONLY;

        nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
                                                 pages);

        /*
         * As specified in the API description above, this routine is not
         * allowed to return negative values. However, the common core
         * routine internal_get_user_pages_fast() *can* return -errno.
         * Therefore, correct for that here:
         */
        if (nr_pinned < 0)
                nr_pinned = 0;

        return nr_pinned;
}
EXPORT_SYMBOL_GPL(get_user_pages_fast_only);

/**
 * get_user_pages_fast() - pin user pages in memory
 * @start:      starting user address
 * @nr_pages:   number of pages from start to pin
 * @gup_flags:  flags modifying pin behaviour
 * @pages:      array that receives pointers to the pages pinned.
 *              Should be at least nr_pages long.
 *
 * Attempt to pin user pages in memory without taking mm->mmap_lock.
 * If not successful, it will fall back to taking the lock and
 * calling get_user_pages().
 *
 * Returns number of pages pinned. This may be fewer than the number requested.
 * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
 * -errno.
 */
int get_user_pages_fast(unsigned long start, int nr_pages,
                        unsigned int gup_flags, struct page **pages)
{
        if (!is_valid_gup_flags(gup_flags))
                return -EINVAL;

        /*
         * The caller may or may not have explicitly set FOLL_GET; either way is
         * OK. However, internally (within mm/gup.c), gup fast variants must set
         * FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
         * request.
         */
        gup_flags |= FOLL_GET;
        return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);

/**
 * pin_user_pages_fast() - pin user pages in memory without taking locks
 *
 * @start:      starting user address
 * @nr_pages:   number of pages from start to pin
 * @gup_flags:  flags modifying pin behaviour
 * @pages:      array that receives pointers to the pages pinned.
 *              Should be at least nr_pages long.
 *
 * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See
 * get_user_pages_fast() for documentation on the function arguments, because
 * the arguments here are identical.
 *
 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
 * see Documentation/core-api/pin_user_pages.rst for further details.
 */
int pin_user_pages_fast(unsigned long start, int nr_pages,
                        unsigned int gup_flags, struct page **pages)
{
        /* FOLL_GET and FOLL_PIN are mutually exclusive. */
        if (WARN_ON_ONCE(gup_flags & FOLL_GET))
                return -EINVAL;

        gup_flags |= FOLL_PIN;
        return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
}
EXPORT_SYMBOL_GPL(pin_user_pages_fast);

/*
 * This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior
 * is the same, except that this one sets FOLL_PIN instead of FOLL_GET.
 *
 * The API rules are the same, too: no negative values may be returned.
 */
int pin_user_pages_fast_only(unsigned long start, int nr_pages,
                             unsigned int gup_flags, struct page **pages)
{
        int nr_pinned;

        /*
         * FOLL_GET and FOLL_PIN are mutually exclusive. Note that the API
         * rules require returning 0, rather than -errno:
         */
        if (WARN_ON_ONCE(gup_flags & FOLL_GET))
                return 0;
        /*
         * FOLL_FAST_ONLY is required in order to match the API description of
         * this routine: no fall back to regular ("slow") GUP.
         */
        gup_flags |= (FOLL_PIN | FOLL_FAST_ONLY);
        nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
                                                 pages);
        /*
         * This routine is not allowed to return negative values. However,
         * internal_get_user_pages_fast() *can* return -errno. Therefore,
         * correct for that here:
         */
        if (nr_pinned < 0)
                nr_pinned = 0;

        return nr_pinned;
}
EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);

/**
 * pin_user_pages_remote() - pin pages of a remote process
 *
 * @mm:                mm_struct of target mm
 * @start:        starting user address
 * @nr_pages:        number of pages from start to pin
 * @gup_flags:        flags modifying lookup behaviour
 * @pages:        array that receives pointers to the pages pinned.
 *                Should be at least nr_pages long. Or NULL, if caller
 *                only intends to ensure the pages are faulted in.
 * @vmas:        array of pointers to vmas corresponding to each page.
 *                Or NULL if the caller does not require them.
 * @locked:        pointer to lock flag indicating whether lock is held and
 *                subsequently whether VM_FAULT_RETRY functionality can be
 *                utilised. Lock must initially be held.
 *
 * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See
 * get_user_pages_remote() for documentation on the function arguments, because
 * the arguments here are identical.
 *
 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
 * see Documentation/core-api/pin_user_pages.rst for details.
 */
long pin_user_pages_remote(struct mm_struct *mm,
                           unsigned long start, unsigned long nr_pages,
                           unsigned int gup_flags, struct page **pages,
                           struct vm_area_struct **vmas, int *locked)
{
        /* FOLL_GET and FOLL_PIN are mutually exclusive. */
        if (WARN_ON_ONCE(gup_flags & FOLL_GET))
                return -EINVAL;

        gup_flags |= FOLL_PIN;
        return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
                                       pages, vmas, locked);
}
EXPORT_SYMBOL(pin_user_pages_remote);

/**
 * pin_user_pages() - pin user pages in memory for use by other devices
 *
 * @start:        starting user address
 * @nr_pages:        number of pages from start to pin
 * @gup_flags:        flags modifying lookup behaviour
 * @pages:        array that receives pointers to the pages pinned.
 *                Should be at least nr_pages long. Or NULL, if caller
 *                only intends to ensure the pages are faulted in.
 * @vmas:        array of pointers to vmas corresponding to each page.
 *                Or NULL if the caller does not require them.
 *
 * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
 * FOLL_PIN is set.
 *
 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
 * see Documentation/core-api/pin_user_pages.rst for details.
 */
long pin_user_pages(unsigned long start, unsigned long nr_pages,
                    unsigned int gup_flags, struct page **pages,
                    struct vm_area_struct **vmas)
{
        /* FOLL_GET and FOLL_PIN are mutually exclusive. */
        if (WARN_ON_ONCE(gup_flags & FOLL_GET))
                return -EINVAL;

        gup_flags |= FOLL_PIN;
        return __gup_longterm_locked(current->mm, start, nr_pages,
                                     pages, vmas, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages);

/*
 * pin_user_pages_unlocked() is the FOLL_PIN variant of
 * get_user_pages_unlocked(). Behavior is the same, except that this one sets
 * FOLL_PIN and rejects FOLL_GET.
 */
long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                             struct page **pages, unsigned int gup_flags)
{
        /* FOLL_GET and FOLL_PIN are mutually exclusive. */
        if (WARN_ON_ONCE(gup_flags & FOLL_GET))
                return -EINVAL;

        gup_flags |= FOLL_PIN;
        return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages_unlocked);

/*
 * pin_user_pages_locked() is the FOLL_PIN variant of get_user_pages_locked().
 * Behavior is the same, except that this one sets FOLL_PIN and rejects
 * FOLL_GET.
 */
long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
                           unsigned int gup_flags, struct page **pages,
                           int *locked)
{
        /*
         * FIXME: Current FOLL_LONGTERM behavior is incompatible with
         * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
         * vmas.  As there are no users of this flag in this call we simply
         * disallow this option for now.
         */
        if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
                return -EINVAL;

        /* FOLL_GET and FOLL_PIN are mutually exclusive. */
        if (WARN_ON_ONCE(gup_flags & FOLL_GET))
                return -EINVAL;

        gup_flags |= FOLL_PIN;
        return __get_user_pages_locked(current->mm, start, nr_pages,
                                       pages, NULL, locked,
                                       gup_flags | FOLL_TOUCH);
}
EXPORT_SYMBOL(pin_user_pages_locked);













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






    1 














    1 





    1 
















































































































































































































































    1 





    1 























    1 










    1 





    1 























    1 


























































































    1 










    1 



    1 

































































    1 


























































    1 







    1 







    1 




























































    1 


    1 










    1 


















    1 



    1 













    1 
    1 

    1 











    1 



    1 
    1 
    1 

    1 


































































































































    1 
































































































































































































    1 
    1 
    1 
    1 






    1 


    1 

    1 


    1 



    1 







    1 




    1 
    1 









    1 
    1 


    1 



    1 




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 

    1 

    1 



    1 
    1 

    1 








    1 





    1 

















    1 




    1 




    1 







    1 





    1 








    1 

    1 



    1 
    1 




    1 


















    1 


    1 
    1 









    1 




    1 





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






























































































































































































































































    1 






    1 











































































































    1 


    1 
















































































































































































































    1 






    1 
    1 



    1 
    1 

































    1 


    1 
    1 





















    1 




































    1 




    1 


    1 


    1 













    1 
    1 




    1 







    1 





































    1 




    1 










    1 




















    1 






























    1 



























    1 












    1 














    1 


    1 


    1 















    1 




































































































































































































































































































































































































































    1 




    1 








    1 
    1 














    1 
    1 






    1 


    1 





















































































































































































































































































































































































































































































































































































































    1 











    1 






















    1 













    1 







    1 





















    1 












    1 




















































    1 























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



    1 









    1 



    1 
    1 









    1 

    1 












































































































































    1 


    1 






    1 









    1 




    1 
    1 


















































































    1 








    1 










    1 



    1 




    1 








    1 














    1 



    1 
























































































































































































    1 



























    1 








    1 










    1 










    1 

    1 













    1 

















    1 























    1 





    1 




































































    1 



    1 








    1 






























































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849
9850
9851
9852
9853
9854
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897
9898
9899
9900
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910
9911
9912
9913
9914
9915
9916
9917
9918
9919
9920
9921
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931
9932
9933
9934
9935
9936
9937
9938
9939
9940
9941
9942
9943
9944
9945
9946
9947
9948
9949
9950
9951
9952
9953
9954
9955
9956
9957
9958
9959
9960
9961
9962
9963
9964
9965
9966
9967
9968
9969
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979
9980
9981
9982
9983
9984
9985
9986
9987
9988
9989
9990
9991
9992
9993
9994
9995
9996
9997
9998
9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231
10232
10233
10234
10235
10236
10237
10238
10239
10240
10241
10242
10243
10244
10245
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257
10258
10259
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287
10288
10289
10290
10291
10292
10293
10294
10295
10296
10297
10298
10299
10300
10301
10302
10303
10304
10305
10306
10307
10308
10309
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330
10331
10332
10333
10334
10335
10336
10337
10338
10339
10340
10341
10342
10343
10344
10345
10346
10347
10348
10349
10350
10351
10352
10353
10354
10355
10356
10357
10358
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369
10370
10371
10372
10373
10374
10375
10376
10377
10378
10379
10380
10381
10382
10383
10384
10385
10386
10387
10388
10389
10390
10391
10392
10393
10394
10395
10396
10397
10398
10399
10400
10401
10402
10403
10404
10405
10406
10407
10408
10409
10410
10411
10412
10413
10414
10415
10416
10417
10418
10419
10420
10421
10422
10423
10424
10425
10426
10427
10428
10429
10430
10431
10432
10433
10434
10435
10436
10437
10438
10439
10440
10441
10442
10443
10444
10445
10446
10447
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457
10458
10459
10460
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474
10475
10476
10477
10478
10479
10480
10481
10482
10483
10484
10485
10486
10487
10488
10489
10490
10491
10492
10493
10494
10495
10496
10497
10498
10499
10500
10501
10502
10503
10504
10505
10506
10507
10508
10509
10510
10511
10512
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536
10537
10538
10539
10540
10541
10542
10543
10544
10545
10546
10547
10548
10549
10550
10551
10552
10553
10554
10555
10556
10557
10558
10559
10560
10561
10562
10563
10564
10565
10566
10567
10568
10569
10570
10571
10572
10573
10574
10575
10576
10577
10578
10579
10580
10581
10582
10583
10584
10585
10586
10587
10588
10589
10590
10591
10592
10593
10594
10595
10596
10597
10598
10599
10600
10601
10602
10603
10604
10605
10606
10607
10608
10609
10610
10611
10612
10613
10614
10615
10616
10617
10618
10619
10620
10621
10622
10623
10624
10625
10626
10627
10628
10629
10630
10631
10632
10633
10634
10635
10636
10637
10638
10639
10640
10641
10642
10643
10644
10645
10646
10647
10648
10649
10650
10651
10652
10653
10654
10655
10656
10657
10658
10659
10660
10661
10662
10663
10664
10665
10666
10667
10668
10669
10670
10671
10672
10673
10674
10675
10676
10677
10678
10679
10680
10681
10682
10683
10684
10685
10686
10687
10688
10689
10690
10691
10692
10693
10694
10695
10696
10697
10698
10699
10700
10701
10702
10703
10704
10705
10706
10707
10708
10709
10710
10711
10712
10713
10714
10715
10716
10717
10718
10719
10720
10721
10722
10723
10724
10725
10726
10727
10728
10729
10730
10731
10732
10733
10734
10735
10736
10737
10738
10739
10740
10741
10742
10743
10744
10745
10746
10747
10748
10749
10750
10751
10752
10753
10754
10755
10756
10757
10758
10759
10760
10761
10762
10763
10764
10765
10766
10767
10768
10769
10770
10771
10772
10773
10774
10775
10776
10777
10778
10779
10780
10781
10782
10783
10784
10785
10786
10787
10788
10789
10790
10791
10792
10793
10794
10795
10796
10797
10798
10799
10800
10801
10802
10803
10804
10805
10806
10807
10808
10809
10810
10811
10812
10813
10814
10815
10816
10817
10818
10819
10820
10821
10822
10823
10824
10825
10826
10827
10828
10829
10830
10831
10832
10833
10834
10835
10836
10837
10838
10839
10840
10841
10842
10843
10844
10845
10846
10847
10848
10849
10850
10851
10852
10853
10854
10855
10856
10857
10858
10859
10860
10861
10862
10863
10864
10865
10866
10867
10868
10869
10870
10871
10872
10873
10874
10875
10876
10877
10878
10879
10880
10881
10882
10883
10884
10885
10886
10887
10888
10889
10890
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900
10901
10902
10903
10904
10905
10906
10907
10908
10909
10910
10911
10912
10913
10914
10915
10916
10917
10918
10919
10920
10921
10922
10923
10924
10925
10926
10927
10928
10929
10930
10931
10932
10933
10934
10935
10936
10937
10938
10939
10940
10941
10942
10943
10944
10945
// SPDX-License-Identifier: GPL-2.0
/*
 * Shared application/kernel submission and completion ring pairs, for
 * supporting fast/efficient IO.
 *
 * A note on the read/write ordering memory barriers that are matched between
 * the application and kernel side.
 *
 * After the application reads the CQ ring tail, it must use an
 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
 * before writing the tail (using smp_load_acquire to read the tail will
 * do). It also needs a smp_mb() before updating CQ head (ordering the
 * entry load(s) with the head store), pairing with an implicit barrier
 * through a control-dependency in io_get_cqe (smp_store_release to
 * store head will do). Failure to do so could lead to reading invalid
 * CQ entries.
 *
 * Likewise, the application must use an appropriate smp_wmb() before
 * writing the SQ tail (ordering SQ entry stores with the tail store),
 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
 * to store the tail will do). And it needs a barrier ordering the SQ
 * head load before writing new SQ entries (smp_load_acquire to read
 * head will do).
 *
 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
 * updating the SQ tail; a full memory barrier smp_mb() is needed
 * between.
 *
 * Also see the examples in the liburing library:
 *
 *        git://git.kernel.dk/liburing
 *
 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
 * from data shared between the kernel and application. This is done both
 * for ordering purposes, but also to ensure that once a value is loaded from
 * data that the application could potentially modify, it remains stable.
 *
 * Copyright (C) 2018-2019 Jens Axboe
 * Copyright (c) 2018-2019 Christoph Hellwig
 */
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/syscalls.h>
#include <linux/compat.h>
#include <net/compat.h>
#include <linux/refcount.h>
#include <linux/uio.h>
#include <linux/bits.h>

#include <linux/sched/signal.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/bvec.h>
#include <linux/net.h>
#include <net/sock.h>
#include <net/af_unix.h>
#include <linux/anon_inodes.h>
#include <linux/sched/mm.h>
#include <linux/uaccess.h>
#include <linux/nospec.h>
#include <linux/sizes.h>
#include <linux/hugetlb.h>
#include <linux/highmem.h>
#include <linux/namei.h>
#include <linux/fsnotify.h>
#include <linux/fadvise.h>
#include <linux/eventpoll.h>
#include <linux/splice.h>
#include <linux/task_work.h>
#include <linux/pagemap.h>
#include <linux/io_uring.h>
#include <linux/tracehook.h>

#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>

#include <uapi/linux/io_uring.h>

#include "../fs/internal.h"
#include "io-wq.h"

#define IORING_MAX_ENTRIES        32768
#define IORING_MAX_CQ_ENTRIES        (2 * IORING_MAX_ENTRIES)
#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8

/* only define max */
#define IORING_MAX_FIXED_FILES        (1U << 15)
#define IORING_MAX_RESTRICTIONS        (IORING_RESTRICTION_LAST + \
                                 IORING_REGISTER_LAST + IORING_OP_LAST)

#define IO_RSRC_TAG_TABLE_SHIFT        (PAGE_SHIFT - 3)
#define IO_RSRC_TAG_TABLE_MAX        (1U << IO_RSRC_TAG_TABLE_SHIFT)
#define IO_RSRC_TAG_TABLE_MASK        (IO_RSRC_TAG_TABLE_MAX - 1)

#define IORING_MAX_REG_BUFFERS        (1U << 14)

#define SQE_VALID_FLAGS        (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK|        \
                                IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
                                IOSQE_BUFFER_SELECT)
#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
                                REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS)

#define IO_TCTX_REFS_CACHE_NR        (1U << 10)

struct io_uring {
        u32 head ____cacheline_aligned_in_smp;
        u32 tail ____cacheline_aligned_in_smp;
};

/*
 * This data is shared with the application through the mmap at offsets
 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
 *
 * The offsets to the member fields are published through struct
 * io_sqring_offsets when calling io_uring_setup.
 */
struct io_rings {
        /*
         * Head and tail offsets into the ring; the offsets need to be
         * masked to get valid indices.
         *
         * The kernel controls head of the sq ring and the tail of the cq ring,
         * and the application controls tail of the sq ring and the head of the
         * cq ring.
         */
        struct io_uring                sq, cq;
        /*
         * Bitmasks to apply to head and tail offsets (constant, equals
         * ring_entries - 1)
         */
        u32                        sq_ring_mask, cq_ring_mask;
        /* Ring sizes (constant, power of 2) */
        u32                        sq_ring_entries, cq_ring_entries;
        /*
         * Number of invalid entries dropped by the kernel due to
         * invalid index stored in array
         *
         * Written by the kernel, shouldn't be modified by the
         * application (i.e. get number of "new events" by comparing to
         * cached value).
         *
         * After a new SQ head value was read by the application this
         * counter includes all submissions that were dropped reaching
         * the new SQ head (and possibly more).
         */
        u32                        sq_dropped;
        /*
         * Runtime SQ flags
         *
         * Written by the kernel, shouldn't be modified by the
         * application.
         *
         * The application needs a full memory barrier before checking
         * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
         */
        u32                        sq_flags;
        /*
         * Runtime CQ flags
         *
         * Written by the application, shouldn't be modified by the
         * kernel.
         */
        u32                        cq_flags;
        /*
         * Number of completion events lost because the queue was full;
         * this should be avoided by the application by making sure
         * there are not more requests pending than there is space in
         * the completion queue.
         *
         * Written by the kernel, shouldn't be modified by the
         * application (i.e. get number of "new events" by comparing to
         * cached value).
         *
         * As completion events come in out of order this counter is not
         * ordered with any other data.
         */
        u32                        cq_overflow;
        /*
         * Ring buffer of completion events.
         *
         * The kernel writes completion events fresh every time they are
         * produced, so the application is allowed to modify pending
         * entries.
         */
        struct io_uring_cqe        cqes[] ____cacheline_aligned_in_smp;
};

enum io_uring_cmd_flags {
        IO_URING_F_NONBLOCK                = 1,
        IO_URING_F_COMPLETE_DEFER        = 2,
};

struct io_mapped_ubuf {
        u64                ubuf;
        u64                ubuf_end;
        unsigned int        nr_bvecs;
        unsigned long        acct_pages;
        struct bio_vec        bvec[];
};

struct io_ring_ctx;

struct io_overflow_cqe {
        struct io_uring_cqe cqe;
        struct list_head list;
};

struct io_fixed_file {
        /* file * with additional FFS_* flags */
        unsigned long file_ptr;
};

struct io_rsrc_put {
        struct list_head list;
        u64 tag;
        union {
                void *rsrc;
                struct file *file;
                struct io_mapped_ubuf *buf;
        };
};

struct io_file_table {
        struct io_fixed_file *files;
};

struct io_rsrc_node {
        struct percpu_ref                refs;
        struct list_head                node;
        struct list_head                rsrc_list;
        struct io_rsrc_data                *rsrc_data;
        struct llist_node                llist;
        bool                                done;
};

typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);

struct io_rsrc_data {
        struct io_ring_ctx                *ctx;

        u64                                **tags;
        unsigned int                        nr;
        rsrc_put_fn                        *do_put;
        atomic_t                        refs;
        struct completion                done;
        bool                                quiesce;
};

struct io_buffer {
        struct list_head list;
        __u64 addr;
        __u32 len;
        __u16 bid;
};

struct io_restriction {
        DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
        DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
        u8 sqe_flags_allowed;
        u8 sqe_flags_required;
        bool registered;
};

enum {
        IO_SQ_THREAD_SHOULD_STOP = 0,
        IO_SQ_THREAD_SHOULD_PARK,
};

struct io_sq_data {
        refcount_t                refs;
        atomic_t                park_pending;
        struct mutex                lock;

        /* ctx's that are using this sqd */
        struct list_head        ctx_list;

        struct task_struct        *thread;
        struct wait_queue_head        wait;

        unsigned                sq_thread_idle;
        int                        sq_cpu;
        pid_t                        task_pid;
        pid_t                        task_tgid;

        unsigned long                state;
        struct completion        exited;
};

#define IO_COMPL_BATCH                        32
#define IO_REQ_CACHE_SIZE                32
#define IO_REQ_ALLOC_BATCH                8

struct io_submit_link {
        struct io_kiocb                *head;
        struct io_kiocb                *last;
};

struct io_submit_state {
        struct blk_plug                plug;
        struct io_submit_link        link;

        /*
         * io_kiocb alloc cache
         */
        void                        *reqs[IO_REQ_CACHE_SIZE];
        unsigned int                free_reqs;

        bool                        plug_started;

        /*
         * Batch completion logic
         */
        struct io_kiocb                *compl_reqs[IO_COMPL_BATCH];
        unsigned int                compl_nr;
        /* inline/task_work completion list, under ->uring_lock */
        struct list_head        free_list;

        unsigned int                ios_left;
};

struct io_ring_ctx {
        /* const or read-mostly hot data */
        struct {
                struct percpu_ref        refs;

                struct io_rings                *rings;
                unsigned int                flags;
                unsigned int                compat: 1;
                unsigned int                drain_next: 1;
                unsigned int                eventfd_async: 1;
                unsigned int                restricted: 1;
                unsigned int                off_timeout_used: 1;
                unsigned int                drain_active: 1;
        } ____cacheline_aligned_in_smp;

        /* submission data */
        struct {
                struct mutex                uring_lock;

                /*
                 * Ring buffer of indices into array of io_uring_sqe, which is
                 * mmapped by the application using the IORING_OFF_SQES offset.
                 *
                 * This indirection could e.g. be used to assign fixed
                 * io_uring_sqe entries to operations and only submit them to
                 * the queue when needed.
                 *
                 * The kernel modifies neither the indices array nor the entries
                 * array.
                 */
                u32                        *sq_array;
                struct io_uring_sqe        *sq_sqes;
                unsigned                cached_sq_head;
                unsigned                sq_entries;
                struct list_head        defer_list;

                /*
                 * Fixed resources fast path, should be accessed only under
                 * uring_lock, and updated through io_uring_register(2)
                 */
                struct io_rsrc_node        *rsrc_node;
                struct io_file_table        file_table;
                unsigned                nr_user_files;
                unsigned                nr_user_bufs;
                struct io_mapped_ubuf        **user_bufs;

                struct io_submit_state        submit_state;
                struct list_head        timeout_list;
                struct list_head        ltimeout_list;
                struct list_head        cq_overflow_list;
                struct xarray                io_buffers;
                struct xarray                personalities;
                u32                        pers_next;
                unsigned                sq_thread_idle;
        } ____cacheline_aligned_in_smp;

        /* IRQ completion list, under ->completion_lock */
        struct list_head        locked_free_list;
        unsigned int                locked_free_nr;

        const struct cred        *sq_creds;        /* cred used for __io_sq_thread() */
        struct io_sq_data        *sq_data;        /* if using sq thread polling */

        struct wait_queue_head        sqo_sq_wait;
        struct list_head        sqd_list;

        unsigned long                check_cq_overflow;

        struct {
                unsigned                cached_cq_tail;
                unsigned                cq_entries;
                struct eventfd_ctx        *cq_ev_fd;
                struct wait_queue_head        poll_wait;
                struct wait_queue_head        cq_wait;
                unsigned                cq_extra;
                atomic_t                cq_timeouts;
                unsigned                cq_last_tm_flush;
        } ____cacheline_aligned_in_smp;

        struct {
                spinlock_t                completion_lock;

                spinlock_t                timeout_lock;

                /*
                 * ->iopoll_list is protected by the ctx->uring_lock for
                 * io_uring instances that don't use IORING_SETUP_SQPOLL.
                 * For SQPOLL, only the single threaded io_sq_thread() will
                 * manipulate the list, hence no extra locking is needed there.
                 */
                struct list_head        iopoll_list;
                struct hlist_head        *cancel_hash;
                unsigned                cancel_hash_bits;
                bool                        poll_multi_queue;
        } ____cacheline_aligned_in_smp;

        struct io_restriction                restrictions;

        /* slow path rsrc auxilary data, used by update/register */
        struct {
                struct io_rsrc_node                *rsrc_backup_node;
                struct io_mapped_ubuf                *dummy_ubuf;
                struct io_rsrc_data                *file_data;
                struct io_rsrc_data                *buf_data;

                struct delayed_work                rsrc_put_work;
                struct llist_head                rsrc_put_llist;
                struct list_head                rsrc_ref_list;
                spinlock_t                        rsrc_ref_lock;
        };

        /* Keep this last, we don't need it for the fast path */
        struct {
                /* hashed buffered write serialization */
                struct io_wq_hash                *hash_map;

                /* Only used for accounting purposes */
                struct user_struct                *user;
                struct mm_struct                *mm_account;

                /* ctx exit and cancelation */
                struct llist_head                fallback_llist;
                struct delayed_work                fallback_work;
                struct work_struct                exit_work;
                struct list_head                tctx_list;
                struct completion                ref_comp;
                u32                                iowq_limits[2];
                bool                                iowq_limits_set;
        };
};

struct io_uring_task {
        /* submission side */
        int                        cached_refs;
        struct xarray                xa;
        struct wait_queue_head        wait;
        const struct io_ring_ctx *last;
        struct io_wq                *io_wq;
        struct percpu_counter        inflight;
        atomic_t                inflight_tracked;
        atomic_t                in_idle;

        spinlock_t                task_lock;
        struct io_wq_work_list        task_list;
        struct callback_head        task_work;
        bool                        task_running;
};

/*
 * First field must be the file pointer in all the
 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
 */
struct io_poll_iocb {
        struct file                        *file;
        struct wait_queue_head                *head;
        __poll_t                        events;
        int                                retries;
        struct wait_queue_entry                wait;
};

struct io_poll_update {
        struct file                        *file;
        u64                                old_user_data;
        u64                                new_user_data;
        __poll_t                        events;
        bool                                update_events;
        bool                                update_user_data;
};

struct io_close {
        struct file                        *file;
        int                                fd;
        u32                                file_slot;
};

struct io_timeout_data {
        struct io_kiocb                        *req;
        struct hrtimer                        timer;
        struct timespec64                ts;
        enum hrtimer_mode                mode;
        u32                                flags;
};

struct io_accept {
        struct file                        *file;
        struct sockaddr __user                *addr;
        int __user                        *addr_len;
        int                                flags;
        u32                                file_slot;
        unsigned long                        nofile;
};

struct io_sync {
        struct file                        *file;
        loff_t                                len;
        loff_t                                off;
        int                                flags;
        int                                mode;
};

struct io_cancel {
        struct file                        *file;
        u64                                addr;
};

struct io_timeout {
        struct file                        *file;
        u32                                off;
        u32                                target_seq;
        struct list_head                list;
        /* head of the link, used by linked timeouts only */
        struct io_kiocb                        *head;
        /* for linked completions */
        struct io_kiocb                        *prev;
};

struct io_timeout_rem {
        struct file                        *file;
        u64                                addr;

        /* timeout update */
        struct timespec64                ts;
        u32                                flags;
        bool                                ltimeout;
};

struct io_rw {
        /* NOTE: kiocb has the file as the first member, so don't do it here */
        struct kiocb                        kiocb;
        u64                                addr;
        u64                                len;
};

struct io_connect {
        struct file                        *file;
        struct sockaddr __user                *addr;
        int                                addr_len;
};

struct io_sr_msg {
        struct file                        *file;
        union {
                struct compat_msghdr __user        *umsg_compat;
                struct user_msghdr __user        *umsg;
                void __user                        *buf;
        };
        int                                msg_flags;
        int                                bgid;
        size_t                                len;
        size_t                                done_io;
        struct io_buffer                *kbuf;
        void __user                        *msg_control;
};

struct io_open {
        struct file                        *file;
        int                                dfd;
        u32                                file_slot;
        struct filename                        *filename;
        struct open_how                        how;
        unsigned long                        nofile;
};

struct io_rsrc_update {
        struct file                        *file;
        u64                                arg;
        u32                                nr_args;
        u32                                offset;
};

struct io_fadvise {
        struct file                        *file;
        u64                                offset;
        u32                                len;
        u32                                advice;
};

struct io_madvise {
        struct file                        *file;
        u64                                addr;
        u32                                len;
        u32                                advice;
};

struct io_epoll {
        struct file                        *file;
        int                                epfd;
        int                                op;
        int                                fd;
        struct epoll_event                event;
};

struct io_splice {
        struct file                        *file_out;
        loff_t                                off_out;
        loff_t                                off_in;
        u64                                len;
        int                                splice_fd_in;
        unsigned int                        flags;
};

struct io_provide_buf {
        struct file                        *file;
        __u64                                addr;
        __u32                                len;
        __u32                                bgid;
        __u16                                nbufs;
        __u16                                bid;
};

struct io_statx {
        struct file                        *file;
        int                                dfd;
        unsigned int                        mask;
        unsigned int                        flags;
        const char __user                *filename;
        struct statx __user                *buffer;
};

struct io_shutdown {
        struct file                        *file;
        int                                how;
};

struct io_rename {
        struct file                        *file;
        int                                old_dfd;
        int                                new_dfd;
        struct filename                        *oldpath;
        struct filename                        *newpath;
        int                                flags;
};

struct io_unlink {
        struct file                        *file;
        int                                dfd;
        int                                flags;
        struct filename                        *filename;
};

struct io_mkdir {
        struct file                        *file;
        int                                dfd;
        umode_t                                mode;
        struct filename                        *filename;
};

struct io_symlink {
        struct file                        *file;
        int                                new_dfd;
        struct filename                        *oldpath;
        struct filename                        *newpath;
};

struct io_hardlink {
        struct file                        *file;
        int                                old_dfd;
        int                                new_dfd;
        struct filename                        *oldpath;
        struct filename                        *newpath;
        int                                flags;
};

struct io_completion {
        struct file                        *file;
        u32                                cflags;
};

struct io_async_connect {
        struct sockaddr_storage                address;
};

struct io_async_msghdr {
        struct iovec                        fast_iov[UIO_FASTIOV];
        /* points to an allocated iov, if NULL we use fast_iov instead */
        struct iovec                        *free_iov;
        struct sockaddr __user                *uaddr;
        struct msghdr                        msg;
        struct sockaddr_storage                addr;
};

struct io_async_rw {
        struct iovec                        fast_iov[UIO_FASTIOV];
        const struct iovec                *free_iovec;
        struct iov_iter                        iter;
        struct iov_iter_state                iter_state;
        size_t                                bytes_done;
        struct wait_page_queue                wpq;
};

enum {
        REQ_F_FIXED_FILE_BIT        = IOSQE_FIXED_FILE_BIT,
        REQ_F_IO_DRAIN_BIT        = IOSQE_IO_DRAIN_BIT,
        REQ_F_LINK_BIT                = IOSQE_IO_LINK_BIT,
        REQ_F_HARDLINK_BIT        = IOSQE_IO_HARDLINK_BIT,
        REQ_F_FORCE_ASYNC_BIT        = IOSQE_ASYNC_BIT,
        REQ_F_BUFFER_SELECT_BIT        = IOSQE_BUFFER_SELECT_BIT,

        /* first byte is taken by user flags, shift it to not overlap */
        REQ_F_FAIL_BIT                = 8,
        REQ_F_INFLIGHT_BIT,
        REQ_F_CUR_POS_BIT,
        REQ_F_NOWAIT_BIT,
        REQ_F_LINK_TIMEOUT_BIT,
        REQ_F_NEED_CLEANUP_BIT,
        REQ_F_POLLED_BIT,
        REQ_F_BUFFER_SELECTED_BIT,
        REQ_F_COMPLETE_INLINE_BIT,
        REQ_F_REISSUE_BIT,
        REQ_F_CREDS_BIT,
        REQ_F_REFCOUNT_BIT,
        REQ_F_ARM_LTIMEOUT_BIT,
        REQ_F_PARTIAL_IO_BIT,
        /* keep async read/write and isreg together and in order */
        REQ_F_NOWAIT_READ_BIT,
        REQ_F_NOWAIT_WRITE_BIT,
        REQ_F_ISREG_BIT,

        /* not a real bit, just to check we're not overflowing the space */
        __REQ_F_LAST_BIT,
};

enum {
        /* ctx owns file */
        REQ_F_FIXED_FILE        = BIT(REQ_F_FIXED_FILE_BIT),
        /* drain existing IO first */
        REQ_F_IO_DRAIN                = BIT(REQ_F_IO_DRAIN_BIT),
        /* linked sqes */
        REQ_F_LINK                = BIT(REQ_F_LINK_BIT),
        /* doesn't sever on completion < 0 */
        REQ_F_HARDLINK                = BIT(REQ_F_HARDLINK_BIT),
        /* IOSQE_ASYNC */
        REQ_F_FORCE_ASYNC        = BIT(REQ_F_FORCE_ASYNC_BIT),
        /* IOSQE_BUFFER_SELECT */
        REQ_F_BUFFER_SELECT        = BIT(REQ_F_BUFFER_SELECT_BIT),

        /* fail rest of links */
        REQ_F_FAIL                = BIT(REQ_F_FAIL_BIT),
        /* on inflight list, should be cancelled and waited on exit reliably */
        REQ_F_INFLIGHT                = BIT(REQ_F_INFLIGHT_BIT),
        /* read/write uses file position */
        REQ_F_CUR_POS                = BIT(REQ_F_CUR_POS_BIT),
        /* must not punt to workers */
        REQ_F_NOWAIT                = BIT(REQ_F_NOWAIT_BIT),
        /* has or had linked timeout */
        REQ_F_LINK_TIMEOUT        = BIT(REQ_F_LINK_TIMEOUT_BIT),
        /* needs cleanup */
        REQ_F_NEED_CLEANUP        = BIT(REQ_F_NEED_CLEANUP_BIT),
        /* already went through poll handler */
        REQ_F_POLLED                = BIT(REQ_F_POLLED_BIT),
        /* buffer already selected */
        REQ_F_BUFFER_SELECTED        = BIT(REQ_F_BUFFER_SELECTED_BIT),
        /* completion is deferred through io_comp_state */
        REQ_F_COMPLETE_INLINE        = BIT(REQ_F_COMPLETE_INLINE_BIT),
        /* caller should reissue async */
        REQ_F_REISSUE                = BIT(REQ_F_REISSUE_BIT),
        /* supports async reads */
        REQ_F_NOWAIT_READ        = BIT(REQ_F_NOWAIT_READ_BIT),
        /* supports async writes */
        REQ_F_NOWAIT_WRITE        = BIT(REQ_F_NOWAIT_WRITE_BIT),
        /* regular file */
        REQ_F_ISREG                = BIT(REQ_F_ISREG_BIT),
        /* has creds assigned */
        REQ_F_CREDS                = BIT(REQ_F_CREDS_BIT),
        /* skip refcounting if not set */
        REQ_F_REFCOUNT                = BIT(REQ_F_REFCOUNT_BIT),
        /* there is a linked timeout that has to be armed */
        REQ_F_ARM_LTIMEOUT        = BIT(REQ_F_ARM_LTIMEOUT_BIT),
        /* request has already done partial IO */
        REQ_F_PARTIAL_IO        = BIT(REQ_F_PARTIAL_IO_BIT),
};

struct async_poll {
        struct io_poll_iocb        poll;
        struct io_poll_iocb        *double_poll;
};

typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);

struct io_task_work {
        union {
                struct io_wq_work_node        node;
                struct llist_node        fallback_node;
        };
        io_req_tw_func_t                func;
};

enum {
        IORING_RSRC_FILE                = 0,
        IORING_RSRC_BUFFER                = 1,
};

/*
 * NOTE! Each of the iocb union members has the file pointer
 * as the first entry in their struct definition. So you can
 * access the file pointer through any of the sub-structs,
 * or directly as just 'ki_filp' in this struct.
 */
struct io_kiocb {
        union {
                struct file                *file;
                struct io_rw                rw;
                struct io_poll_iocb        poll;
                struct io_poll_update        poll_update;
                struct io_accept        accept;
                struct io_sync                sync;
                struct io_cancel        cancel;
                struct io_timeout        timeout;
                struct io_timeout_rem        timeout_rem;
                struct io_connect        connect;
                struct io_sr_msg        sr_msg;
                struct io_open                open;
                struct io_close                close;
                struct io_rsrc_update        rsrc_update;
                struct io_fadvise        fadvise;
                struct io_madvise        madvise;
                struct io_epoll                epoll;
                struct io_splice        splice;
                struct io_provide_buf        pbuf;
                struct io_statx                statx;
                struct io_shutdown        shutdown;
                struct io_rename        rename;
                struct io_unlink        unlink;
                struct io_mkdir                mkdir;
                struct io_symlink        symlink;
                struct io_hardlink        hardlink;
                /* use only after cleaning per-op data, see io_clean_op() */
                struct io_completion        compl;
        };

        /* opcode allocated if it needs to store data for async defer */
        void                                *async_data;
        u8                                opcode;
        /* polled IO has completed */
        u8                                iopoll_completed;

        u16                                buf_index;
        u32                                result;

        struct io_ring_ctx                *ctx;
        unsigned int                        flags;
        atomic_t                        refs;
        struct task_struct                *task;
        u64                                user_data;

        struct io_kiocb                        *link;
        struct percpu_ref                *fixed_rsrc_refs;

        /* used with ctx->iopoll_list with reads/writes */
        struct list_head                inflight_entry;
        struct io_task_work                io_task_work;
        /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
        struct hlist_node                hash_node;
        struct async_poll                *apoll;
        struct io_wq_work                work;
        const struct cred                *creds;

        /* store used ubuf, so we can prevent reloading */
        struct io_mapped_ubuf                *imu;
        /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
        struct io_buffer                *kbuf;
        atomic_t                        poll_refs;
};

struct io_tctx_node {
        struct list_head        ctx_node;
        struct task_struct        *task;
        struct io_ring_ctx        *ctx;
};

struct io_defer_entry {
        struct list_head        list;
        struct io_kiocb                *req;
        u32                        seq;
};

struct io_op_def {
        /* needs req->file assigned */
        unsigned                needs_file : 1;
        /* hash wq insertion if file is a regular file */
        unsigned                hash_reg_file : 1;
        /* unbound wq insertion if file is a non-regular file */
        unsigned                unbound_nonreg_file : 1;
        /* opcode is not supported by this kernel */
        unsigned                not_supported : 1;
        /* set if opcode supports polled "wait" */
        unsigned                pollin : 1;
        unsigned                pollout : 1;
        /* op supports buffer selection */
        unsigned                buffer_select : 1;
        /* do prep async if is going to be punted */
        unsigned                needs_async_setup : 1;
        /* should block plug */
        unsigned                plug : 1;
        /* size of async data needed, if any */
        unsigned short                async_size;
};

static const struct io_op_def io_op_defs[] = {
        [IORING_OP_NOP] = {},
        [IORING_OP_READV] = {
                .needs_file                = 1,
                .unbound_nonreg_file        = 1,
                .pollin                        = 1,
                .buffer_select                = 1,
                .needs_async_setup        = 1,
                .plug                        = 1,
                .async_size                = sizeof(struct io_async_rw),
        },
        [IORING_OP_WRITEV] = {
                .needs_file                = 1,
                .hash_reg_file                = 1,
                .unbound_nonreg_file        = 1,
                .pollout                = 1,
                .needs_async_setup        = 1,
                .plug                        = 1,
                .async_size                = sizeof(struct io_async_rw),
        },
        [IORING_OP_FSYNC] = {
                .needs_file                = 1,
        },
        [IORING_OP_READ_FIXED] = {
                .needs_file                = 1,
                .unbound_nonreg_file        = 1,
                .pollin                        = 1,
                .plug                        = 1,
                .async_size                = sizeof(struct io_async_rw),
        },
        [IORING_OP_WRITE_FIXED] = {
                .needs_file                = 1,
                .hash_reg_file                = 1,
                .unbound_nonreg_file        = 1,
                .pollout                = 1,
                .plug                        = 1,
                .async_size                = sizeof(struct io_async_rw),
        },
        [IORING_OP_POLL_ADD] = {
                .needs_file                = 1,
                .unbound_nonreg_file        = 1,
        },
        [IORING_OP_POLL_REMOVE] = {},
        [IORING_OP_SYNC_FILE_RANGE] = {
                .needs_file                = 1,
        },
        [IORING_OP_SENDMSG] = {
                .needs_file                = 1,
                .unbound_nonreg_file        = 1,
                .pollout                = 1,
                .needs_async_setup        = 1,
                .async_size                = sizeof(struct io_async_msghdr),
        },
        [IORING_OP_RECVMSG] = {
                .needs_file                = 1,
                .unbound_nonreg_file        = 1,
                .pollin                        = 1,
                .buffer_select                = 1,
                .needs_async_setup        = 1,
                .async_size                = sizeof(struct io_async_msghdr),
        },
        [IORING_OP_TIMEOUT] = {
                .async_size                = sizeof(struct io_timeout_data),
        },
        [IORING_OP_TIMEOUT_REMOVE] = {
                /* used by timeout updates' prep() */
        },
        [IORING_OP_ACCEPT] = {
                .needs_file                = 1,
                .unbound_nonreg_file        = 1,
                .pollin                        = 1,
        },
        [IORING_OP_ASYNC_CANCEL] = {},
        [IORING_OP_LINK_TIMEOUT] = {
                .async_size                = sizeof(struct io_timeout_data),
        },
        [IORING_OP_CONNECT] = {
                .needs_file                = 1,
                .unbound_nonreg_file        = 1,
                .pollout                = 1,
                .needs_async_setup        = 1,
                .async_size                = sizeof(struct io_async_connect),
        },
        [IORING_OP_FALLOCATE] = {
                .needs_file                = 1,
        },
        [IORING_OP_OPENAT] = {},
        [IORING_OP_CLOSE] = {},
        [IORING_OP_FILES_UPDATE] = {},
        [IORING_OP_STATX] = {},
        [IORING_OP_READ] = {
                .needs_file                = 1,
                .unbound_nonreg_file        = 1,
                .pollin                        = 1,
                .buffer_select                = 1,
                .plug                        = 1,
                .async_size                = sizeof(struct io_async_rw),
        },
        [IORING_OP_WRITE] = {
                .needs_file                = 1,
                .hash_reg_file                = 1,
                .unbound_nonreg_file        = 1,
                .pollout                = 1,
                .plug                        = 1,
                .async_size                = sizeof(struct io_async_rw),
        },
        [IORING_OP_FADVISE] = {
                .needs_file                = 1,
        },
        [IORING_OP_MADVISE] = {},
        [IORING_OP_SEND] = {
                .needs_file                = 1,
                .unbound_nonreg_file        = 1,
                .pollout                = 1,
        },
        [IORING_OP_RECV] = {
                .needs_file                = 1,
                .unbound_nonreg_file        = 1,
                .pollin                        = 1,
                .buffer_select                = 1,
        },
        [IORING_OP_OPENAT2] = {
        },
        [IORING_OP_EPOLL_CTL] = {
                .unbound_nonreg_file        = 1,
        },
        [IORING_OP_SPLICE] = {
                .needs_file                = 1,
                .hash_reg_file                = 1,
                .unbound_nonreg_file        = 1,
        },
        [IORING_OP_PROVIDE_BUFFERS] = {},
        [IORING_OP_REMOVE_BUFFERS] = {},
        [IORING_OP_TEE] = {
                .needs_file                = 1,
                .hash_reg_file                = 1,
                .unbound_nonreg_file        = 1,
        },
        [IORING_OP_SHUTDOWN] = {
                .needs_file                = 1,
        },
        [IORING_OP_RENAMEAT] = {},
        [IORING_OP_UNLINKAT] = {},
};

/* requests with any of those set should undergo io_disarm_next() */
#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)

static bool io_disarm_next(struct io_kiocb *req);
static void io_uring_del_tctx_node(unsigned long index);
static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
                                         struct task_struct *task,
                                         bool cancel_all);
static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);

static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags);

static void io_put_req(struct io_kiocb *req);
static void io_put_req_deferred(struct io_kiocb *req);
static void io_dismantle_req(struct io_kiocb *req);
static void io_queue_linked_timeout(struct io_kiocb *req);
static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
                                     struct io_uring_rsrc_update2 *up,
                                     unsigned nr_args);
static void io_clean_op(struct io_kiocb *req);
static struct file *io_file_get(struct io_ring_ctx *ctx,
                                struct io_kiocb *req, int fd, bool fixed,
                                unsigned int issue_flags);
static void __io_queue_sqe(struct io_kiocb *req);
static void io_rsrc_put_work(struct work_struct *work);

static void io_req_task_queue(struct io_kiocb *req);
static void io_submit_flush_completions(struct io_ring_ctx *ctx);
static int io_req_prep_async(struct io_kiocb *req);

static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
                                 unsigned int issue_flags, u32 slot_index);
static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);

static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);

static struct kmem_cache *req_cachep;

static const struct file_operations io_uring_fops;

static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
{
        if (!*locked) {
                mutex_lock(&ctx->uring_lock);
                *locked = true;
        }
}

#define io_for_each_link(pos, head) \
        for (pos = (head); pos; pos = pos->link)

/*
 * Shamelessly stolen from the mm implementation of page reference checking,
 * see commit f958d7b528b1 for details.
 */
#define req_ref_zero_or_close_to_overflow(req)        \
        ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)

static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
{
        WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
        return atomic_inc_not_zero(&req->refs);
}

static inline bool req_ref_put_and_test(struct io_kiocb *req)
{
        if (likely(!(req->flags & REQ_F_REFCOUNT)))
                return true;

        WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
        return atomic_dec_and_test(&req->refs);
}

static inline void req_ref_get(struct io_kiocb *req)
{
        WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
        WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
        atomic_inc(&req->refs);
}

static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
{
        if (!(req->flags & REQ_F_REFCOUNT)) {
                req->flags |= REQ_F_REFCOUNT;
                atomic_set(&req->refs, nr);
        }
}

static inline void io_req_set_refcount(struct io_kiocb *req)
{
        __io_req_set_refcount(req, 1);
}

static inline void io_req_set_rsrc_node(struct io_kiocb *req)
{
        struct io_ring_ctx *ctx = req->ctx;

        if (!req->fixed_rsrc_refs) {
                req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
                percpu_ref_get(req->fixed_rsrc_refs);
        }
}

static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
{
        bool got = percpu_ref_tryget(ref);

        /* already at zero, wait for ->release() */
        if (!got)
                wait_for_completion(compl);
        percpu_ref_resurrect(ref);
        if (got)
                percpu_ref_put(ref);
}

static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
                          bool cancel_all)
        __must_hold(&req->ctx->timeout_lock)
{
        struct io_kiocb *req;

        if (task && head->task != task)
                return false;
        if (cancel_all)
                return true;

        io_for_each_link(req, head) {
                if (req->flags & REQ_F_INFLIGHT)
                        return true;
        }
        return false;
}

static bool io_match_linked(struct io_kiocb *head)
{
        struct io_kiocb *req;

        io_for_each_link(req, head) {
                if (req->flags & REQ_F_INFLIGHT)
                        return true;
        }
        return false;
}

/*
 * As io_match_task() but protected against racing with linked timeouts.
 * User must not hold timeout_lock.
 */
static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
                               bool cancel_all)
{
        bool matched;

        if (task && head->task != task)
                return false;
        if (cancel_all)
                return true;

        if (head->flags & REQ_F_LINK_TIMEOUT) {
                struct io_ring_ctx *ctx = head->ctx;

                /* protect against races with linked timeouts */
                spin_lock_irq(&ctx->timeout_lock);
                matched = io_match_linked(head);
                spin_unlock_irq(&ctx->timeout_lock);
        } else {
                matched = io_match_linked(head);
        }
        return matched;
}

static inline void req_set_fail(struct io_kiocb *req)
{
        req->flags |= REQ_F_FAIL;
}

static inline void req_fail_link_node(struct io_kiocb *req, int res)
{
        req_set_fail(req);
        req->result = res;
}

static void io_ring_ctx_ref_free(struct percpu_ref *ref)
{
        struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);

        complete(&ctx->ref_comp);
}

static inline bool io_is_timeout_noseq(struct io_kiocb *req)
{
        return !req->timeout.off;
}

static void io_fallback_req_func(struct work_struct *work)
{
        struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
                                                fallback_work.work);
        struct llist_node *node = llist_del_all(&ctx->fallback_llist);
        struct io_kiocb *req, *tmp;
        bool locked = false;

        percpu_ref_get(&ctx->refs);
        llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
                req->io_task_work.func(req, &locked);

        if (locked) {
                if (ctx->submit_state.compl_nr)
                        io_submit_flush_completions(ctx);
                mutex_unlock(&ctx->uring_lock);
        }
        percpu_ref_put(&ctx->refs);

}

static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
        struct io_ring_ctx *ctx;
        int hash_bits;

        ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
        if (!ctx)
                return NULL;

        /*
         * Use 5 bits less than the max cq entries, that should give us around
         * 32 entries per hash list if totally full and uniformly spread.
         */
        hash_bits = ilog2(p->cq_entries);
        hash_bits -= 5;
        if (hash_bits <= 0)
                hash_bits = 1;
        ctx->cancel_hash_bits = hash_bits;
        ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
                                        GFP_KERNEL);
        if (!ctx->cancel_hash)
                goto err;
        __hash_init(ctx->cancel_hash, 1U << hash_bits);

        ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
        if (!ctx->dummy_ubuf)
                goto err;
        /* set invalid range, so io_import_fixed() fails meeting it */
        ctx->dummy_ubuf->ubuf = -1UL;

        if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
                            PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
                goto err;

        ctx->flags = p->flags;
        init_waitqueue_head(&ctx->sqo_sq_wait);
        INIT_LIST_HEAD(&ctx->sqd_list);
        init_waitqueue_head(&ctx->poll_wait);
        INIT_LIST_HEAD(&ctx->cq_overflow_list);
        init_completion(&ctx->ref_comp);
        xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
        xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
        mutex_init(&ctx->uring_lock);
        init_waitqueue_head(&ctx->cq_wait);
        spin_lock_init(&ctx->completion_lock);
        spin_lock_init(&ctx->timeout_lock);
        INIT_LIST_HEAD(&ctx->iopoll_list);
        INIT_LIST_HEAD(&ctx->defer_list);
        INIT_LIST_HEAD(&ctx->timeout_list);
        INIT_LIST_HEAD(&ctx->ltimeout_list);
        spin_lock_init(&ctx->rsrc_ref_lock);
        INIT_LIST_HEAD(&ctx->rsrc_ref_list);
        INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
        init_llist_head(&ctx->rsrc_put_llist);
        INIT_LIST_HEAD(&ctx->tctx_list);
        INIT_LIST_HEAD(&ctx->submit_state.free_list);
        INIT_LIST_HEAD(&ctx->locked_free_list);
        INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
        return ctx;
err:
        kfree(ctx->dummy_ubuf);
        kfree(ctx->cancel_hash);
        kfree(ctx);
        return NULL;
}

static void io_account_cq_overflow(struct io_ring_ctx *ctx)
{
        struct io_rings *r = ctx->rings;

        WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
        ctx->cq_extra--;
}

static bool req_need_defer(struct io_kiocb *req, u32 seq)
{
        if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
                struct io_ring_ctx *ctx = req->ctx;

                return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
        }

        return false;
}

#define FFS_ASYNC_READ                0x1UL
#define FFS_ASYNC_WRITE                0x2UL
#ifdef CONFIG_64BIT
#define FFS_ISREG                0x4UL
#else
#define FFS_ISREG                0x0UL
#endif
#define FFS_MASK                ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)

static inline bool io_req_ffs_set(struct io_kiocb *req)
{
        return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE);
}

static void io_req_track_inflight(struct io_kiocb *req)
{
        if (!(req->flags & REQ_F_INFLIGHT)) {
                req->flags |= REQ_F_INFLIGHT;
                atomic_inc(&req->task->io_uring->inflight_tracked);
        }
}

static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
{
        if (WARN_ON_ONCE(!req->link))
                return NULL;

        req->flags &= ~REQ_F_ARM_LTIMEOUT;
        req->flags |= REQ_F_LINK_TIMEOUT;

        /* linked timeouts should have two refs once prep'ed */
        io_req_set_refcount(req);
        __io_req_set_refcount(req->link, 2);
        return req->link;
}

static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
{
        if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
                return NULL;
        return __io_prep_linked_timeout(req);
}

static void io_prep_async_work(struct io_kiocb *req)
{
        const struct io_op_def *def = &io_op_defs[req->opcode];
        struct io_ring_ctx *ctx = req->ctx;

        if (!(req->flags & REQ_F_CREDS)) {
                req->flags |= REQ_F_CREDS;
                req->creds = get_current_cred();
        }

        req->work.list.next = NULL;
        req->work.flags = 0;
        if (req->flags & REQ_F_FORCE_ASYNC)
                req->work.flags |= IO_WQ_WORK_CONCURRENT;

        if (req->flags & REQ_F_ISREG) {
                if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
                        io_wq_hash_work(&req->work, file_inode(req->file));
        } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
                if (def->unbound_nonreg_file)
                        req->work.flags |= IO_WQ_WORK_UNBOUND;
        }
}

static void io_prep_async_link(struct io_kiocb *req)
{
        struct io_kiocb *cur;

        if (req->flags & REQ_F_LINK_TIMEOUT) {
                struct io_ring_ctx *ctx = req->ctx;

                spin_lock_irq(&ctx->timeout_lock);
                io_for_each_link(cur, req)
                        io_prep_async_work(cur);
                spin_unlock_irq(&ctx->timeout_lock);
        } else {
                io_for_each_link(cur, req)
                        io_prep_async_work(cur);
        }
}

static void io_queue_async_work(struct io_kiocb *req, bool *locked)
{
        struct io_ring_ctx *ctx = req->ctx;
        struct io_kiocb *link = io_prep_linked_timeout(req);
        struct io_uring_task *tctx = req->task->io_uring;

        /* must not take the lock, NULL it as a precaution */
        locked = NULL;

        BUG_ON(!tctx);
        BUG_ON(!tctx->io_wq);

        /* init ->work of the whole link before punting */
        io_prep_async_link(req);

        /*
         * Not expected to happen, but if we do have a bug where this _can_
         * happen, catch it here and ensure the request is marked as
         * canceled. That will make io-wq go through the usual work cancel
         * procedure rather than attempt to run this request (or create a new
         * worker for it).
         */
        if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
                req->work.flags |= IO_WQ_WORK_CANCEL;

        trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
                                        &req->work, req->flags);
        io_wq_enqueue(tctx->io_wq, &req->work);
        if (link)
                io_queue_linked_timeout(link);
}

static void io_kill_timeout(struct io_kiocb *req, int status)
        __must_hold(&req->ctx->completion_lock)
        __must_hold(&req->ctx->timeout_lock)
{
        struct io_timeout_data *io = req->async_data;

        if (hrtimer_try_to_cancel(&io->timer) != -1) {
                if (status)
                        req_set_fail(req);
                atomic_set(&req->ctx->cq_timeouts,
                        atomic_read(&req->ctx->cq_timeouts) + 1);
                list_del_init(&req->timeout.list);
                io_fill_cqe_req(req, status, 0);
                io_put_req_deferred(req);
        }
}

static void io_queue_deferred(struct io_ring_ctx *ctx)
{
        lockdep_assert_held(&ctx->completion_lock);

        while (!list_empty(&ctx->defer_list)) {
                struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
                                                struct io_defer_entry, list);

                if (req_need_defer(de->req, de->seq))
                        break;
                list_del_init(&de->list);
                io_req_task_queue(de->req);
                kfree(de);
        }
}

static void io_flush_timeouts(struct io_ring_ctx *ctx)
        __must_hold(&ctx->completion_lock)
{
        u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
        struct io_kiocb *req, *tmp;

        spin_lock_irq(&ctx->timeout_lock);
        list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
                u32 events_needed, events_got;

                if (io_is_timeout_noseq(req))
                        break;

                /*
                 * Since seq can easily wrap around over time, subtract
                 * the last seq at which timeouts were flushed before comparing.
                 * Assuming not more than 2^31-1 events have happened since,
                 * these subtractions won't have wrapped, so we can check if
                 * target is in [last_seq, current_seq] by comparing the two.
                 */
                events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
                events_got = seq - ctx->cq_last_tm_flush;
                if (events_got < events_needed)
                        break;

                io_kill_timeout(req, 0);
        }
        ctx->cq_last_tm_flush = seq;
        spin_unlock_irq(&ctx->timeout_lock);
}

static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
        if (ctx->off_timeout_used)
                io_flush_timeouts(ctx);
        if (ctx->drain_active)
                io_queue_deferred(ctx);
}

static inline bool io_commit_needs_flush(struct io_ring_ctx *ctx)
{
        return ctx->off_timeout_used || ctx->drain_active;
}

static inline void __io_commit_cqring(struct io_ring_ctx *ctx)
{
        /* order cqe stores with ring update */
        smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
}

static inline void io_commit_cqring(struct io_ring_ctx *ctx)
{
        if (unlikely(io_commit_needs_flush(ctx)))
                __io_commit_cqring_flush(ctx);
        __io_commit_cqring(ctx);
}

static inline bool io_sqring_full(struct io_ring_ctx *ctx)
{
        struct io_rings *r = ctx->rings;

        return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
}

static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
{
        return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
}

static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
{
        struct io_rings *rings = ctx->rings;
        unsigned tail, mask = ctx->cq_entries - 1;

        /*
         * writes to the cq entry need to come after reading head; the
         * control dependency is enough as we're using WRITE_ONCE to
         * fill the cq entry
         */
        if (__io_cqring_events(ctx) == ctx->cq_entries)
                return NULL;

        tail = ctx->cached_cq_tail++;
        return &rings->cqes[tail & mask];
}

static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
{
        if (likely(!ctx->cq_ev_fd))
                return false;
        if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
                return false;
        return !ctx->eventfd_async || io_wq_current_is_worker();
}

/*
 * This should only get called when at least one event has been posted.
 * Some applications rely on the eventfd notification count only changing
 * IFF a new CQE has been added to the CQ ring. There's no depedency on
 * 1:1 relationship between how many times this function is called (and
 * hence the eventfd count) and number of CQEs posted to the CQ ring.
 */
static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
{
        /*
         * wake_up_all() may seem excessive, but io_wake_function() and
         * io_should_wake() handle the termination of the loop and only
         * wake as many waiters as we need to.
         */
        if (wq_has_sleeper(&ctx->cq_wait))
                __wake_up(&ctx->cq_wait, TASK_NORMAL, 0,
                                poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
        if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
                wake_up(&ctx->sq_data->wait);
        if (io_should_trigger_evfd(ctx))
                eventfd_signal_mask(ctx->cq_ev_fd, 1, EPOLL_URING_WAKE);
        if (waitqueue_active(&ctx->poll_wait))
                __wake_up(&ctx->poll_wait, TASK_INTERRUPTIBLE, 0,
                                poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
}

static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
{
        /* see waitqueue_active() comment */
        smp_mb();

        if (ctx->flags & IORING_SETUP_SQPOLL) {
                if (waitqueue_active(&ctx->cq_wait))
                        __wake_up(&ctx->cq_wait, TASK_NORMAL, 0,
                                  poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
        }
        if (io_should_trigger_evfd(ctx))
                eventfd_signal_mask(ctx->cq_ev_fd, 1, EPOLL_URING_WAKE);
        if (waitqueue_active(&ctx->poll_wait))
                __wake_up(&ctx->poll_wait, TASK_INTERRUPTIBLE, 0,
                                poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
}

/* Returns true if there are no backlogged entries after the flush */
static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
{
        bool all_flushed, posted;

        if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
                return false;

        posted = false;
        spin_lock(&ctx->completion_lock);
        while (!list_empty(&ctx->cq_overflow_list)) {
                struct io_uring_cqe *cqe = io_get_cqe(ctx);
                struct io_overflow_cqe *ocqe;

                if (!cqe && !force)
                        break;
                ocqe = list_first_entry(&ctx->cq_overflow_list,
                                        struct io_overflow_cqe, list);
                if (cqe)
                        memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
                else
                        io_account_cq_overflow(ctx);

                posted = true;
                list_del(&ocqe->list);
                kfree(ocqe);
        }

        all_flushed = list_empty(&ctx->cq_overflow_list);
        if (all_flushed) {
                clear_bit(0, &ctx->check_cq_overflow);
                WRITE_ONCE(ctx->rings->sq_flags,
                           ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
        }

        if (posted)
                io_commit_cqring(ctx);
        spin_unlock(&ctx->completion_lock);
        if (posted)
                io_cqring_ev_posted(ctx);
        return all_flushed;
}

static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
{
        bool ret = true;

        if (test_bit(0, &ctx->check_cq_overflow)) {
                /* iopoll syncs against uring_lock, not completion_lock */
                if (ctx->flags & IORING_SETUP_IOPOLL)
                        mutex_lock(&ctx->uring_lock);
                ret = __io_cqring_overflow_flush(ctx, false);
                if (ctx->flags & IORING_SETUP_IOPOLL)
                        mutex_unlock(&ctx->uring_lock);
        }

        return ret;
}

/* must to be called somewhat shortly after putting a request */
static inline void io_put_task(struct task_struct *task, int nr)
{
        struct io_uring_task *tctx = task->io_uring;

        if (likely(task == current)) {
                tctx->cached_refs += nr;
        } else {
                percpu_counter_sub(&tctx->inflight, nr);
                if (unlikely(atomic_read(&tctx->in_idle)))
                        wake_up(&tctx->wait);
                put_task_struct_many(task, nr);
        }
}

static void io_task_refs_refill(struct io_uring_task *tctx)
{
        unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;

        percpu_counter_add(&tctx->inflight, refill);
        refcount_add(refill, &current->usage);
        tctx->cached_refs += refill;
}

static inline void io_get_task_refs(int nr)
{
        struct io_uring_task *tctx = current->io_uring;

        tctx->cached_refs -= nr;
        if (unlikely(tctx->cached_refs < 0))
                io_task_refs_refill(tctx);
}

static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
{
        struct io_uring_task *tctx = task->io_uring;
        unsigned int refs = tctx->cached_refs;

        if (refs) {
                tctx->cached_refs = 0;
                percpu_counter_sub(&tctx->inflight, refs);
                put_task_struct_many(task, refs);
        }
}

static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
                                     s32 res, u32 cflags)
{
        struct io_overflow_cqe *ocqe;

        ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
        if (!ocqe) {
                /*
                 * If we're in ring overflow flush mode, or in task cancel mode,
                 * or cannot allocate an overflow entry, then we need to drop it
                 * on the floor.
                 */
                io_account_cq_overflow(ctx);
                return false;
        }
        if (list_empty(&ctx->cq_overflow_list)) {
                set_bit(0, &ctx->check_cq_overflow);
                WRITE_ONCE(ctx->rings->sq_flags,
                           ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);

        }
        ocqe->cqe.user_data = user_data;
        ocqe->cqe.res = res;
        ocqe->cqe.flags = cflags;
        list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
        return true;
}

static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
                                 s32 res, u32 cflags)
{
        struct io_uring_cqe *cqe;

        trace_io_uring_complete(ctx, user_data, res, cflags);

        /*
         * If we can't get a cq entry, userspace overflowed the
         * submission (by quite a lot). Increment the overflow count in
         * the ring.
         */
        cqe = io_get_cqe(ctx);
        if (likely(cqe)) {
                WRITE_ONCE(cqe->user_data, user_data);
                WRITE_ONCE(cqe->res, res);
                WRITE_ONCE(cqe->flags, cflags);
                return true;
        }
        return io_cqring_event_overflow(ctx, user_data, res, cflags);
}

static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
{
        __io_fill_cqe(req->ctx, req->user_data, res, cflags);
}

static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
                                     s32 res, u32 cflags)
{
        ctx->cq_extra++;
        return __io_fill_cqe(ctx, user_data, res, cflags);
}

static void io_req_complete_post(struct io_kiocb *req, s32 res,
                                 u32 cflags)
{
        struct io_ring_ctx *ctx = req->ctx;

        spin_lock(&ctx->completion_lock);
        __io_fill_cqe(ctx, req->user_data, res, cflags);
        /*
         * If we're the last reference to this request, add to our locked
         * free_list cache.
         */
        if (req_ref_put_and_test(req)) {
                if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
                        if (req->flags & IO_DISARM_MASK)
                                io_disarm_next(req);
                        if (req->link) {
                                io_req_task_queue(req->link);
                                req->link = NULL;
                        }
                }
                io_dismantle_req(req);
                io_put_task(req->task, 1);
                list_add(&req->inflight_entry, &ctx->locked_free_list);
                ctx->locked_free_nr++;
        } else {
                if (!percpu_ref_tryget(&ctx->refs))
                        req = NULL;
        }
        io_commit_cqring(ctx);
        spin_unlock(&ctx->completion_lock);

        if (req) {
                io_cqring_ev_posted(ctx);
                percpu_ref_put(&ctx->refs);
        }
}

static inline bool io_req_needs_clean(struct io_kiocb *req)
{
        return req->flags & IO_REQ_CLEAN_FLAGS;
}

static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
                                         u32 cflags)
{
        if (io_req_needs_clean(req))
                io_clean_op(req);
        req->result = res;
        req->compl.cflags = cflags;
        req->flags |= REQ_F_COMPLETE_INLINE;
}

static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
                                     s32 res, u32 cflags)
{
        if (issue_flags & IO_URING_F_COMPLETE_DEFER)
                io_req_complete_state(req, res, cflags);
        else
                io_req_complete_post(req, res, cflags);
}

static inline void io_req_complete(struct io_kiocb *req, s32 res)
{
        __io_req_complete(req, 0, res, 0);
}

static void io_req_complete_failed(struct io_kiocb *req, s32 res)
{
        req_set_fail(req);
        io_req_complete_post(req, res, 0);
}

static void io_req_complete_fail_submit(struct io_kiocb *req)
{
        /*
         * We don't submit, fail them all, for that replace hardlinks with
         * normal links. Extra REQ_F_LINK is tolerated.
         */
        req->flags &= ~REQ_F_HARDLINK;
        req->flags |= REQ_F_LINK;
        io_req_complete_failed(req, req->result);
}

/*
 * Don't initialise the fields below on every allocation, but do that in
 * advance and keep them valid across allocations.
 */
static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
{
        req->ctx = ctx;
        req->link = NULL;
        req->async_data = NULL;
        /* not necessary, but safer to zero */
        req->result = 0;
}

static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
                                        struct io_submit_state *state)
{
        spin_lock(&ctx->completion_lock);
        list_splice_init(&ctx->locked_free_list, &state->free_list);
        ctx->locked_free_nr = 0;
        spin_unlock(&ctx->completion_lock);
}

/* Returns true IFF there are requests in the cache */
static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
{
        struct io_submit_state *state = &ctx->submit_state;
        int nr;

        /*
         * If we have more than a batch's worth of requests in our IRQ side
         * locked cache, grab the lock and move them over to our submission
         * side cache.
         */
        if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
                io_flush_cached_locked_reqs(ctx, state);

        nr = state->free_reqs;
        while (!list_empty(&state->free_list)) {
                struct io_kiocb *req = list_first_entry(&state->free_list,
                                        struct io_kiocb, inflight_entry);

                list_del(&req->inflight_entry);
                state->reqs[nr++] = req;
                if (nr == ARRAY_SIZE(state->reqs))
                        break;
        }

        state->free_reqs = nr;
        return nr != 0;
}

/*
 * A request might get retired back into the request caches even before opcode
 * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
 * Because of that, io_alloc_req() should be called only under ->uring_lock
 * and with extra caution to not get a request that is still worked on.
 */
static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
        __must_hold(&ctx->uring_lock)
{
        struct io_submit_state *state = &ctx->submit_state;
        gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
        int ret, i;

        BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH);

        if (likely(state->free_reqs || io_flush_cached_reqs(ctx)))
                goto got_req;

        ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
                                    state->reqs);

        /*
         * Bulk alloc is all-or-nothing. If we fail to get a batch,
         * retry single alloc to be on the safe side.
         */
        if (unlikely(ret <= 0)) {
                state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
                if (!state->reqs[0])
                        return NULL;
                ret = 1;
        }

        for (i = 0; i < ret; i++)
                io_preinit_req(state->reqs[i], ctx);
        state->free_reqs = ret;
got_req:
        state->free_reqs--;
        return state->reqs[state->free_reqs];
}

static inline void io_put_file(struct file *file)
{
        if (file)
                fput(file);
}

static void io_dismantle_req(struct io_kiocb *req)
{
        unsigned int flags = req->flags;

        if (io_req_needs_clean(req))
                io_clean_op(req);
        if (!(flags & REQ_F_FIXED_FILE))
                io_put_file(req->file);
        if (req->fixed_rsrc_refs)
                percpu_ref_put(req->fixed_rsrc_refs);
        if (req->async_data) {
                kfree(req->async_data);
                req->async_data = NULL;
        }
}

static void __io_free_req(struct io_kiocb *req)
{
        struct io_ring_ctx *ctx = req->ctx;

        io_dismantle_req(req);
        io_put_task(req->task, 1);

        spin_lock(&ctx->completion_lock);
        list_add(&req->inflight_entry, &ctx->locked_free_list);
        ctx->locked_free_nr++;
        spin_unlock(&ctx->completion_lock);

        percpu_ref_put(&ctx->refs);
}

static inline void io_remove_next_linked(struct io_kiocb *req)
{
        struct io_kiocb *nxt = req->link;

        req->link = nxt->link;
        nxt->link = NULL;
}

static bool io_kill_linked_timeout(struct io_kiocb *req)
        __must_hold(&req->ctx->completion_lock)
        __must_hold(&req->ctx->timeout_lock)
{
        struct io_kiocb *link = req->link;

        if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
                struct io_timeout_data *io = link->async_data;

                io_remove_next_linked(req);
                link->timeout.head = NULL;
                if (hrtimer_try_to_cancel(&io->timer) != -1) {
                        list_del(&link->timeout.list);
                        io_fill_cqe_req(link, -ECANCELED, 0);
                        io_put_req_deferred(link);
                        return true;
                }
        }
        return false;
}

static void io_fail_links(struct io_kiocb *req)
        __must_hold(&req->ctx->completion_lock)
{
        struct io_kiocb *nxt, *link = req->link;

        req->link = NULL;
        while (link) {
                long res = -ECANCELED;

                if (link->flags & REQ_F_FAIL)
                        res = link->result;

                nxt = link->link;
                link->link = NULL;

                trace_io_uring_fail_link(req, link);
                io_fill_cqe_req(link, res, 0);
                io_put_req_deferred(link);
                link = nxt;
        }
}

static bool io_disarm_next(struct io_kiocb *req)
        __must_hold(&req->ctx->completion_lock)
{
        bool posted = false;

        if (req->flags & REQ_F_ARM_LTIMEOUT) {
                struct io_kiocb *link = req->link;

                req->flags &= ~REQ_F_ARM_LTIMEOUT;
                if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
                        io_remove_next_linked(req);
                        io_fill_cqe_req(link, -ECANCELED, 0);
                        io_put_req_deferred(link);
                        posted = true;
                }
        } else if (req->flags & REQ_F_LINK_TIMEOUT) {
                struct io_ring_ctx *ctx = req->ctx;

                spin_lock_irq(&ctx->timeout_lock);
                posted = io_kill_linked_timeout(req);
                spin_unlock_irq(&ctx->timeout_lock);
        }
        if (unlikely((req->flags & REQ_F_FAIL) &&
                     !(req->flags & REQ_F_HARDLINK))) {
                posted |= (req->link != NULL);
                io_fail_links(req);
        }
        return posted;
}

static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
{
        struct io_kiocb *nxt;

        /*
         * If LINK is set, we have dependent requests in this chain. If we
         * didn't fail this request, queue the first one up, moving any other
         * dependencies to the next request. In case of failure, fail the rest
         * of the chain.
         */
        if (req->flags & IO_DISARM_MASK) {
                struct io_ring_ctx *ctx = req->ctx;
                bool posted;

                spin_lock(&ctx->completion_lock);
                posted = io_disarm_next(req);
                if (posted)
                        io_commit_cqring(req->ctx);
                spin_unlock(&ctx->completion_lock);
                if (posted)
                        io_cqring_ev_posted(ctx);
        }
        nxt = req->link;
        req->link = NULL;
        return nxt;
}

static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
{
        if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
                return NULL;
        return __io_req_find_next(req);
}

static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
{
        if (!ctx)
                return;
        if (*locked) {
                if (ctx->submit_state.compl_nr)
                        io_submit_flush_completions(ctx);
                mutex_unlock(&ctx->uring_lock);
                *locked = false;
        }
        percpu_ref_put(&ctx->refs);
}

static void tctx_task_work(struct callback_head *cb)
{
        bool locked = false;
        struct io_ring_ctx *ctx = NULL;
        struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
                                                  task_work);

        while (1) {
                struct io_wq_work_node *node;

                if (!tctx->task_list.first && locked && ctx->submit_state.compl_nr)
                        io_submit_flush_completions(ctx);

                spin_lock_irq(&tctx->task_lock);
                node = tctx->task_list.first;
                INIT_WQ_LIST(&tctx->task_list);
                if (!node)
                        tctx->task_running = false;
                spin_unlock_irq(&tctx->task_lock);
                if (!node)
                        break;

                do {
                        struct io_wq_work_node *next = node->next;
                        struct io_kiocb *req = container_of(node, struct io_kiocb,
                                                            io_task_work.node);

                        if (req->ctx != ctx) {
                                ctx_flush_and_put(ctx, &locked);
                                ctx = req->ctx;
                                /* if not contended, grab and improve batching */
                                locked = mutex_trylock(&ctx->uring_lock);
                                percpu_ref_get(&ctx->refs);
                        }
                        req->io_task_work.func(req, &locked);
                        node = next;
                        if (unlikely(need_resched())) {
                                ctx_flush_and_put(ctx, &locked);
                                ctx = NULL;
                                cond_resched();
                        }
                } while (node);
        }

        ctx_flush_and_put(ctx, &locked);

        /* relaxed read is enough as only the task itself sets ->in_idle */
        if (unlikely(atomic_read(&tctx->in_idle)))
                io_uring_drop_tctx_refs(current);
}

static void io_req_task_work_add(struct io_kiocb *req)
{
        struct task_struct *tsk = req->task;
        struct io_uring_task *tctx = tsk->io_uring;
        enum task_work_notify_mode notify;
        struct io_wq_work_node *node;
        unsigned long flags;
        bool running;

        WARN_ON_ONCE(!tctx);

        spin_lock_irqsave(&tctx->task_lock, flags);
        wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
        running = tctx->task_running;
        if (!running)
                tctx->task_running = true;
        spin_unlock_irqrestore(&tctx->task_lock, flags);

        /* task_work already pending, we're done */
        if (running)
                return;

        /*
         * SQPOLL kernel thread doesn't need notification, just a wakeup. For
         * all other cases, use TWA_SIGNAL unconditionally to ensure we're
         * processing task_work. There's no reliable way to tell if TWA_RESUME
         * will do the job.
         */
        notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
        if (!task_work_add(tsk, &tctx->task_work, notify)) {
                wake_up_process(tsk);
                return;
        }

        spin_lock_irqsave(&tctx->task_lock, flags);
        tctx->task_running = false;
        node = tctx->task_list.first;
        INIT_WQ_LIST(&tctx->task_list);
        spin_unlock_irqrestore(&tctx->task_lock, flags);

        while (node) {
                req = container_of(node, struct io_kiocb, io_task_work.node);
                node = node->next;
                if (llist_add(&req->io_task_work.fallback_node,
                              &req->ctx->fallback_llist))
                        schedule_delayed_work(&req->ctx->fallback_work, 1);
        }
}

static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
{
        struct io_ring_ctx *ctx = req->ctx;

        /* not needed for normal modes, but SQPOLL depends on it */
        io_tw_lock(ctx, locked);
        io_req_complete_failed(req, req->result);
}

static void io_req_task_submit(struct io_kiocb *req, bool *locked)
{
        struct io_ring_ctx *ctx = req->ctx;

        io_tw_lock(ctx, locked);
        /* req->task == current here, checking PF_EXITING is safe */
        if (likely(!(req->task->flags & PF_EXITING)))
                __io_queue_sqe(req);
        else
                io_req_complete_failed(req, -EFAULT);
}

static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
{
        req->result = ret;
        req->io_task_work.func = io_req_task_cancel;
        io_req_task_work_add(req);
}

static void io_req_task_queue(struct io_kiocb *req)
{
        req->io_task_work.func = io_req_task_submit;
        io_req_task_work_add(req);
}

static void io_req_task_queue_reissue(struct io_kiocb *req)
{
        req->io_task_work.func = io_queue_async_work;
        io_req_task_work_add(req);
}

static inline void io_queue_next(struct io_kiocb *req)
{
        struct io_kiocb *nxt = io_req_find_next(req);

        if (nxt)
                io_req_task_queue(nxt);
}

static void io_free_req(struct io_kiocb *req)
{
        io_queue_next(req);
        __io_free_req(req);
}

static void io_free_req_work(struct io_kiocb *req, bool *locked)
{
        io_free_req(req);
}

struct req_batch {
        struct task_struct        *task;
        int                        task_refs;
        int                        ctx_refs;
};

static inline void io_init_req_batch(struct req_batch *rb)
{
        rb->task_refs = 0;
        rb->ctx_refs = 0;
        rb->task = NULL;
}

static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
                                     struct req_batch *rb)
{
        if (rb->ctx_refs)
                percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
        if (rb->task)
                io_put_task(rb->task, rb->task_refs);
}

static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
                              struct io_submit_state *state)
{
        io_queue_next(req);
        io_dismantle_req(req);

        if (req->task != rb->task) {
                if (rb->task)
                        io_put_task(rb->task, rb->task_refs);
                rb->task = req->task;
                rb->task_refs = 0;
        }
        rb->task_refs++;
        rb->ctx_refs++;

        if (state->free_reqs != ARRAY_SIZE(state->reqs))
                state->reqs[state->free_reqs++] = req;
        else
                list_add(&req->inflight_entry, &state->free_list);
}

static void io_submit_flush_completions(struct io_ring_ctx *ctx)
        __must_hold(&ctx->uring_lock)
{
        struct io_submit_state *state = &ctx->submit_state;
        int i, nr = state->compl_nr;
        struct req_batch rb;

        spin_lock(&ctx->completion_lock);
        for (i = 0; i < nr; i++) {
                struct io_kiocb *req = state->compl_reqs[i];

                __io_fill_cqe(ctx, req->user_data, req->result,
                              req->compl.cflags);
        }
        io_commit_cqring(ctx);
        spin_unlock(&ctx->completion_lock);
        io_cqring_ev_posted(ctx);

        io_init_req_batch(&rb);
        for (i = 0; i < nr; i++) {
                struct io_kiocb *req = state->compl_reqs[i];

                if (req_ref_put_and_test(req))
                        io_req_free_batch(&rb, req, &ctx->submit_state);
        }

        io_req_free_batch_finish(ctx, &rb);
        state->compl_nr = 0;
}

/*
 * Drop reference to request, return next in chain (if there is one) if this
 * was the last reference to this request.
 */
static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
{
        struct io_kiocb *nxt = NULL;

        if (req_ref_put_and_test(req)) {
                nxt = io_req_find_next(req);
                __io_free_req(req);
        }
        return nxt;
}

static inline void io_put_req(struct io_kiocb *req)
{
        if (req_ref_put_and_test(req))
                io_free_req(req);
}

static inline void io_put_req_deferred(struct io_kiocb *req)
{
        if (req_ref_put_and_test(req)) {
                req->io_task_work.func = io_free_req_work;
                io_req_task_work_add(req);
        }
}

static unsigned io_cqring_events(struct io_ring_ctx *ctx)
{
        /* See comment at the top of this file */
        smp_rmb();
        return __io_cqring_events(ctx);
}

static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
{
        struct io_rings *rings = ctx->rings;

        /* make sure SQ entry isn't read before tail */
        return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
}

static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
{
        unsigned int cflags;

        cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
        cflags |= IORING_CQE_F_BUFFER;
        req->flags &= ~REQ_F_BUFFER_SELECTED;
        kfree(kbuf);
        return cflags;
}

static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
{
        struct io_buffer *kbuf;

        if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
                return 0;
        kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
        return io_put_kbuf(req, kbuf);
}

static inline bool io_run_task_work(void)
{
        /*
         * PF_IO_WORKER never returns to userspace, so check here if we have
         * notify work that needs processing.
         */
        if (current->flags & PF_IO_WORKER &&
            test_thread_flag(TIF_NOTIFY_RESUME)) {
                __set_current_state(TASK_RUNNING);
                tracehook_notify_resume(NULL);
        }
        if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) {
                __set_current_state(TASK_RUNNING);
                tracehook_notify_signal();
                return true;
        }

        return false;
}

/*
 * Find and free completed poll iocbs
 */
static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
                               struct list_head *done)
{
        struct req_batch rb;
        struct io_kiocb *req;

        /* order with ->result store in io_complete_rw_iopoll() */
        smp_rmb();

        io_init_req_batch(&rb);
        while (!list_empty(done)) {
                struct io_uring_cqe *cqe;
                unsigned cflags;

                req = list_first_entry(done, struct io_kiocb, inflight_entry);
                list_del(&req->inflight_entry);
                cflags = io_put_rw_kbuf(req);
                (*nr_events)++;

                cqe = io_get_cqe(ctx);
                if (cqe) {
                        WRITE_ONCE(cqe->user_data, req->user_data);
                        WRITE_ONCE(cqe->res, req->result);
                        WRITE_ONCE(cqe->flags, cflags);
                } else {
                        spin_lock(&ctx->completion_lock);
                        io_cqring_event_overflow(ctx, req->user_data,
                                                        req->result, cflags);
                        spin_unlock(&ctx->completion_lock);
                }

                if (req_ref_put_and_test(req))
                        io_req_free_batch(&rb, req, &ctx->submit_state);
        }

        if (io_commit_needs_flush(ctx)) {
                spin_lock(&ctx->completion_lock);
                __io_commit_cqring_flush(ctx);
                spin_unlock(&ctx->completion_lock);
        }
        __io_commit_cqring(ctx);
        io_cqring_ev_posted_iopoll(ctx);
        io_req_free_batch_finish(ctx, &rb);
}

static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
                        long min)
{
        struct io_kiocb *req, *tmp;
        LIST_HEAD(done);
        bool spin;

        /*
         * Only spin for completions if we don't have multiple devices hanging
         * off our complete list, and we're under the requested amount.
         */
        spin = !ctx->poll_multi_queue && *nr_events < min;

        list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
                struct kiocb *kiocb = &req->rw.kiocb;
                int ret;

                /*
                 * Move completed and retryable entries to our local lists.
                 * If we find a request that requires polling, break out
                 * and complete those lists first, if we have entries there.
                 */
                if (READ_ONCE(req->iopoll_completed)) {
                        list_move_tail(&req->inflight_entry, &done);
                        continue;
                }
                if (!list_empty(&done))
                        break;

                ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
                if (unlikely(ret < 0))
                        return ret;
                else if (ret)
                        spin = false;

                /* iopoll may have completed current req */
                if (READ_ONCE(req->iopoll_completed))
                        list_move_tail(&req->inflight_entry, &done);
        }

        if (!list_empty(&done))
                io_iopoll_complete(ctx, nr_events, &done);

        return 0;
}

/*
 * We can't just wait for polled events to come to us, we have to actively
 * find and complete them.
 */
static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
{
        if (!(ctx->flags & IORING_SETUP_IOPOLL))
                return;

        mutex_lock(&ctx->uring_lock);
        while (!list_empty(&ctx->iopoll_list)) {
                unsigned int nr_events = 0;

                io_do_iopoll(ctx, &nr_events, 0);

                /* let it sleep and repeat later if can't complete a request */
                if (nr_events == 0)
                        break;
                /*
                 * Ensure we allow local-to-the-cpu processing to take place,
                 * in this case we need to ensure that we reap all events.
                 * Also let task_work, etc. to progress by releasing the mutex
                 */
                if (need_resched()) {
                        mutex_unlock(&ctx->uring_lock);
                        cond_resched();
                        mutex_lock(&ctx->uring_lock);
                }
        }
        mutex_unlock(&ctx->uring_lock);
}

static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
{
        unsigned int nr_events = 0;
        int ret = 0;

        /*
         * We disallow the app entering submit/complete with polling, but we
         * still need to lock the ring to prevent racing with polled issue
         * that got punted to a workqueue.
         */
        mutex_lock(&ctx->uring_lock);
        /*
         * Don't enter poll loop if we already have events pending.
         * If we do, we can potentially be spinning for commands that
         * already triggered a CQE (eg in error).
         */
        if (test_bit(0, &ctx->check_cq_overflow))
                __io_cqring_overflow_flush(ctx, false);
        if (io_cqring_events(ctx))
                goto out;
        do {
                /*
                 * If a submit got punted to a workqueue, we can have the
                 * application entering polling for a command before it gets
                 * issued. That app will hold the uring_lock for the duration
                 * of the poll right here, so we need to take a breather every
                 * now and then to ensure that the issue has a chance to add
                 * the poll to the issued list. Otherwise we can spin here
                 * forever, while the workqueue is stuck trying to acquire the
                 * very same mutex.
                 */
                if (list_empty(&ctx->iopoll_list)) {
                        u32 tail = ctx->cached_cq_tail;

                        mutex_unlock(&ctx->uring_lock);
                        io_run_task_work();
                        mutex_lock(&ctx->uring_lock);

                        /* some requests don't go through iopoll_list */
                        if (tail != ctx->cached_cq_tail ||
                            list_empty(&ctx->iopoll_list))
                                break;
                }
                ret = io_do_iopoll(ctx, &nr_events, min);

                if (task_sigpending(current)) {
                        ret = -EINTR;
                        goto out;
                }
        } while (!ret && nr_events < min && !need_resched());
out:
        mutex_unlock(&ctx->uring_lock);
        return ret;
}

static void kiocb_end_write(struct io_kiocb *req)
{
        /*
         * Tell lockdep we inherited freeze protection from submission
         * thread.
         */
        if (req->flags & REQ_F_ISREG) {
                struct super_block *sb = file_inode(req->file)->i_sb;

                __sb_writers_acquired(sb, SB_FREEZE_WRITE);
                sb_end_write(sb);
        }
}

#ifdef CONFIG_BLOCK
static bool io_resubmit_prep(struct io_kiocb *req)
{
        struct io_async_rw *rw = req->async_data;

        if (!rw)
                return !io_req_prep_async(req);
        iov_iter_restore(&rw->iter, &rw->iter_state);
        return true;
}

static bool io_rw_should_reissue(struct io_kiocb *req)
{
        umode_t mode = file_inode(req->file)->i_mode;
        struct io_ring_ctx *ctx = req->ctx;

        if (!S_ISBLK(mode) && !S_ISREG(mode))
                return false;
        if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
            !(ctx->flags & IORING_SETUP_IOPOLL)))
                return false;
        /*
         * If ref is dying, we might be running poll reap from the exit work.
         * Don't attempt to reissue from that path, just let it fail with
         * -EAGAIN.
         */
        if (percpu_ref_is_dying(&ctx->refs))
                return false;
        /*
         * Play it safe and assume not safe to re-import and reissue if we're
         * not in the original thread group (or in task context).
         */
        if (!same_thread_group(req->task, current) || !in_task())
                return false;
        return true;
}
#else
static bool io_resubmit_prep(struct io_kiocb *req)
{
        return false;
}
static bool io_rw_should_reissue(struct io_kiocb *req)
{
        return false;
}
#endif

/*
 * Trigger the notifications after having done some IO, and finish the write
 * accounting, if any.
 */
static void io_req_io_end(struct io_kiocb *req)
{
        struct io_rw *rw = &req->rw;

        if (rw->kiocb.ki_flags & IOCB_WRITE) {
                kiocb_end_write(req);
                fsnotify_modify(req->file);
        } else {
                fsnotify_access(req->file);
        }
}

static bool __io_complete_rw_common(struct io_kiocb *req, long res)
{
        if (res != req->result) {
                if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
                    io_rw_should_reissue(req)) {
                        /*
                         * Reissue will start accounting again, finish the
                         * current cycle.
                         */
                        io_req_io_end(req);
                        req->flags |= REQ_F_REISSUE;
                        return true;
                }
                req_set_fail(req);
                req->result = res;
        }
        return false;
}

static inline int io_fixup_rw_res(struct io_kiocb *req, long res)
{
        struct io_async_rw *io = req->async_data;

        /* add previously done IO, if any */
        if (io && io->bytes_done > 0) {
                if (res < 0)
                        res = io->bytes_done;
                else
                        res += io->bytes_done;
        }
        return res;
}

static void io_req_task_complete(struct io_kiocb *req, bool *locked)
{
        unsigned int cflags = io_put_rw_kbuf(req);
        int res = req->result;

        if (*locked) {
                struct io_ring_ctx *ctx = req->ctx;
                struct io_submit_state *state = &ctx->submit_state;

                io_req_complete_state(req, res, cflags);
                state->compl_reqs[state->compl_nr++] = req;
                if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
                        io_submit_flush_completions(ctx);
        } else {
                io_req_complete_post(req, res, cflags);
        }
}

static void io_req_rw_complete(struct io_kiocb *req, bool *locked)
{
        io_req_io_end(req);
        io_req_task_complete(req, locked);
}

static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
{
        struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);

        if (__io_complete_rw_common(req, res))
                return;
        req->result = io_fixup_rw_res(req, res);
        req->io_task_work.func = io_req_rw_complete;
        io_req_task_work_add(req);
}

static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
{
        struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);

        if (kiocb->ki_flags & IOCB_WRITE)
                kiocb_end_write(req);
        if (unlikely(res != req->result)) {
                if (res == -EAGAIN && io_rw_should_reissue(req)) {
                        req->flags |= REQ_F_REISSUE;
                        return;
                }
        }

        WRITE_ONCE(req->result, res);
        /* order with io_iopoll_complete() checking ->result */
        smp_wmb();
        WRITE_ONCE(req->iopoll_completed, 1);
}

/*
 * After the iocb has been issued, it's safe to be found on the poll list.
 * Adding the kiocb to the list AFTER submission ensures that we don't
 * find it from a io_do_iopoll() thread before the issuer is done
 * accessing the kiocb cookie.
 */
static void io_iopoll_req_issued(struct io_kiocb *req)
{
        struct io_ring_ctx *ctx = req->ctx;
        const bool in_async = io_wq_current_is_worker();

        /* workqueue context doesn't hold uring_lock, grab it now */
        if (unlikely(in_async))
                mutex_lock(&ctx->uring_lock);

        /*
         * Track whether we have multiple files in our lists. This will impact
         * how we do polling eventually, not spinning if we're on potentially
         * different devices.
         */
        if (list_empty(&ctx->iopoll_list)) {
                ctx->poll_multi_queue = false;
        } else if (!ctx->poll_multi_queue) {
                struct io_kiocb *list_req;
                unsigned int queue_num0, queue_num1;

                list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
                                                inflight_entry);

                if (list_req->file != req->file) {
                        ctx->poll_multi_queue = true;
                } else {
                        queue_num0 = blk_qc_t_to_queue_num(list_req->rw.kiocb.ki_cookie);
                        queue_num1 = blk_qc_t_to_queue_num(req->rw.kiocb.ki_cookie);
                        if (queue_num0 != queue_num1)
                                ctx->poll_multi_queue = true;
                }
        }

        /*
         * For fast devices, IO may have already completed. If it has, add
         * it to the front so we find it first.
         */
        if (READ_ONCE(req->iopoll_completed))
                list_add(&req->inflight_entry, &ctx->iopoll_list);
        else
                list_add_tail(&req->inflight_entry, &ctx->iopoll_list);

        if (unlikely(in_async)) {
                /*
                 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
                 * in sq thread task context or in io worker task context. If
                 * current task context is sq thread, we don't need to check
                 * whether should wake up sq thread.
                 */
                if ((ctx->flags & IORING_SETUP_SQPOLL) &&
                    wq_has_sleeper(&ctx->sq_data->wait))
                        wake_up(&ctx->sq_data->wait);

                mutex_unlock(&ctx->uring_lock);
        }
}

static bool io_bdev_nowait(struct block_device *bdev)
{
        return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
}

/*
 * If we tracked the file through the SCM inflight mechanism, we could support
 * any file. For now, just ensure that anything potentially problematic is done
 * inline.
 */
static bool __io_file_supports_nowait(struct file *file, int rw)
{
        umode_t mode = file_inode(file)->i_mode;

        if (S_ISBLK(mode)) {
                if (IS_ENABLED(CONFIG_BLOCK) &&
                    io_bdev_nowait(I_BDEV(file->f_mapping->host)))
                        return true;
                return false;
        }
        if (S_ISSOCK(mode))
                return true;
        if (S_ISREG(mode)) {
                if (IS_ENABLED(CONFIG_BLOCK) &&
                    io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
                    file->f_op != &io_uring_fops)
                        return true;
                return false;
        }

        /* any ->read/write should understand O_NONBLOCK */
        if (file->f_flags & O_NONBLOCK)
                return true;

        if (!(file->f_mode & FMODE_NOWAIT))
                return false;

        if (rw == READ)
                return file->f_op->read_iter != NULL;

        return file->f_op->write_iter != NULL;
}

static bool io_file_supports_nowait(struct io_kiocb *req, int rw)
{
        if (rw == READ && (req->flags & REQ_F_NOWAIT_READ))
                return true;
        else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE))
                return true;

        return __io_file_supports_nowait(req->file, rw);
}

static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                      int rw)
{
        struct io_ring_ctx *ctx = req->ctx;
        struct kiocb *kiocb = &req->rw.kiocb;
        struct file *file = req->file;
        unsigned ioprio;
        int ret;

        if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode))
                req->flags |= REQ_F_ISREG;

        kiocb->ki_pos = READ_ONCE(sqe->off);
        kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
        kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
        ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
        if (unlikely(ret))
                return ret;

        /*
         * If the file is marked O_NONBLOCK, still allow retry for it if it
         * supports async. Otherwise it's impossible to use O_NONBLOCK files
         * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
         */
        if ((kiocb->ki_flags & IOCB_NOWAIT) ||
            ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req, rw)))
                req->flags |= REQ_F_NOWAIT;

        ioprio = READ_ONCE(sqe->ioprio);
        if (ioprio) {
                ret = ioprio_check_cap(ioprio);
                if (ret)
                        return ret;

                kiocb->ki_ioprio = ioprio;
        } else
                kiocb->ki_ioprio = get_current_ioprio();

        if (ctx->flags & IORING_SETUP_IOPOLL) {
                if (!(kiocb->ki_flags & IOCB_DIRECT) ||
                    !kiocb->ki_filp->f_op->iopoll)
                        return -EOPNOTSUPP;

                kiocb->ki_flags |= IOCB_HIPRI;
                kiocb->ki_complete = io_complete_rw_iopoll;
                req->iopoll_completed = 0;
        } else {
                if (kiocb->ki_flags & IOCB_HIPRI)
                        return -EINVAL;
                kiocb->ki_complete = io_complete_rw;
        }

        /* used for fixed read/write too - just read unconditionally */
        req->buf_index = READ_ONCE(sqe->buf_index);
        req->imu = NULL;

        if (req->opcode == IORING_OP_READ_FIXED ||
            req->opcode == IORING_OP_WRITE_FIXED) {
                struct io_ring_ctx *ctx = req->ctx;
                u16 index;

                if (unlikely(req->buf_index >= ctx->nr_user_bufs))
                        return -EFAULT;
                index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
                req->imu = ctx->user_bufs[index];
                io_req_set_rsrc_node(req);
        }

        req->rw.addr = READ_ONCE(sqe->addr);
        req->rw.len = READ_ONCE(sqe->len);
        return 0;
}

static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
{
        switch (ret) {
        case -EIOCBQUEUED:
                break;
        case -ERESTARTSYS:
        case -ERESTARTNOINTR:
        case -ERESTARTNOHAND:
        case -ERESTART_RESTARTBLOCK:
                /*
                 * We can't just restart the syscall, since previously
                 * submitted sqes may already be in progress. Just fail this
                 * IO with EINTR.
                 */
                ret = -EINTR;
                fallthrough;
        default:
                kiocb->ki_complete(kiocb, ret, 0);
        }
}

static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
{
        struct kiocb *kiocb = &req->rw.kiocb;

        if (kiocb->ki_pos != -1)
                return &kiocb->ki_pos;

        if (!(req->file->f_mode & FMODE_STREAM)) {
                req->flags |= REQ_F_CUR_POS;
                kiocb->ki_pos = req->file->f_pos;
                return &kiocb->ki_pos;
        }

        kiocb->ki_pos = 0;
        return NULL;
}

static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
                       unsigned int issue_flags)
{
        struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);

        if (req->flags & REQ_F_CUR_POS)
                req->file->f_pos = kiocb->ki_pos;
        if (ret >= 0 && (kiocb->ki_complete == io_complete_rw)) {
                if (!__io_complete_rw_common(req, ret)) {
                        /*
                         * Safe to call io_end from here as we're inline
                         * from the submission path.
                         */
                        io_req_io_end(req);
                        __io_req_complete(req, issue_flags,
                                          io_fixup_rw_res(req, ret),
                                          io_put_rw_kbuf(req));
                }
        } else {
                io_rw_done(kiocb, ret);
        }

        if (req->flags & REQ_F_REISSUE) {
                req->flags &= ~REQ_F_REISSUE;
                if (io_resubmit_prep(req)) {
                        io_req_task_queue_reissue(req);
                } else {
                        unsigned int cflags = io_put_rw_kbuf(req);
                        struct io_ring_ctx *ctx = req->ctx;

                        ret = io_fixup_rw_res(req, ret);
                        req_set_fail(req);
                        if (!(issue_flags & IO_URING_F_NONBLOCK)) {
                                mutex_lock(&ctx->uring_lock);
                                __io_req_complete(req, issue_flags, ret, cflags);
                                mutex_unlock(&ctx->uring_lock);
                        } else {
                                __io_req_complete(req, issue_flags, ret, cflags);
                        }
                }
        }
}

static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
                             struct io_mapped_ubuf *imu)
{
        size_t len = req->rw.len;
        u64 buf_end, buf_addr = req->rw.addr;
        size_t offset;

        if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
                return -EFAULT;
        /* not inside the mapped region */
        if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
                return -EFAULT;

        /*
         * May not be a start of buffer, set size appropriately
         * and advance us to the beginning.
         */
        offset = buf_addr - imu->ubuf;
        iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);

        if (offset) {
                /*
                 * Don't use iov_iter_advance() here, as it's really slow for
                 * using the latter parts of a big fixed buffer - it iterates
                 * over each segment manually. We can cheat a bit here, because
                 * we know that:
                 *
                 * 1) it's a BVEC iter, we set it up
                 * 2) all bvecs are PAGE_SIZE in size, except potentially the
                 *    first and last bvec
                 *
                 * So just find our index, and adjust the iterator afterwards.
                 * If the offset is within the first bvec (or the whole first
                 * bvec, just use iov_iter_advance(). This makes it easier
                 * since we can just skip the first segment, which may not
                 * be PAGE_SIZE aligned.
                 */
                const struct bio_vec *bvec = imu->bvec;

                if (offset < bvec->bv_len) {
                        iov_iter_advance(iter, offset);
                } else {
                        unsigned long seg_skip;

                        /* skip first vec */
                        offset -= bvec->bv_len;
                        seg_skip = 1 + (offset >> PAGE_SHIFT);

                        iter->bvec = bvec + seg_skip;
                        iter->nr_segs -= seg_skip;
                        iter->count -= bvec->bv_len + offset;
                        iter->iov_offset = offset & ~PAGE_MASK;
                }
        }

        return 0;
}

static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
{
        if (WARN_ON_ONCE(!req->imu))
                return -EFAULT;
        return __io_import_fixed(req, rw, iter, req->imu);
}

static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
{
        if (needs_lock)
                mutex_unlock(&ctx->uring_lock);
}

static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
{
        /*
         * "Normal" inline submissions always hold the uring_lock, since we
         * grab it from the system call. Same is true for the SQPOLL offload.
         * The only exception is when we've detached the request and issue it
         * from an async worker thread, grab the lock for that case.
         */
        if (needs_lock)
                mutex_lock(&ctx->uring_lock);
}

static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
                                          int bgid, struct io_buffer *kbuf,
                                          bool needs_lock)
{
        struct io_buffer *head;

        if (req->flags & REQ_F_BUFFER_SELECTED)
                return kbuf;

        io_ring_submit_lock(req->ctx, needs_lock);

        lockdep_assert_held(&req->ctx->uring_lock);

        head = xa_load(&req->ctx->io_buffers, bgid);
        if (head) {
                if (!list_empty(&head->list)) {
                        kbuf = list_last_entry(&head->list, struct io_buffer,
                                                        list);
                        list_del(&kbuf->list);
                } else {
                        kbuf = head;
                        xa_erase(&req->ctx->io_buffers, bgid);
                }
                if (*len > kbuf->len)
                        *len = kbuf->len;
        } else {
                kbuf = ERR_PTR(-ENOBUFS);
        }

        io_ring_submit_unlock(req->ctx, needs_lock);

        return kbuf;
}

static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
                                        bool needs_lock)
{
        struct io_buffer *kbuf;
        u16 bgid;

        kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
        bgid = req->buf_index;
        kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
        if (IS_ERR(kbuf))
                return kbuf;
        req->rw.addr = (u64) (unsigned long) kbuf;
        req->flags |= REQ_F_BUFFER_SELECTED;
        return u64_to_user_ptr(kbuf->addr);
}

#ifdef CONFIG_COMPAT
static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
                                bool needs_lock)
{
        struct compat_iovec __user *uiov;
        compat_ssize_t clen;
        void __user *buf;
        ssize_t len;

        uiov = u64_to_user_ptr(req->rw.addr);
        if (!access_ok(uiov, sizeof(*uiov)))
                return -EFAULT;
        if (__get_user(clen, &uiov->iov_len))
                return -EFAULT;
        if (clen < 0)
                return -EINVAL;

        len = clen;
        buf = io_rw_buffer_select(req, &len, needs_lock);
        if (IS_ERR(buf))
                return PTR_ERR(buf);
        iov[0].iov_base = buf;
        iov[0].iov_len = (compat_size_t) len;
        return 0;
}
#endif

static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
                                      bool needs_lock)
{
        struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
        void __user *buf;
        ssize_t len;

        if (copy_from_user(iov, uiov, sizeof(*uiov)))
                return -EFAULT;

        len = iov[0].iov_len;
        if (len < 0)
                return -EINVAL;
        buf = io_rw_buffer_select(req, &len, needs_lock);
        if (IS_ERR(buf))
                return PTR_ERR(buf);
        iov[0].iov_base = buf;
        iov[0].iov_len = len;
        return 0;
}

static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
                                    bool needs_lock)
{
        if (req->flags & REQ_F_BUFFER_SELECTED) {
                struct io_buffer *kbuf;

                kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
                iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
                iov[0].iov_len = kbuf->len;
                return 0;
        }
        if (req->rw.len != 1)
                return -EINVAL;

#ifdef CONFIG_COMPAT
        if (req->ctx->compat)
                return io_compat_import(req, iov, needs_lock);
#endif

        return __io_iov_buffer_select(req, iov, needs_lock);
}

static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
                           struct iov_iter *iter, bool needs_lock)
{
        void __user *buf = u64_to_user_ptr(req->rw.addr);
        size_t sqe_len = req->rw.len;
        u8 opcode = req->opcode;
        ssize_t ret;

        if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
                *iovec = NULL;
                return io_import_fixed(req, rw, iter);
        }

        /* buffer index only valid with fixed read/write, or buffer select  */
        if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
                return -EINVAL;

        if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
                if (req->flags & REQ_F_BUFFER_SELECT) {
                        buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
                        if (IS_ERR(buf))
                                return PTR_ERR(buf);
                        req->rw.len = sqe_len;
                }

                ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
                *iovec = NULL;
                return ret;
        }

        if (req->flags & REQ_F_BUFFER_SELECT) {
                ret = io_iov_buffer_select(req, *iovec, needs_lock);
                if (!ret)
                        iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
                *iovec = NULL;
                return ret;
        }

        return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
                              req->ctx->compat);
}

static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
{
        return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
}

/*
 * For files that don't have ->read_iter() and ->write_iter(), handle them
 * by looping over ->read() or ->write() manually.
 */
static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
{
        struct kiocb *kiocb = &req->rw.kiocb;
        struct file *file = req->file;
        ssize_t ret = 0;
        loff_t *ppos;

        /*
         * Don't support polled IO through this interface, and we can't
         * support non-blocking either. For the latter, this just causes
         * the kiocb to be handled from an async context.
         */
        if (kiocb->ki_flags & IOCB_HIPRI)
                return -EOPNOTSUPP;
        if (kiocb->ki_flags & IOCB_NOWAIT)
                return -EAGAIN;

        ppos = io_kiocb_ppos(kiocb);

        while (iov_iter_count(iter)) {
                struct iovec iovec;
                ssize_t nr;

                if (!iov_iter_is_bvec(iter)) {
                        iovec = iov_iter_iovec(iter);
                } else {
                        iovec.iov_base = u64_to_user_ptr(req->rw.addr);
                        iovec.iov_len = req->rw.len;
                }

                if (rw == READ) {
                        nr = file->f_op->read(file, iovec.iov_base,
                                              iovec.iov_len, ppos);
                } else {
                        nr = file->f_op->write(file, iovec.iov_base,
                                               iovec.iov_len, ppos);
                }

                if (nr < 0) {
                        if (!ret)
                                ret = nr;
                        break;
                }
                ret += nr;
                if (!iov_iter_is_bvec(iter)) {
                        iov_iter_advance(iter, nr);
                } else {
                        req->rw.addr += nr;
                        req->rw.len -= nr;
                        if (!req->rw.len)
                                break;
                }
                if (nr != iovec.iov_len)
                        break;
        }

        return ret;
}

static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
                          const struct iovec *fast_iov, struct iov_iter *iter)
{
        struct io_async_rw *rw = req->async_data;

        memcpy(&rw->iter, iter, sizeof(*iter));
        rw->free_iovec = iovec;
        rw->bytes_done = 0;
        /* can only be fixed buffers, no need to do anything */
        if (iov_iter_is_bvec(iter))
                return;
        if (!iovec) {
                unsigned iov_off = 0;

                rw->iter.iov = rw->fast_iov;
                if (iter->iov != fast_iov) {
                        iov_off = iter->iov - fast_iov;
                        rw->iter.iov += iov_off;
                }
                if (rw->fast_iov != fast_iov)
                        memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
                               sizeof(struct iovec) * iter->nr_segs);
        } else {
                req->flags |= REQ_F_NEED_CLEANUP;
        }
}

static inline int io_alloc_async_data(struct io_kiocb *req)
{
        WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
        req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
        return req->async_data == NULL;
}

static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
                             const struct iovec *fast_iov,
                             struct iov_iter *iter, bool force)
{
        if (!force && !io_op_defs[req->opcode].needs_async_setup)
                return 0;
        if (!req->async_data) {
                struct io_async_rw *iorw;

                if (io_alloc_async_data(req)) {
                        kfree(iovec);
                        return -ENOMEM;
                }

                io_req_map_rw(req, iovec, fast_iov, iter);
                iorw = req->async_data;
                /* we've copied and mapped the iter, ensure state is saved */
                iov_iter_save_state(&iorw->iter, &iorw->iter_state);
        }
        return 0;
}

static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
{
        struct io_async_rw *iorw = req->async_data;
        struct iovec *iov = iorw->fast_iov;
        int ret;

        iorw->bytes_done = 0;
        iorw->free_iovec = NULL;

        ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
        if (unlikely(ret < 0))
                return ret;

        if (iov) {
                iorw->free_iovec = iov;
                req->flags |= REQ_F_NEED_CLEANUP;
        }
        iov_iter_save_state(&iorw->iter, &iorw->iter_state);
        return 0;
}

static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        if (unlikely(!(req->file->f_mode & FMODE_READ)))
                return -EBADF;
        return io_prep_rw(req, sqe, READ);
}

/*
 * This is our waitqueue callback handler, registered through lock_page_async()
 * when we initially tried to do the IO with the iocb armed our waitqueue.
 * This gets called when the page is unlocked, and we generally expect that to
 * happen when the page IO is completed and the page is now uptodate. This will
 * queue a task_work based retry of the operation, attempting to copy the data
 * again. If the latter fails because the page was NOT uptodate, then we will
 * do a thread based blocking retry of the operation. That's the unexpected
 * slow path.
 */
static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
                             int sync, void *arg)
{
        struct wait_page_queue *wpq;
        struct io_kiocb *req = wait->private;
        struct wait_page_key *key = arg;

        wpq = container_of(wait, struct wait_page_queue, wait);

        if (!wake_page_match(wpq, key))
                return 0;

        req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
        list_del_init(&wait->entry);
        io_req_task_queue(req);
        return 1;
}

/*
 * This controls whether a given IO request should be armed for async page
 * based retry. If we return false here, the request is handed to the async
 * worker threads for retry. If we're doing buffered reads on a regular file,
 * we prepare a private wait_page_queue entry and retry the operation. This
 * will either succeed because the page is now uptodate and unlocked, or it
 * will register a callback when the page is unlocked at IO completion. Through
 * that callback, io_uring uses task_work to setup a retry of the operation.
 * That retry will attempt the buffered read again. The retry will generally
 * succeed, or in rare cases where it fails, we then fall back to using the
 * async worker threads for a blocking retry.
 */
static bool io_rw_should_retry(struct io_kiocb *req)
{
        struct io_async_rw *rw = req->async_data;
        struct wait_page_queue *wait = &rw->wpq;
        struct kiocb *kiocb = &req->rw.kiocb;

        /* never retry for NOWAIT, we just complete with -EAGAIN */
        if (req->flags & REQ_F_NOWAIT)
                return false;

        /* Only for buffered IO */
        if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
                return false;

        /*
         * just use poll if we can, and don't attempt if the fs doesn't
         * support callback based unlocks
         */
        if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
                return false;

        wait->wait.func = io_async_buf_func;
        wait->wait.private = req;
        wait->wait.flags = 0;
        INIT_LIST_HEAD(&wait->wait.entry);
        kiocb->ki_flags |= IOCB_WAITQ;
        kiocb->ki_flags &= ~IOCB_NOWAIT;
        kiocb->ki_waitq = wait;
        return true;
}

static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
{
        if (req->file->f_op->read_iter)
                return call_read_iter(req->file, &req->rw.kiocb, iter);
        else if (req->file->f_op->read)
                return loop_rw_iter(READ, req, iter);
        else
                return -EINVAL;
}

static bool need_read_all(struct io_kiocb *req)
{
        return req->flags & REQ_F_ISREG ||
                S_ISBLK(file_inode(req->file)->i_mode);
}

static int io_read(struct io_kiocb *req, unsigned int issue_flags)
{
        struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
        struct kiocb *kiocb = &req->rw.kiocb;
        struct iov_iter __iter, *iter = &__iter;
        struct io_async_rw *rw = req->async_data;
        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
        struct iov_iter_state __state, *state;
        ssize_t ret, ret2;
        loff_t *ppos;

        if (rw) {
                iter = &rw->iter;
                state = &rw->iter_state;
                /*
                 * We come here from an earlier attempt, restore our state to
                 * match in case it doesn't. It's cheap enough that we don't
                 * need to make this conditional.
                 */
                iov_iter_restore(iter, state);
                iovec = NULL;
        } else {
                ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
                if (ret < 0)
                        return ret;
                state = &__state;
                iov_iter_save_state(iter, state);
        }
        req->result = iov_iter_count(iter);

        /* Ensure we clear previously set non-block flag */
        if (!force_nonblock)
                kiocb->ki_flags &= ~IOCB_NOWAIT;
        else
                kiocb->ki_flags |= IOCB_NOWAIT;

        /* If the file doesn't support async, just async punt */
        if (force_nonblock && !io_file_supports_nowait(req, READ)) {
                ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
                return ret ?: -EAGAIN;
        }

        ppos = io_kiocb_update_pos(req);

        ret = rw_verify_area(READ, req->file, ppos, req->result);
        if (unlikely(ret)) {
                kfree(iovec);
                return ret;
        }

        ret = io_iter_do_read(req, iter);

        if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
                req->flags &= ~REQ_F_REISSUE;
                /* IOPOLL retry should happen for io-wq threads */
                if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
                        goto done;
                /* no retry on NONBLOCK nor RWF_NOWAIT */
                if (req->flags & REQ_F_NOWAIT)
                        goto done;
                ret = 0;
        } else if (ret == -EIOCBQUEUED) {
                goto out_free;
        } else if (ret <= 0 || ret == req->result || !force_nonblock ||
                   (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
                /* read all, failed, already did sync or don't want to retry */
                goto done;
        }

        /*
         * Don't depend on the iter state matching what was consumed, or being
         * untouched in case of error. Restore it and we'll advance it
         * manually if we need to.
         */
        iov_iter_restore(iter, state);

        ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
        if (ret2)
                return ret2;

        iovec = NULL;
        rw = req->async_data;
        /*
         * Now use our persistent iterator and state, if we aren't already.
         * We've restored and mapped the iter to match.
         */
        if (iter != &rw->iter) {
                iter = &rw->iter;
                state = &rw->iter_state;
        }

        do {
                /*
                 * We end up here because of a partial read, either from
                 * above or inside this loop. Advance the iter by the bytes
                 * that were consumed.
                 */
                iov_iter_advance(iter, ret);
                if (!iov_iter_count(iter))
                        break;
                rw->bytes_done += ret;
                iov_iter_save_state(iter, state);

                /* if we can retry, do so with the callbacks armed */
                if (!io_rw_should_retry(req)) {
                        kiocb->ki_flags &= ~IOCB_WAITQ;
                        return -EAGAIN;
                }

                req->result = iov_iter_count(iter);
                /*
                 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
                 * we get -EIOCBQUEUED, then we'll get a notification when the
                 * desired page gets unlocked. We can also get a partial read
                 * here, and if we do, then just retry at the new offset.
                 */
                ret = io_iter_do_read(req, iter);
                if (ret == -EIOCBQUEUED)
                        return 0;
                /* we got some bytes, but not all. retry. */
                kiocb->ki_flags &= ~IOCB_WAITQ;
                iov_iter_restore(iter, state);
        } while (ret > 0);
done:
        kiocb_done(kiocb, ret, issue_flags);
out_free:
        /* it's faster to check here then delegate to kfree */
        if (iovec)
                kfree(iovec);
        return 0;
}

static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
                return -EBADF;
        return io_prep_rw(req, sqe, WRITE);
}

static int io_write(struct io_kiocb *req, unsigned int issue_flags)
{
        struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
        struct kiocb *kiocb = &req->rw.kiocb;
        struct iov_iter __iter, *iter = &__iter;
        struct io_async_rw *rw = req->async_data;
        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
        struct iov_iter_state __state, *state;
        ssize_t ret, ret2;
        loff_t *ppos;

        if (rw) {
                iter = &rw->iter;
                state = &rw->iter_state;
                iov_iter_restore(iter, state);
                iovec = NULL;
        } else {
                ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
                if (ret < 0)
                        return ret;
                state = &__state;
                iov_iter_save_state(iter, state);
        }
        req->result = iov_iter_count(iter);

        /* Ensure we clear previously set non-block flag */
        if (!force_nonblock)
                kiocb->ki_flags &= ~IOCB_NOWAIT;
        else
                kiocb->ki_flags |= IOCB_NOWAIT;

        /* If the file doesn't support async, just async punt */
        if (force_nonblock && !io_file_supports_nowait(req, WRITE))
                goto copy_iov;

        /* file path doesn't support NOWAIT for non-direct_IO */
        if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
            (req->flags & REQ_F_ISREG))
                goto copy_iov;

        ppos = io_kiocb_update_pos(req);

        ret = rw_verify_area(WRITE, req->file, ppos, req->result);
        if (unlikely(ret))
                goto out_free;

        /*
         * Open-code file_start_write here to grab freeze protection,
         * which will be released by another thread in
         * io_complete_rw().  Fool lockdep by telling it the lock got
         * released so that it doesn't complain about the held lock when
         * we return to userspace.
         */
        if (req->flags & REQ_F_ISREG) {
                sb_start_write(file_inode(req->file)->i_sb);
                __sb_writers_release(file_inode(req->file)->i_sb,
                                        SB_FREEZE_WRITE);
        }
        kiocb->ki_flags |= IOCB_WRITE;

        if (req->file->f_op->write_iter)
                ret2 = call_write_iter(req->file, kiocb, iter);
        else if (req->file->f_op->write)
                ret2 = loop_rw_iter(WRITE, req, iter);
        else
                ret2 = -EINVAL;

        if (req->flags & REQ_F_REISSUE) {
                req->flags &= ~REQ_F_REISSUE;
                ret2 = -EAGAIN;
        }

        /*
         * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
         * retry them without IOCB_NOWAIT.
         */
        if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
                ret2 = -EAGAIN;
        /* no retry on NONBLOCK nor RWF_NOWAIT */
        if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
                goto done;
        if (!force_nonblock || ret2 != -EAGAIN) {
                /* IOPOLL retry should happen for io-wq threads */
                if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
                        goto copy_iov;
done:
                kiocb_done(kiocb, ret2, issue_flags);
        } else {
copy_iov:
                iov_iter_restore(iter, state);
                ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
                if (!ret) {
                        if (kiocb->ki_flags & IOCB_WRITE)
                                kiocb_end_write(req);
                        return -EAGAIN;
                }
                return ret;
        }
out_free:
        /* it's reportedly faster than delegating the null check to kfree() */
        if (iovec)
                kfree(iovec);
        return ret;
}

static int io_renameat_prep(struct io_kiocb *req,
                            const struct io_uring_sqe *sqe)
{
        struct io_rename *ren = &req->rename;
        const char __user *oldf, *newf;

        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
                return -EINVAL;
        if (unlikely(req->flags & REQ_F_FIXED_FILE))
                return -EBADF;

        ren->old_dfd = READ_ONCE(sqe->fd);
        oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
        newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
        ren->new_dfd = READ_ONCE(sqe->len);
        ren->flags = READ_ONCE(sqe->rename_flags);

        ren->oldpath = getname(oldf);
        if (IS_ERR(ren->oldpath))
                return PTR_ERR(ren->oldpath);

        ren->newpath = getname(newf);
        if (IS_ERR(ren->newpath)) {
                putname(ren->oldpath);
                return PTR_ERR(ren->newpath);
        }

        req->flags |= REQ_F_NEED_CLEANUP;
        return 0;
}

static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_rename *ren = &req->rename;
        int ret;

        if (issue_flags & IO_URING_F_NONBLOCK)
                return -EAGAIN;

        ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
                                ren->newpath, ren->flags);

        req->flags &= ~REQ_F_NEED_CLEANUP;
        if (ret < 0)
                req_set_fail(req);
        io_req_complete(req, ret);
        return 0;
}

static int io_unlinkat_prep(struct io_kiocb *req,
                            const struct io_uring_sqe *sqe)
{
        struct io_unlink *un = &req->unlink;
        const char __user *fname;

        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
            sqe->splice_fd_in)
                return -EINVAL;
        if (unlikely(req->flags & REQ_F_FIXED_FILE))
                return -EBADF;

        un->dfd = READ_ONCE(sqe->fd);

        un->flags = READ_ONCE(sqe->unlink_flags);
        if (un->flags & ~AT_REMOVEDIR)
                return -EINVAL;

        fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
        un->filename = getname(fname);
        if (IS_ERR(un->filename))
                return PTR_ERR(un->filename);

        req->flags |= REQ_F_NEED_CLEANUP;
        return 0;
}

static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_unlink *un = &req->unlink;
        int ret;

        if (issue_flags & IO_URING_F_NONBLOCK)
                return -EAGAIN;

        if (un->flags & AT_REMOVEDIR)
                ret = do_rmdir(un->dfd, un->filename);
        else
                ret = do_unlinkat(un->dfd, un->filename);

        req->flags &= ~REQ_F_NEED_CLEANUP;
        if (ret < 0)
                req_set_fail(req);
        io_req_complete(req, ret);
        return 0;
}

static int io_shutdown_prep(struct io_kiocb *req,
                            const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_NET)
        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
                     sqe->buf_index || sqe->splice_fd_in))
                return -EINVAL;

        req->shutdown.how = READ_ONCE(sqe->len);
        return 0;
#else
        return -EOPNOTSUPP;
#endif
}

static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
{
#if defined(CONFIG_NET)
        struct socket *sock;
        int ret;

        if (issue_flags & IO_URING_F_NONBLOCK)
                return -EAGAIN;

        sock = sock_from_file(req->file, &ret);
        if (unlikely(!sock))
                return ret;

        ret = __sys_shutdown_sock(sock, req->shutdown.how);
        if (ret < 0)
                req_set_fail(req);
        io_req_complete(req, ret);
        return 0;
#else
        return -EOPNOTSUPP;
#endif
}

static int __io_splice_prep(struct io_kiocb *req,
                            const struct io_uring_sqe *sqe)
{
        struct io_splice *sp = &req->splice;
        unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;

        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;

        sp->len = READ_ONCE(sqe->len);
        sp->flags = READ_ONCE(sqe->splice_flags);
        if (unlikely(sp->flags & ~valid_flags))
                return -EINVAL;
        sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in);
        return 0;
}

static int io_tee_prep(struct io_kiocb *req,
                       const struct io_uring_sqe *sqe)
{
        if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
                return -EINVAL;
        return __io_splice_prep(req, sqe);
}

static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_splice *sp = &req->splice;
        struct file *out = sp->file_out;
        unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
        struct file *in;
        long ret = 0;

        if (issue_flags & IO_URING_F_NONBLOCK)
                return -EAGAIN;

        in = io_file_get(req->ctx, req, sp->splice_fd_in,
                         (sp->flags & SPLICE_F_FD_IN_FIXED), issue_flags);
        if (!in) {
                ret = -EBADF;
                goto done;
        }

        if (sp->len)
                ret = do_tee(in, out, sp->len, flags);

        if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
                io_put_file(in);
done:
        if (ret != sp->len)
                req_set_fail(req);
        io_req_complete(req, ret);
        return 0;
}

static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        struct io_splice *sp = &req->splice;

        sp->off_in = READ_ONCE(sqe->splice_off_in);
        sp->off_out = READ_ONCE(sqe->off);
        return __io_splice_prep(req, sqe);
}

static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_splice *sp = &req->splice;
        struct file *out = sp->file_out;
        unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
        loff_t *poff_in, *poff_out;
        struct file *in;
        long ret = 0;

        if (issue_flags & IO_URING_F_NONBLOCK)
                return -EAGAIN;

        in = io_file_get(req->ctx, req, sp->splice_fd_in,
                         (sp->flags & SPLICE_F_FD_IN_FIXED), issue_flags);
        if (!in) {
                ret = -EBADF;
                goto done;
        }

        poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
        poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;

        if (sp->len)
                ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);

        if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
                io_put_file(in);
done:
        if (ret != sp->len)
                req_set_fail(req);
        io_req_complete(req, ret);
        return 0;
}

/*
 * IORING_OP_NOP just posts a completion event, nothing else.
 */
static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_ring_ctx *ctx = req->ctx;

        if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;

        __io_req_complete(req, issue_flags, 0, 0);
        return 0;
}

static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        struct io_ring_ctx *ctx = req->ctx;

        if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
                     sqe->splice_fd_in))
                return -EINVAL;

        req->sync.flags = READ_ONCE(sqe->fsync_flags);
        if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
                return -EINVAL;

        req->sync.off = READ_ONCE(sqe->off);
        req->sync.len = READ_ONCE(sqe->len);
        return 0;
}

static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
{
        loff_t end = req->sync.off + req->sync.len;
        int ret;

        /* fsync always requires a blocking context */
        if (issue_flags & IO_URING_F_NONBLOCK)
                return -EAGAIN;

        ret = vfs_fsync_range(req->file, req->sync.off,
                                end > 0 ? end : LLONG_MAX,
                                req->sync.flags & IORING_FSYNC_DATASYNC);
        if (ret < 0)
                req_set_fail(req);
        io_req_complete(req, ret);
        return 0;
}

static int io_fallocate_prep(struct io_kiocb *req,
                             const struct io_uring_sqe *sqe)
{
        if (sqe->ioprio || sqe->buf_index || sqe->rw_flags ||
            sqe->splice_fd_in)
                return -EINVAL;
        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;

        req->sync.off = READ_ONCE(sqe->off);
        req->sync.len = READ_ONCE(sqe->addr);
        req->sync.mode = READ_ONCE(sqe->len);
        return 0;
}

static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
{
        int ret;

        /* fallocate always requiring blocking context */
        if (issue_flags & IO_URING_F_NONBLOCK)
                return -EAGAIN;
        ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
                                req->sync.len);
        if (ret < 0)
                req_set_fail(req);
        else
                fsnotify_modify(req->file);
        io_req_complete(req, ret);
        return 0;
}

static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        const char __user *fname;
        int ret;

        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (unlikely(sqe->ioprio || sqe->buf_index))
                return -EINVAL;
        if (unlikely(req->flags & REQ_F_FIXED_FILE))
                return -EBADF;

        /* open.how should be already initialised */
        if (!(req->open.how.flags & O_PATH) && force_o_largefile())
                req->open.how.flags |= O_LARGEFILE;

        req->open.dfd = READ_ONCE(sqe->fd);
        fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
        req->open.filename = getname(fname);
        if (IS_ERR(req->open.filename)) {
                ret = PTR_ERR(req->open.filename);
                req->open.filename = NULL;
                return ret;
        }

        req->open.file_slot = READ_ONCE(sqe->file_index);
        if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC))
                return -EINVAL;

        req->open.nofile = rlimit(RLIMIT_NOFILE);
        req->flags |= REQ_F_NEED_CLEANUP;
        return 0;
}

static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        u64 mode = READ_ONCE(sqe->len);
        u64 flags = READ_ONCE(sqe->open_flags);

        req->open.how = build_open_how(flags, mode);
        return __io_openat_prep(req, sqe);
}

static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        struct open_how __user *how;
        size_t len;
        int ret;

        how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
        len = READ_ONCE(sqe->len);
        if (len < OPEN_HOW_SIZE_VER0)
                return -EINVAL;

        ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
                                        len);
        if (ret)
                return ret;

        return __io_openat_prep(req, sqe);
}

static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
{
        struct open_flags op;
        struct file *file;
        bool resolve_nonblock, nonblock_set;
        bool fixed = !!req->open.file_slot;
        int ret;

        ret = build_open_flags(&req->open.how, &op);
        if (ret)
                goto err;
        nonblock_set = op.open_flag & O_NONBLOCK;
        resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
        if (issue_flags & IO_URING_F_NONBLOCK) {
                /*
                 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
                 * it'll always -EAGAIN. Note that we test for __O_TMPFILE
                 * because O_TMPFILE includes O_DIRECTORY, which isn't a flag
                 * we need to force async for.
                 */
                if (req->open.how.flags & (O_TRUNC | O_CREAT | __O_TMPFILE))
                        return -EAGAIN;
                op.lookup_flags |= LOOKUP_CACHED;
                op.open_flag |= O_NONBLOCK;
        }

        if (!fixed) {
                ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
                if (ret < 0)
                        goto err;
        }

        file = do_filp_open(req->open.dfd, req->open.filename, &op);
        if (IS_ERR(file)) {
                /*
                 * We could hang on to this 'fd' on retrying, but seems like
                 * marginal gain for something that is now known to be a slower
                 * path. So just put it, and we'll get a new one when we retry.
                 */
                if (!fixed)
                        put_unused_fd(ret);

                ret = PTR_ERR(file);
                /* only retry if RESOLVE_CACHED wasn't already set by application */
                if (ret == -EAGAIN &&
                    (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
                        return -EAGAIN;
                goto err;
        }

        if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
                file->f_flags &= ~O_NONBLOCK;
        fsnotify_open(file);

        if (!fixed)
                fd_install(ret, file);
        else
                ret = io_install_fixed_file(req, file, issue_flags,
                                            req->open.file_slot - 1);
err:
        putname(req->open.filename);
        req->flags &= ~REQ_F_NEED_CLEANUP;
        if (ret < 0)
                req_set_fail(req);
        __io_req_complete(req, issue_flags, ret, 0);
        return 0;
}

static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
{
        return io_openat2(req, issue_flags);
}

static int io_remove_buffers_prep(struct io_kiocb *req,
                                  const struct io_uring_sqe *sqe)
{
        struct io_provide_buf *p = &req->pbuf;
        u64 tmp;

        if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
            sqe->splice_fd_in)
                return -EINVAL;

        tmp = READ_ONCE(sqe->fd);
        if (!tmp || tmp > USHRT_MAX)
                return -EINVAL;

        memset(p, 0, sizeof(*p));
        p->nbufs = tmp;
        p->bgid = READ_ONCE(sqe->buf_group);
        return 0;
}

static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
                               int bgid, unsigned nbufs)
{
        unsigned i = 0;

        /* shouldn't happen */
        if (!nbufs)
                return 0;

        /* the head kbuf is the list itself */
        while (!list_empty(&buf->list)) {
                struct io_buffer *nxt;

                nxt = list_first_entry(&buf->list, struct io_buffer, list);
                list_del(&nxt->list);
                kfree(nxt);
                if (++i == nbufs)
                        return i;
                cond_resched();
        }
        i++;
        kfree(buf);
        xa_erase(&ctx->io_buffers, bgid);

        return i;
}

static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_provide_buf *p = &req->pbuf;
        struct io_ring_ctx *ctx = req->ctx;
        struct io_buffer *head;
        int ret = 0;
        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;

        io_ring_submit_lock(ctx, !force_nonblock);

        lockdep_assert_held(&ctx->uring_lock);

        ret = -ENOENT;
        head = xa_load(&ctx->io_buffers, p->bgid);
        if (head)
                ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
        if (ret < 0)
                req_set_fail(req);

        /* complete before unlock, IOPOLL may need the lock */
        __io_req_complete(req, issue_flags, ret, 0);
        io_ring_submit_unlock(ctx, !force_nonblock);
        return 0;
}

static int io_provide_buffers_prep(struct io_kiocb *req,
                                   const struct io_uring_sqe *sqe)
{
        unsigned long size, tmp_check;
        struct io_provide_buf *p = &req->pbuf;
        u64 tmp;

        if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
                return -EINVAL;

        tmp = READ_ONCE(sqe->fd);
        if (!tmp || tmp > USHRT_MAX)
                return -E2BIG;
        p->nbufs = tmp;
        p->addr = READ_ONCE(sqe->addr);
        p->len = READ_ONCE(sqe->len);

        if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
                                &size))
                return -EOVERFLOW;
        if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
                return -EOVERFLOW;

        size = (unsigned long)p->len * p->nbufs;
        if (!access_ok(u64_to_user_ptr(p->addr), size))
                return -EFAULT;

        p->bgid = READ_ONCE(sqe->buf_group);
        tmp = READ_ONCE(sqe->off);
        if (tmp > USHRT_MAX)
                return -E2BIG;
        p->bid = tmp;
        return 0;
}

static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
{
        struct io_buffer *buf;
        u64 addr = pbuf->addr;
        int i, bid = pbuf->bid;

        for (i = 0; i < pbuf->nbufs; i++) {
                buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
                if (!buf)
                        break;

                buf->addr = addr;
                buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
                buf->bid = bid;
                addr += pbuf->len;
                bid++;
                if (!*head) {
                        INIT_LIST_HEAD(&buf->list);
                        *head = buf;
                } else {
                        list_add_tail(&buf->list, &(*head)->list);
                }
                cond_resched();
        }

        return i ? i : -ENOMEM;
}

static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_provide_buf *p = &req->pbuf;
        struct io_ring_ctx *ctx = req->ctx;
        struct io_buffer *head, *list;
        int ret = 0;
        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;

        io_ring_submit_lock(ctx, !force_nonblock);

        lockdep_assert_held(&ctx->uring_lock);

        list = head = xa_load(&ctx->io_buffers, p->bgid);

        ret = io_add_buffers(p, &head);
        if (ret >= 0 && !list) {
                ret = xa_insert(&ctx->io_buffers, p->bgid, head,
                                GFP_KERNEL_ACCOUNT);
                if (ret < 0)
                        __io_remove_buffers(ctx, head, p->bgid, -1U);
        }
        if (ret < 0)
                req_set_fail(req);
        /* complete before unlock, IOPOLL may need the lock */
        __io_req_complete(req, issue_flags, ret, 0);
        io_ring_submit_unlock(ctx, !force_nonblock);
        return 0;
}

static int io_epoll_ctl_prep(struct io_kiocb *req,
                             const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_EPOLL)
        if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
                return -EINVAL;
        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;

        req->epoll.epfd = READ_ONCE(sqe->fd);
        req->epoll.op = READ_ONCE(sqe->len);
        req->epoll.fd = READ_ONCE(sqe->off);

        if (ep_op_has_event(req->epoll.op)) {
                struct epoll_event __user *ev;

                ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
                if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
                        return -EFAULT;
        }

        return 0;
#else
        return -EOPNOTSUPP;
#endif
}

static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
{
#if defined(CONFIG_EPOLL)
        struct io_epoll *ie = &req->epoll;
        int ret;
        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;

        ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
        if (force_nonblock && ret == -EAGAIN)
                return -EAGAIN;

        if (ret < 0)
                req_set_fail(req);
        __io_req_complete(req, issue_flags, ret, 0);
        return 0;
#else
        return -EOPNOTSUPP;
#endif
}

static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
        if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in)
                return -EINVAL;
        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;

        req->madvise.addr = READ_ONCE(sqe->addr);
        req->madvise.len = READ_ONCE(sqe->len);
        req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
        return 0;
#else
        return -EOPNOTSUPP;
#endif
}

static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
{
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
        struct io_madvise *ma = &req->madvise;
        int ret;

        if (issue_flags & IO_URING_F_NONBLOCK)
                return -EAGAIN;

        ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
        if (ret < 0)
                req_set_fail(req);
        io_req_complete(req, ret);
        return 0;
#else
        return -EOPNOTSUPP;
#endif
}

static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in)
                return -EINVAL;
        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;

        req->fadvise.offset = READ_ONCE(sqe->off);
        req->fadvise.len = READ_ONCE(sqe->len);
        req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
        return 0;
}

static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_fadvise *fa = &req->fadvise;
        int ret;

        if (issue_flags & IO_URING_F_NONBLOCK) {
                switch (fa->advice) {
                case POSIX_FADV_NORMAL:
                case POSIX_FADV_RANDOM:
                case POSIX_FADV_SEQUENTIAL:
                        break;
                default:
                        return -EAGAIN;
                }
        }

        ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
        if (ret < 0)
                req_set_fail(req);
        __io_req_complete(req, issue_flags, ret, 0);
        return 0;
}

static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
                return -EINVAL;
        if (req->flags & REQ_F_FIXED_FILE)
                return -EBADF;

        req->statx.dfd = READ_ONCE(sqe->fd);
        req->statx.mask = READ_ONCE(sqe->len);
        req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
        req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
        req->statx.flags = READ_ONCE(sqe->statx_flags);

        return 0;
}

static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_statx *ctx = &req->statx;
        int ret;

        if (issue_flags & IO_URING_F_NONBLOCK)
                return -EAGAIN;

        ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
                       ctx->buffer);

        if (ret < 0)
                req_set_fail(req);
        io_req_complete(req, ret);
        return 0;
}

static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
            sqe->rw_flags || sqe->buf_index)
                return -EINVAL;
        if (req->flags & REQ_F_FIXED_FILE)
                return -EBADF;

        req->close.fd = READ_ONCE(sqe->fd);
        req->close.file_slot = READ_ONCE(sqe->file_index);
        if (req->close.file_slot && req->close.fd)
                return -EINVAL;

        return 0;
}

static int io_close(struct io_kiocb *req, unsigned int issue_flags)
{
        struct files_struct *files = current->files;
        struct io_close *close = &req->close;
        struct fdtable *fdt;
        struct file *file = NULL;
        int ret = -EBADF;

        if (req->close.file_slot) {
                ret = io_close_fixed(req, issue_flags);
                goto err;
        }

        spin_lock(&files->file_lock);
        fdt = files_fdtable(files);
        if (close->fd >= fdt->max_fds) {
                spin_unlock(&files->file_lock);
                goto err;
        }
        file = fdt->fd[close->fd];
        if (!file || file->f_op == &io_uring_fops) {
                spin_unlock(&files->file_lock);
                file = NULL;
                goto err;
        }

        /* if the file has a flush method, be safe and punt to async */
        if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
                spin_unlock(&files->file_lock);
                return -EAGAIN;
        }

        ret = __close_fd_get_file(close->fd, &file);
        spin_unlock(&files->file_lock);
        if (ret < 0) {
                if (ret == -ENOENT)
                        ret = -EBADF;
                goto err;
        }

        /* No ->flush() or already async, safely close from here */
        ret = filp_close(file, current->files);
err:
        if (ret < 0)
                req_set_fail(req);
        if (file)
                fput(file);
        __io_req_complete(req, issue_flags, ret, 0);
        return 0;
}

static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        struct io_ring_ctx *ctx = req->ctx;

        if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
                     sqe->splice_fd_in))
                return -EINVAL;

        req->sync.off = READ_ONCE(sqe->off);
        req->sync.len = READ_ONCE(sqe->len);
        req->sync.flags = READ_ONCE(sqe->sync_range_flags);
        return 0;
}

static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
{
        int ret;

        /* sync_file_range always requires a blocking context */
        if (issue_flags & IO_URING_F_NONBLOCK)
                return -EAGAIN;

        ret = sync_file_range(req->file, req->sync.off, req->sync.len,
                                req->sync.flags);
        if (ret < 0)
                req_set_fail(req);
        io_req_complete(req, ret);
        return 0;
}

#if defined(CONFIG_NET)
static bool io_net_retry(struct socket *sock, int flags)
{
        if (!(flags & MSG_WAITALL))
                return false;
        return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
}

static int io_setup_async_msg(struct io_kiocb *req,
                              struct io_async_msghdr *kmsg)
{
        struct io_async_msghdr *async_msg = req->async_data;

        if (async_msg)
                return -EAGAIN;
        if (io_alloc_async_data(req)) {
                kfree(kmsg->free_iov);
                return -ENOMEM;
        }
        async_msg = req->async_data;
        req->flags |= REQ_F_NEED_CLEANUP;
        memcpy(async_msg, kmsg, sizeof(*kmsg));
        if (async_msg->msg.msg_name)
                async_msg->msg.msg_name = &async_msg->addr;
        /* if were using fast_iov, set it to the new one */
        if (!kmsg->free_iov) {
                size_t fast_idx = kmsg->msg.msg_iter.iov - kmsg->fast_iov;
                async_msg->msg.msg_iter.iov = &async_msg->fast_iov[fast_idx];
        }

        return -EAGAIN;
}

static int io_sendmsg_copy_hdr(struct io_kiocb *req,
                               struct io_async_msghdr *iomsg)
{
        struct io_sr_msg *sr = &req->sr_msg;
        int ret;

        iomsg->msg.msg_name = &iomsg->addr;
        iomsg->free_iov = iomsg->fast_iov;
        ret = sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
                                   req->sr_msg.msg_flags, &iomsg->free_iov);
        /* save msg_control as sys_sendmsg() overwrites it */
        sr->msg_control = iomsg->msg.msg_control;
        return ret;
}

static int io_sendmsg_prep_async(struct io_kiocb *req)
{
        int ret;

        ret = io_sendmsg_copy_hdr(req, req->async_data);
        if (!ret)
                req->flags |= REQ_F_NEED_CLEANUP;
        return ret;
}

static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        struct io_sr_msg *sr = &req->sr_msg;

        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (unlikely(sqe->addr2 || sqe->file_index))
                return -EINVAL;
        if (unlikely(sqe->addr2 || sqe->file_index || sqe->ioprio))
                return -EINVAL;

        sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
        sr->len = READ_ONCE(sqe->len);
        sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
        if (sr->msg_flags & MSG_DONTWAIT)
                req->flags |= REQ_F_NOWAIT;

#ifdef CONFIG_COMPAT
        if (req->ctx->compat)
                sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
        sr->done_io = 0;
        return 0;
}

static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_async_msghdr iomsg, *kmsg;
        struct io_sr_msg *sr = &req->sr_msg;
        struct socket *sock;
        unsigned flags;
        int min_ret = 0;
        int ret;

        sock = sock_from_file(req->file, &ret);
        if (unlikely(!sock))
                return ret;

        kmsg = req->async_data;
        if (!kmsg) {
                ret = io_sendmsg_copy_hdr(req, &iomsg);
                if (ret)
                        return ret;
                kmsg = &iomsg;
        } else {
                kmsg->msg.msg_control = sr->msg_control;
        }

        flags = req->sr_msg.msg_flags;
        if (issue_flags & IO_URING_F_NONBLOCK)
                flags |= MSG_DONTWAIT;
        if (flags & MSG_WAITALL)
                min_ret = iov_iter_count(&kmsg->msg.msg_iter);

        ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);

        if (ret < min_ret) {
                if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
                        return io_setup_async_msg(req, kmsg);
                if (ret == -ERESTARTSYS)
                        ret = -EINTR;
                if (ret > 0 && io_net_retry(sock, flags)) {
                        sr->done_io += ret;
                        req->flags |= REQ_F_PARTIAL_IO;
                        return io_setup_async_msg(req, kmsg);
                }
                req_set_fail(req);
        }
        /* fast path, check for non-NULL to avoid function call */
        if (kmsg->free_iov)
                kfree(kmsg->free_iov);
        req->flags &= ~REQ_F_NEED_CLEANUP;
        if (ret >= 0)
                ret += sr->done_io;
        else if (sr->done_io)
                ret = sr->done_io;
        __io_req_complete(req, issue_flags, ret, 0);
        return 0;
}

static int io_send(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_sr_msg *sr = &req->sr_msg;
        struct msghdr msg;
        struct iovec iov;
        struct socket *sock;
        unsigned flags;
        int min_ret = 0;
        int ret;

        sock = sock_from_file(req->file, &ret);
        if (unlikely(!sock))
                return ret;

        ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
        if (unlikely(ret))
                return ret;

        msg.msg_name = NULL;
        msg.msg_control = NULL;
        msg.msg_controllen = 0;
        msg.msg_namelen = 0;

        flags = req->sr_msg.msg_flags;
        if (issue_flags & IO_URING_F_NONBLOCK)
                flags |= MSG_DONTWAIT;
        if (flags & MSG_WAITALL)
                min_ret = iov_iter_count(&msg.msg_iter);

        msg.msg_flags = flags;
        ret = sock_sendmsg(sock, &msg);
        if (ret < min_ret) {
                if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
                        return -EAGAIN;
                if (ret == -ERESTARTSYS)
                        ret = -EINTR;
                if (ret > 0 && io_net_retry(sock, flags)) {
                        sr->len -= ret;
                        sr->buf += ret;
                        sr->done_io += ret;
                        req->flags |= REQ_F_PARTIAL_IO;
                        return -EAGAIN;
                }
                req_set_fail(req);
        }
        if (ret >= 0)
                ret += sr->done_io;
        else if (sr->done_io)
                ret = sr->done_io;
        __io_req_complete(req, issue_flags, ret, 0);
        return 0;
}

static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
                                 struct io_async_msghdr *iomsg)
{
        struct io_sr_msg *sr = &req->sr_msg;
        struct iovec __user *uiov;
        size_t iov_len;
        int ret;

        ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
                                        &iomsg->uaddr, &uiov, &iov_len);
        if (ret)
                return ret;

        if (req->flags & REQ_F_BUFFER_SELECT) {
                if (iov_len > 1)
                        return -EINVAL;
                if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
                        return -EFAULT;
                sr->len = iomsg->fast_iov[0].iov_len;
                iomsg->free_iov = NULL;
        } else {
                iomsg->free_iov = iomsg->fast_iov;
                ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
                                     &iomsg->free_iov, &iomsg->msg.msg_iter,
                                     false);
                if (ret > 0)
                        ret = 0;
        }

        return ret;
}

#ifdef CONFIG_COMPAT
static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
                                        struct io_async_msghdr *iomsg)
{
        struct io_sr_msg *sr = &req->sr_msg;
        struct compat_iovec __user *uiov;
        compat_uptr_t ptr;
        compat_size_t len;
        int ret;

        ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
                                  &ptr, &len);
        if (ret)
                return ret;

        uiov = compat_ptr(ptr);
        if (req->flags & REQ_F_BUFFER_SELECT) {
                compat_ssize_t clen;

                if (len > 1)
                        return -EINVAL;
                if (!access_ok(uiov, sizeof(*uiov)))
                        return -EFAULT;
                if (__get_user(clen, &uiov->iov_len))
                        return -EFAULT;
                if (clen < 0)
                        return -EINVAL;
                sr->len = clen;
                iomsg->free_iov = NULL;
        } else {
                iomsg->free_iov = iomsg->fast_iov;
                ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
                                   UIO_FASTIOV, &iomsg->free_iov,
                                   &iomsg->msg.msg_iter, true);
                if (ret < 0)
                        return ret;
        }

        return 0;
}
#endif

static int io_recvmsg_copy_hdr(struct io_kiocb *req,
                               struct io_async_msghdr *iomsg)
{
        iomsg->msg.msg_name = &iomsg->addr;

#ifdef CONFIG_COMPAT
        if (req->ctx->compat)
                return __io_compat_recvmsg_copy_hdr(req, iomsg);
#endif

        return __io_recvmsg_copy_hdr(req, iomsg);
}

static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
                                               bool needs_lock)
{
        struct io_sr_msg *sr = &req->sr_msg;
        struct io_buffer *kbuf;

        kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
        if (IS_ERR(kbuf))
                return kbuf;

        sr->kbuf = kbuf;
        req->flags |= REQ_F_BUFFER_SELECTED;
        return kbuf;
}

static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
{
        return io_put_kbuf(req, req->sr_msg.kbuf);
}

static int io_recvmsg_prep_async(struct io_kiocb *req)
{
        int ret;

        ret = io_recvmsg_copy_hdr(req, req->async_data);
        if (!ret)
                req->flags |= REQ_F_NEED_CLEANUP;
        return ret;
}

static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        struct io_sr_msg *sr = &req->sr_msg;

        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (unlikely(sqe->addr2 || sqe->file_index))
                return -EINVAL;
        if (unlikely(sqe->addr2 || sqe->file_index || sqe->ioprio))
                return -EINVAL;

        sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
        sr->len = READ_ONCE(sqe->len);
        sr->bgid = READ_ONCE(sqe->buf_group);
        sr->msg_flags = READ_ONCE(sqe->msg_flags);
        if (sr->msg_flags & MSG_DONTWAIT)
                req->flags |= REQ_F_NOWAIT;

#ifdef CONFIG_COMPAT
        if (req->ctx->compat)
                sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
        sr->done_io = 0;
        return 0;
}

static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_async_msghdr iomsg, *kmsg;
        struct io_sr_msg *sr = &req->sr_msg;
        struct socket *sock;
        struct io_buffer *kbuf;
        unsigned flags;
        int min_ret = 0;
        int ret, cflags = 0;
        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;

        sock = sock_from_file(req->file, &ret);
        if (unlikely(!sock))
                return ret;

        kmsg = req->async_data;
        if (!kmsg) {
                ret = io_recvmsg_copy_hdr(req, &iomsg);
                if (ret)
                        return ret;
                kmsg = &iomsg;
        }

        if (req->flags & REQ_F_BUFFER_SELECT) {
                kbuf = io_recv_buffer_select(req, !force_nonblock);
                if (IS_ERR(kbuf))
                        return PTR_ERR(kbuf);
                kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
                kmsg->fast_iov[0].iov_len = req->sr_msg.len;
                iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
                                1, req->sr_msg.len);
        }

        flags = req->sr_msg.msg_flags;
        if (force_nonblock)
                flags |= MSG_DONTWAIT;
        if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen)
                min_ret = iov_iter_count(&kmsg->msg.msg_iter);

        ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
                                        kmsg->uaddr, flags);
        if (ret < min_ret) {
                if (ret == -EAGAIN && force_nonblock)
                        return io_setup_async_msg(req, kmsg);
                if (ret == -ERESTARTSYS)
                        ret = -EINTR;
                if (ret > 0 && io_net_retry(sock, flags)) {
                        kmsg->msg.msg_controllen = 0;
                        kmsg->msg.msg_control = NULL;
                        sr->done_io += ret;
                        req->flags |= REQ_F_PARTIAL_IO;
                        return io_setup_async_msg(req, kmsg);
                }
                req_set_fail(req);
        } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
                req_set_fail(req);
        }

        if (req->flags & REQ_F_BUFFER_SELECTED)
                cflags = io_put_recv_kbuf(req);
        /* fast path, check for non-NULL to avoid function call */
        if (kmsg->free_iov)
                kfree(kmsg->free_iov);
        req->flags &= ~REQ_F_NEED_CLEANUP;
        if (ret >= 0)
                ret += sr->done_io;
        else if (sr->done_io)
                ret = sr->done_io;
        __io_req_complete(req, issue_flags, ret, cflags);
        return 0;
}

static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_buffer *kbuf;
        struct io_sr_msg *sr = &req->sr_msg;
        struct msghdr msg;
        void __user *buf = sr->buf;
        struct socket *sock;
        struct iovec iov;
        unsigned flags;
        int min_ret = 0;
        int ret, cflags = 0;
        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;

        sock = sock_from_file(req->file, &ret);
        if (unlikely(!sock))
                return ret;

        if (req->flags & REQ_F_BUFFER_SELECT) {
                kbuf = io_recv_buffer_select(req, !force_nonblock);
                if (IS_ERR(kbuf))
                        return PTR_ERR(kbuf);
                buf = u64_to_user_ptr(kbuf->addr);
        }

        ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
        if (unlikely(ret))
                goto out_free;

        msg.msg_name = NULL;
        msg.msg_control = NULL;
        msg.msg_controllen = 0;
        msg.msg_namelen = 0;
        msg.msg_iocb = NULL;
        msg.msg_flags = 0;

        flags = req->sr_msg.msg_flags;
        if (force_nonblock)
                flags |= MSG_DONTWAIT;
        if (flags & MSG_WAITALL)
                min_ret = iov_iter_count(&msg.msg_iter);

        ret = sock_recvmsg(sock, &msg, flags);
        if (ret < min_ret) {
                if (ret == -EAGAIN && force_nonblock)
                        return -EAGAIN;
                if (ret == -ERESTARTSYS)
                        ret = -EINTR;
                if (ret > 0 && io_net_retry(sock, flags)) {
                        sr->len -= ret;
                        sr->buf += ret;
                        sr->done_io += ret;
                        req->flags |= REQ_F_PARTIAL_IO;
                        return -EAGAIN;
                }
                req_set_fail(req);
        } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
out_free:
                req_set_fail(req);
        }
        if (req->flags & REQ_F_BUFFER_SELECTED)
                cflags = io_put_recv_kbuf(req);
        if (ret >= 0)
                ret += sr->done_io;
        else if (sr->done_io)
                ret = sr->done_io;
        __io_req_complete(req, issue_flags, ret, cflags);
        return 0;
}

static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        struct io_accept *accept = &req->accept;

        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (sqe->ioprio || sqe->len || sqe->buf_index)
                return -EINVAL;

        accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
        accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
        accept->flags = READ_ONCE(sqe->accept_flags);
        accept->nofile = rlimit(RLIMIT_NOFILE);

        accept->file_slot = READ_ONCE(sqe->file_index);
        if (accept->file_slot && (accept->flags & SOCK_CLOEXEC))
                return -EINVAL;
        if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
                return -EINVAL;
        if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
                accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
        return 0;
}

static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_accept *accept = &req->accept;
        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
        unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
        bool fixed = !!accept->file_slot;
        struct file *file;
        int ret, fd;

        if (!fixed) {
                fd = __get_unused_fd_flags(accept->flags, accept->nofile);
                if (unlikely(fd < 0))
                        return fd;
        }
        file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
                         accept->flags);

        if (IS_ERR(file)) {
                if (!fixed)
                        put_unused_fd(fd);
                ret = PTR_ERR(file);
                /* safe to retry */
                req->flags |= REQ_F_PARTIAL_IO;
                if (ret == -EAGAIN && force_nonblock)
                        return -EAGAIN;
                if (ret == -ERESTARTSYS)
                        ret = -EINTR;
                req_set_fail(req);
        } else if (!fixed) {
                fd_install(fd, file);
                ret = fd;
        } else {
                ret = io_install_fixed_file(req, file, issue_flags,
                                            accept->file_slot - 1);
        }
        __io_req_complete(req, issue_flags, ret, 0);
        return 0;
}

static int io_connect_prep_async(struct io_kiocb *req)
{
        struct io_async_connect *io = req->async_data;
        struct io_connect *conn = &req->connect;

        return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
}

static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        struct io_connect *conn = &req->connect;

        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags ||
            sqe->splice_fd_in)
                return -EINVAL;

        conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
        conn->addr_len =  READ_ONCE(sqe->addr2);
        return 0;
}

static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_async_connect __io, *io;
        unsigned file_flags;
        int ret;
        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;

        if (req->async_data) {
                io = req->async_data;
        } else {
                ret = move_addr_to_kernel(req->connect.addr,
                                                req->connect.addr_len,
                                                &__io.address);
                if (ret)
                        goto out;
                io = &__io;
        }

        file_flags = force_nonblock ? O_NONBLOCK : 0;

        ret = __sys_connect_file(req->file, &io->address,
                                        req->connect.addr_len, file_flags);
        if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
                if (req->async_data)
                        return -EAGAIN;
                if (io_alloc_async_data(req)) {
                        ret = -ENOMEM;
                        goto out;
                }
                memcpy(req->async_data, &__io, sizeof(__io));
                return -EAGAIN;
        }
        if (ret == -ERESTARTSYS)
                ret = -EINTR;
out:
        if (ret < 0)
                req_set_fail(req);
        __io_req_complete(req, issue_flags, ret, 0);
        return 0;
}
#else /* !CONFIG_NET */
#define IO_NETOP_FN(op)                                                        \
static int io_##op(struct io_kiocb *req, unsigned int issue_flags)        \
{                                                                        \
        return -EOPNOTSUPP;                                                \
}

#define IO_NETOP_PREP(op)                                                \
IO_NETOP_FN(op)                                                                \
static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
{                                                                        \
        return -EOPNOTSUPP;                                                \
}                                                                        \

#define IO_NETOP_PREP_ASYNC(op)                                                \
IO_NETOP_PREP(op)                                                        \
static int io_##op##_prep_async(struct io_kiocb *req)                        \
{                                                                        \
        return -EOPNOTSUPP;                                                \
}

IO_NETOP_PREP_ASYNC(sendmsg);
IO_NETOP_PREP_ASYNC(recvmsg);
IO_NETOP_PREP_ASYNC(connect);
IO_NETOP_PREP(accept);
IO_NETOP_FN(send);
IO_NETOP_FN(recv);
#endif /* CONFIG_NET */

struct io_poll_table {
        struct poll_table_struct pt;
        struct io_kiocb *req;
        int nr_entries;
        int error;
};

#define IO_POLL_CANCEL_FLAG        BIT(31)
#define IO_POLL_RETRY_FLAG        BIT(30)
#define IO_POLL_REF_MASK        GENMASK(29, 0)

/*
 * We usually have 1-2 refs taken, 128 is more than enough and we want to
 * maximise the margin between this amount and the moment when it overflows.
 */
#define IO_POLL_REF_BIAS       128

static bool io_poll_get_ownership_slowpath(struct io_kiocb *req)
{
        int v;

        /*
         * poll_refs are already elevated and we don't have much hope for
         * grabbing the ownership. Instead of incrementing set a retry flag
         * to notify the loop that there might have been some change.
         */
        v = atomic_fetch_or(IO_POLL_RETRY_FLAG, &req->poll_refs);
        if (v & IO_POLL_REF_MASK)
                return false;
        return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
}

/*
 * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
 * bump it and acquire ownership. It's disallowed to modify requests while not
 * owning it, that prevents from races for enqueueing task_work's and b/w
 * arming poll and wakeups.
 */
static inline bool io_poll_get_ownership(struct io_kiocb *req)
{
        if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS))
                return io_poll_get_ownership_slowpath(req);
        return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
}

static void io_poll_mark_cancelled(struct io_kiocb *req)
{
        atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs);
}

static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
{
        /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
        if (req->opcode == IORING_OP_POLL_ADD)
                return req->async_data;
        return req->apoll->double_poll;
}

static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
{
        if (req->opcode == IORING_OP_POLL_ADD)
                return &req->poll;
        return &req->apoll->poll;
}

static void io_poll_req_insert(struct io_kiocb *req)
{
        struct io_ring_ctx *ctx = req->ctx;
        struct hlist_head *list;

        list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
        hlist_add_head(&req->hash_node, list);
}

static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
                              wait_queue_func_t wake_func)
{
        poll->head = NULL;
#define IO_POLL_UNMASK        (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
        /* mask in events that we always want/need */
        poll->events = events | IO_POLL_UNMASK;
        INIT_LIST_HEAD(&poll->wait.entry);
        init_waitqueue_func_entry(&poll->wait, wake_func);
}

static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
{
        struct wait_queue_head *head = smp_load_acquire(&poll->head);

        if (head) {
                spin_lock_irq(&head->lock);
                list_del_init(&poll->wait.entry);
                poll->head = NULL;
                spin_unlock_irq(&head->lock);
        }
}

static void io_poll_remove_entries(struct io_kiocb *req)
{
        struct io_poll_iocb *poll = io_poll_get_single(req);
        struct io_poll_iocb *poll_double = io_poll_get_double(req);

        /*
         * While we hold the waitqueue lock and the waitqueue is nonempty,
         * wake_up_pollfree() will wait for us.  However, taking the waitqueue
         * lock in the first place can race with the waitqueue being freed.
         *
         * We solve this as eventpoll does: by taking advantage of the fact that
         * all users of wake_up_pollfree() will RCU-delay the actual free.  If
         * we enter rcu_read_lock() and see that the pointer to the queue is
         * non-NULL, we can then lock it without the memory being freed out from
         * under us.
         *
         * Keep holding rcu_read_lock() as long as we hold the queue lock, in
         * case the caller deletes the entry from the queue, leaving it empty.
         * In that case, only RCU prevents the queue memory from being freed.
         */
        rcu_read_lock();
        io_poll_remove_entry(poll);
        if (poll_double)
                io_poll_remove_entry(poll_double);
        rcu_read_unlock();
}

/*
 * All poll tw should go through this. Checks for poll events, manages
 * references, does rewait, etc.
 *
 * Returns a negative error on failure. >0 when no action require, which is
 * either spurious wakeup or multishot CQE is served. 0 when it's done with
 * the request, then the mask is stored in req->result.
 */
static int io_poll_check_events(struct io_kiocb *req)
{
        struct io_ring_ctx *ctx = req->ctx;
        struct io_poll_iocb *poll = io_poll_get_single(req);
        int v;

        /* req->task == current here, checking PF_EXITING is safe */
        if (unlikely(req->task->flags & PF_EXITING))
                io_poll_mark_cancelled(req);

        do {
                v = atomic_read(&req->poll_refs);

                /* tw handler should be the owner, and so have some references */
                if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK)))
                        return 0;
                if (v & IO_POLL_CANCEL_FLAG)
                        return -ECANCELED;
                /*
                 * cqe.res contains only events of the first wake up
                 * and all others are be lost. Redo vfs_poll() to get
                 * up to date state.
                 */
                if ((v & IO_POLL_REF_MASK) != 1)
                        req->result = 0;
                if (v & IO_POLL_RETRY_FLAG) {
                        req->result = 0;
                        /*
                         * We won't find new events that came in between
                         * vfs_poll and the ref put unless we clear the
                         * flag in advance.
                         */
                        atomic_andnot(IO_POLL_RETRY_FLAG, &req->poll_refs);
                        v &= ~IO_POLL_RETRY_FLAG;
                }

                if (!req->result) {
                        struct poll_table_struct pt = { ._key = poll->events };

                        req->result = vfs_poll(req->file, &pt) & poll->events;
                }

                /* multishot, just fill an CQE and proceed */
                if (req->result && !(poll->events & EPOLLONESHOT)) {
                        __poll_t mask = mangle_poll(req->result & poll->events);
                        bool filled;

                        spin_lock(&ctx->completion_lock);
                        filled = io_fill_cqe_aux(ctx, req->user_data, mask,
                                                 IORING_CQE_F_MORE);
                        io_commit_cqring(ctx);
                        spin_unlock(&ctx->completion_lock);
                        if (unlikely(!filled))
                                return -ECANCELED;
                        io_cqring_ev_posted(ctx);
                } else if (req->result) {
                        return 0;
                }

                /* force the next iteration to vfs_poll() */
                req->result = 0;

                /*
                 * Release all references, retry if someone tried to restart
                 * task_work while we were executing it.
                 */
        } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs) &
                                        IO_POLL_REF_MASK);

        return 1;
}

static void io_poll_task_func(struct io_kiocb *req, bool *locked)
{
        struct io_ring_ctx *ctx = req->ctx;
        int ret;

        ret = io_poll_check_events(req);
        if (ret > 0)
                return;

        if (!ret) {
                req->result = mangle_poll(req->result & req->poll.events);
        } else {
                req->result = ret;
                req_set_fail(req);
        }

        io_poll_remove_entries(req);
        spin_lock(&ctx->completion_lock);
        hash_del(&req->hash_node);
        spin_unlock(&ctx->completion_lock);
        io_req_complete_post(req, req->result, 0);
}

static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
{
        struct io_ring_ctx *ctx = req->ctx;
        int ret;

        ret = io_poll_check_events(req);
        if (ret > 0)
                return;

        io_tw_lock(req->ctx, locked);
        io_poll_remove_entries(req);
        spin_lock(&ctx->completion_lock);
        hash_del(&req->hash_node);
        spin_unlock(&ctx->completion_lock);

        if (!ret)
                io_req_task_submit(req, locked);
        else
                io_req_complete_failed(req, ret);
}

static void __io_poll_execute(struct io_kiocb *req, int mask)
{
        req->result = mask;
        if (req->opcode == IORING_OP_POLL_ADD)
                req->io_task_work.func = io_poll_task_func;
        else
                req->io_task_work.func = io_apoll_task_func;

        trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
        io_req_task_work_add(req);
}

static inline void io_poll_execute(struct io_kiocb *req, int res)
{
        if (io_poll_get_ownership(req))
                __io_poll_execute(req, res);
}

static void io_poll_cancel_req(struct io_kiocb *req)
{
        io_poll_mark_cancelled(req);
        /* kick tw, which should complete the request */
        io_poll_execute(req, 0);
}

static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
                        void *key)
{
        struct io_kiocb *req = wait->private;
        struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
                                                 wait);
        __poll_t mask = key_to_poll(key);

        if (unlikely(mask & POLLFREE)) {
                io_poll_mark_cancelled(req);
                /* we have to kick tw in case it's not already */
                io_poll_execute(req, 0);

                /*
                 * If the waitqueue is being freed early but someone is already
                 * holds ownership over it, we have to tear down the request as
                 * best we can. That means immediately removing the request from
                 * its waitqueue and preventing all further accesses to the
                 * waitqueue via the request.
                 */
                list_del_init(&poll->wait.entry);

                /*
                 * Careful: this *must* be the last step, since as soon
                 * as req->head is NULL'ed out, the request can be
                 * completed and freed, since aio_poll_complete_work()
                 * will no longer need to take the waitqueue lock.
                 */
                smp_store_release(&poll->head, NULL);
                return 1;
        }

        /* for instances that support it check for an event match first */
        if (mask && !(mask & poll->events))
                return 0;

        if (io_poll_get_ownership(req)) {
                /*
                 * If we trigger a multishot poll off our own wakeup path,
                 * disable multishot as there is a circular dependency between
                 * CQ posting and triggering the event.
                 */
                if (mask & EPOLL_URING_WAKE)
                        poll->events |= EPOLLONESHOT;

                __io_poll_execute(req, mask);
        }
        return 1;
}

static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
                            struct wait_queue_head *head,
                            struct io_poll_iocb **poll_ptr)
{
        struct io_kiocb *req = pt->req;

        /*
         * The file being polled uses multiple waitqueues for poll handling
         * (e.g. one for read, one for write). Setup a separate io_poll_iocb
         * if this happens.
         */
        if (unlikely(pt->nr_entries)) {
                struct io_poll_iocb *first = poll;

                /* double add on the same waitqueue head, ignore */
                if (first->head == head)
                        return;
                /* already have a 2nd entry, fail a third attempt */
                if (*poll_ptr) {
                        if ((*poll_ptr)->head == head)
                                return;
                        pt->error = -EINVAL;
                        return;
                }

                poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
                if (!poll) {
                        pt->error = -ENOMEM;
                        return;
                }
                io_init_poll_iocb(poll, first->events, first->wait.func);
                *poll_ptr = poll;
        }

        pt->nr_entries++;
        poll->head = head;
        poll->wait.private = req;

        if (poll->events & EPOLLEXCLUSIVE)
                add_wait_queue_exclusive(head, &poll->wait);
        else
                add_wait_queue(head, &poll->wait);
}

static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
                               struct poll_table_struct *p)
{
        struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);

        __io_queue_proc(&pt->req->poll, pt, head,
                        (struct io_poll_iocb **) &pt->req->async_data);
}

static int __io_arm_poll_handler(struct io_kiocb *req,
                                 struct io_poll_iocb *poll,
                                 struct io_poll_table *ipt, __poll_t mask)
{
        struct io_ring_ctx *ctx = req->ctx;

        INIT_HLIST_NODE(&req->hash_node);
        io_init_poll_iocb(poll, mask, io_poll_wake);
        poll->file = req->file;
        poll->wait.private = req;

        ipt->pt._key = mask;
        ipt->req = req;
        ipt->error = 0;
        ipt->nr_entries = 0;

        /*
         * Take the ownership to delay any tw execution up until we're done
         * with poll arming. see io_poll_get_ownership().
         */
        atomic_set(&req->poll_refs, 1);
        mask = vfs_poll(req->file, &ipt->pt) & poll->events;

        if (mask && (poll->events & EPOLLONESHOT)) {
                io_poll_remove_entries(req);
                /* no one else has access to the req, forget about the ref */
                return mask;
        }
        if (!mask && unlikely(ipt->error || !ipt->nr_entries)) {
                io_poll_remove_entries(req);
                if (!ipt->error)
                        ipt->error = -EINVAL;
                return 0;
        }

        spin_lock(&ctx->completion_lock);
        io_poll_req_insert(req);
        spin_unlock(&ctx->completion_lock);

        if (mask) {
                /* can't multishot if failed, just queue the event we've got */
                if (unlikely(ipt->error || !ipt->nr_entries)) {
                        poll->events |= EPOLLONESHOT;
                        ipt->error = 0;
                }
                __io_poll_execute(req, mask);
                return 0;
        }

        /*
         * Try to release ownership. If we see a change of state, e.g.
         * poll was waken up, queue up a tw, it'll deal with it.
         */
        if (atomic_cmpxchg(&req->poll_refs, 1, 0) != 1)
                __io_poll_execute(req, 0);
        return 0;
}

static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
                               struct poll_table_struct *p)
{
        struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
        struct async_poll *apoll = pt->req->apoll;

        __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
}

enum {
        IO_APOLL_OK,
        IO_APOLL_ABORTED,
        IO_APOLL_READY
};

/*
 * We can't reliably detect loops in repeated poll triggers and issue
 * subsequently failing. But rather than fail these immediately, allow a
 * certain amount of retries before we give up. Given that this condition
 * should _rarely_ trigger even once, we should be fine with a larger value.
 */
#define APOLL_MAX_RETRY                128

static int io_arm_poll_handler(struct io_kiocb *req)
{
        const struct io_op_def *def = &io_op_defs[req->opcode];
        struct io_ring_ctx *ctx = req->ctx;
        struct async_poll *apoll;
        struct io_poll_table ipt;
        __poll_t mask = EPOLLONESHOT | POLLERR | POLLPRI;
        int ret;

        if (!req->file || !file_can_poll(req->file))
                return IO_APOLL_ABORTED;
        if (!def->pollin && !def->pollout)
                return IO_APOLL_ABORTED;

        if (def->pollin) {
                mask |= POLLIN | POLLRDNORM;

                /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
                if ((req->opcode == IORING_OP_RECVMSG) &&
                    (req->sr_msg.msg_flags & MSG_ERRQUEUE))
                        mask &= ~POLLIN;
        } else {
                mask |= POLLOUT | POLLWRNORM;
        }

        if (req->flags & REQ_F_POLLED) {
                apoll = req->apoll;
                kfree(apoll->double_poll);
                if (unlikely(!--apoll->poll.retries)) {
                        apoll->double_poll = NULL;
                        return IO_APOLL_ABORTED;
                }
        } else {
                apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
                if (unlikely(!apoll))
                        return IO_APOLL_ABORTED;
                apoll->poll.retries = APOLL_MAX_RETRY;
        }
        apoll->double_poll = NULL;
        req->apoll = apoll;
        req->flags |= REQ_F_POLLED;
        ipt.pt._qproc = io_async_queue_proc;

        ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask);
        if (ret || ipt.error)
                return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;

        trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
                                mask, apoll->poll.events);
        return IO_APOLL_OK;
}

/*
 * Returns true if we found and killed one or more poll requests
 */
static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
                               bool cancel_all)
{
        struct hlist_node *tmp;
        struct io_kiocb *req;
        bool found = false;
        int i;

        spin_lock(&ctx->completion_lock);
        for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
                struct hlist_head *list;

                list = &ctx->cancel_hash[i];
                hlist_for_each_entry_safe(req, tmp, list, hash_node) {
                        if (io_match_task_safe(req, tsk, cancel_all)) {
                                hlist_del_init(&req->hash_node);
                                io_poll_cancel_req(req);
                                found = true;
                        }
                }
        }
        spin_unlock(&ctx->completion_lock);
        return found;
}

static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
                                     bool poll_only)
        __must_hold(&ctx->completion_lock)
{
        struct hlist_head *list;
        struct io_kiocb *req;

        list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
        hlist_for_each_entry(req, list, hash_node) {
                if (sqe_addr != req->user_data)
                        continue;
                if (poll_only && req->opcode != IORING_OP_POLL_ADD)
                        continue;
                return req;
        }
        return NULL;
}

static bool io_poll_disarm(struct io_kiocb *req)
        __must_hold(&ctx->completion_lock)
{
        if (!io_poll_get_ownership(req))
                return false;
        io_poll_remove_entries(req);
        hash_del(&req->hash_node);
        return true;
}

static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
                          bool poll_only)
        __must_hold(&ctx->completion_lock)
{
        struct io_kiocb *req = io_poll_find(ctx, sqe_addr, poll_only);

        if (!req)
                return -ENOENT;
        io_poll_cancel_req(req);
        return 0;
}

static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
                                     unsigned int flags)
{
        u32 events;

        events = READ_ONCE(sqe->poll32_events);
#ifdef __BIG_ENDIAN
        events = swahw32(events);
#endif
        if (!(flags & IORING_POLL_ADD_MULTI))
                events |= EPOLLONESHOT;
        return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
}

static int io_poll_update_prep(struct io_kiocb *req,
                               const struct io_uring_sqe *sqe)
{
        struct io_poll_update *upd = &req->poll_update;
        u32 flags;

        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
                return -EINVAL;
        flags = READ_ONCE(sqe->len);
        if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
                      IORING_POLL_ADD_MULTI))
                return -EINVAL;
        /* meaningless without update */
        if (flags == IORING_POLL_ADD_MULTI)
                return -EINVAL;

        upd->old_user_data = READ_ONCE(sqe->addr);
        upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
        upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;

        upd->new_user_data = READ_ONCE(sqe->off);
        if (!upd->update_user_data && upd->new_user_data)
                return -EINVAL;
        if (upd->update_events)
                upd->events = io_poll_parse_events(sqe, flags);
        else if (sqe->poll32_events)
                return -EINVAL;

        return 0;
}

static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        struct io_poll_iocb *poll = &req->poll;
        u32 flags;

        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
                return -EINVAL;
        flags = READ_ONCE(sqe->len);
        if (flags & ~IORING_POLL_ADD_MULTI)
                return -EINVAL;

        io_req_set_refcount(req);
        poll->events = io_poll_parse_events(sqe, flags);
        return 0;
}

static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_poll_iocb *poll = &req->poll;
        struct io_poll_table ipt;
        int ret;

        ipt.pt._qproc = io_poll_queue_proc;

        ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events);
        if (!ret && ipt.error)
                req_set_fail(req);
        ret = ret ?: ipt.error;
        if (ret)
                __io_req_complete(req, issue_flags, ret, 0);
        return 0;
}

static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_ring_ctx *ctx = req->ctx;
        struct io_kiocb *preq;
        int ret2, ret = 0;

        io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));

        spin_lock(&ctx->completion_lock);
        preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
        if (!preq || !io_poll_disarm(preq)) {
                spin_unlock(&ctx->completion_lock);
                ret = preq ? -EALREADY : -ENOENT;
                goto out;
        }
        spin_unlock(&ctx->completion_lock);

        if (req->poll_update.update_events || req->poll_update.update_user_data) {
                /* only mask one event flags, keep behavior flags */
                if (req->poll_update.update_events) {
                        preq->poll.events &= ~0xffff;
                        preq->poll.events |= req->poll_update.events & 0xffff;
                        preq->poll.events |= IO_POLL_UNMASK;
                }
                if (req->poll_update.update_user_data)
                        preq->user_data = req->poll_update.new_user_data;

                ret2 = io_poll_add(preq, issue_flags);
                /* successfully updated, don't complete poll request */
                if (!ret2)
                        goto out;
        }
        req_set_fail(preq);
        io_req_complete(preq, -ECANCELED);
out:
        if (ret < 0)
                req_set_fail(req);
        /* complete update request, we're done with it */
        io_req_complete(req, ret);
        io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
        return 0;
}

static void io_req_task_timeout(struct io_kiocb *req, bool *locked)
{
        req_set_fail(req);
        io_req_complete_post(req, -ETIME, 0);
}

static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
{
        struct io_timeout_data *data = container_of(timer,
                                                struct io_timeout_data, timer);
        struct io_kiocb *req = data->req;
        struct io_ring_ctx *ctx = req->ctx;
        unsigned long flags;

        spin_lock_irqsave(&ctx->timeout_lock, flags);
        list_del_init(&req->timeout.list);
        atomic_set(&req->ctx->cq_timeouts,
                atomic_read(&req->ctx->cq_timeouts) + 1);
        spin_unlock_irqrestore(&ctx->timeout_lock, flags);

        req->io_task_work.func = io_req_task_timeout;
        io_req_task_work_add(req);
        return HRTIMER_NORESTART;
}

static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
                                           __u64 user_data)
        __must_hold(&ctx->timeout_lock)
{
        struct io_timeout_data *io;
        struct io_kiocb *req;
        bool found = false;

        list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
                found = user_data == req->user_data;
                if (found)
                        break;
        }
        if (!found)
                return ERR_PTR(-ENOENT);

        io = req->async_data;
        if (hrtimer_try_to_cancel(&io->timer) == -1)
                return ERR_PTR(-EALREADY);
        list_del_init(&req->timeout.list);
        return req;
}

static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
        __must_hold(&ctx->completion_lock)
        __must_hold(&ctx->timeout_lock)
{
        struct io_kiocb *req = io_timeout_extract(ctx, user_data);

        if (IS_ERR(req))
                return PTR_ERR(req);

        req_set_fail(req);
        io_fill_cqe_req(req, -ECANCELED, 0);
        io_put_req_deferred(req);
        return 0;
}

static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
{
        switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
        case IORING_TIMEOUT_BOOTTIME:
                return CLOCK_BOOTTIME;
        case IORING_TIMEOUT_REALTIME:
                return CLOCK_REALTIME;
        default:
                /* can't happen, vetted at prep time */
                WARN_ON_ONCE(1);
                fallthrough;
        case 0:
                return CLOCK_MONOTONIC;
        }
}

static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
                                    struct timespec64 *ts, enum hrtimer_mode mode)
        __must_hold(&ctx->timeout_lock)
{
        struct io_timeout_data *io;
        struct io_kiocb *req;
        bool found = false;

        list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
                found = user_data == req->user_data;
                if (found)
                        break;
        }
        if (!found)
                return -ENOENT;

        io = req->async_data;
        if (hrtimer_try_to_cancel(&io->timer) == -1)
                return -EALREADY;
        hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
        io->timer.function = io_link_timeout_fn;
        hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
        return 0;
}

static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
                             struct timespec64 *ts, enum hrtimer_mode mode)
        __must_hold(&ctx->timeout_lock)
{
        struct io_kiocb *req = io_timeout_extract(ctx, user_data);
        struct io_timeout_data *data;

        if (IS_ERR(req))
                return PTR_ERR(req);

        req->timeout.off = 0; /* noseq */
        data = req->async_data;
        list_add_tail(&req->timeout.list, &ctx->timeout_list);
        hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
        data->timer.function = io_timeout_fn;
        hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
        return 0;
}

static int io_timeout_remove_prep(struct io_kiocb *req,
                                  const struct io_uring_sqe *sqe)
{
        struct io_timeout_rem *tr = &req->timeout_rem;

        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
                return -EINVAL;
        if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
                return -EINVAL;

        tr->ltimeout = false;
        tr->addr = READ_ONCE(sqe->addr);
        tr->flags = READ_ONCE(sqe->timeout_flags);
        if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
                if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
                        return -EINVAL;
                if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
                        tr->ltimeout = true;
                if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
                        return -EINVAL;
                if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
                        return -EFAULT;
        } else if (tr->flags) {
                /* timeout removal doesn't support flags */
                return -EINVAL;
        }

        return 0;
}

static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
{
        return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
                                            : HRTIMER_MODE_REL;
}

/*
 * Remove or update an existing timeout command
 */
static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_timeout_rem *tr = &req->timeout_rem;
        struct io_ring_ctx *ctx = req->ctx;
        int ret;

        if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
                spin_lock(&ctx->completion_lock);
                spin_lock_irq(&ctx->timeout_lock);
                ret = io_timeout_cancel(ctx, tr->addr);
                spin_unlock_irq(&ctx->timeout_lock);
                spin_unlock(&ctx->completion_lock);
        } else {
                enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);

                spin_lock_irq(&ctx->timeout_lock);
                if (tr->ltimeout)
                        ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
                else
                        ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
                spin_unlock_irq(&ctx->timeout_lock);
        }

        if (ret < 0)
                req_set_fail(req);
        io_req_complete_post(req, ret, 0);
        return 0;
}

static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                           bool is_timeout_link)
{
        struct io_timeout_data *data;
        unsigned flags;
        u32 off = READ_ONCE(sqe->off);

        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (sqe->ioprio || sqe->buf_index || sqe->len != 1 ||
            sqe->splice_fd_in)
                return -EINVAL;
        if (off && is_timeout_link)
                return -EINVAL;
        flags = READ_ONCE(sqe->timeout_flags);
        if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK))
                return -EINVAL;
        /* more than one clock specified is invalid, obviously */
        if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
                return -EINVAL;

        INIT_LIST_HEAD(&req->timeout.list);
        req->timeout.off = off;
        if (unlikely(off && !req->ctx->off_timeout_used))
                req->ctx->off_timeout_used = true;

        if (!req->async_data && io_alloc_async_data(req))
                return -ENOMEM;

        data = req->async_data;
        data->req = req;
        data->flags = flags;

        if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
                return -EFAULT;

        INIT_LIST_HEAD(&req->timeout.list);
        data->mode = io_translate_timeout_mode(flags);
        hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);

        if (is_timeout_link) {
                struct io_submit_link *link = &req->ctx->submit_state.link;

                if (!link->head)
                        return -EINVAL;
                if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
                        return -EINVAL;
                req->timeout.head = link->last;
                link->last->flags |= REQ_F_ARM_LTIMEOUT;
        }
        return 0;
}

static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_ring_ctx *ctx = req->ctx;
        struct io_timeout_data *data = req->async_data;
        struct list_head *entry;
        u32 tail, off = req->timeout.off;

        spin_lock_irq(&ctx->timeout_lock);

        /*
         * sqe->off holds how many events that need to occur for this
         * timeout event to be satisfied. If it isn't set, then this is
         * a pure timeout request, sequence isn't used.
         */
        if (io_is_timeout_noseq(req)) {
                entry = ctx->timeout_list.prev;
                goto add;
        }

        tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
        req->timeout.target_seq = tail + off;

        /* Update the last seq here in case io_flush_timeouts() hasn't.
         * This is safe because ->completion_lock is held, and submissions
         * and completions are never mixed in the same ->completion_lock section.
         */
        ctx->cq_last_tm_flush = tail;

        /*
         * Insertion sort, ensuring the first entry in the list is always
         * the one we need first.
         */
        list_for_each_prev(entry, &ctx->timeout_list) {
                struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
                                                  timeout.list);

                if (io_is_timeout_noseq(nxt))
                        continue;
                /* nxt.seq is behind @tail, otherwise would've been completed */
                if (off >= nxt->timeout.target_seq - tail)
                        break;
        }
add:
        list_add(&req->timeout.list, entry);
        data->timer.function = io_timeout_fn;
        hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
        spin_unlock_irq(&ctx->timeout_lock);
        return 0;
}

struct io_cancel_data {
        struct io_ring_ctx *ctx;
        u64 user_data;
};

static bool io_cancel_cb(struct io_wq_work *work, void *data)
{
        struct io_kiocb *req = container_of(work, struct io_kiocb, work);
        struct io_cancel_data *cd = data;

        return req->ctx == cd->ctx && req->user_data == cd->user_data;
}

static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
                               struct io_ring_ctx *ctx)
{
        struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };
        enum io_wq_cancel cancel_ret;
        int ret = 0;

        if (!tctx || !tctx->io_wq)
                return -ENOENT;

        cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false);
        switch (cancel_ret) {
        case IO_WQ_CANCEL_OK:
                ret = 0;
                break;
        case IO_WQ_CANCEL_RUNNING:
                ret = -EALREADY;
                break;
        case IO_WQ_CANCEL_NOTFOUND:
                ret = -ENOENT;
                break;
        }

        return ret;
}

static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
{
        struct io_ring_ctx *ctx = req->ctx;
        int ret;

        WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);

        ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
        if (ret != -ENOENT)
                return ret;

        spin_lock(&ctx->completion_lock);
        spin_lock_irq(&ctx->timeout_lock);
        ret = io_timeout_cancel(ctx, sqe_addr);
        spin_unlock_irq(&ctx->timeout_lock);
        if (ret != -ENOENT)
                goto out;
        ret = io_poll_cancel(ctx, sqe_addr, false);
out:
        spin_unlock(&ctx->completion_lock);
        return ret;
}

static int io_async_cancel_prep(struct io_kiocb *req,
                                const struct io_uring_sqe *sqe)
{
        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
                return -EINVAL;
        if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags ||
            sqe->splice_fd_in)
                return -EINVAL;

        req->cancel.addr = READ_ONCE(sqe->addr);
        return 0;
}

static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_ring_ctx *ctx = req->ctx;
        u64 sqe_addr = req->cancel.addr;
        struct io_tctx_node *node;
        int ret;

        ret = io_try_cancel_userdata(req, sqe_addr);
        if (ret != -ENOENT)
                goto done;

        /* slow path, try all io-wq's */
        io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
        ret = -ENOENT;
        list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
                struct io_uring_task *tctx = node->task->io_uring;

                ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
                if (ret != -ENOENT)
                        break;
        }
        io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
done:
        if (ret < 0)
                req_set_fail(req);
        io_req_complete_post(req, ret, 0);
        return 0;
}

static int io_rsrc_update_prep(struct io_kiocb *req,
                                const struct io_uring_sqe *sqe)
{
        if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
                return -EINVAL;
        if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
                return -EINVAL;

        req->rsrc_update.offset = READ_ONCE(sqe->off);
        req->rsrc_update.nr_args = READ_ONCE(sqe->len);
        if (!req->rsrc_update.nr_args)
                return -EINVAL;
        req->rsrc_update.arg = READ_ONCE(sqe->addr);
        return 0;
}

static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_ring_ctx *ctx = req->ctx;
        struct io_uring_rsrc_update2 up;
        int ret;

        up.offset = req->rsrc_update.offset;
        up.data = req->rsrc_update.arg;
        up.nr = 0;
        up.tags = 0;
        up.resv = 0;
        up.resv2 = 0;

        io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
        ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
                                        &up, req->rsrc_update.nr_args);
        io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));

        if (ret < 0)
                req_set_fail(req);
        __io_req_complete(req, issue_flags, ret, 0);
        return 0;
}

static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        switch (req->opcode) {
        case IORING_OP_NOP:
                return 0;
        case IORING_OP_READV:
        case IORING_OP_READ_FIXED:
        case IORING_OP_READ:
                return io_read_prep(req, sqe);
        case IORING_OP_WRITEV:
        case IORING_OP_WRITE_FIXED:
        case IORING_OP_WRITE:
                return io_write_prep(req, sqe);
        case IORING_OP_POLL_ADD:
                return io_poll_add_prep(req, sqe);
        case IORING_OP_POLL_REMOVE:
                return io_poll_update_prep(req, sqe);
        case IORING_OP_FSYNC:
                return io_fsync_prep(req, sqe);
        case IORING_OP_SYNC_FILE_RANGE:
                return io_sfr_prep(req, sqe);
        case IORING_OP_SENDMSG:
        case IORING_OP_SEND:
                return io_sendmsg_prep(req, sqe);
        case IORING_OP_RECVMSG:
        case IORING_OP_RECV:
                return io_recvmsg_prep(req, sqe);
        case IORING_OP_CONNECT:
                return io_connect_prep(req, sqe);
        case IORING_OP_TIMEOUT:
                return io_timeout_prep(req, sqe, false);
        case IORING_OP_TIMEOUT_REMOVE:
                return io_timeout_remove_prep(req, sqe);
        case IORING_OP_ASYNC_CANCEL:
                return io_async_cancel_prep(req, sqe);
        case IORING_OP_LINK_TIMEOUT:
                return io_timeout_prep(req, sqe, true);
        case IORING_OP_ACCEPT:
                return io_accept_prep(req, sqe);
        case IORING_OP_FALLOCATE:
                return io_fallocate_prep(req, sqe);
        case IORING_OP_OPENAT:
                return io_openat_prep(req, sqe);
        case IORING_OP_CLOSE:
                return io_close_prep(req, sqe);
        case IORING_OP_FILES_UPDATE:
                return io_rsrc_update_prep(req, sqe);
        case IORING_OP_STATX:
                return io_statx_prep(req, sqe);
        case IORING_OP_FADVISE:
                return io_fadvise_prep(req, sqe);
        case IORING_OP_MADVISE:
                return io_madvise_prep(req, sqe);
        case IORING_OP_OPENAT2:
                return io_openat2_prep(req, sqe);
        case IORING_OP_EPOLL_CTL:
                return io_epoll_ctl_prep(req, sqe);
        case IORING_OP_SPLICE:
                return io_splice_prep(req, sqe);
        case IORING_OP_PROVIDE_BUFFERS:
                return io_provide_buffers_prep(req, sqe);
        case IORING_OP_REMOVE_BUFFERS:
                return io_remove_buffers_prep(req, sqe);
        case IORING_OP_TEE:
                return io_tee_prep(req, sqe);
        case IORING_OP_SHUTDOWN:
                return io_shutdown_prep(req, sqe);
        case IORING_OP_RENAMEAT:
                return io_renameat_prep(req, sqe);
        case IORING_OP_UNLINKAT:
                return io_unlinkat_prep(req, sqe);
        }

        printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
                        req->opcode);
        return -EINVAL;
}

static int io_req_prep_async(struct io_kiocb *req)
{
        if (!io_op_defs[req->opcode].needs_async_setup)
                return 0;
        if (WARN_ON_ONCE(req->async_data))
                return -EFAULT;
        if (io_alloc_async_data(req))
                return -EAGAIN;

        switch (req->opcode) {
        case IORING_OP_READV:
                return io_rw_prep_async(req, READ);
        case IORING_OP_WRITEV:
                return io_rw_prep_async(req, WRITE);
        case IORING_OP_SENDMSG:
                return io_sendmsg_prep_async(req);
        case IORING_OP_RECVMSG:
                return io_recvmsg_prep_async(req);
        case IORING_OP_CONNECT:
                return io_connect_prep_async(req);
        }
        printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
                    req->opcode);
        return -EFAULT;
}

static u32 io_get_sequence(struct io_kiocb *req)
{
        u32 seq = req->ctx->cached_sq_head;

        /* need original cached_sq_head, but it was increased for each req */
        io_for_each_link(req, req)
                seq--;
        return seq;
}

static bool io_drain_req(struct io_kiocb *req)
{
        struct io_kiocb *pos;
        struct io_ring_ctx *ctx = req->ctx;
        struct io_defer_entry *de;
        int ret;
        u32 seq;

        if (req->flags & REQ_F_FAIL) {
                io_req_complete_fail_submit(req);
                return true;
        }

        /*
         * If we need to drain a request in the middle of a link, drain the
         * head request and the next request/link after the current link.
         * Considering sequential execution of links, IOSQE_IO_DRAIN will be
         * maintained for every request of our link.
         */
        if (ctx->drain_next) {
                req->flags |= REQ_F_IO_DRAIN;
                ctx->drain_next = false;
        }
        /* not interested in head, start from the first linked */
        io_for_each_link(pos, req->link) {
                if (pos->flags & REQ_F_IO_DRAIN) {
                        ctx->drain_next = true;
                        req->flags |= REQ_F_IO_DRAIN;
                        break;
                }
        }

        /* Still need defer if there is pending req in defer list. */
        spin_lock(&ctx->completion_lock);
        if (likely(list_empty_careful(&ctx->defer_list) &&
                !(req->flags & REQ_F_IO_DRAIN))) {
                spin_unlock(&ctx->completion_lock);
                ctx->drain_active = false;
                return false;
        }
        spin_unlock(&ctx->completion_lock);

        seq = io_get_sequence(req);
        /* Still a chance to pass the sequence check */
        if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
                return false;

        ret = io_req_prep_async(req);
        if (ret)
                goto fail;
        io_prep_async_link(req);
        de = kmalloc(sizeof(*de), GFP_KERNEL);
        if (!de) {
                ret = -ENOMEM;
fail:
                io_req_complete_failed(req, ret);
                return true;
        }

        spin_lock(&ctx->completion_lock);
        if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
                spin_unlock(&ctx->completion_lock);
                kfree(de);
                io_queue_async_work(req, NULL);
                return true;
        }

        trace_io_uring_defer(ctx, req, req->user_data);
        de->req = req;
        de->seq = seq;
        list_add_tail(&de->list, &ctx->defer_list);
        spin_unlock(&ctx->completion_lock);
        return true;
}

static void io_clean_op(struct io_kiocb *req)
{
        if (req->flags & REQ_F_BUFFER_SELECTED) {
                switch (req->opcode) {
                case IORING_OP_READV:
                case IORING_OP_READ_FIXED:
                case IORING_OP_READ:
                        kfree((void *)(unsigned long)req->rw.addr);
                        break;
                case IORING_OP_RECVMSG:
                case IORING_OP_RECV:
                        kfree(req->sr_msg.kbuf);
                        break;
                }
        }

        if (req->flags & REQ_F_NEED_CLEANUP) {
                switch (req->opcode) {
                case IORING_OP_READV:
                case IORING_OP_READ_FIXED:
                case IORING_OP_READ:
                case IORING_OP_WRITEV:
                case IORING_OP_WRITE_FIXED:
                case IORING_OP_WRITE: {
                        struct io_async_rw *io = req->async_data;

                        kfree(io->free_iovec);
                        break;
                        }
                case IORING_OP_RECVMSG:
                case IORING_OP_SENDMSG: {
                        struct io_async_msghdr *io = req->async_data;

                        kfree(io->free_iov);
                        break;
                        }
                case IORING_OP_OPENAT:
                case IORING_OP_OPENAT2:
                        if (req->open.filename)
                                putname(req->open.filename);
                        break;
                case IORING_OP_RENAMEAT:
                        putname(req->rename.oldpath);
                        putname(req->rename.newpath);
                        break;
                case IORING_OP_UNLINKAT:
                        putname(req->unlink.filename);
                        break;
                }
        }
        if ((req->flags & REQ_F_POLLED) && req->apoll) {
                kfree(req->apoll->double_poll);
                kfree(req->apoll);
                req->apoll = NULL;
        }
        if (req->flags & REQ_F_INFLIGHT) {
                struct io_uring_task *tctx = req->task->io_uring;

                atomic_dec(&tctx->inflight_tracked);
        }
        if (req->flags & REQ_F_CREDS)
                put_cred(req->creds);

        req->flags &= ~IO_REQ_CLEAN_FLAGS;
}

static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_ring_ctx *ctx = req->ctx;
        const struct cred *creds = NULL;
        int ret;

        if ((req->flags & REQ_F_CREDS) && req->creds != current_cred())
                creds = override_creds(req->creds);

        switch (req->opcode) {
        case IORING_OP_NOP:
                ret = io_nop(req, issue_flags);
                break;
        case IORING_OP_READV:
        case IORING_OP_READ_FIXED:
        case IORING_OP_READ:
                ret = io_read(req, issue_flags);
                break;
        case IORING_OP_WRITEV:
        case IORING_OP_WRITE_FIXED:
        case IORING_OP_WRITE:
                ret = io_write(req, issue_flags);
                break;
        case IORING_OP_FSYNC:
                ret = io_fsync(req, issue_flags);
                break;
        case IORING_OP_POLL_ADD:
                ret = io_poll_add(req, issue_flags);
                break;
        case IORING_OP_POLL_REMOVE:
                ret = io_poll_update(req, issue_flags);
                break;
        case IORING_OP_SYNC_FILE_RANGE:
                ret = io_sync_file_range(req, issue_flags);
                break;
        case IORING_OP_SENDMSG:
                ret = io_sendmsg(req, issue_flags);
                break;
        case IORING_OP_SEND:
                ret = io_send(req, issue_flags);
                break;
        case IORING_OP_RECVMSG:
                ret = io_recvmsg(req, issue_flags);
                break;
        case IORING_OP_RECV:
                ret = io_recv(req, issue_flags);
                break;
        case IORING_OP_TIMEOUT:
                ret = io_timeout(req, issue_flags);
                break;
        case IORING_OP_TIMEOUT_REMOVE:
                ret = io_timeout_remove(req, issue_flags);
                break;
        case IORING_OP_ACCEPT:
                ret = io_accept(req, issue_flags);
                break;
        case IORING_OP_CONNECT:
                ret = io_connect(req, issue_flags);
                break;
        case IORING_OP_ASYNC_CANCEL:
                ret = io_async_cancel(req, issue_flags);
                break;
        case IORING_OP_FALLOCATE:
                ret = io_fallocate(req, issue_flags);
                break;
        case IORING_OP_OPENAT:
                ret = io_openat(req, issue_flags);
                break;
        case IORING_OP_CLOSE:
                ret = io_close(req, issue_flags);
                break;
        case IORING_OP_FILES_UPDATE:
                ret = io_files_update(req, issue_flags);
                break;
        case IORING_OP_STATX:
                ret = io_statx(req, issue_flags);
                break;
        case IORING_OP_FADVISE:
                ret = io_fadvise(req, issue_flags);
                break;
        case IORING_OP_MADVISE:
                ret = io_madvise(req, issue_flags);
                break;
        case IORING_OP_OPENAT2:
                ret = io_openat2(req, issue_flags);
                break;
        case IORING_OP_EPOLL_CTL:
                ret = io_epoll_ctl(req, issue_flags);
                break;
        case IORING_OP_SPLICE:
                ret = io_splice(req, issue_flags);
                break;
        case IORING_OP_PROVIDE_BUFFERS:
                ret = io_provide_buffers(req, issue_flags);
                break;
        case IORING_OP_REMOVE_BUFFERS:
                ret = io_remove_buffers(req, issue_flags);
                break;
        case IORING_OP_TEE:
                ret = io_tee(req, issue_flags);
                break;
        case IORING_OP_SHUTDOWN:
                ret = io_shutdown(req, issue_flags);
                break;
        case IORING_OP_RENAMEAT:
                ret = io_renameat(req, issue_flags);
                break;
        case IORING_OP_UNLINKAT:
                ret = io_unlinkat(req, issue_flags);
                break;
        default:
                ret = -EINVAL;
                break;
        }

        if (creds)
                revert_creds(creds);
        if (ret)
                return ret;
        /* If the op doesn't have a file, we're not polling for it */
        if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file)
                io_iopoll_req_issued(req);

        return 0;
}

static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
{
        struct io_kiocb *req = container_of(work, struct io_kiocb, work);

        req = io_put_req_find_next(req);
        return req ? &req->work : NULL;
}

static void io_wq_submit_work(struct io_wq_work *work)
{
        struct io_kiocb *req = container_of(work, struct io_kiocb, work);
        struct io_kiocb *timeout;
        int ret = 0;

        /* one will be dropped by ->io_free_work() after returning to io-wq */
        if (!(req->flags & REQ_F_REFCOUNT))
                __io_req_set_refcount(req, 2);
        else
                req_ref_get(req);

        timeout = io_prep_linked_timeout(req);
        if (timeout)
                io_queue_linked_timeout(timeout);

        /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
        if (work->flags & IO_WQ_WORK_CANCEL)
                ret = -ECANCELED;

        if (!ret) {
                do {
                        ret = io_issue_sqe(req, 0);
                        /*
                         * We can get EAGAIN for polled IO even though we're
                         * forcing a sync submission from here, since we can't
                         * wait for request slots on the block side.
                         */
                        if (ret != -EAGAIN || !(req->ctx->flags & IORING_SETUP_IOPOLL))
                                break;
                        if (io_wq_worker_stopped())
                                break;
                        /*
                         * If REQ_F_NOWAIT is set, then don't wait or retry with
                         * poll. -EAGAIN is final for that case.
                         */
                        if (req->flags & REQ_F_NOWAIT)
                                break;

                        cond_resched();
                } while (1);
        }

        /* avoid locking problems by failing it from a clean context */
        if (ret)
                io_req_task_queue_fail(req, ret);
}

static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
                                                       unsigned i)
{
        return &table->files[i];
}

static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
                                              int index)
{
        struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);

        return (struct file *) (slot->file_ptr & FFS_MASK);
}

static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
{
        unsigned long file_ptr = (unsigned long) file;

        if (__io_file_supports_nowait(file, READ))
                file_ptr |= FFS_ASYNC_READ;
        if (__io_file_supports_nowait(file, WRITE))
                file_ptr |= FFS_ASYNC_WRITE;
        if (S_ISREG(file_inode(file)->i_mode))
                file_ptr |= FFS_ISREG;
        file_slot->file_ptr = file_ptr;
}

static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
                                             struct io_kiocb *req, int fd,
                                             unsigned int issue_flags)
{
        struct file *file = NULL;
        unsigned long file_ptr;

        io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));

        if (unlikely((unsigned int)fd >= ctx->nr_user_files))
                goto out;
        fd = array_index_nospec(fd, ctx->nr_user_files);
        file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
        file = (struct file *) (file_ptr & FFS_MASK);
        file_ptr &= ~FFS_MASK;
        /* mask in overlapping REQ_F and FFS bits */
        req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT);
        io_req_set_rsrc_node(req);
out:
        io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
        return file;
}

static struct file *io_file_get_normal(struct io_ring_ctx *ctx,
                                       struct io_kiocb *req, int fd)
{
        struct file *file = fget(fd);

        trace_io_uring_file_get(ctx, fd);

        /* we don't allow fixed io_uring files */
        if (file && unlikely(file->f_op == &io_uring_fops))
                io_req_track_inflight(req);
        return file;
}

static inline struct file *io_file_get(struct io_ring_ctx *ctx,
                                       struct io_kiocb *req, int fd, bool fixed,
                                       unsigned int issue_flags)
{
        if (fixed)
                return io_file_get_fixed(ctx, req, fd, issue_flags);
        else
                return io_file_get_normal(ctx, req, fd);
}

static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
{
        struct io_kiocb *prev = req->timeout.prev;
        int ret = -ENOENT;

        if (prev) {
                if (!(req->task->flags & PF_EXITING))
                        ret = io_try_cancel_userdata(req, prev->user_data);
                io_req_complete_post(req, ret ?: -ETIME, 0);
                io_put_req(prev);
        } else {
                io_req_complete_post(req, -ETIME, 0);
        }
}

static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
{
        struct io_timeout_data *data = container_of(timer,
                                                struct io_timeout_data, timer);
        struct io_kiocb *prev, *req = data->req;
        struct io_ring_ctx *ctx = req->ctx;
        unsigned long flags;

        spin_lock_irqsave(&ctx->timeout_lock, flags);
        prev = req->timeout.head;
        req->timeout.head = NULL;

        /*
         * We don't expect the list to be empty, that will only happen if we
         * race with the completion of the linked work.
         */
        if (prev) {
                io_remove_next_linked(prev);
                if (!req_ref_inc_not_zero(prev))
                        prev = NULL;
        }
        list_del(&req->timeout.list);
        req->timeout.prev = prev;
        spin_unlock_irqrestore(&ctx->timeout_lock, flags);

        req->io_task_work.func = io_req_task_link_timeout;
        io_req_task_work_add(req);
        return HRTIMER_NORESTART;
}

static void io_queue_linked_timeout(struct io_kiocb *req)
{
        struct io_ring_ctx *ctx = req->ctx;

        spin_lock_irq(&ctx->timeout_lock);
        /*
         * If the back reference is NULL, then our linked request finished
         * before we got a chance to setup the timer
         */
        if (req->timeout.head) {
                struct io_timeout_data *data = req->async_data;

                data->timer.function = io_link_timeout_fn;
                hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
                                data->mode);
                list_add_tail(&req->timeout.list, &ctx->ltimeout_list);
        }
        spin_unlock_irq(&ctx->timeout_lock);
        /* drop submission reference */
        io_put_req(req);
}

static void __io_queue_sqe(struct io_kiocb *req)
        __must_hold(&req->ctx->uring_lock)
{
        struct io_kiocb *linked_timeout;
        int ret;

issue_sqe:
        ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);

        /*
         * We async punt it if the file wasn't marked NOWAIT, or if the file
         * doesn't support non-blocking read/write attempts
         */
        if (likely(!ret)) {
                if (req->flags & REQ_F_COMPLETE_INLINE) {
                        struct io_ring_ctx *ctx = req->ctx;
                        struct io_submit_state *state = &ctx->submit_state;

                        state->compl_reqs[state->compl_nr++] = req;
                        if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
                                io_submit_flush_completions(ctx);
                        return;
                }

                linked_timeout = io_prep_linked_timeout(req);
                if (linked_timeout)
                        io_queue_linked_timeout(linked_timeout);
        } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
                linked_timeout = io_prep_linked_timeout(req);

                switch (io_arm_poll_handler(req)) {
                case IO_APOLL_READY:
                        if (linked_timeout)
                                io_queue_linked_timeout(linked_timeout);
                        goto issue_sqe;
                case IO_APOLL_ABORTED:
                        /*
                         * Queued up for async execution, worker will release
                         * submit reference when the iocb is actually submitted.
                         */
                        io_queue_async_work(req, NULL);
                        break;
                }

                if (linked_timeout)
                        io_queue_linked_timeout(linked_timeout);
        } else {
                io_req_complete_failed(req, ret);
        }
}

static inline void io_queue_sqe(struct io_kiocb *req)
        __must_hold(&req->ctx->uring_lock)
{
        if (unlikely(req->ctx->drain_active) && io_drain_req(req))
                return;

        if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) {
                __io_queue_sqe(req);
        } else if (req->flags & REQ_F_FAIL) {
                io_req_complete_fail_submit(req);
        } else {
                int ret = io_req_prep_async(req);

                if (unlikely(ret))
                        io_req_complete_failed(req, ret);
                else
                        io_queue_async_work(req, NULL);
        }
}

/*
 * Check SQE restrictions (opcode and flags).
 *
 * Returns 'true' if SQE is allowed, 'false' otherwise.
 */
static inline bool io_check_restriction(struct io_ring_ctx *ctx,
                                        struct io_kiocb *req,
                                        unsigned int sqe_flags)
{
        if (likely(!ctx->restricted))
                return true;

        if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
                return false;

        if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
            ctx->restrictions.sqe_flags_required)
                return false;

        if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
                          ctx->restrictions.sqe_flags_required))
                return false;

        return true;
}

static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
                       const struct io_uring_sqe *sqe)
        __must_hold(&ctx->uring_lock)
{
        struct io_submit_state *state;
        unsigned int sqe_flags;
        int personality, ret = 0;

        /* req is partially pre-initialised, see io_preinit_req() */
        req->opcode = READ_ONCE(sqe->opcode);
        /* same numerical values with corresponding REQ_F_*, safe to copy */
        req->flags = sqe_flags = READ_ONCE(sqe->flags);
        req->user_data = READ_ONCE(sqe->user_data);
        req->file = NULL;
        req->fixed_rsrc_refs = NULL;
        req->task = current;

        /* enforce forwards compatibility on users */
        if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
                return -EINVAL;
        if (unlikely(req->opcode >= IORING_OP_LAST))
                return -EINVAL;
        if (!io_check_restriction(ctx, req, sqe_flags))
                return -EACCES;

        if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
            !io_op_defs[req->opcode].buffer_select)
                return -EOPNOTSUPP;
        if (unlikely(sqe_flags & IOSQE_IO_DRAIN))
                ctx->drain_active = true;

        personality = READ_ONCE(sqe->personality);
        if (personality) {
                req->creds = xa_load(&ctx->personalities, personality);
                if (!req->creds)
                        return -EINVAL;
                get_cred(req->creds);
                req->flags |= REQ_F_CREDS;
        }
        state = &ctx->submit_state;

        /*
         * Plug now if we have more than 1 IO left after this, and the target
         * is potentially a read/write to block based storage.
         */
        if (!state->plug_started && state->ios_left > 1 &&
            io_op_defs[req->opcode].plug) {
                blk_start_plug(&state->plug);
                state->plug_started = true;
        }

        if (io_op_defs[req->opcode].needs_file) {
                req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
                                        (sqe_flags & IOSQE_FIXED_FILE),
                                        IO_URING_F_NONBLOCK);
                if (unlikely(!req->file))
                        ret = -EBADF;
        }

        state->ios_left--;
        return ret;
}

static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
                         const struct io_uring_sqe *sqe)
        __must_hold(&ctx->uring_lock)
{
        struct io_submit_link *link = &ctx->submit_state.link;
        int ret;

        ret = io_init_req(ctx, req, sqe);
        if (unlikely(ret)) {
fail_req:
                /* fail even hard links since we don't submit */
                if (link->head) {
                        /*
                         * we can judge a link req is failed or cancelled by if
                         * REQ_F_FAIL is set, but the head is an exception since
                         * it may be set REQ_F_FAIL because of other req's failure
                         * so let's leverage req->result to distinguish if a head
                         * is set REQ_F_FAIL because of its failure or other req's
                         * failure so that we can set the correct ret code for it.
                         * init result here to avoid affecting the normal path.
                         */
                        if (!(link->head->flags & REQ_F_FAIL))
                                req_fail_link_node(link->head, -ECANCELED);
                } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
                        /*
                         * the current req is a normal req, we should return
                         * error and thus break the submittion loop.
                         */
                        io_req_complete_failed(req, ret);
                        return ret;
                }
                req_fail_link_node(req, ret);
        } else {
                ret = io_req_prep(req, sqe);
                if (unlikely(ret))
                        goto fail_req;
        }

        /* don't need @sqe from now on */
        trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data,
                                  req->flags, true,
                                  ctx->flags & IORING_SETUP_SQPOLL);

        /*
         * If we already have a head request, queue this one for async
         * submittal once the head completes. If we don't have a head but
         * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
         * submitted sync once the chain is complete. If none of those
         * conditions are true (normal request), then just queue it.
         */
        if (link->head) {
                struct io_kiocb *head = link->head;

                if (!(req->flags & REQ_F_FAIL)) {
                        ret = io_req_prep_async(req);
                        if (unlikely(ret)) {
                                req_fail_link_node(req, ret);
                                if (!(head->flags & REQ_F_FAIL))
                                        req_fail_link_node(head, -ECANCELED);
                        }
                }
                trace_io_uring_link(ctx, req, head);
                link->last->link = req;
                link->last = req;

                /* last request of a link, enqueue the link */
                if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
                        link->head = NULL;
                        io_queue_sqe(head);
                }
        } else {
                if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
                        link->head = req;
                        link->last = req;
                } else {
                        io_queue_sqe(req);
                }
        }

        return 0;
}

/*
 * Batched submission is done, ensure local IO is flushed out.
 */
static void io_submit_state_end(struct io_submit_state *state,
                                struct io_ring_ctx *ctx)
{
        if (state->link.head)
                io_queue_sqe(state->link.head);
        if (state->compl_nr)
                io_submit_flush_completions(ctx);
        if (state->plug_started)
                blk_finish_plug(&state->plug);
}

/*
 * Start submission side cache.
 */
static void io_submit_state_start(struct io_submit_state *state,
                                  unsigned int max_ios)
{
        state->plug_started = false;
        state->ios_left = max_ios;
        /* set only head, no need to init link_last in advance */
        state->link.head = NULL;
}

static void io_commit_sqring(struct io_ring_ctx *ctx)
{
        struct io_rings *rings = ctx->rings;

        /*
         * Ensure any loads from the SQEs are done at this point,
         * since once we write the new head, the application could
         * write new data to them.
         */
        smp_store_release(&rings->sq.head, ctx->cached_sq_head);
}

/*
 * Fetch an sqe, if one is available. Note this returns a pointer to memory
 * that is mapped by userspace. This means that care needs to be taken to
 * ensure that reads are stable, as we cannot rely on userspace always
 * being a good citizen. If members of the sqe are validated and then later
 * used, it's important that those reads are done through READ_ONCE() to
 * prevent a re-load down the line.
 */
static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
{
        unsigned head, mask = ctx->sq_entries - 1;
        unsigned sq_idx = ctx->cached_sq_head++ & mask;

        /*
         * The cached sq head (or cq tail) serves two purposes:
         *
         * 1) allows us to batch the cost of updating the user visible
         *    head updates.
         * 2) allows the kernel side to track the head on its own, even
         *    though the application is the one updating it.
         */
        head = READ_ONCE(ctx->sq_array[sq_idx]);
        if (likely(head < ctx->sq_entries))
                return &ctx->sq_sqes[head];

        /* drop invalid entries */
        ctx->cq_extra--;
        WRITE_ONCE(ctx->rings->sq_dropped,
                   READ_ONCE(ctx->rings->sq_dropped) + 1);
        return NULL;
}

static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
        __must_hold(&ctx->uring_lock)
{
        int submitted = 0;

        /* make sure SQ entry isn't read before tail */
        nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
        if (!percpu_ref_tryget_many(&ctx->refs, nr))
                return -EAGAIN;
        io_get_task_refs(nr);

        io_submit_state_start(&ctx->submit_state, nr);
        while (submitted < nr) {
                const struct io_uring_sqe *sqe;
                struct io_kiocb *req;

                req = io_alloc_req(ctx);
                if (unlikely(!req)) {
                        if (!submitted)
                                submitted = -EAGAIN;
                        break;
                }
                sqe = io_get_sqe(ctx);
                if (unlikely(!sqe)) {
                        list_add(&req->inflight_entry, &ctx->submit_state.free_list);
                        break;
                }
                /* will complete beyond this point, count as submitted */
                submitted++;
                if (io_submit_sqe(ctx, req, sqe))
                        break;
        }

        if (unlikely(submitted != nr)) {
                int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
                int unused = nr - ref_used;

                current->io_uring->cached_refs += unused;
                percpu_ref_put_many(&ctx->refs, unused);
        }

        io_submit_state_end(&ctx->submit_state, ctx);
         /* Commit SQ ring head once we've consumed and submitted all SQEs */
        io_commit_sqring(ctx);

        return submitted;
}

static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
{
        return READ_ONCE(sqd->state);
}

static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
{
        /* Tell userspace we may need a wakeup call */
        spin_lock(&ctx->completion_lock);
        WRITE_ONCE(ctx->rings->sq_flags,
                   ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
        spin_unlock(&ctx->completion_lock);
}

static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
{
        spin_lock(&ctx->completion_lock);
        WRITE_ONCE(ctx->rings->sq_flags,
                   ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
        spin_unlock(&ctx->completion_lock);
}

static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
{
        unsigned int to_submit;
        int ret = 0;

        to_submit = io_sqring_entries(ctx);
        /* if we're handling multiple rings, cap submit size for fairness */
        if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
                to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;

        if (!list_empty(&ctx->iopoll_list) || to_submit) {
                unsigned nr_events = 0;
                const struct cred *creds = NULL;

                if (ctx->sq_creds != current_cred())
                        creds = override_creds(ctx->sq_creds);

                mutex_lock(&ctx->uring_lock);
                if (!list_empty(&ctx->iopoll_list))
                        io_do_iopoll(ctx, &nr_events, 0);

                /*
                 * Don't submit if refs are dying, good for io_uring_register(),
                 * but also it is relied upon by io_ring_exit_work()
                 */
                if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
                    !(ctx->flags & IORING_SETUP_R_DISABLED))
                        ret = io_submit_sqes(ctx, to_submit);
                mutex_unlock(&ctx->uring_lock);

                if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
                        wake_up(&ctx->sqo_sq_wait);
                if (creds)
                        revert_creds(creds);
        }

        return ret;
}

static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
{
        struct io_ring_ctx *ctx;
        unsigned sq_thread_idle = 0;

        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
        sqd->sq_thread_idle = sq_thread_idle;
}

static bool io_sqd_handle_event(struct io_sq_data *sqd)
{
        bool did_sig = false;
        struct ksignal ksig;

        if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
            signal_pending(current)) {
                mutex_unlock(&sqd->lock);
                if (signal_pending(current))
                        did_sig = get_signal(&ksig);
                cond_resched();
                mutex_lock(&sqd->lock);
        }
        return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
}

static int io_sq_thread(void *data)
{
        struct io_sq_data *sqd = data;
        struct io_ring_ctx *ctx;
        unsigned long timeout = 0;
        char buf[TASK_COMM_LEN];
        DEFINE_WAIT(wait);

        snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
        set_task_comm(current, buf);

        if (sqd->sq_cpu != -1)
                set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
        else
                set_cpus_allowed_ptr(current, cpu_online_mask);
        current->flags |= PF_NO_SETAFFINITY;

        mutex_lock(&sqd->lock);
        while (1) {
                bool cap_entries, sqt_spin = false;

                if (io_sqd_events_pending(sqd) || signal_pending(current)) {
                        if (io_sqd_handle_event(sqd))
                                break;
                        timeout = jiffies + sqd->sq_thread_idle;
                }

                cap_entries = !list_is_singular(&sqd->ctx_list);
                list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
                        int ret = __io_sq_thread(ctx, cap_entries);

                        if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
                                sqt_spin = true;
                }
                if (io_run_task_work())
                        sqt_spin = true;

                if (sqt_spin || !time_after(jiffies, timeout)) {
                        cond_resched();
                        if (sqt_spin)
                                timeout = jiffies + sqd->sq_thread_idle;
                        continue;
                }

                prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
                if (!io_sqd_events_pending(sqd) && !current->task_works) {
                        bool needs_sched = true;

                        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
                                io_ring_set_wakeup_flag(ctx);

                                if ((ctx->flags & IORING_SETUP_IOPOLL) &&
                                    !list_empty_careful(&ctx->iopoll_list)) {
                                        needs_sched = false;
                                        break;
                                }
                                if (io_sqring_entries(ctx)) {
                                        needs_sched = false;
                                        break;
                                }
                        }

                        if (needs_sched) {
                                mutex_unlock(&sqd->lock);
                                schedule();
                                mutex_lock(&sqd->lock);
                        }
                        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                                io_ring_clear_wakeup_flag(ctx);
                }

                finish_wait(&sqd->wait, &wait);
                timeout = jiffies + sqd->sq_thread_idle;
        }

        io_uring_cancel_generic(true, sqd);
        sqd->thread = NULL;
        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                io_ring_set_wakeup_flag(ctx);
        io_run_task_work();
        mutex_unlock(&sqd->lock);

        complete(&sqd->exited);
        do_exit(0);
}

struct io_wait_queue {
        struct wait_queue_entry wq;
        struct io_ring_ctx *ctx;
        unsigned cq_tail;
        unsigned nr_timeouts;
};

static inline bool io_should_wake(struct io_wait_queue *iowq)
{
        struct io_ring_ctx *ctx = iowq->ctx;
        int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;

        /*
         * Wake up if we have enough events, or if a timeout occurred since we
         * started waiting. For timeouts, we always want to return to userspace,
         * regardless of event count.
         */
        return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
}

static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
                            int wake_flags, void *key)
{
        struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
                                                        wq);

        /*
         * Cannot safely flush overflowed CQEs from here, ensure we wake up
         * the task, and the next invocation will do it.
         */
        if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow))
                return autoremove_wake_function(curr, mode, wake_flags, key);
        return -1;
}

static int io_run_task_work_sig(void)
{
        if (io_run_task_work())
                return 1;
        if (!signal_pending(current))
                return 0;
        if (test_thread_flag(TIF_NOTIFY_SIGNAL))
                return -ERESTARTSYS;
        return -EINTR;
}

static bool current_pending_io(void)
{
        struct io_uring_task *tctx = current->io_uring;

        if (!tctx)
                return false;
        return percpu_counter_read_positive(&tctx->inflight);
}

/* when returns >0, the caller should retry */
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
                                          struct io_wait_queue *iowq,
                                          ktime_t *timeout)
{
        int ret;

        /* make sure we run task_work before checking for signals */
        ret = io_run_task_work_sig();
        if (ret || io_should_wake(iowq))
                return ret;
        /* let the caller flush overflows, retry */
        if (test_bit(0, &ctx->check_cq_overflow))
                return 1;

        /*
         * Mark us as being in io_wait if we have pending requests, so cpufreq
         * can take into account that the task is waiting for IO - turns out
         * to be important for low QD IO.
         */
        if (current_pending_io())
                current->in_iowait = 1;
        ret = 1;
        if (!schedule_hrtimeout(timeout, HRTIMER_MODE_ABS))
                ret = -ETIME;
        current->in_iowait = 0;
        return ret;
}

/*
 * Wait until events become available, if we don't already have some. The
 * application must reap them itself, as they reside on the shared cq ring.
 */
static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
                          const sigset_t __user *sig, size_t sigsz,
                          struct __kernel_timespec __user *uts)
{
        struct io_wait_queue iowq;
        struct io_rings *rings = ctx->rings;
        ktime_t timeout = KTIME_MAX;
        int ret;

        do {
                io_cqring_overflow_flush(ctx);
                if (io_cqring_events(ctx) >= min_events)
                        return 0;
                if (!io_run_task_work())
                        break;
        } while (1);

        if (uts) {
                struct timespec64 ts;

                if (get_timespec64(&ts, uts))
                        return -EFAULT;
                timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
        }

        if (sig) {
#ifdef CONFIG_COMPAT
                if (in_compat_syscall())
                        ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
                                                      sigsz);
                else
#endif
                        ret = set_user_sigmask(sig, sigsz);

                if (ret)
                        return ret;
        }

        init_waitqueue_func_entry(&iowq.wq, io_wake_function);
        iowq.wq.private = current;
        INIT_LIST_HEAD(&iowq.wq.entry);
        iowq.ctx = ctx;
        iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
        iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;

        trace_io_uring_cqring_wait(ctx, min_events);
        do {
                /* if we can't even flush overflow, don't wait for more */
                if (!io_cqring_overflow_flush(ctx)) {
                        ret = -EBUSY;
                        break;
                }
                prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
                                                TASK_INTERRUPTIBLE);
                ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
                finish_wait(&ctx->cq_wait, &iowq.wq);
                cond_resched();
        } while (ret > 0);

        restore_saved_sigmask_unless(ret == -EINTR);

        return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
}

static void io_free_page_table(void **table, size_t size)
{
        unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);

        for (i = 0; i < nr_tables; i++)
                kfree(table[i]);
        kfree(table);
}

static void **io_alloc_page_table(size_t size)
{
        unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
        size_t init_size = size;
        void **table;

        table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
        if (!table)
                return NULL;

        for (i = 0; i < nr_tables; i++) {
                unsigned int this_size = min_t(size_t, size, PAGE_SIZE);

                table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
                if (!table[i]) {
                        io_free_page_table(table, init_size);
                        return NULL;
                }
                size -= this_size;
        }
        return table;
}

static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
{
        percpu_ref_exit(&ref_node->refs);
        kfree(ref_node);
}

static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
{
        struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
        struct io_ring_ctx *ctx = node->rsrc_data->ctx;
        unsigned long flags;
        bool first_add = false;
        unsigned long delay = HZ;

        spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
        node->done = true;

        /* if we are mid-quiesce then do not delay */
        if (node->rsrc_data->quiesce)
                delay = 0;

        while (!list_empty(&ctx->rsrc_ref_list)) {
                node = list_first_entry(&ctx->rsrc_ref_list,
                                            struct io_rsrc_node, node);
                /* recycle ref nodes in order */
                if (!node->done)
                        break;
                list_del(&node->node);
                first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
        }
        spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);

        if (first_add)
                mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
}

static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
{
        struct io_rsrc_node *ref_node;

        ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
        if (!ref_node)
                return NULL;

        if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
                            0, GFP_KERNEL)) {
                kfree(ref_node);
                return NULL;
        }
        INIT_LIST_HEAD(&ref_node->node);
        INIT_LIST_HEAD(&ref_node->rsrc_list);
        ref_node->done = false;
        return ref_node;
}

static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
                                struct io_rsrc_data *data_to_kill)
{
        WARN_ON_ONCE(!ctx->rsrc_backup_node);
        WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);

        if (data_to_kill) {
                struct io_rsrc_node *rsrc_node = ctx->rsrc_node;

                rsrc_node->rsrc_data = data_to_kill;
                spin_lock_irq(&ctx->rsrc_ref_lock);
                list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
                spin_unlock_irq(&ctx->rsrc_ref_lock);

                atomic_inc(&data_to_kill->refs);
                percpu_ref_kill(&rsrc_node->refs);
                ctx->rsrc_node = NULL;
        }

        if (!ctx->rsrc_node) {
                ctx->rsrc_node = ctx->rsrc_backup_node;
                ctx->rsrc_backup_node = NULL;
        }
}

static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
{
        if (ctx->rsrc_backup_node)
                return 0;
        ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx);
        return ctx->rsrc_backup_node ? 0 : -ENOMEM;
}

static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx)
{
        int ret;

        /* As we may drop ->uring_lock, other task may have started quiesce */
        if (data->quiesce)
                return -ENXIO;

        data->quiesce = true;
        do {
                ret = io_rsrc_node_switch_start(ctx);
                if (ret)
                        break;
                io_rsrc_node_switch(ctx, data);

                /* kill initial ref, already quiesced if zero */
                if (atomic_dec_and_test(&data->refs))
                        break;
                mutex_unlock(&ctx->uring_lock);
                flush_delayed_work(&ctx->rsrc_put_work);
                ret = wait_for_completion_interruptible(&data->done);
                if (!ret) {
                        mutex_lock(&ctx->uring_lock);
                        if (atomic_read(&data->refs) > 0) {
                                /*
                                 * it has been revived by another thread while
                                 * we were unlocked
                                 */
                                mutex_unlock(&ctx->uring_lock);
                        } else {
                                break;
                        }
                }

                atomic_inc(&data->refs);
                /* wait for all works potentially completing data->done */
                flush_delayed_work(&ctx->rsrc_put_work);
                reinit_completion(&data->done);

                ret = io_run_task_work_sig();
                mutex_lock(&ctx->uring_lock);
        } while (ret >= 0);
        data->quiesce = false;

        return ret;
}

static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
{
        unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
        unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;

        return &data->tags[table_idx][off];
}

static void io_rsrc_data_free(struct io_rsrc_data *data)
{
        size_t size = data->nr * sizeof(data->tags[0][0]);

        if (data->tags)
                io_free_page_table((void **)data->tags, size);
        kfree(data);
}

static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
                              u64 __user *utags, unsigned nr,
                              struct io_rsrc_data **pdata)
{
        struct io_rsrc_data *data;
        int ret = -ENOMEM;
        unsigned i;

        data = kzalloc(sizeof(*data), GFP_KERNEL);
        if (!data)
                return -ENOMEM;
        data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
        if (!data->tags) {
                kfree(data);
                return -ENOMEM;
        }

        data->nr = nr;
        data->ctx = ctx;
        data->do_put = do_put;
        if (utags) {
                ret = -EFAULT;
                for (i = 0; i < nr; i++) {
                        u64 *tag_slot = io_get_tag_slot(data, i);

                        if (copy_from_user(tag_slot, &utags[i],
                                           sizeof(*tag_slot)))
                                goto fail;
                }
        }

        atomic_set(&data->refs, 1);
        init_completion(&data->done);
        *pdata = data;
        return 0;
fail:
        io_rsrc_data_free(data);
        return ret;
}

static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
{
        table->files = kvcalloc(nr_files, sizeof(table->files[0]),
                                GFP_KERNEL_ACCOUNT);
        return !!table->files;
}

static void io_free_file_tables(struct io_file_table *table)
{
        kvfree(table->files);
        table->files = NULL;
}

static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
        int i;

        for (i = 0; i < ctx->nr_user_files; i++) {
                struct file *file;

                file = io_file_from_index(ctx, i);
                if (file)
                        fput(file);
        }
        io_free_file_tables(&ctx->file_table);
        io_rsrc_data_free(ctx->file_data);
        ctx->file_data = NULL;
        ctx->nr_user_files = 0;
}

static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
        unsigned nr = ctx->nr_user_files;
        int ret;

        if (!ctx->file_data)
                return -ENXIO;

        /*
         * Quiesce may unlock ->uring_lock, and while it's not held
         * prevent new requests using the table.
         */
        ctx->nr_user_files = 0;
        ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
        ctx->nr_user_files = nr;
        if (!ret)
                __io_sqe_files_unregister(ctx);
        return ret;
}

static void io_sq_thread_unpark(struct io_sq_data *sqd)
        __releases(&sqd->lock)
{
        WARN_ON_ONCE(sqd->thread == current);

        /*
         * Do the dance but not conditional clear_bit() because it'd race with
         * other threads incrementing park_pending and setting the bit.
         */
        clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
        if (atomic_dec_return(&sqd->park_pending))
                set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
        mutex_unlock(&sqd->lock);
}

static void io_sq_thread_park(struct io_sq_data *sqd)
        __acquires(&sqd->lock)
{
        WARN_ON_ONCE(sqd->thread == current);

        atomic_inc(&sqd->park_pending);
        set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
        mutex_lock(&sqd->lock);
        if (sqd->thread)
                wake_up_process(sqd->thread);
}

static void io_sq_thread_stop(struct io_sq_data *sqd)
{
        WARN_ON_ONCE(sqd->thread == current);
        WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));

        set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
        mutex_lock(&sqd->lock);
        if (sqd->thread)
                wake_up_process(sqd->thread);
        mutex_unlock(&sqd->lock);
        wait_for_completion(&sqd->exited);
}

static void io_put_sq_data(struct io_sq_data *sqd)
{
        if (refcount_dec_and_test(&sqd->refs)) {
                WARN_ON_ONCE(atomic_read(&sqd->park_pending));

                io_sq_thread_stop(sqd);
                kfree(sqd);
        }
}

static void io_sq_thread_finish(struct io_ring_ctx *ctx)
{
        struct io_sq_data *sqd = ctx->sq_data;

        if (sqd) {
                io_sq_thread_park(sqd);
                list_del_init(&ctx->sqd_list);
                io_sqd_update_thread_idle(sqd);
                io_sq_thread_unpark(sqd);

                io_put_sq_data(sqd);
                ctx->sq_data = NULL;
        }
}

static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
{
        struct io_ring_ctx *ctx_attach;
        struct io_sq_data *sqd;
        struct fd f;

        f = fdget(p->wq_fd);
        if (!f.file)
                return ERR_PTR(-ENXIO);
        if (f.file->f_op != &io_uring_fops) {
                fdput(f);
                return ERR_PTR(-EINVAL);
        }

        ctx_attach = f.file->private_data;
        sqd = ctx_attach->sq_data;
        if (!sqd) {
                fdput(f);
                return ERR_PTR(-EINVAL);
        }
        if (sqd->task_tgid != current->tgid) {
                fdput(f);
                return ERR_PTR(-EPERM);
        }

        refcount_inc(&sqd->refs);
        fdput(f);
        return sqd;
}

static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
                                         bool *attached)
{
        struct io_sq_data *sqd;

        *attached = false;
        if (p->flags & IORING_SETUP_ATTACH_WQ) {
                sqd = io_attach_sq_data(p);
                if (!IS_ERR(sqd)) {
                        *attached = true;
                        return sqd;
                }
                /* fall through for EPERM case, setup new sqd/task */
                if (PTR_ERR(sqd) != -EPERM)
                        return sqd;
        }

        sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
        if (!sqd)
                return ERR_PTR(-ENOMEM);

        atomic_set(&sqd->park_pending, 0);
        refcount_set(&sqd->refs, 1);
        INIT_LIST_HEAD(&sqd->ctx_list);
        mutex_init(&sqd->lock);
        init_waitqueue_head(&sqd->wait);
        init_completion(&sqd->exited);
        return sqd;
}

static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
{
        struct file *file = prsrc->file;

        fput(file);
}

static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
{
        struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
        struct io_ring_ctx *ctx = rsrc_data->ctx;
        struct io_rsrc_put *prsrc, *tmp;

        list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
                list_del(&prsrc->list);

                if (prsrc->tag) {
                        bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;

                        io_ring_submit_lock(ctx, lock_ring);
                        spin_lock(&ctx->completion_lock);
                        io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
                        io_commit_cqring(ctx);
                        spin_unlock(&ctx->completion_lock);
                        io_cqring_ev_posted(ctx);
                        io_ring_submit_unlock(ctx, lock_ring);
                }

                rsrc_data->do_put(ctx, prsrc);
                kfree(prsrc);
        }

        io_rsrc_node_destroy(ref_node);
        if (atomic_dec_and_test(&rsrc_data->refs))
                complete(&rsrc_data->done);
}

static void io_rsrc_put_work(struct work_struct *work)
{
        struct io_ring_ctx *ctx;
        struct llist_node *node;

        ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
        node = llist_del_all(&ctx->rsrc_put_llist);

        while (node) {
                struct io_rsrc_node *ref_node;
                struct llist_node *next = node->next;

                ref_node = llist_entry(node, struct io_rsrc_node, llist);
                __io_rsrc_put_work(ref_node);
                node = next;
        }
}

static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
                                 unsigned nr_args, u64 __user *tags)
{
        __s32 __user *fds = (__s32 __user *) arg;
        struct file *file;
        int fd, ret;
        unsigned i;

        if (ctx->file_data)
                return -EBUSY;
        if (!nr_args)
                return -EINVAL;
        if (nr_args > IORING_MAX_FIXED_FILES)
                return -EMFILE;
        if (nr_args > rlimit(RLIMIT_NOFILE))
                return -EMFILE;
        ret = io_rsrc_node_switch_start(ctx);
        if (ret)
                return ret;
        ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
                                 &ctx->file_data);
        if (ret)
                return ret;

        ret = -ENOMEM;
        if (!io_alloc_file_tables(&ctx->file_table, nr_args))
                goto out_free;

        for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
                if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
                        ret = -EFAULT;
                        goto out_fput;
                }
                /* allow sparse sets */
                if (fd == -1) {
                        ret = -EINVAL;
                        if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
                                goto out_fput;
                        continue;
                }

                file = fget(fd);
                ret = -EBADF;
                if (unlikely(!file))
                        goto out_fput;

                /*
                 * Don't allow io_uring instances to be registered. If UNIX
                 * isn't enabled, then this causes a reference cycle and this
                 * instance can never get freed. If UNIX is enabled we'll
                 * handle it just fine, but there's still no point in allowing
                 * a ring fd as it doesn't support regular read/write anyway.
                 */
                if (file->f_op == &io_uring_fops) {
                        fput(file);
                        goto out_fput;
                }
                io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
        }

        io_rsrc_node_switch(ctx, NULL);
        return 0;
out_fput:
        for (i = 0; i < ctx->nr_user_files; i++) {
                file = io_file_from_index(ctx, i);
                if (file)
                        fput(file);
        }
        io_free_file_tables(&ctx->file_table);
        ctx->nr_user_files = 0;
out_free:
        io_rsrc_data_free(ctx->file_data);
        ctx->file_data = NULL;
        return ret;
}

static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
                                 struct io_rsrc_node *node, void *rsrc)
{
        u64 *tag_slot = io_get_tag_slot(data, idx);
        struct io_rsrc_put *prsrc;

        prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
        if (!prsrc)
                return -ENOMEM;

        prsrc->tag = *tag_slot;
        *tag_slot = 0;
        prsrc->rsrc = rsrc;
        list_add(&prsrc->list, &node->rsrc_list);
        return 0;
}

static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
                                 unsigned int issue_flags, u32 slot_index)
{
        struct io_ring_ctx *ctx = req->ctx;
        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
        bool needs_switch = false;
        struct io_fixed_file *file_slot;
        int ret = -EBADF;

        io_ring_submit_lock(ctx, !force_nonblock);
        if (file->f_op == &io_uring_fops)
                goto err;
        ret = -ENXIO;
        if (!ctx->file_data)
                goto err;
        ret = -EINVAL;
        if (slot_index >= ctx->nr_user_files)
                goto err;

        slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
        file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);

        if (file_slot->file_ptr) {
                struct file *old_file;

                ret = io_rsrc_node_switch_start(ctx);
                if (ret)
                        goto err;

                old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
                ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
                                            ctx->rsrc_node, old_file);
                if (ret)
                        goto err;
                file_slot->file_ptr = 0;
                needs_switch = true;
        }

        *io_get_tag_slot(ctx->file_data, slot_index) = 0;
        io_fixed_file_set(file_slot, file);
        ret = 0;
err:
        if (needs_switch)
                io_rsrc_node_switch(ctx, ctx->file_data);
        io_ring_submit_unlock(ctx, !force_nonblock);
        if (ret)
                fput(file);
        return ret;
}

static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
{
        unsigned int offset = req->close.file_slot - 1;
        struct io_ring_ctx *ctx = req->ctx;
        struct io_fixed_file *file_slot;
        struct file *file;
        int ret;

        io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
        ret = -ENXIO;
        if (unlikely(!ctx->file_data))
                goto out;
        ret = -EINVAL;
        if (offset >= ctx->nr_user_files)
                goto out;
        ret = io_rsrc_node_switch_start(ctx);
        if (ret)
                goto out;

        offset = array_index_nospec(offset, ctx->nr_user_files);
        file_slot = io_fixed_file_slot(&ctx->file_table, offset);
        ret = -EBADF;
        if (!file_slot->file_ptr)
                goto out;

        file = (struct file *)(file_slot->file_ptr & FFS_MASK);
        ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
        if (ret)
                goto out;

        file_slot->file_ptr = 0;
        io_rsrc_node_switch(ctx, ctx->file_data);
        ret = 0;
out:
        io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
        return ret;
}

static int __io_sqe_files_update(struct io_ring_ctx *ctx,
                                 struct io_uring_rsrc_update2 *up,
                                 unsigned nr_args)
{
        u64 __user *tags = u64_to_user_ptr(up->tags);
        __s32 __user *fds = u64_to_user_ptr(up->data);
        struct io_rsrc_data *data = ctx->file_data;
        struct io_fixed_file *file_slot;
        struct file *file;
        int fd, i, err = 0;
        unsigned int done;
        bool needs_switch = false;

        if (!ctx->file_data)
                return -ENXIO;
        if (up->offset + nr_args > ctx->nr_user_files)
                return -EINVAL;

        for (done = 0; done < nr_args; done++) {
                u64 tag = 0;

                if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
                    copy_from_user(&fd, &fds[done], sizeof(fd))) {
                        err = -EFAULT;
                        break;
                }
                if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
                        err = -EINVAL;
                        break;
                }
                if (fd == IORING_REGISTER_FILES_SKIP)
                        continue;

                i = array_index_nospec(up->offset + done, ctx->nr_user_files);
                file_slot = io_fixed_file_slot(&ctx->file_table, i);

                if (file_slot->file_ptr) {
                        file = (struct file *)(file_slot->file_ptr & FFS_MASK);
                        err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
                        if (err)
                                break;
                        file_slot->file_ptr = 0;
                        needs_switch = true;
                }
                if (fd != -1) {
                        file = fget(fd);
                        if (!file) {
                                err = -EBADF;
                                break;
                        }
                        /*
                         * Don't allow io_uring instances to be registered. If
                         * UNIX isn't enabled, then this causes a reference
                         * cycle and this instance can never get freed. If UNIX
                         * is enabled we'll handle it just fine, but there's
                         * still no point in allowing a ring fd as it doesn't
                         * support regular read/write anyway.
                         */
                        if (file->f_op == &io_uring_fops) {
                                fput(file);
                                err = -EBADF;
                                break;
                        }
                        *io_get_tag_slot(data, i) = tag;
                        io_fixed_file_set(file_slot, file);
                }
        }

        if (needs_switch)
                io_rsrc_node_switch(ctx, data);
        return done ? done : err;
}

static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
                                        struct task_struct *task)
{
        struct io_wq_hash *hash;
        struct io_wq_data data;
        unsigned int concurrency;

        mutex_lock(&ctx->uring_lock);
        hash = ctx->hash_map;
        if (!hash) {
                hash = kzalloc(sizeof(*hash), GFP_KERNEL);
                if (!hash) {
                        mutex_unlock(&ctx->uring_lock);
                        return ERR_PTR(-ENOMEM);
                }
                refcount_set(&hash->refs, 1);
                init_waitqueue_head(&hash->wait);
                ctx->hash_map = hash;
        }
        mutex_unlock(&ctx->uring_lock);

        data.hash = hash;
        data.task = task;
        data.free_work = io_wq_free_work;
        data.do_work = io_wq_submit_work;

        /* Do QD, or 4 * CPUS, whatever is smallest */
        concurrency = min(ctx->sq_entries, 4 * num_online_cpus());

        return io_wq_create(concurrency, &data);
}

static int io_uring_alloc_task_context(struct task_struct *task,
                                       struct io_ring_ctx *ctx)
{
        struct io_uring_task *tctx;
        int ret;

        tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
        if (unlikely(!tctx))
                return -ENOMEM;

        ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
        if (unlikely(ret)) {
                kfree(tctx);
                return ret;
        }

        tctx->io_wq = io_init_wq_offload(ctx, task);
        if (IS_ERR(tctx->io_wq)) {
                ret = PTR_ERR(tctx->io_wq);
                percpu_counter_destroy(&tctx->inflight);
                kfree(tctx);
                return ret;
        }

        xa_init(&tctx->xa);
        init_waitqueue_head(&tctx->wait);
        atomic_set(&tctx->in_idle, 0);
        atomic_set(&tctx->inflight_tracked, 0);
        task->io_uring = tctx;
        spin_lock_init(&tctx->task_lock);
        INIT_WQ_LIST(&tctx->task_list);
        init_task_work(&tctx->task_work, tctx_task_work);
        return 0;
}

void __io_uring_free(struct task_struct *tsk)
{
        struct io_uring_task *tctx = tsk->io_uring;

        WARN_ON_ONCE(!xa_empty(&tctx->xa));
        WARN_ON_ONCE(tctx->io_wq);
        WARN_ON_ONCE(tctx->cached_refs);

        percpu_counter_destroy(&tctx->inflight);
        kfree(tctx);
        tsk->io_uring = NULL;
}

static int io_sq_offload_create(struct io_ring_ctx *ctx,
                                struct io_uring_params *p)
{
        int ret;

        /* Retain compatibility with failing for an invalid attach attempt */
        if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
                                IORING_SETUP_ATTACH_WQ) {
                struct fd f;

                f = fdget(p->wq_fd);
                if (!f.file)
                        return -ENXIO;
                if (f.file->f_op != &io_uring_fops) {
                        fdput(f);
                        return -EINVAL;
                }
                fdput(f);
        }
        if (ctx->flags & IORING_SETUP_SQPOLL) {
                struct task_struct *tsk;
                struct io_sq_data *sqd;
                bool attached;

                sqd = io_get_sq_data(p, &attached);
                if (IS_ERR(sqd)) {
                        ret = PTR_ERR(sqd);
                        goto err;
                }

                ctx->sq_creds = get_current_cred();
                ctx->sq_data = sqd;
                ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
                if (!ctx->sq_thread_idle)
                        ctx->sq_thread_idle = HZ;

                io_sq_thread_park(sqd);
                list_add(&ctx->sqd_list, &sqd->ctx_list);
                io_sqd_update_thread_idle(sqd);
                /* don't attach to a dying SQPOLL thread, would be racy */
                ret = (attached && !sqd->thread) ? -ENXIO : 0;
                io_sq_thread_unpark(sqd);

                if (ret < 0)
                        goto err;
                if (attached)
                        return 0;

                if (p->flags & IORING_SETUP_SQ_AFF) {
                        int cpu = p->sq_thread_cpu;

                        ret = -EINVAL;
                        if (cpu >= nr_cpu_ids || !cpu_online(cpu))
                                goto err_sqpoll;
                        sqd->sq_cpu = cpu;
                } else {
                        sqd->sq_cpu = -1;
                }

                sqd->task_pid = current->pid;
                sqd->task_tgid = current->tgid;
                tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
                if (IS_ERR(tsk)) {
                        ret = PTR_ERR(tsk);
                        goto err_sqpoll;
                }

                sqd->thread = tsk;
                ret = io_uring_alloc_task_context(tsk, ctx);
                wake_up_new_task(tsk);
                if (ret)
                        goto err;
        } else if (p->flags & IORING_SETUP_SQ_AFF) {
                /* Can't have SQ_AFF without SQPOLL */
                ret = -EINVAL;
                goto err;
        }

        return 0;
err_sqpoll:
        complete(&ctx->sq_data->exited);
err:
        io_sq_thread_finish(ctx);
        return ret;
}

static inline void __io_unaccount_mem(struct user_struct *user,
                                      unsigned long nr_pages)
{
        atomic_long_sub(nr_pages, &user->locked_vm);
}

static inline int __io_account_mem(struct user_struct *user,
                                   unsigned long nr_pages)
{
        unsigned long page_limit, cur_pages, new_pages;

        /* Don't allow more pages than we can safely lock */
        page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;

        do {
                cur_pages = atomic_long_read(&user->locked_vm);
                new_pages = cur_pages + nr_pages;
                if (new_pages > page_limit)
                        return -ENOMEM;
        } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
                                        new_pages) != cur_pages);

        return 0;
}

static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
{
        if (ctx->user)
                __io_unaccount_mem(ctx->user, nr_pages);

        if (ctx->mm_account)
                atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
}

static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
{
        int ret;

        if (ctx->user) {
                ret = __io_account_mem(ctx->user, nr_pages);
                if (ret)
                        return ret;
        }

        if (ctx->mm_account)
                atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);

        return 0;
}

static void io_mem_free(void *ptr)
{
        struct page *page;

        if (!ptr)
                return;

        page = virt_to_head_page(ptr);
        if (put_page_testzero(page))
                free_compound_page(page);
}

static void *io_mem_alloc(size_t size)
{
        gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;

        return (void *) __get_free_pages(gfp, get_order(size));
}

static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
                                size_t *sq_offset)
{
        struct io_rings *rings;
        size_t off, sq_array_size;

        off = struct_size(rings, cqes, cq_entries);
        if (off == SIZE_MAX)
                return SIZE_MAX;

#ifdef CONFIG_SMP
        off = ALIGN(off, SMP_CACHE_BYTES);
        if (off == 0)
                return SIZE_MAX;
#endif

        if (sq_offset)
                *sq_offset = off;

        sq_array_size = array_size(sizeof(u32), sq_entries);
        if (sq_array_size == SIZE_MAX)
                return SIZE_MAX;

        if (check_add_overflow(off, sq_array_size, &off))
                return SIZE_MAX;

        return off;
}

static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
{
        struct io_mapped_ubuf *imu = *slot;
        unsigned int i;

        if (imu != ctx->dummy_ubuf) {
                for (i = 0; i < imu->nr_bvecs; i++)
                        unpin_user_page(imu->bvec[i].bv_page);
                if (imu->acct_pages)
                        io_unaccount_mem(ctx, imu->acct_pages);
                kvfree(imu);
        }
        *slot = NULL;
}

static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
{
        io_buffer_unmap(ctx, &prsrc->buf);
        prsrc->buf = NULL;
}

static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
{
        unsigned int i;

        for (i = 0; i < ctx->nr_user_bufs; i++)
                io_buffer_unmap(ctx, &ctx->user_bufs[i]);
        kfree(ctx->user_bufs);
        io_rsrc_data_free(ctx->buf_data);
        ctx->user_bufs = NULL;
        ctx->buf_data = NULL;
        ctx->nr_user_bufs = 0;
}

static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
{
        unsigned nr = ctx->nr_user_bufs;
        int ret;

        if (!ctx->buf_data)
                return -ENXIO;

        /*
         * Quiesce may unlock ->uring_lock, and while it's not held
         * prevent new requests using the table.
         */
        ctx->nr_user_bufs = 0;
        ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
        ctx->nr_user_bufs = nr;
        if (!ret)
                __io_sqe_buffers_unregister(ctx);
        return ret;
}

static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
                       void __user *arg, unsigned index)
{
        struct iovec __user *src;

#ifdef CONFIG_COMPAT
        if (ctx->compat) {
                struct compat_iovec __user *ciovs;
                struct compat_iovec ciov;

                ciovs = (struct compat_iovec __user *) arg;
                if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
                        return -EFAULT;

                dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
                dst->iov_len = ciov.iov_len;
                return 0;
        }
#endif
        src = (struct iovec __user *) arg;
        if (copy_from_user(dst, &src[index], sizeof(*dst)))
                return -EFAULT;
        return 0;
}

/*
 * Not super efficient, but this is just a registration time. And we do cache
 * the last compound head, so generally we'll only do a full search if we don't
 * match that one.
 *
 * We check if the given compound head page has already been accounted, to
 * avoid double accounting it. This allows us to account the full size of the
 * page, not just the constituent pages of a huge page.
 */
static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
                                  int nr_pages, struct page *hpage)
{
        int i, j;

        /* check current page array */
        for (i = 0; i < nr_pages; i++) {
                if (!PageCompound(pages[i]))
                        continue;
                if (compound_head(pages[i]) == hpage)
                        return true;
        }

        /* check previously registered pages */
        for (i = 0; i < ctx->nr_user_bufs; i++) {
                struct io_mapped_ubuf *imu = ctx->user_bufs[i];

                for (j = 0; j < imu->nr_bvecs; j++) {
                        if (!PageCompound(imu->bvec[j].bv_page))
                                continue;
                        if (compound_head(imu->bvec[j].bv_page) == hpage)
                                return true;
                }
        }

        return false;
}

static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
                                 int nr_pages, struct io_mapped_ubuf *imu,
                                 struct page **last_hpage)
{
        int i, ret;

        imu->acct_pages = 0;
        for (i = 0; i < nr_pages; i++) {
                if (!PageCompound(pages[i])) {
                        imu->acct_pages++;
                } else {
                        struct page *hpage;

                        hpage = compound_head(pages[i]);
                        if (hpage == *last_hpage)
                                continue;
                        *last_hpage = hpage;
                        if (headpage_already_acct(ctx, pages, i, hpage))
                                continue;
                        imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
                }
        }

        if (!imu->acct_pages)
                return 0;

        ret = io_account_mem(ctx, imu->acct_pages);
        if (ret)
                imu->acct_pages = 0;
        return ret;
}

static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
                                  struct io_mapped_ubuf **pimu,
                                  struct page **last_hpage)
{
        struct io_mapped_ubuf *imu = NULL;
        struct vm_area_struct **vmas = NULL;
        struct page **pages = NULL;
        unsigned long off, start, end, ubuf;
        size_t size;
        int ret, pret, nr_pages, i;

        if (!iov->iov_base) {
                *pimu = ctx->dummy_ubuf;
                return 0;
        }

        ubuf = (unsigned long) iov->iov_base;
        end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
        start = ubuf >> PAGE_SHIFT;
        nr_pages = end - start;

        *pimu = NULL;
        ret = -ENOMEM;

        pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
        if (!pages)
                goto done;

        vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
                              GFP_KERNEL);
        if (!vmas)
                goto done;

        imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
        if (!imu)
                goto done;

        ret = 0;
        mmap_read_lock(current->mm);
        pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
                              pages, vmas);
        if (pret == nr_pages) {
                struct file *file = vmas[0]->vm_file;

                /* don't support file backed memory */
                for (i = 0; i < nr_pages; i++) {
                        if (vmas[i]->vm_file != file) {
                                ret = -EINVAL;
                                break;
                        }
                        if (!file)
                                continue;
                        if (!vma_is_shmem(vmas[i]) && !is_file_hugepages(file)) {
                                ret = -EOPNOTSUPP;
                                break;
                        }
                }
        } else {
                ret = pret < 0 ? pret : -EFAULT;
        }
        mmap_read_unlock(current->mm);
        if (ret) {
                /*
                 * if we did partial map, or found file backed vmas,
                 * release any pages we did get
                 */
                if (pret > 0)
                        unpin_user_pages(pages, pret);
                goto done;
        }

        ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
        if (ret) {
                unpin_user_pages(pages, pret);
                goto done;
        }

        off = ubuf & ~PAGE_MASK;
        size = iov->iov_len;
        for (i = 0; i < nr_pages; i++) {
                size_t vec_len;

                vec_len = min_t(size_t, size, PAGE_SIZE - off);
                imu->bvec[i].bv_page = pages[i];
                imu->bvec[i].bv_len = vec_len;
                imu->bvec[i].bv_offset = off;
                off = 0;
                size -= vec_len;
        }
        /* store original address for later verification */
        imu->ubuf = ubuf;
        imu->ubuf_end = ubuf + iov->iov_len;
        imu->nr_bvecs = nr_pages;
        *pimu = imu;
        ret = 0;
done:
        if (ret)
                kvfree(imu);
        kvfree(pages);
        kvfree(vmas);
        return ret;
}

static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
{
        ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
        return ctx->user_bufs ? 0 : -ENOMEM;
}

static int io_buffer_validate(struct iovec *iov)
{
        unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);

        /*
         * Don't impose further limits on the size and buffer
         * constraints here, we'll -EINVAL later when IO is
         * submitted if they are wrong.
         */
        if (!iov->iov_base)
                return iov->iov_len ? -EFAULT : 0;
        if (!iov->iov_len)
                return -EFAULT;

        /* arbitrary limit, but we need something */
        if (iov->iov_len > SZ_1G)
                return -EFAULT;

        if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
                return -EOVERFLOW;

        return 0;
}

static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
                                   unsigned int nr_args, u64 __user *tags)
{
        struct page *last_hpage = NULL;
        struct io_rsrc_data *data;
        int i, ret;
        struct iovec iov;

        if (ctx->user_bufs)
                return -EBUSY;
        if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
                return -EINVAL;
        ret = io_rsrc_node_switch_start(ctx);
        if (ret)
                return ret;
        ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
        if (ret)
                return ret;
        ret = io_buffers_map_alloc(ctx, nr_args);
        if (ret) {
                io_rsrc_data_free(data);
                return ret;
        }

        for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
                ret = io_copy_iov(ctx, &iov, arg, i);
                if (ret)
                        break;
                ret = io_buffer_validate(&iov);
                if (ret)
                        break;
                if (!iov.iov_base && *io_get_tag_slot(data, i)) {
                        ret = -EINVAL;
                        break;
                }

                ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
                                             &last_hpage);
                if (ret)
                        break;
        }

        WARN_ON_ONCE(ctx->buf_data);

        ctx->buf_data = data;
        if (ret)
                __io_sqe_buffers_unregister(ctx);
        else
                io_rsrc_node_switch(ctx, NULL);
        return ret;
}

static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
                                   struct io_uring_rsrc_update2 *up,
                                   unsigned int nr_args)
{
        u64 __user *tags = u64_to_user_ptr(up->tags);
        struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
        struct page *last_hpage = NULL;
        bool needs_switch = false;
        __u32 done;
        int i, err;

        if (!ctx->buf_data)
                return -ENXIO;
        if (up->offset + nr_args > ctx->nr_user_bufs)
                return -EINVAL;

        for (done = 0; done < nr_args; done++) {
                struct io_mapped_ubuf *imu;
                int offset = up->offset + done;
                u64 tag = 0;

                err = io_copy_iov(ctx, &iov, iovs, done);
                if (err)
                        break;
                if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
                        err = -EFAULT;
                        break;
                }
                err = io_buffer_validate(&iov);
                if (err)
                        break;
                if (!iov.iov_base && tag) {
                        err = -EINVAL;
                        break;
                }
                err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
                if (err)
                        break;

                i = array_index_nospec(offset, ctx->nr_user_bufs);
                if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
                        err = io_queue_rsrc_removal(ctx->buf_data, i,
                                                    ctx->rsrc_node, ctx->user_bufs[i]);
                        if (unlikely(err)) {
                                io_buffer_unmap(ctx, &imu);
                                break;
                        }
                        ctx->user_bufs[i] = NULL;
                        needs_switch = true;
                }

                ctx->user_bufs[i] = imu;
                *io_get_tag_slot(ctx->buf_data, offset) = tag;
        }

        if (needs_switch)
                io_rsrc_node_switch(ctx, ctx->buf_data);
        return done ? done : err;
}

static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
{
        __s32 __user *fds = arg;
        int fd;

        if (ctx->cq_ev_fd)
                return -EBUSY;

        if (copy_from_user(&fd, fds, sizeof(*fds)))
                return -EFAULT;

        ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
        if (IS_ERR(ctx->cq_ev_fd)) {
                int ret = PTR_ERR(ctx->cq_ev_fd);

                ctx->cq_ev_fd = NULL;
                return ret;
        }

        return 0;
}

static int io_eventfd_unregister(struct io_ring_ctx *ctx)
{
        if (ctx->cq_ev_fd) {
                eventfd_ctx_put(ctx->cq_ev_fd);
                ctx->cq_ev_fd = NULL;
                return 0;
        }

        return -ENXIO;
}

static void io_destroy_buffers(struct io_ring_ctx *ctx)
{
        struct io_buffer *buf;
        unsigned long index;

        xa_for_each(&ctx->io_buffers, index, buf)
                __io_remove_buffers(ctx, buf, index, -1U);
}

static void io_req_cache_free(struct list_head *list)
{
        struct io_kiocb *req, *nxt;

        list_for_each_entry_safe(req, nxt, list, inflight_entry) {
                list_del(&req->inflight_entry);
                kmem_cache_free(req_cachep, req);
        }
}

static void io_req_caches_free(struct io_ring_ctx *ctx)
{
        struct io_submit_state *state = &ctx->submit_state;

        mutex_lock(&ctx->uring_lock);

        if (state->free_reqs) {
                kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
                state->free_reqs = 0;
        }

        io_flush_cached_locked_reqs(ctx, state);
        io_req_cache_free(&state->free_list);
        mutex_unlock(&ctx->uring_lock);
}

static void io_wait_rsrc_data(struct io_rsrc_data *data)
{
        if (data && !atomic_dec_and_test(&data->refs))
                wait_for_completion(&data->done);
}

static void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
        io_sq_thread_finish(ctx);

        /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
        io_wait_rsrc_data(ctx->buf_data);
        io_wait_rsrc_data(ctx->file_data);

        mutex_lock(&ctx->uring_lock);
        if (ctx->buf_data)
                __io_sqe_buffers_unregister(ctx);
        if (ctx->file_data)
                __io_sqe_files_unregister(ctx);
        if (ctx->rings)
                __io_cqring_overflow_flush(ctx, true);
        mutex_unlock(&ctx->uring_lock);
        io_eventfd_unregister(ctx);
        io_destroy_buffers(ctx);
        if (ctx->sq_creds)
                put_cred(ctx->sq_creds);

        /* there are no registered resources left, nobody uses it */
        if (ctx->rsrc_node)
                io_rsrc_node_destroy(ctx->rsrc_node);
        if (ctx->rsrc_backup_node)
                io_rsrc_node_destroy(ctx->rsrc_backup_node);
        flush_delayed_work(&ctx->rsrc_put_work);

        WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
        WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));

        WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));

        if (ctx->mm_account) {
                mmdrop(ctx->mm_account);
                ctx->mm_account = NULL;
        }

        io_mem_free(ctx->rings);
        io_mem_free(ctx->sq_sqes);

        percpu_ref_exit(&ctx->refs);
        free_uid(ctx->user);
        io_req_caches_free(ctx);
        if (ctx->hash_map)
                io_wq_put_hash(ctx->hash_map);
        kfree(ctx->cancel_hash);
        kfree(ctx->dummy_ubuf);
        kfree(ctx);
}

static __poll_t io_uring_poll(struct file *file, poll_table *wait)
{
        struct io_ring_ctx *ctx = file->private_data;
        __poll_t mask = 0;

        poll_wait(file, &ctx->poll_wait, wait);
        /*
         * synchronizes with barrier from wq_has_sleeper call in
         * io_commit_cqring
         */
        smp_rmb();
        if (!io_sqring_full(ctx))
                mask |= EPOLLOUT | EPOLLWRNORM;

        /*
         * Don't flush cqring overflow list here, just do a simple check.
         * Otherwise there could possible be ABBA deadlock:
         *      CPU0                    CPU1
         *      ----                    ----
         * lock(&ctx->uring_lock);
         *                              lock(&ep->mtx);
         *                              lock(&ctx->uring_lock);
         * lock(&ep->mtx);
         *
         * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
         * pushs them to do the flush.
         */
        if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow))
                mask |= EPOLLIN | EPOLLRDNORM;

        return mask;
}

static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
{
        const struct cred *creds;

        creds = xa_erase(&ctx->personalities, id);
        if (creds) {
                put_cred(creds);
                return 0;
        }

        return -EINVAL;
}

struct io_tctx_exit {
        struct callback_head                task_work;
        struct completion                completion;
        struct io_ring_ctx                *ctx;
};

static void io_tctx_exit_cb(struct callback_head *cb)
{
        struct io_uring_task *tctx = current->io_uring;
        struct io_tctx_exit *work;

        work = container_of(cb, struct io_tctx_exit, task_work);
        /*
         * When @in_idle, we're in cancellation and it's racy to remove the
         * node. It'll be removed by the end of cancellation, just ignore it.
         * tctx can be NULL if the queueing of this task_work raced with
         * work cancelation off the exec path.
         */
        if (tctx && !atomic_read(&tctx->in_idle))
                io_uring_del_tctx_node((unsigned long)work->ctx);
        complete(&work->completion);
}

static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
{
        struct io_kiocb *req = container_of(work, struct io_kiocb, work);

        return req->ctx == data;
}

static void io_ring_exit_work(struct work_struct *work)
{
        struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
        unsigned long timeout = jiffies + HZ * 60 * 5;
        unsigned long interval = HZ / 20;
        struct io_tctx_exit exit;
        struct io_tctx_node *node;
        int ret;

        /*
         * If we're doing polled IO and end up having requests being
         * submitted async (out-of-line), then completions can come in while
         * we're waiting for refs to drop. We need to reap these manually,
         * as nobody else will be looking for them.
         */
        do {
                io_uring_try_cancel_requests(ctx, NULL, true);
                if (ctx->sq_data) {
                        struct io_sq_data *sqd = ctx->sq_data;
                        struct task_struct *tsk;

                        io_sq_thread_park(sqd);
                        tsk = sqd->thread;
                        if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
                                io_wq_cancel_cb(tsk->io_uring->io_wq,
                                                io_cancel_ctx_cb, ctx, true);
                        io_sq_thread_unpark(sqd);
                }

                if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
                        /* there is little hope left, don't run it too often */
                        interval = HZ * 60;
                }
                /*
                 * This is really an uninterruptible wait, as it has to be
                 * complete. But it's also run from a kworker, which doesn't
                 * take signals, so it's fine to make it interruptible. This
                 * avoids scenarios where we knowingly can wait much longer
                 * on completions, for example if someone does a SIGSTOP on
                 * a task that needs to finish task_work to make this loop
                 * complete. That's a synthetic situation that should not
                 * cause a stuck task backtrace, and hence a potential panic
                 * on stuck tasks if that is enabled.
                 */
        } while (!wait_for_completion_interruptible_timeout(&ctx->ref_comp, interval));

        init_completion(&exit.completion);
        init_task_work(&exit.task_work, io_tctx_exit_cb);
        exit.ctx = ctx;
        /*
         * Some may use context even when all refs and requests have been put,
         * and they are free to do so while still holding uring_lock or
         * completion_lock, see io_req_task_submit(). Apart from other work,
         * this lock/unlock section also waits them to finish.
         */
        mutex_lock(&ctx->uring_lock);
        while (!list_empty(&ctx->tctx_list)) {
                WARN_ON_ONCE(time_after(jiffies, timeout));

                node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
                                        ctx_node);
                /* don't spin on a single task if cancellation failed */
                list_rotate_left(&ctx->tctx_list);
                ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
                if (WARN_ON_ONCE(ret))
                        continue;
                wake_up_process(node->task);

                mutex_unlock(&ctx->uring_lock);
                /*
                 * See comment above for
                 * wait_for_completion_interruptible_timeout() on why this
                 * wait is marked as interruptible.
                 */
                wait_for_completion_interruptible(&exit.completion);
                mutex_lock(&ctx->uring_lock);
        }
        mutex_unlock(&ctx->uring_lock);
        spin_lock(&ctx->completion_lock);
        spin_unlock(&ctx->completion_lock);

        io_ring_ctx_free(ctx);
}

/* Returns true if we found and killed one or more timeouts */
static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
                             bool cancel_all)
{
        struct io_kiocb *req, *tmp;
        int canceled = 0;

        spin_lock(&ctx->completion_lock);
        spin_lock_irq(&ctx->timeout_lock);
        list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
                if (io_match_task(req, tsk, cancel_all)) {
                        io_kill_timeout(req, -ECANCELED);
                        canceled++;
                }
        }
        spin_unlock_irq(&ctx->timeout_lock);
        if (canceled != 0)
                io_commit_cqring(ctx);
        spin_unlock(&ctx->completion_lock);
        if (canceled != 0)
                io_cqring_ev_posted(ctx);
        return canceled != 0;
}

static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{
        unsigned long index;
        struct creds *creds;

        mutex_lock(&ctx->uring_lock);
        percpu_ref_kill(&ctx->refs);
        if (ctx->rings)
                __io_cqring_overflow_flush(ctx, true);
        xa_for_each(&ctx->personalities, index, creds)
                io_unregister_personality(ctx, index);
        mutex_unlock(&ctx->uring_lock);

        io_kill_timeouts(ctx, NULL, true);
        io_poll_remove_all(ctx, NULL, true);

        /* if we failed setting up the ctx, we might not have any rings */
        io_iopoll_try_reap_events(ctx);

        /* drop cached put refs after potentially doing completions */
        if (current->io_uring)
                io_uring_drop_tctx_refs(current);

        INIT_WORK(&ctx->exit_work, io_ring_exit_work);
        /*
         * Use system_unbound_wq to avoid spawning tons of event kworkers
         * if we're exiting a ton of rings at the same time. It just adds
         * noise and overhead, there's no discernable change in runtime
         * over using system_wq.
         */
        queue_work(system_unbound_wq, &ctx->exit_work);
}

static int io_uring_release(struct inode *inode, struct file *file)
{
        struct io_ring_ctx *ctx = file->private_data;

        file->private_data = NULL;
        io_ring_ctx_wait_and_kill(ctx);
        return 0;
}

struct io_task_cancel {
        struct task_struct *task;
        bool all;
};

static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
{
        struct io_kiocb *req = container_of(work, struct io_kiocb, work);
        struct io_task_cancel *cancel = data;

        return io_match_task_safe(req, cancel->task, cancel->all);
}

static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
                                  struct task_struct *task, bool cancel_all)
{
        struct io_defer_entry *de;
        LIST_HEAD(list);

        spin_lock(&ctx->completion_lock);
        list_for_each_entry_reverse(de, &ctx->defer_list, list) {
                if (io_match_task_safe(de->req, task, cancel_all)) {
                        list_cut_position(&list, &ctx->defer_list, &de->list);
                        break;
                }
        }
        spin_unlock(&ctx->completion_lock);
        if (list_empty(&list))
                return false;

        while (!list_empty(&list)) {
                de = list_first_entry(&list, struct io_defer_entry, list);
                list_del_init(&de->list);
                io_req_complete_failed(de->req, -ECANCELED);
                kfree(de);
        }
        return true;
}

static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
{
        struct io_tctx_node *node;
        enum io_wq_cancel cret;
        bool ret = false;

        mutex_lock(&ctx->uring_lock);
        list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
                struct io_uring_task *tctx = node->task->io_uring;

                /*
                 * io_wq will stay alive while we hold uring_lock, because it's
                 * killed after ctx nodes, which requires to take the lock.
                 */
                if (!tctx || !tctx->io_wq)
                        continue;
                cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
                ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
        }
        mutex_unlock(&ctx->uring_lock);

        return ret;
}

static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
                                         struct task_struct *task,
                                         bool cancel_all)
{
        struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
        struct io_uring_task *tctx = task ? task->io_uring : NULL;

        while (1) {
                enum io_wq_cancel cret;
                bool ret = false;

                if (!task) {
                        ret |= io_uring_try_cancel_iowq(ctx);
                } else if (tctx && tctx->io_wq) {
                        /*
                         * Cancels requests of all rings, not only @ctx, but
                         * it's fine as the task is in exit/exec.
                         */
                        cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
                                               &cancel, true);
                        ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
                }

                /* SQPOLL thread does its own polling */
                if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
                    (ctx->sq_data && ctx->sq_data->thread == current)) {
                        while (!list_empty_careful(&ctx->iopoll_list)) {
                                io_iopoll_try_reap_events(ctx);
                                ret = true;
                                cond_resched();
                        }
                }

                ret |= io_cancel_defer_files(ctx, task, cancel_all);
                ret |= io_poll_remove_all(ctx, task, cancel_all);
                ret |= io_kill_timeouts(ctx, task, cancel_all);
                if (task)
                        ret |= io_run_task_work();
                if (!ret)
                        break;
                cond_resched();
        }
}

static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
{
        struct io_uring_task *tctx = current->io_uring;
        struct io_tctx_node *node;
        int ret;

        if (unlikely(!tctx)) {
                ret = io_uring_alloc_task_context(current, ctx);
                if (unlikely(ret))
                        return ret;

                tctx = current->io_uring;
                if (ctx->iowq_limits_set) {
                        unsigned int limits[2] = { ctx->iowq_limits[0],
                                                   ctx->iowq_limits[1], };

                        ret = io_wq_max_workers(tctx->io_wq, limits);
                        if (ret)
                                return ret;
                }
        }
        if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
                node = kmalloc(sizeof(*node), GFP_KERNEL);
                if (!node)
                        return -ENOMEM;
                node->ctx = ctx;
                node->task = current;

                ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
                                        node, GFP_KERNEL));
                if (ret) {
                        kfree(node);
                        return ret;
                }

                mutex_lock(&ctx->uring_lock);
                list_add(&node->ctx_node, &ctx->tctx_list);
                mutex_unlock(&ctx->uring_lock);
        }
        tctx->last = ctx;
        return 0;
}

/*
 * Note that this task has used io_uring. We use it for cancelation purposes.
 */
static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
{
        struct io_uring_task *tctx = current->io_uring;

        if (likely(tctx && tctx->last == ctx))
                return 0;
        return __io_uring_add_tctx_node(ctx);
}

/*
 * Remove this io_uring_file -> task mapping.
 */
static void io_uring_del_tctx_node(unsigned long index)
{
        struct io_uring_task *tctx = current->io_uring;
        struct io_tctx_node *node;

        if (!tctx)
                return;
        node = xa_erase(&tctx->xa, index);
        if (!node)
                return;

        WARN_ON_ONCE(current != node->task);
        WARN_ON_ONCE(list_empty(&node->ctx_node));

        mutex_lock(&node->ctx->uring_lock);
        list_del(&node->ctx_node);
        mutex_unlock(&node->ctx->uring_lock);

        if (tctx->last == node->ctx)
                tctx->last = NULL;
        kfree(node);
}

static void io_uring_clean_tctx(struct io_uring_task *tctx)
{
        struct io_wq *wq = tctx->io_wq;
        struct io_tctx_node *node;
        unsigned long index;

        xa_for_each(&tctx->xa, index, node) {
                io_uring_del_tctx_node(index);
                cond_resched();
        }
        if (wq) {
                /*
                 * Must be after io_uring_del_task_file() (removes nodes under
                 * uring_lock) to avoid race with io_uring_try_cancel_iowq().
                 */
                io_wq_put_and_exit(wq);
                tctx->io_wq = NULL;
        }
}

static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
{
        if (tracked)
                return atomic_read(&tctx->inflight_tracked);
        return percpu_counter_sum(&tctx->inflight);
}

/*
 * Find any io_uring ctx that this task has registered or done IO on, and cancel
 * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
 */
static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
{
        struct io_uring_task *tctx = current->io_uring;
        struct io_ring_ctx *ctx;
        s64 inflight;
        DEFINE_WAIT(wait);

        WARN_ON_ONCE(sqd && sqd->thread != current);

        if (!current->io_uring)
                return;
        if (tctx->io_wq)
                io_wq_exit_start(tctx->io_wq);

        atomic_inc(&tctx->in_idle);
        do {
                io_uring_drop_tctx_refs(current);
                /* read completions before cancelations */
                inflight = tctx_inflight(tctx, !cancel_all);
                if (!inflight)
                        break;

                if (!sqd) {
                        struct io_tctx_node *node;
                        unsigned long index;

                        xa_for_each(&tctx->xa, index, node) {
                                /* sqpoll task will cancel all its requests */
                                if (node->ctx->sq_data)
                                        continue;
                                io_uring_try_cancel_requests(node->ctx, current,
                                                             cancel_all);
                        }
                } else {
                        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                                io_uring_try_cancel_requests(ctx, current,
                                                             cancel_all);
                }

                prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
                io_run_task_work();
                io_uring_drop_tctx_refs(current);

                /*
                 * If we've seen completions, retry without waiting. This
                 * avoids a race where a completion comes in before we did
                 * prepare_to_wait().
                 */
                if (inflight == tctx_inflight(tctx, !cancel_all))
                        schedule();
                finish_wait(&tctx->wait, &wait);
        } while (1);

        io_uring_clean_tctx(tctx);
        if (cancel_all) {
                /*
                 * We shouldn't run task_works after cancel, so just leave
                 * ->in_idle set for normal exit.
                 */
                atomic_dec(&tctx->in_idle);
                /* for exec all current's requests should be gone, kill tctx */
                __io_uring_free(current);
        }
}

void __io_uring_cancel(bool cancel_all)
{
        io_uring_cancel_generic(cancel_all, NULL);
}

static void *io_uring_validate_mmap_request(struct file *file,
                                            loff_t pgoff, size_t sz)
{
        struct io_ring_ctx *ctx = file->private_data;
        loff_t offset = pgoff << PAGE_SHIFT;
        struct page *page;
        void *ptr;

        switch (offset) {
        case IORING_OFF_SQ_RING:
        case IORING_OFF_CQ_RING:
                ptr = ctx->rings;
                break;
        case IORING_OFF_SQES:
                ptr = ctx->sq_sqes;
                break;
        default:
                return ERR_PTR(-EINVAL);
        }

        page = virt_to_head_page(ptr);
        if (sz > page_size(page))
                return ERR_PTR(-EINVAL);

        return ptr;
}

#ifdef CONFIG_MMU

static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
        size_t sz = vma->vm_end - vma->vm_start;
        unsigned long pfn;
        void *ptr;

        ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
        if (IS_ERR(ptr))
                return PTR_ERR(ptr);

        pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
        return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
}

#else /* !CONFIG_MMU */

static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
        return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
}

static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
{
        return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
}

static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
        unsigned long addr, unsigned long len,
        unsigned long pgoff, unsigned long flags)
{
        void *ptr;

        ptr = io_uring_validate_mmap_request(file, pgoff, len);
        if (IS_ERR(ptr))
                return PTR_ERR(ptr);

        return (unsigned long) ptr;
}

#endif /* !CONFIG_MMU */

static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
{
        DEFINE_WAIT(wait);

        do {
                if (!io_sqring_full(ctx))
                        break;
                prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);

                if (!io_sqring_full(ctx))
                        break;
                schedule();
        } while (!signal_pending(current));

        finish_wait(&ctx->sqo_sq_wait, &wait);
        return 0;
}

static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
                          struct __kernel_timespec __user **ts,
                          const sigset_t __user **sig)
{
        struct io_uring_getevents_arg arg;

        /*
         * If EXT_ARG isn't set, then we have no timespec and the argp pointer
         * is just a pointer to the sigset_t.
         */
        if (!(flags & IORING_ENTER_EXT_ARG)) {
                *sig = (const sigset_t __user *) argp;
                *ts = NULL;
                return 0;
        }

        /*
         * EXT_ARG is set - ensure we agree on the size of it and copy in our
         * timespec and sigset_t pointers if good.
         */
        if (*argsz != sizeof(arg))
                return -EINVAL;
        if (copy_from_user(&arg, argp, sizeof(arg)))
                return -EFAULT;
        if (arg.pad)
                return -EINVAL;
        *sig = u64_to_user_ptr(arg.sigmask);
        *argsz = arg.sigmask_sz;
        *ts = u64_to_user_ptr(arg.ts);
        return 0;
}

SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
                u32, min_complete, u32, flags, const void __user *, argp,
                size_t, argsz)
{
        struct io_ring_ctx *ctx;
        int submitted = 0;
        struct fd f;
        long ret;

        io_run_task_work();

        if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
                               IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
                return -EINVAL;

        f = fdget(fd);
        if (unlikely(!f.file))
                return -EBADF;

        ret = -EOPNOTSUPP;
        if (unlikely(f.file->f_op != &io_uring_fops))
                goto out_fput;

        ret = -ENXIO;
        ctx = f.file->private_data;
        if (unlikely(!percpu_ref_tryget(&ctx->refs)))
                goto out_fput;

        ret = -EBADFD;
        if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
                goto out;

        /*
         * For SQ polling, the thread will do all submissions and completions.
         * Just return the requested submit count, and wake the thread if
         * we were asked to.
         */
        ret = 0;
        if (ctx->flags & IORING_SETUP_SQPOLL) {
                io_cqring_overflow_flush(ctx);

                if (unlikely(ctx->sq_data->thread == NULL)) {
                        ret = -EOWNERDEAD;
                        goto out;
                }
                if (flags & IORING_ENTER_SQ_WAKEUP)
                        wake_up(&ctx->sq_data->wait);
                if (flags & IORING_ENTER_SQ_WAIT) {
                        ret = io_sqpoll_wait_sq(ctx);
                        if (ret)
                                goto out;
                }
                submitted = to_submit;
        } else if (to_submit) {
                ret = io_uring_add_tctx_node(ctx);
                if (unlikely(ret))
                        goto out;
                mutex_lock(&ctx->uring_lock);
                submitted = io_submit_sqes(ctx, to_submit);
                mutex_unlock(&ctx->uring_lock);

                if (submitted != to_submit)
                        goto out;
        }
        if (flags & IORING_ENTER_GETEVENTS) {
                const sigset_t __user *sig;
                struct __kernel_timespec __user *ts;

                ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
                if (unlikely(ret))
                        goto out;

                min_complete = min(min_complete, ctx->cq_entries);

                /*
                 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
                 * space applications don't need to do io completion events
                 * polling again, they can rely on io_sq_thread to do polling
                 * work, which can reduce cpu usage and uring_lock contention.
                 */
                if (ctx->flags & IORING_SETUP_IOPOLL &&
                    !(ctx->flags & IORING_SETUP_SQPOLL)) {
                        ret = io_iopoll_check(ctx, min_complete);
                } else {
                        ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
                }
        }

out:
        percpu_ref_put(&ctx->refs);
out_fput:
        fdput(f);
        return submitted ? submitted : ret;
}

#ifdef CONFIG_PROC_FS
static int io_uring_show_cred(struct seq_file *m, unsigned int id,
                const struct cred *cred)
{
        struct user_namespace *uns = seq_user_ns(m);
        struct group_info *gi;
        kernel_cap_t cap;
        unsigned __capi;
        int g;

        seq_printf(m, "%5d\n", id);
        seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
        seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
        seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
        seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
        seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
        seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
        seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
        seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
        seq_puts(m, "\n\tGroups:\t");
        gi = cred->group_info;
        for (g = 0; g < gi->ngroups; g++) {
                seq_put_decimal_ull(m, g ? " " : "",
                                        from_kgid_munged(uns, gi->gid[g]));
        }
        seq_puts(m, "\n\tCapEff:\t");
        cap = cred->cap_effective;
        CAP_FOR_EACH_U32(__capi)
                seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
        seq_putc(m, '\n');
        return 0;
}

static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
{
        int sq_pid = -1, sq_cpu = -1;
        bool has_lock;
        int i;

        /*
         * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
         * since fdinfo case grabs it in the opposite direction of normal use
         * cases. If we fail to get the lock, we just don't iterate any
         * structures that could be going away outside the io_uring mutex.
         */
        has_lock = mutex_trylock(&ctx->uring_lock);

        if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
                struct io_sq_data *sq = ctx->sq_data;

                if (mutex_trylock(&sq->lock)) {
                        if (sq->thread) {
                                sq_pid = task_pid_nr(sq->thread);
                                sq_cpu = task_cpu(sq->thread);
                        }
                        mutex_unlock(&sq->lock);
                }
        }

        seq_printf(m, "SqThread:\t%d\n", sq_pid);
        seq_printf(m, "SqThreadCpu:\t%d\n", sq_cpu);
        seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
        for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
                struct file *f = io_file_from_index(ctx, i);

                if (f)
                        seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
                else
                        seq_printf(m, "%5u: <none>\n", i);
        }
        seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
        for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
                struct io_mapped_ubuf *buf = ctx->user_bufs[i];
                unsigned int len = buf->ubuf_end - buf->ubuf;

                seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
        }
        if (has_lock && !xa_empty(&ctx->personalities)) {
                unsigned long index;
                const struct cred *cred;

                seq_printf(m, "Personalities:\n");
                xa_for_each(&ctx->personalities, index, cred)
                        io_uring_show_cred(m, index, cred);
        }
        seq_printf(m, "PollList:\n");
        spin_lock(&ctx->completion_lock);
        for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
                struct hlist_head *list = &ctx->cancel_hash[i];
                struct io_kiocb *req;

                hlist_for_each_entry(req, list, hash_node)
                        seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
                                        req->task->task_works != NULL);
        }
        spin_unlock(&ctx->completion_lock);
        if (has_lock)
                mutex_unlock(&ctx->uring_lock);
}

static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
{
        struct io_ring_ctx *ctx = f->private_data;

        if (percpu_ref_tryget(&ctx->refs)) {
                __io_uring_show_fdinfo(ctx, m);
                percpu_ref_put(&ctx->refs);
        }
}
#endif

static const struct file_operations io_uring_fops = {
        .release        = io_uring_release,
        .mmap                = io_uring_mmap,
#ifndef CONFIG_MMU
        .get_unmapped_area = io_uring_nommu_get_unmapped_area,
        .mmap_capabilities = io_uring_nommu_mmap_capabilities,
#endif
        .poll                = io_uring_poll,
#ifdef CONFIG_PROC_FS
        .show_fdinfo        = io_uring_show_fdinfo,
#endif
};

bool io_is_uring_fops(struct file *file)
{
        return file->f_op == &io_uring_fops;
}

static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
                                  struct io_uring_params *p)
{
        struct io_rings *rings;
        size_t size, sq_array_offset;

        /* make sure these are sane, as we already accounted them */
        ctx->sq_entries = p->sq_entries;
        ctx->cq_entries = p->cq_entries;

        size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
        if (size == SIZE_MAX)
                return -EOVERFLOW;

        rings = io_mem_alloc(size);
        if (!rings)
                return -ENOMEM;

        ctx->rings = rings;
        ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
        rings->sq_ring_mask = p->sq_entries - 1;
        rings->cq_ring_mask = p->cq_entries - 1;
        rings->sq_ring_entries = p->sq_entries;
        rings->cq_ring_entries = p->cq_entries;

        size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
        if (size == SIZE_MAX) {
                io_mem_free(ctx->rings);
                ctx->rings = NULL;
                return -EOVERFLOW;
        }

        ctx->sq_sqes = io_mem_alloc(size);
        if (!ctx->sq_sqes) {
                io_mem_free(ctx->rings);
                ctx->rings = NULL;
                return -ENOMEM;
        }

        return 0;
}

static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
{
        int ret, fd;

        fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
        if (fd < 0)
                return fd;

        ret = io_uring_add_tctx_node(ctx);
        if (ret) {
                put_unused_fd(fd);
                return ret;
        }
        fd_install(fd, file);
        return fd;
}

/*
 * Allocate an anonymous fd, this is what constitutes the application
 * visible backing of an io_uring instance. The application mmaps this
 * fd to gain access to the SQ/CQ ring details.
 */
static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
{
        return anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
                                  O_RDWR | O_CLOEXEC);
}

static int io_uring_create(unsigned entries, struct io_uring_params *p,
                           struct io_uring_params __user *params)
{
        struct io_ring_ctx *ctx;
        struct file *file;
        int ret;

        if (!entries)
                return -EINVAL;
        if (entries > IORING_MAX_ENTRIES) {
                if (!(p->flags & IORING_SETUP_CLAMP))
                        return -EINVAL;
                entries = IORING_MAX_ENTRIES;
        }

        /*
         * Use twice as many entries for the CQ ring. It's possible for the
         * application to drive a higher depth than the size of the SQ ring,
         * since the sqes are only used at submission time. This allows for
         * some flexibility in overcommitting a bit. If the application has
         * set IORING_SETUP_CQSIZE, it will have passed in the desired number
         * of CQ ring entries manually.
         */
        p->sq_entries = roundup_pow_of_two(entries);
        if (p->flags & IORING_SETUP_CQSIZE) {
                /*
                 * If IORING_SETUP_CQSIZE is set, we do the same roundup
                 * to a power-of-two, if it isn't already. We do NOT impose
                 * any cq vs sq ring sizing.
                 */
                if (!p->cq_entries)
                        return -EINVAL;
                if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
                        if (!(p->flags & IORING_SETUP_CLAMP))
                                return -EINVAL;
                        p->cq_entries = IORING_MAX_CQ_ENTRIES;
                }
                p->cq_entries = roundup_pow_of_two(p->cq_entries);
                if (p->cq_entries < p->sq_entries)
                        return -EINVAL;
        } else {
                p->cq_entries = 2 * p->sq_entries;
        }

        ctx = io_ring_ctx_alloc(p);
        if (!ctx)
                return -ENOMEM;
        ctx->compat = in_compat_syscall();
        if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK))
                ctx->user = get_uid(current_user());

        /*
         * This is just grabbed for accounting purposes. When a process exits,
         * the mm is exited and dropped before the files, hence we need to hang
         * on to this mm purely for the purposes of being able to unaccount
         * memory (locked/pinned vm). It's not used for anything else.
         */
        mmgrab(current->mm);
        ctx->mm_account = current->mm;

        ret = io_allocate_scq_urings(ctx, p);
        if (ret)
                goto err;

        ret = io_sq_offload_create(ctx, p);
        if (ret)
                goto err;
        /* always set a rsrc node */
        ret = io_rsrc_node_switch_start(ctx);
        if (ret)
                goto err;
        io_rsrc_node_switch(ctx, NULL);

        memset(&p->sq_off, 0, sizeof(p->sq_off));
        p->sq_off.head = offsetof(struct io_rings, sq.head);
        p->sq_off.tail = offsetof(struct io_rings, sq.tail);
        p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
        p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
        p->sq_off.flags = offsetof(struct io_rings, sq_flags);
        p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
        p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;

        memset(&p->cq_off, 0, sizeof(p->cq_off));
        p->cq_off.head = offsetof(struct io_rings, cq.head);
        p->cq_off.tail = offsetof(struct io_rings, cq.tail);
        p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
        p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
        p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
        p->cq_off.cqes = offsetof(struct io_rings, cqes);
        p->cq_off.flags = offsetof(struct io_rings, cq_flags);

        p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
                        IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
                        IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
                        IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
                        IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
                        IORING_FEAT_RSRC_TAGS;

        if (copy_to_user(params, p, sizeof(*p))) {
                ret = -EFAULT;
                goto err;
        }

        file = io_uring_get_file(ctx);
        if (IS_ERR(file)) {
                ret = PTR_ERR(file);
                goto err;
        }

        /*
         * Install ring fd as the very last thing, so we don't risk someone
         * having closed it before we finish setup
         */
        ret = io_uring_install_fd(ctx, file);
        if (ret < 0) {
                /* fput will clean it up */
                fput(file);
                return ret;
        }

        trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
        return ret;
err:
        io_ring_ctx_wait_and_kill(ctx);
        return ret;
}

/*
 * Sets up an aio uring context, and returns the fd. Applications asks for a
 * ring size, we return the actual sq/cq ring sizes (among other things) in the
 * params structure passed in.
 */
static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
{
        struct io_uring_params p;
        int i;

        if (copy_from_user(&p, params, sizeof(p)))
                return -EFAULT;
        for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
                if (p.resv[i])
                        return -EINVAL;
        }

        if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
                        IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
                        IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
                        IORING_SETUP_R_DISABLED))
                return -EINVAL;

        return  io_uring_create(entries, &p, params);
}

SYSCALL_DEFINE2(io_uring_setup, u32, entries,
                struct io_uring_params __user *, params)
{
        return io_uring_setup(entries, params);
}

static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
{
        struct io_uring_probe *p;
        size_t size;
        int i, ret;

        size = struct_size(p, ops, nr_args);
        if (size == SIZE_MAX)
                return -EOVERFLOW;
        p = kzalloc(size, GFP_KERNEL);
        if (!p)
                return -ENOMEM;

        ret = -EFAULT;
        if (copy_from_user(p, arg, size))
                goto out;
        ret = -EINVAL;
        if (memchr_inv(p, 0, size))
                goto out;

        p->last_op = IORING_OP_LAST - 1;
        if (nr_args > IORING_OP_LAST)
                nr_args = IORING_OP_LAST;

        for (i = 0; i < nr_args; i++) {
                p->ops[i].op = i;
                if (!io_op_defs[i].not_supported)
                        p->ops[i].flags = IO_URING_OP_SUPPORTED;
        }
        p->ops_len = i;

        ret = 0;
        if (copy_to_user(arg, p, size))
                ret = -EFAULT;
out:
        kfree(p);
        return ret;
}

static int io_register_personality(struct io_ring_ctx *ctx)
{
        const struct cred *creds;
        u32 id;
        int ret;

        creds = get_current_cred();

        ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
                        XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
        if (ret < 0) {
                put_cred(creds);
                return ret;
        }
        return id;
}

static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
                                    unsigned int nr_args)
{
        struct io_uring_restriction *res;
        size_t size;
        int i, ret;

        /* Restrictions allowed only if rings started disabled */
        if (!(ctx->flags & IORING_SETUP_R_DISABLED))
                return -EBADFD;

        /* We allow only a single restrictions registration */
        if (ctx->restrictions.registered)
                return -EBUSY;

        if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
                return -EINVAL;

        size = array_size(nr_args, sizeof(*res));
        if (size == SIZE_MAX)
                return -EOVERFLOW;

        res = memdup_user(arg, size);
        if (IS_ERR(res))
                return PTR_ERR(res);

        ret = 0;

        for (i = 0; i < nr_args; i++) {
                switch (res[i].opcode) {
                case IORING_RESTRICTION_REGISTER_OP:
                        if (res[i].register_op >= IORING_REGISTER_LAST) {
                                ret = -EINVAL;
                                goto out;
                        }

                        __set_bit(res[i].register_op,
                                  ctx->restrictions.register_op);
                        break;
                case IORING_RESTRICTION_SQE_OP:
                        if (res[i].sqe_op >= IORING_OP_LAST) {
                                ret = -EINVAL;
                                goto out;
                        }

                        __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
                        break;
                case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
                        ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
                        break;
                case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
                        ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
                        break;
                default:
                        ret = -EINVAL;
                        goto out;
                }
        }

out:
        /* Reset all restrictions if an error happened */
        if (ret != 0)
                memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
        else
                ctx->restrictions.registered = true;

        kfree(res);
        return ret;
}

static int io_register_enable_rings(struct io_ring_ctx *ctx)
{
        if (!(ctx->flags & IORING_SETUP_R_DISABLED))
                return -EBADFD;

        if (ctx->restrictions.registered)
                ctx->restricted = 1;

        ctx->flags &= ~IORING_SETUP_R_DISABLED;
        if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
                wake_up(&ctx->sq_data->wait);
        return 0;
}

static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
                                     struct io_uring_rsrc_update2 *up,
                                     unsigned nr_args)
{
        __u32 tmp;
        int err;

        if (check_add_overflow(up->offset, nr_args, &tmp))
                return -EOVERFLOW;
        err = io_rsrc_node_switch_start(ctx);
        if (err)
                return err;

        switch (type) {
        case IORING_RSRC_FILE:
                return __io_sqe_files_update(ctx, up, nr_args);
        case IORING_RSRC_BUFFER:
                return __io_sqe_buffers_update(ctx, up, nr_args);
        }
        return -EINVAL;
}

static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
                                    unsigned nr_args)
{
        struct io_uring_rsrc_update2 up;

        if (!nr_args)
                return -EINVAL;
        memset(&up, 0, sizeof(up));
        if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
                return -EFAULT;
        if (up.resv || up.resv2)
                return -EINVAL;
        return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
}

static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
                                   unsigned size, unsigned type)
{
        struct io_uring_rsrc_update2 up;

        if (size != sizeof(up))
                return -EINVAL;
        if (copy_from_user(&up, arg, sizeof(up)))
                return -EFAULT;
        if (!up.nr || up.resv || up.resv2)
                return -EINVAL;
        return __io_register_rsrc_update(ctx, type, &up, up.nr);
}

static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
                            unsigned int size, unsigned int type)
{
        struct io_uring_rsrc_register rr;

        /* keep it extendible */
        if (size != sizeof(rr))
                return -EINVAL;

        memset(&rr, 0, sizeof(rr));
        if (copy_from_user(&rr, arg, size))
                return -EFAULT;
        if (!rr.nr || rr.resv || rr.resv2)
                return -EINVAL;

        switch (type) {
        case IORING_RSRC_FILE:
                return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
                                             rr.nr, u64_to_user_ptr(rr.tags));
        case IORING_RSRC_BUFFER:
                return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
                                               rr.nr, u64_to_user_ptr(rr.tags));
        }
        return -EINVAL;
}

static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg,
                                unsigned len)
{
        struct io_uring_task *tctx = current->io_uring;
        cpumask_var_t new_mask;
        int ret;

        if (!tctx || !tctx->io_wq)
                return -EINVAL;

        if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
                return -ENOMEM;

        cpumask_clear(new_mask);
        if (len > cpumask_size())
                len = cpumask_size();

#ifdef CONFIG_COMPAT
        if (in_compat_syscall()) {
                ret = compat_get_bitmap(cpumask_bits(new_mask),
                                        (const compat_ulong_t __user *)arg,
                                        len * 8 /* CHAR_BIT */);
        } else {
                ret = copy_from_user(new_mask, arg, len);
        }
#else
        ret = copy_from_user(new_mask, arg, len);
#endif

        if (ret) {
                free_cpumask_var(new_mask);
                return -EFAULT;
        }

        ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
        free_cpumask_var(new_mask);
        return ret;
}

static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
{
        struct io_uring_task *tctx = current->io_uring;

        if (!tctx || !tctx->io_wq)
                return -EINVAL;

        return io_wq_cpu_affinity(tctx->io_wq, NULL);
}

static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
                                        void __user *arg)
        __must_hold(&ctx->uring_lock)
{
        struct io_tctx_node *node;
        struct io_uring_task *tctx = NULL;
        struct io_sq_data *sqd = NULL;
        __u32 new_count[2];
        int i, ret;

        if (copy_from_user(new_count, arg, sizeof(new_count)))
                return -EFAULT;
        for (i = 0; i < ARRAY_SIZE(new_count); i++)
                if (new_count[i] > INT_MAX)
                        return -EINVAL;

        if (ctx->flags & IORING_SETUP_SQPOLL) {
                sqd = ctx->sq_data;
                if (sqd) {
                        /*
                         * Observe the correct sqd->lock -> ctx->uring_lock
                         * ordering. Fine to drop uring_lock here, we hold
                         * a ref to the ctx.
                         */
                        refcount_inc(&sqd->refs);
                        mutex_unlock(&ctx->uring_lock);
                        mutex_lock(&sqd->lock);
                        mutex_lock(&ctx->uring_lock);
                        if (sqd->thread)
                                tctx = sqd->thread->io_uring;
                }
        } else {
                tctx = current->io_uring;
        }

        BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));

        for (i = 0; i < ARRAY_SIZE(new_count); i++)
                if (new_count[i])
                        ctx->iowq_limits[i] = new_count[i];
        ctx->iowq_limits_set = true;

        ret = -EINVAL;
        if (tctx && tctx->io_wq) {
                ret = io_wq_max_workers(tctx->io_wq, new_count);
                if (ret)
                        goto err;
        } else {
                memset(new_count, 0, sizeof(new_count));
        }

        if (sqd) {
                mutex_unlock(&sqd->lock);
                io_put_sq_data(sqd);
        }

        if (copy_to_user(arg, new_count, sizeof(new_count)))
                return -EFAULT;

        /* that's it for SQPOLL, only the SQPOLL task creates requests */
        if (sqd)
                return 0;

        /* now propagate the restriction to all registered users */
        list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
                struct io_uring_task *tctx = node->task->io_uring;

                if (WARN_ON_ONCE(!tctx->io_wq))
                        continue;

                for (i = 0; i < ARRAY_SIZE(new_count); i++)
                        new_count[i] = ctx->iowq_limits[i];
                /* ignore errors, it always returns zero anyway */
                (void)io_wq_max_workers(tctx->io_wq, new_count);
        }
        return 0;
err:
        if (sqd) {
                mutex_unlock(&sqd->lock);
                io_put_sq_data(sqd);
        }
        return ret;
}

static bool io_register_op_must_quiesce(int op)
{
        switch (op) {
        case IORING_REGISTER_BUFFERS:
        case IORING_UNREGISTER_BUFFERS:
        case IORING_REGISTER_FILES:
        case IORING_UNREGISTER_FILES:
        case IORING_REGISTER_FILES_UPDATE:
        case IORING_REGISTER_PROBE:
        case IORING_REGISTER_PERSONALITY:
        case IORING_UNREGISTER_PERSONALITY:
        case IORING_REGISTER_FILES2:
        case IORING_REGISTER_FILES_UPDATE2:
        case IORING_REGISTER_BUFFERS2:
        case IORING_REGISTER_BUFFERS_UPDATE:
        case IORING_REGISTER_IOWQ_AFF:
        case IORING_UNREGISTER_IOWQ_AFF:
        case IORING_REGISTER_IOWQ_MAX_WORKERS:
                return false;
        default:
                return true;
        }
}

static int io_ctx_quiesce(struct io_ring_ctx *ctx)
{
        long ret;

        percpu_ref_kill(&ctx->refs);

        /*
         * Drop uring mutex before waiting for references to exit. If another
         * thread is currently inside io_uring_enter() it might need to grab the
         * uring_lock to make progress. If we hold it here across the drain
         * wait, then we can deadlock. It's safe to drop the mutex here, since
         * no new references will come in after we've killed the percpu ref.
         */
        mutex_unlock(&ctx->uring_lock);
        do {
                ret = wait_for_completion_interruptible(&ctx->ref_comp);
                if (!ret)
                        break;
                ret = io_run_task_work_sig();
        } while (ret >= 0);
        mutex_lock(&ctx->uring_lock);

        if (ret)
                io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
        return ret;
}

static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
                               void __user *arg, unsigned nr_args)
        __releases(ctx->uring_lock)
        __acquires(ctx->uring_lock)
{
        int ret;

        /*
         * We're inside the ring mutex, if the ref is already dying, then
         * someone else killed the ctx or is already going through
         * io_uring_register().
         */
        if (percpu_ref_is_dying(&ctx->refs))
                return -ENXIO;

        if (ctx->restricted) {
                opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
                if (!test_bit(opcode, ctx->restrictions.register_op))
                        return -EACCES;
        }

        if (io_register_op_must_quiesce(opcode)) {
                ret = io_ctx_quiesce(ctx);
                if (ret)
                        return ret;
        }

        switch (opcode) {
        case IORING_REGISTER_BUFFERS:
                ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
                break;
        case IORING_UNREGISTER_BUFFERS:
                ret = -EINVAL;
                if (arg || nr_args)
                        break;
                ret = io_sqe_buffers_unregister(ctx);
                break;
        case IORING_REGISTER_FILES:
                ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
                break;
        case IORING_UNREGISTER_FILES:
                ret = -EINVAL;
                if (arg || nr_args)
                        break;
                ret = io_sqe_files_unregister(ctx);
                break;
        case IORING_REGISTER_FILES_UPDATE:
                ret = io_register_files_update(ctx, arg, nr_args);
                break;
        case IORING_REGISTER_EVENTFD:
        case IORING_REGISTER_EVENTFD_ASYNC:
                ret = -EINVAL;
                if (nr_args != 1)
                        break;
                ret = io_eventfd_register(ctx, arg);
                if (ret)
                        break;
                if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
                        ctx->eventfd_async = 1;
                else
                        ctx->eventfd_async = 0;
                break;
        case IORING_UNREGISTER_EVENTFD:
                ret = -EINVAL;
                if (arg || nr_args)
                        break;
                ret = io_eventfd_unregister(ctx);
                break;
        case IORING_REGISTER_PROBE:
                ret = -EINVAL;
                if (!arg || nr_args > 256)
                        break;
                ret = io_probe(ctx, arg, nr_args);
                break;
        case IORING_REGISTER_PERSONALITY:
                ret = -EINVAL;
                if (arg || nr_args)
                        break;
                ret = io_register_personality(ctx);
                break;
        case IORING_UNREGISTER_PERSONALITY:
                ret = -EINVAL;
                if (arg)
                        break;
                ret = io_unregister_personality(ctx, nr_args);
                break;
        case IORING_REGISTER_ENABLE_RINGS:
                ret = -EINVAL;
                if (arg || nr_args)
                        break;
                ret = io_register_enable_rings(ctx);
                break;
        case IORING_REGISTER_RESTRICTIONS:
                ret = io_register_restrictions(ctx, arg, nr_args);
                break;
        case IORING_REGISTER_FILES2:
                ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
                break;
        case IORING_REGISTER_FILES_UPDATE2:
                ret = io_register_rsrc_update(ctx, arg, nr_args,
                                              IORING_RSRC_FILE);
                break;
        case IORING_REGISTER_BUFFERS2:
                ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
                break;
        case IORING_REGISTER_BUFFERS_UPDATE:
                ret = io_register_rsrc_update(ctx, arg, nr_args,
                                              IORING_RSRC_BUFFER);
                break;
        case IORING_REGISTER_IOWQ_AFF:
                ret = -EINVAL;
                if (!arg || !nr_args)
                        break;
                ret = io_register_iowq_aff(ctx, arg, nr_args);
                break;
        case IORING_UNREGISTER_IOWQ_AFF:
                ret = -EINVAL;
                if (arg || nr_args)
                        break;
                ret = io_unregister_iowq_aff(ctx);
                break;
        case IORING_REGISTER_IOWQ_MAX_WORKERS:
                ret = -EINVAL;
                if (!arg || nr_args != 2)
                        break;
                ret = io_register_iowq_max_workers(ctx, arg);
                break;
        default:
                ret = -EINVAL;
                break;
        }

        if (io_register_op_must_quiesce(opcode)) {
                /* bring the ctx back to life */
                percpu_ref_reinit(&ctx->refs);
                reinit_completion(&ctx->ref_comp);
        }
        return ret;
}

SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
                void __user *, arg, unsigned int, nr_args)
{
        struct io_ring_ctx *ctx;
        long ret = -EBADF;
        struct fd f;

        if (opcode >= IORING_REGISTER_LAST)
                return -EINVAL;

        f = fdget(fd);
        if (!f.file)
                return -EBADF;

        ret = -EOPNOTSUPP;
        if (f.file->f_op != &io_uring_fops)
                goto out_fput;

        ctx = f.file->private_data;

        io_run_task_work();

        mutex_lock(&ctx->uring_lock);
        ret = __io_uring_register(ctx, opcode, arg, nr_args);
        mutex_unlock(&ctx->uring_lock);
        trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
                                                        ctx->cq_ev_fd != NULL, ret);
out_fput:
        fdput(f);
        return ret;
}

static int __init io_uring_init(void)
{
#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
        BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
        BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
} while (0)

#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
        __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
        BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
        BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
        BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
        BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
        BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
        BUILD_BUG_SQE_ELEM(8,  __u64,  off);
        BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
        BUILD_BUG_SQE_ELEM(16, __u64,  addr);
        BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
        BUILD_BUG_SQE_ELEM(24, __u32,  len);
        BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
        BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
        BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
        BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
        BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
        BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
        BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
        BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
        BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
        BUILD_BUG_SQE_ELEM(40, __u16,  buf_group);
        BUILD_BUG_SQE_ELEM(42, __u16,  personality);
        BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
        BUILD_BUG_SQE_ELEM(44, __u32,  file_index);

        BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
                     sizeof(struct io_uring_rsrc_update));
        BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
                     sizeof(struct io_uring_rsrc_update2));

        /* ->buf_index is u16 */
        BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));

        /* should fit into one byte */
        BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));

        BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
        BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));

        req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
                                SLAB_ACCOUNT);
        return 0;
};
__initcall(io_uring_init);





























    1 











    1 



































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _X86_IRQFLAGS_H_
#define _X86_IRQFLAGS_H_

#include <asm/processor-flags.h>

#ifndef __ASSEMBLY__

#include <asm/nospec-branch.h>

/* Provide __cpuidle; we can't safely include <linux/cpu.h> */
#define __cpuidle __section(".cpuidle.text")

/*
 * Interrupt control:
 */

/* Declaration required for gcc < 4.9 to prevent -Werror=missing-prototypes */
extern inline unsigned long native_save_fl(void);
extern __always_inline unsigned long native_save_fl(void)
{
        unsigned long flags;

        /*
         * "=rm" is safe here, because "pop" adjusts the stack before
         * it evaluates its effective address -- this is part of the
         * documented behavior of the "pop" instruction.
         */
        asm volatile("# __raw_save_flags\n\t"
                     "pushf ; pop %0"
                     : "=rm" (flags)
                     : /* no input */
                     : "memory");

        return flags;
}

extern inline void native_restore_fl(unsigned long flags);
extern inline void native_restore_fl(unsigned long flags)
{
        asm volatile("push %0 ; popf"
                     : /* no output */
                     :"g" (flags)
                     :"memory", "cc");
}

static __always_inline void native_irq_disable(void)
{
        asm volatile("cli": : :"memory");
}

static __always_inline void native_irq_enable(void)
{
        asm volatile("sti": : :"memory");
}

static inline __cpuidle void native_safe_halt(void)
{
        mds_idle_clear_cpu_buffers();
        asm volatile("sti; hlt": : :"memory");
}

static inline __cpuidle void native_halt(void)
{
        mds_idle_clear_cpu_buffers();
        asm volatile("hlt": : :"memory");
}

#endif

#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#ifndef __ASSEMBLY__
#include <linux/types.h>

static __always_inline unsigned long arch_local_save_flags(void)
{
        return native_save_fl();
}

static __always_inline void arch_local_irq_restore(unsigned long flags)
{
        native_restore_fl(flags);
}

static __always_inline void arch_local_irq_disable(void)
{
        native_irq_disable();
}

static __always_inline void arch_local_irq_enable(void)
{
        native_irq_enable();
}

/*
 * Used in the idle loop; sti takes one instruction cycle
 * to complete:
 */
static inline __cpuidle void arch_safe_halt(void)
{
        native_safe_halt();
}

/*
 * Used when interrupts are already enabled or to
 * shutdown the processor:
 */
static inline __cpuidle void halt(void)
{
        native_halt();
}

/*
 * For spinlocks, etc:
 */
static __always_inline unsigned long arch_local_irq_save(void)
{
        unsigned long flags = arch_local_save_flags();
        arch_local_irq_disable();
        return flags;
}
#else

#define ENABLE_INTERRUPTS(x)        sti
#define DISABLE_INTERRUPTS(x)        cli

#ifdef CONFIG_X86_64
#ifdef CONFIG_DEBUG_ENTRY
#define SAVE_FLAGS(x)                pushfq; popq %rax
#endif

#define INTERRUPT_RETURN        jmp native_iret
#define USERGS_SYSRET64                                \
        swapgs;                                        \
        CLEAR_CPU_BUFFERS;                        \
        sysretq;
#define USERGS_SYSRET32                                \
        swapgs;                                        \
        sysretl

#else
#define INTERRUPT_RETURN                iret
#endif

#endif /* __ASSEMBLY__ */
#endif /* CONFIG_PARAVIRT_XXL */

#ifndef __ASSEMBLY__
static __always_inline int arch_irqs_disabled_flags(unsigned long flags)
{
        return !(flags & X86_EFLAGS_IF);
}

static __always_inline int arch_irqs_disabled(void)
{
        unsigned long flags = arch_local_save_flags();

        return arch_irqs_disabled_flags(flags);
}
#else
#ifdef CONFIG_X86_64
#ifdef CONFIG_XEN_PV
#define SWAPGS        ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV
#else
#define SWAPGS        swapgs
#endif
#endif
#endif /* !__ASSEMBLY__ */

#endif































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PROCESSOR_H
#define _ASM_X86_PROCESSOR_H

#include <asm/processor-flags.h>

/* Forward declaration, a strange C thing */
struct task_struct;
struct mm_struct;
struct io_bitmap;
struct vm86;

#include <asm/math_emu.h>
#include <asm/segment.h>
#include <asm/types.h>
#include <uapi/asm/sigcontext.h>
#include <asm/current.h>
#include <asm/cpufeatures.h>
#include <asm/page.h>
#include <asm/pgtable_types.h>
#include <asm/percpu.h>
#include <asm/msr.h>
#include <asm/desc_defs.h>
#include <asm/nops.h>
#include <asm/special_insns.h>
#include <asm/fpu/types.h>
#include <asm/unwind_hints.h>
#include <asm/vmxfeatures.h>
#include <asm/vdso/processor.h>

#include <linux/personality.h>
#include <linux/cache.h>
#include <linux/threads.h>
#include <linux/math64.h>
#include <linux/err.h>
#include <linux/irqflags.h>
#include <linux/mem_encrypt.h>

/*
 * We handle most unaligned accesses in hardware.  On the other hand
 * unaligned DMA can be quite expensive on some Nehalem processors.
 *
 * Based on this we disable the IP header alignment in network drivers.
 */
#define NET_IP_ALIGN        0

#define HBP_NUM 4

/*
 * These alignment constraints are for performance in the vSMP case,
 * but in the task_struct case we must also meet hardware imposed
 * alignment requirements of the FPU state:
 */
#ifdef CONFIG_X86_VSMP
# define ARCH_MIN_TASKALIGN                (1 << INTERNODE_CACHE_SHIFT)
# define ARCH_MIN_MMSTRUCT_ALIGN        (1 << INTERNODE_CACHE_SHIFT)
#else
# define ARCH_MIN_TASKALIGN                __alignof__(union fpregs_state)
# define ARCH_MIN_MMSTRUCT_ALIGN        0
#endif

enum tlb_infos {
        ENTRIES,
        NR_INFO
};

extern u16 __read_mostly tlb_lli_4k[NR_INFO];
extern u16 __read_mostly tlb_lli_2m[NR_INFO];
extern u16 __read_mostly tlb_lli_4m[NR_INFO];
extern u16 __read_mostly tlb_lld_4k[NR_INFO];
extern u16 __read_mostly tlb_lld_2m[NR_INFO];
extern u16 __read_mostly tlb_lld_4m[NR_INFO];
extern u16 __read_mostly tlb_lld_1g[NR_INFO];

/*
 *  CPU type and hardware bug flags. Kept separately for each CPU.
 *  Members of this structure are referenced in head_32.S, so think twice
 *  before touching them. [mj]
 */

struct cpuinfo_x86 {
        __u8                        x86;                /* CPU family */
        __u8                        x86_vendor;        /* CPU vendor */
        __u8                        x86_model;
        __u8                        x86_stepping;
#ifdef CONFIG_X86_64
        /* Number of 4K pages in DTLB/ITLB combined(in pages): */
        int                        x86_tlbsize;
#endif
#ifdef CONFIG_X86_VMX_FEATURE_NAMES
        __u32                        vmx_capability[NVMXINTS];
#endif
        __u8                        x86_virt_bits;
        __u8                        x86_phys_bits;
        /* CPUID returned core id bits: */
        __u8                        x86_coreid_bits;
        __u8                        cu_id;
        /* Max extended CPUID function supported: */
        __u32                        extended_cpuid_level;
        /* Maximum supported CPUID level, -1=no CPUID: */
        int                        cpuid_level;
        /*
         * Align to size of unsigned long because the x86_capability array
         * is passed to bitops which require the alignment. Use unnamed
         * union to enforce the array is aligned to size of unsigned long.
         */
        union {
                __u32                x86_capability[NCAPINTS + NBUGINTS];
                unsigned long        x86_capability_alignment;
        };
        char                        x86_vendor_id[16];
        char                        x86_model_id[64];
        /* in KB - valid for CPUS which support this call: */
        unsigned int                x86_cache_size;
        int                        x86_cache_alignment;        /* In bytes */
        /* Cache QoS architectural values, valid only on the BSP: */
        int                        x86_cache_max_rmid;        /* max index */
        int                        x86_cache_occ_scale;        /* scale to bytes */
        int                        x86_cache_mbm_width_offset;
        int                        x86_power;
        unsigned long                loops_per_jiffy;
        /* cpuid returned max cores value: */
        u16                        x86_max_cores;
        u16                        apicid;
        u16                        initial_apicid;
        u16                        x86_clflush_size;
        /* number of cores as seen by the OS: */
        u16                        booted_cores;
        /* Physical processor id: */
        u16                        phys_proc_id;
        /* Logical processor id: */
        u16                        logical_proc_id;
        /* Core id: */
        u16                        cpu_core_id;
        u16                        cpu_die_id;
        u16                        logical_die_id;
        /* Index into per_cpu list: */
        u16                        cpu_index;
        u32                        microcode;
        /* Address space bits used by the cache internally */
        u8                        x86_cache_bits;
        unsigned                initialized : 1;
} __randomize_layout;

struct cpuid_regs {
        u32 eax, ebx, ecx, edx;
};

enum cpuid_regs_idx {
        CPUID_EAX = 0,
        CPUID_EBX,
        CPUID_ECX,
        CPUID_EDX,
};

#define X86_VENDOR_INTEL        0
#define X86_VENDOR_CYRIX        1
#define X86_VENDOR_AMD                2
#define X86_VENDOR_UMC                3
#define X86_VENDOR_CENTAUR        5
#define X86_VENDOR_TRANSMETA        7
#define X86_VENDOR_NSC                8
#define X86_VENDOR_HYGON        9
#define X86_VENDOR_ZHAOXIN        10
#define X86_VENDOR_NUM                11

#define X86_VENDOR_UNKNOWN        0xff

/*
 * capabilities of CPUs
 */
extern struct cpuinfo_x86        boot_cpu_data;
extern struct cpuinfo_x86        new_cpu_data;

extern __u32                        cpu_caps_cleared[NCAPINTS + NBUGINTS];
extern __u32                        cpu_caps_set[NCAPINTS + NBUGINTS];

#ifdef CONFIG_SMP
DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
#define cpu_data(cpu)                per_cpu(cpu_info, cpu)
#else
#define cpu_info                boot_cpu_data
#define cpu_data(cpu)                boot_cpu_data
#endif

extern const struct seq_operations cpuinfo_op;

#define cache_line_size()        (boot_cpu_data.x86_cache_alignment)

extern void cpu_detect(struct cpuinfo_x86 *c);

static inline unsigned long long l1tf_pfn_limit(void)
{
        return BIT_ULL(boot_cpu_data.x86_cache_bits - 1 - PAGE_SHIFT);
}

extern void early_cpu_init(void);
extern void identify_boot_cpu(void);
extern void identify_secondary_cpu(struct cpuinfo_x86 *);
extern void print_cpu_info(struct cpuinfo_x86 *);
void print_cpu_msr(struct cpuinfo_x86 *);

#ifdef CONFIG_X86_32
extern int have_cpuid_p(void);
#else
static inline int have_cpuid_p(void)
{
        return 1;
}
#endif
static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
                                unsigned int *ecx, unsigned int *edx)
{
        /* ecx is often an input as well as an output. */
        asm volatile("cpuid"
            : "=a" (*eax),
              "=b" (*ebx),
              "=c" (*ecx),
              "=d" (*edx)
            : "0" (*eax), "2" (*ecx)
            : "memory");
}

#define native_cpuid_reg(reg)                                        \
static inline unsigned int native_cpuid_##reg(unsigned int op)        \
{                                                                \
        unsigned int eax = op, ebx, ecx = 0, edx;                \
                                                                \
        native_cpuid(&eax, &ebx, &ecx, &edx);                        \
                                                                \
        return reg;                                                \
}

/*
 * Native CPUID functions returning a single datum.
 */
native_cpuid_reg(eax)
native_cpuid_reg(ebx)
native_cpuid_reg(ecx)
native_cpuid_reg(edx)

/*
 * Friendlier CR3 helpers.
 */
static inline unsigned long read_cr3_pa(void)
{
        return __read_cr3() & CR3_ADDR_MASK;
}

static inline unsigned long native_read_cr3_pa(void)
{
        return __native_read_cr3() & CR3_ADDR_MASK;
}

static inline void load_cr3(pgd_t *pgdir)
{
        write_cr3(__sme_pa(pgdir));
}

/*
 * Note that while the legacy 'TSS' name comes from 'Task State Segment',
 * on modern x86 CPUs the TSS also holds information important to 64-bit mode,
 * unrelated to the task-switch mechanism:
 */
#ifdef CONFIG_X86_32
/* This is the TSS defined by the hardware. */
struct x86_hw_tss {
        unsigned short                back_link, __blh;
        unsigned long                sp0;
        unsigned short                ss0, __ss0h;
        unsigned long                sp1;

        /*
         * We don't use ring 1, so ss1 is a convenient scratch space in
         * the same cacheline as sp0.  We use ss1 to cache the value in
         * MSR_IA32_SYSENTER_CS.  When we context switch
         * MSR_IA32_SYSENTER_CS, we first check if the new value being
         * written matches ss1, and, if it's not, then we wrmsr the new
         * value and update ss1.
         *
         * The only reason we context switch MSR_IA32_SYSENTER_CS is
         * that we set it to zero in vm86 tasks to avoid corrupting the
         * stack if we were to go through the sysenter path from vm86
         * mode.
         */
        unsigned short                ss1;        /* MSR_IA32_SYSENTER_CS */

        unsigned short                __ss1h;
        unsigned long                sp2;
        unsigned short                ss2, __ss2h;
        unsigned long                __cr3;
        unsigned long                ip;
        unsigned long                flags;
        unsigned long                ax;
        unsigned long                cx;
        unsigned long                dx;
        unsigned long                bx;
        unsigned long                sp;
        unsigned long                bp;
        unsigned long                si;
        unsigned long                di;
        unsigned short                es, __esh;
        unsigned short                cs, __csh;
        unsigned short                ss, __ssh;
        unsigned short                ds, __dsh;
        unsigned short                fs, __fsh;
        unsigned short                gs, __gsh;
        unsigned short                ldt, __ldth;
        unsigned short                trace;
        unsigned short                io_bitmap_base;

} __attribute__((packed));
#else
struct x86_hw_tss {
        u32                        reserved1;
        u64                        sp0;

        /*
         * We store cpu_current_top_of_stack in sp1 so it's always accessible.
         * Linux does not use ring 1, so sp1 is not otherwise needed.
         */
        u64                        sp1;

        /*
         * Since Linux does not use ring 2, the 'sp2' slot is unused by
         * hardware.  entry_SYSCALL_64 uses it as scratch space to stash
         * the user RSP value.
         */
        u64                        sp2;

        u64                        reserved2;
        u64                        ist[7];
        u32                        reserved3;
        u32                        reserved4;
        u16                        reserved5;
        u16                        io_bitmap_base;

} __attribute__((packed));
#endif

/*
 * IO-bitmap sizes:
 */
#define IO_BITMAP_BITS                        65536
#define IO_BITMAP_BYTES                        (IO_BITMAP_BITS / BITS_PER_BYTE)
#define IO_BITMAP_LONGS                        (IO_BITMAP_BYTES / sizeof(long))

#define IO_BITMAP_OFFSET_VALID_MAP                                \
        (offsetof(struct tss_struct, io_bitmap.bitmap) -        \
         offsetof(struct tss_struct, x86_tss))

#define IO_BITMAP_OFFSET_VALID_ALL                                \
        (offsetof(struct tss_struct, io_bitmap.mapall) -        \
         offsetof(struct tss_struct, x86_tss))

#ifdef CONFIG_X86_IOPL_IOPERM
/*
 * sizeof(unsigned long) coming from an extra "long" at the end of the
 * iobitmap. The limit is inclusive, i.e. the last valid byte.
 */
# define __KERNEL_TSS_LIMIT        \
        (IO_BITMAP_OFFSET_VALID_ALL + IO_BITMAP_BYTES + \
         sizeof(unsigned long) - 1)
#else
# define __KERNEL_TSS_LIMIT        \
        (offsetof(struct tss_struct, x86_tss) + sizeof(struct x86_hw_tss) - 1)
#endif

/* Base offset outside of TSS_LIMIT so unpriviledged IO causes #GP */
#define IO_BITMAP_OFFSET_INVALID        (__KERNEL_TSS_LIMIT + 1)

struct entry_stack {
        char        stack[PAGE_SIZE];
};

struct entry_stack_page {
        struct entry_stack stack;
} __aligned(PAGE_SIZE);

/*
 * All IO bitmap related data stored in the TSS:
 */
struct x86_io_bitmap {
        /* The sequence number of the last active bitmap. */
        u64                        prev_sequence;

        /*
         * Store the dirty size of the last io bitmap offender. The next
         * one will have to do the cleanup as the switch out to a non io
         * bitmap user will just set x86_tss.io_bitmap_base to a value
         * outside of the TSS limit. So for sane tasks there is no need to
         * actually touch the io_bitmap at all.
         */
        unsigned int                prev_max;

        /*
         * The extra 1 is there because the CPU will access an
         * additional byte beyond the end of the IO permission
         * bitmap. The extra byte must be all 1 bits, and must
         * be within the limit.
         */
        unsigned long                bitmap[IO_BITMAP_LONGS + 1];

        /*
         * Special I/O bitmap to emulate IOPL(3). All bytes zero,
         * except the additional byte at the end.
         */
        unsigned long                mapall[IO_BITMAP_LONGS + 1];
};

struct tss_struct {
        /*
         * The fixed hardware portion.  This must not cross a page boundary
         * at risk of violating the SDM's advice and potentially triggering
         * errata.
         */
        struct x86_hw_tss        x86_tss;

        struct x86_io_bitmap        io_bitmap;
} __aligned(PAGE_SIZE);

DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);

/* Per CPU interrupt stacks */
struct irq_stack {
        char                stack[IRQ_STACK_SIZE];
} __aligned(IRQ_STACK_SIZE);

DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);

#ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
#else
/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
#endif

#ifdef CONFIG_X86_64
struct fixed_percpu_data {
        /*
         * GCC hardcodes the stack canary as %gs:40.  Since the
         * irq_stack is the object at %gs:0, we reserve the bottom
         * 48 bytes of the irq stack for the canary.
         *
         * Once we are willing to require -mstack-protector-guard-symbol=
         * support for x86_64 stackprotector, we can get rid of this.
         */
        char                gs_base[40];
        unsigned long        stack_canary;
};

DECLARE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __visible;
DECLARE_INIT_PER_CPU(fixed_percpu_data);

static inline unsigned long cpu_kernelmode_gs_base(int cpu)
{
        return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu);
}

DECLARE_PER_CPU(unsigned int, irq_count);
extern asmlinkage void ignore_sysret(void);

/* Save actual FS/GS selectors and bases to current->thread */
void current_save_fsgs(void);
#else        /* X86_64 */
#ifdef CONFIG_STACKPROTECTOR
DECLARE_PER_CPU(unsigned long, __stack_chk_guard);
#endif
/* Per CPU softirq stack pointer */
DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr);
#endif        /* X86_64 */

extern unsigned int fpu_kernel_xstate_size;
extern unsigned int fpu_user_xstate_size;

struct perf_event;

struct thread_struct {
        /* Cached TLS descriptors: */
        struct desc_struct        tls_array[GDT_ENTRY_TLS_ENTRIES];
#ifdef CONFIG_X86_32
        unsigned long                sp0;
#endif
        unsigned long                sp;
#ifdef CONFIG_X86_32
        unsigned long                sysenter_cs;
#else
        unsigned short                es;
        unsigned short                ds;
        unsigned short                fsindex;
        unsigned short                gsindex;
#endif

#ifdef CONFIG_X86_64
        unsigned long                fsbase;
        unsigned long                gsbase;
#else
        /*
         * XXX: this could presumably be unsigned short.  Alternatively,
         * 32-bit kernels could be taught to use fsindex instead.
         */
        unsigned long fs;
        unsigned long gs;
#endif

        /* Save middle states of ptrace breakpoints */
        struct perf_event        *ptrace_bps[HBP_NUM];
        /* Debug status used for traps, single steps, etc... */
        unsigned long           virtual_dr6;
        /* Keep track of the exact dr7 value set by the user */
        unsigned long           ptrace_dr7;
        /* Fault info: */
        unsigned long                cr2;
        unsigned long                trap_nr;
        unsigned long                error_code;
#ifdef CONFIG_VM86
        /* Virtual 86 mode info */
        struct vm86                *vm86;
#endif
        /* IO permissions: */
        struct io_bitmap        *io_bitmap;

        /*
         * IOPL. Priviledge level dependent I/O permission which is
         * emulated via the I/O bitmap to prevent user space from disabling
         * interrupts.
         */
        unsigned long                iopl_emul;

        unsigned int                iopl_warn:1;
        unsigned int                sig_on_uaccess_err:1;

        /* Floating point and extended processor state */
        struct fpu                fpu;
        /*
         * WARNING: 'fpu' is dynamically-sized.  It *MUST* be at
         * the end.
         */
};

/* Whitelist the FPU state from the task_struct for hardened usercopy. */
static inline void arch_thread_struct_whitelist(unsigned long *offset,
                                                unsigned long *size)
{
        *offset = offsetof(struct thread_struct, fpu.state);
        *size = fpu_kernel_xstate_size;
}

static inline void
native_load_sp0(unsigned long sp0)
{
        this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
}

static __always_inline void native_swapgs(void)
{
#ifdef CONFIG_X86_64
        asm volatile("swapgs" ::: "memory");
#endif
}

static inline unsigned long current_top_of_stack(void)
{
        /*
         *  We can't read directly from tss.sp0: sp0 on x86_32 is special in
         *  and around vm86 mode and sp0 on x86_64 is special because of the
         *  entry trampoline.
         */
        return this_cpu_read_stable(cpu_current_top_of_stack);
}

static inline bool on_thread_stack(void)
{
        return (unsigned long)(current_top_of_stack() -
                               current_stack_pointer) < THREAD_SIZE;
}

#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#define __cpuid                        native_cpuid

static inline void load_sp0(unsigned long sp0)
{
        native_load_sp0(sp0);
}

#endif /* CONFIG_PARAVIRT_XXL */

/* Free all resources held by a thread. */
extern void release_thread(struct task_struct *);

unsigned long get_wchan(struct task_struct *p);

/*
 * Generic CPUID function
 * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
 * resulting in stale register contents being returned.
 */
static inline void cpuid(unsigned int op,
                         unsigned int *eax, unsigned int *ebx,
                         unsigned int *ecx, unsigned int *edx)
{
        *eax = op;
        *ecx = 0;
        __cpuid(eax, ebx, ecx, edx);
}

/* Some CPUID calls want 'count' to be placed in ecx */
static inline void cpuid_count(unsigned int op, int count,
                               unsigned int *eax, unsigned int *ebx,
                               unsigned int *ecx, unsigned int *edx)
{
        *eax = op;
        *ecx = count;
        __cpuid(eax, ebx, ecx, edx);
}

/*
 * CPUID functions returning a single datum
 */
static inline unsigned int cpuid_eax(unsigned int op)
{
        unsigned int eax, ebx, ecx, edx;

        cpuid(op, &eax, &ebx, &ecx, &edx);

        return eax;
}

static inline unsigned int cpuid_ebx(unsigned int op)
{
        unsigned int eax, ebx, ecx, edx;

        cpuid(op, &eax, &ebx, &ecx, &edx);

        return ebx;
}

static inline unsigned int cpuid_ecx(unsigned int op)
{
        unsigned int eax, ebx, ecx, edx;

        cpuid(op, &eax, &ebx, &ecx, &edx);

        return ecx;
}

static inline unsigned int cpuid_edx(unsigned int op)
{
        unsigned int eax, ebx, ecx, edx;

        cpuid(op, &eax, &ebx, &ecx, &edx);

        return edx;
}

extern void select_idle_routine(const struct cpuinfo_x86 *c);
extern void amd_e400_c1e_apic_setup(void);

extern unsigned long                boot_option_idle_override;

enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
                         IDLE_POLL};

extern void enable_sep_cpu(void);
extern int sysenter_setup(void);


/* Defined in head.S */
extern struct desc_ptr                early_gdt_descr;

extern void switch_to_new_gdt(int);
extern void load_direct_gdt(int);
extern void load_fixmap_gdt(int);
extern void load_percpu_segment(int);
extern void cpu_init(void);
extern void cpu_init_secondary(void);
extern void cpu_init_exception_handling(void);
extern void cr4_init(void);

static inline unsigned long get_debugctlmsr(void)
{
        unsigned long debugctlmsr = 0;

#ifndef CONFIG_X86_DEBUGCTLMSR
        if (boot_cpu_data.x86 < 6)
                return 0;
#endif
        rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);

        return debugctlmsr;
}

static inline void update_debugctlmsr(unsigned long debugctlmsr)
{
#ifndef CONFIG_X86_DEBUGCTLMSR
        if (boot_cpu_data.x86 < 6)
                return;
#endif
        wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
}

extern void set_task_blockstep(struct task_struct *task, bool on);

/* Boot loader type from the setup header: */
extern int                        bootloader_type;
extern int                        bootloader_version;

extern char                        ignore_fpu_irq;

#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
#define ARCH_HAS_PREFETCHW
#define ARCH_HAS_SPINLOCK_PREFETCH

#ifdef CONFIG_X86_32
# define BASE_PREFETCH                ""
# define ARCH_HAS_PREFETCH
#else
# define BASE_PREFETCH                "prefetcht0 %P1"
#endif

/*
 * Prefetch instructions for Pentium III (+) and AMD Athlon (+)
 *
 * It's not worth to care about 3dnow prefetches for the K6
 * because they are microcoded there and very slow.
 */
static inline void prefetch(const void *x)
{
        alternative_input(BASE_PREFETCH, "prefetchnta %P1",
                          X86_FEATURE_XMM,
                          "m" (*(const char *)x));
}

/*
 * 3dnow prefetch to get an exclusive cache line.
 * Useful for spinlocks to avoid one state transition in the
 * cache coherency protocol:
 */
static __always_inline void prefetchw(const void *x)
{
        alternative_input(BASE_PREFETCH, "prefetchw %P1",
                          X86_FEATURE_3DNOWPREFETCH,
                          "m" (*(const char *)x));
}

static inline void spin_lock_prefetch(const void *x)
{
        prefetchw(x);
}

#define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \
                           TOP_OF_KERNEL_STACK_PADDING)

#define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1))

#define task_pt_regs(task) \
({                                                                        \
        unsigned long __ptr = (unsigned long)task_stack_page(task);        \
        __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;                \
        ((struct pt_regs *)__ptr) - 1;                                        \
})

#ifdef CONFIG_X86_32
#define INIT_THREAD  {                                                          \
        .sp0                        = TOP_OF_INIT_STACK,                          \
        .sysenter_cs                = __KERNEL_CS,                                  \
}

#define KSTK_ESP(task)                (task_pt_regs(task)->sp)

#else
#define INIT_THREAD { }

extern unsigned long KSTK_ESP(struct task_struct *task);

#endif /* CONFIG_X86_64 */

extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
                                               unsigned long new_sp);

/*
 * This decides where the kernel will search for a free chunk of vm
 * space during mmap's.
 */
#define __TASK_UNMAPPED_BASE(task_size)        (PAGE_ALIGN(task_size / 3))
#define TASK_UNMAPPED_BASE                __TASK_UNMAPPED_BASE(TASK_SIZE_LOW)

#define KSTK_EIP(task)                (task_pt_regs(task)->ip)

/* Get/set a process' ability to use the timestamp counter instruction */
#define GET_TSC_CTL(adr)        get_tsc_mode((adr))
#define SET_TSC_CTL(val)        set_tsc_mode((val))

extern int get_tsc_mode(unsigned long adr);
extern int set_tsc_mode(unsigned int val);

DECLARE_PER_CPU(u64, msr_misc_features_shadow);

#ifdef CONFIG_CPU_SUP_AMD
extern u16 amd_get_nb_id(int cpu);
extern u32 amd_get_nodes_per_socket(void);
extern bool cpu_has_ibpb_brtype_microcode(void);
extern void amd_clear_divider(void);
#else
static inline u16 amd_get_nb_id(int cpu)                { return 0; }
static inline u32 amd_get_nodes_per_socket(void)        { return 0; }
static inline bool cpu_has_ibpb_brtype_microcode(void)        { return false; }
static inline void amd_clear_divider(void)                { }
#endif

static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
{
        uint32_t base, eax, signature[3];

        for (base = 0x40000000; base < 0x40010000; base += 0x100) {
                cpuid(base, &eax, &signature[0], &signature[1], &signature[2]);

                if (!memcmp(sig, signature, 12) &&
                    (leaves == 0 || ((eax - base) >= leaves)))
                        return base;
        }

        return 0;
}

extern unsigned long arch_align_stack(unsigned long sp);
void free_init_pages(const char *what, unsigned long begin, unsigned long end);
extern void free_kernel_image_pages(const char *what, void *begin, void *end);

void default_idle(void);
#ifdef        CONFIG_XEN
bool xen_set_default_idle(void);
#else
#define xen_set_default_idle 0
#endif

void __noreturn stop_this_cpu(void *dummy);
void microcode_check(struct cpuinfo_x86 *prev_info);
void store_cpu_caps(struct cpuinfo_x86 *info);

enum l1tf_mitigations {
        L1TF_MITIGATION_OFF,
        L1TF_MITIGATION_FLUSH_NOWARN,
        L1TF_MITIGATION_FLUSH,
        L1TF_MITIGATION_FLUSH_NOSMT,
        L1TF_MITIGATION_FULL,
        L1TF_MITIGATION_FULL_FORCE
};

extern enum l1tf_mitigations l1tf_mitigation;

enum mds_mitigations {
        MDS_MITIGATION_OFF,
        MDS_MITIGATION_FULL,
        MDS_MITIGATION_VMWERV,
};

extern bool gds_ucode_mitigated(void);

#endif /* _ASM_X86_PROCESSOR_H */













































































































































































































































































































































































































































    1 












    1 




    1 












    1 














    1 










































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_DEFS_H
#define _ASM_X86_PGTABLE_DEFS_H

#include <linux/const.h>
#include <linux/mem_encrypt.h>

#include <asm/page_types.h>

#define FIRST_USER_ADDRESS        0UL

#define _PAGE_BIT_PRESENT        0        /* is present */
#define _PAGE_BIT_RW                1        /* writeable */
#define _PAGE_BIT_USER                2        /* userspace addressable */
#define _PAGE_BIT_PWT                3        /* page write through */
#define _PAGE_BIT_PCD                4        /* page cache disabled */
#define _PAGE_BIT_ACCESSED        5        /* was accessed (raised by CPU) */
#define _PAGE_BIT_DIRTY                6        /* was written to (raised by CPU) */
#define _PAGE_BIT_PSE                7        /* 4 MB (or 2MB) page */
#define _PAGE_BIT_PAT                7        /* on 4KB pages */
#define _PAGE_BIT_GLOBAL        8        /* Global TLB entry PPro+ */
#define _PAGE_BIT_SOFTW1        9        /* available for programmer */
#define _PAGE_BIT_SOFTW2        10        /* " */
#define _PAGE_BIT_SOFTW3        11        /* " */
#define _PAGE_BIT_PAT_LARGE        12        /* On 2MB or 1GB pages */
#define _PAGE_BIT_SOFTW4        58        /* available for programmer */
#define _PAGE_BIT_PKEY_BIT0        59        /* Protection Keys, bit 1/4 */
#define _PAGE_BIT_PKEY_BIT1        60        /* Protection Keys, bit 2/4 */
#define _PAGE_BIT_PKEY_BIT2        61        /* Protection Keys, bit 3/4 */
#define _PAGE_BIT_PKEY_BIT3        62        /* Protection Keys, bit 4/4 */
#define _PAGE_BIT_NX                63        /* No execute: only valid after cpuid check */

#define _PAGE_BIT_SPECIAL        _PAGE_BIT_SOFTW1
#define _PAGE_BIT_CPA_TEST        _PAGE_BIT_SOFTW1
#define _PAGE_BIT_UFFD_WP        _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */
#define _PAGE_BIT_SOFT_DIRTY        _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_DEVMAP        _PAGE_BIT_SOFTW4

/* If _PAGE_BIT_PRESENT is clear, we use these: */
/* - if the user mapped it with PROT_NONE; pte_present gives true */
#define _PAGE_BIT_PROTNONE        _PAGE_BIT_GLOBAL

#define _PAGE_PRESENT        (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
#define _PAGE_RW        (_AT(pteval_t, 1) << _PAGE_BIT_RW)
#define _PAGE_USER        (_AT(pteval_t, 1) << _PAGE_BIT_USER)
#define _PAGE_PWT        (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
#define _PAGE_PCD        (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
#define _PAGE_ACCESSED        (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
#define _PAGE_DIRTY        (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
#define _PAGE_PSE        (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
#define _PAGE_GLOBAL        (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
#define _PAGE_SOFTW1        (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
#define _PAGE_SOFTW2        (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
#define _PAGE_SOFTW3        (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW3)
#define _PAGE_PAT        (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define _PAGE_SPECIAL        (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
#define _PAGE_CPA_TEST        (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
#define _PAGE_PKEY_BIT0        (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0)
#define _PAGE_PKEY_BIT1        (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1)
#define _PAGE_PKEY_BIT2        (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT2)
#define _PAGE_PKEY_BIT3        (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT3)
#else
#define _PAGE_PKEY_BIT0        (_AT(pteval_t, 0))
#define _PAGE_PKEY_BIT1        (_AT(pteval_t, 0))
#define _PAGE_PKEY_BIT2        (_AT(pteval_t, 0))
#define _PAGE_PKEY_BIT3        (_AT(pteval_t, 0))
#endif

#define _PAGE_PKEY_MASK (_PAGE_PKEY_BIT0 | \
                         _PAGE_PKEY_BIT1 | \
                         _PAGE_PKEY_BIT2 | \
                         _PAGE_PKEY_BIT3)

#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
#define _PAGE_KNL_ERRATUM_MASK (_PAGE_DIRTY | _PAGE_ACCESSED)
#else
#define _PAGE_KNL_ERRATUM_MASK 0
#endif

#ifdef CONFIG_MEM_SOFT_DIRTY
#define _PAGE_SOFT_DIRTY        (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY)
#else
#define _PAGE_SOFT_DIRTY        (_AT(pteval_t, 0))
#endif

/*
 * Tracking soft dirty bit when a page goes to a swap is tricky.
 * We need a bit which can be stored in pte _and_ not conflict
 * with swap entry format. On x86 bits 1-4 are *not* involved
 * into swap entry computation, but bit 7 is used for thp migration,
 * so we borrow bit 1 for soft dirty tracking.
 *
 * Please note that this bit must be treated as swap dirty page
 * mark if and only if the PTE/PMD has present bit clear!
 */
#ifdef CONFIG_MEM_SOFT_DIRTY
#define _PAGE_SWP_SOFT_DIRTY        _PAGE_RW
#else
#define _PAGE_SWP_SOFT_DIRTY        (_AT(pteval_t, 0))
#endif

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
#define _PAGE_UFFD_WP                (_AT(pteval_t, 1) << _PAGE_BIT_UFFD_WP)
#define _PAGE_SWP_UFFD_WP        _PAGE_USER
#else
#define _PAGE_UFFD_WP                (_AT(pteval_t, 0))
#define _PAGE_SWP_UFFD_WP        (_AT(pteval_t, 0))
#endif

#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
#define _PAGE_NX        (_AT(pteval_t, 1) << _PAGE_BIT_NX)
#define _PAGE_DEVMAP        (_AT(u64, 1) << _PAGE_BIT_DEVMAP)
#else
#define _PAGE_NX        (_AT(pteval_t, 0))
#define _PAGE_DEVMAP        (_AT(pteval_t, 0))
#endif

#define _PAGE_PROTNONE        (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)

/*
 * Set of bits not changed in pte_modify.  The pte's
 * protection key is treated like _PAGE_RW, for
 * instance, and is *not* included in this mask since
 * pte_modify() does modify it.
 */
#define _COMMON_PAGE_CHG_MASK        (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT |               \
                                 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY |\
                                 _PAGE_SOFT_DIRTY | _PAGE_DEVMAP | _PAGE_ENC | \
                                 _PAGE_UFFD_WP)
#define _PAGE_CHG_MASK        (_COMMON_PAGE_CHG_MASK | _PAGE_PAT)
#define _HPAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_PAT_LARGE)

/*
 * The cache modes defined here are used to translate between pure SW usage
 * and the HW defined cache mode bits and/or PAT entries.
 *
 * The resulting bits for PWT, PCD and PAT should be chosen in a way
 * to have the WB mode at index 0 (all bits clear). This is the default
 * right now and likely would break too much if changed.
 */
#ifndef __ASSEMBLY__
enum page_cache_mode {
        _PAGE_CACHE_MODE_WB       = 0,
        _PAGE_CACHE_MODE_WC       = 1,
        _PAGE_CACHE_MODE_UC_MINUS = 2,
        _PAGE_CACHE_MODE_UC       = 3,
        _PAGE_CACHE_MODE_WT       = 4,
        _PAGE_CACHE_MODE_WP       = 5,

        _PAGE_CACHE_MODE_NUM      = 8
};
#endif

#define _PAGE_ENC                (_AT(pteval_t, sme_me_mask))

#define _PAGE_CACHE_MASK        (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)
#define _PAGE_LARGE_CACHE_MASK        (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT_LARGE)

#define _PAGE_NOCACHE                (cachemode2protval(_PAGE_CACHE_MODE_UC))
#define _PAGE_CACHE_WP                (cachemode2protval(_PAGE_CACHE_MODE_WP))

#define __PP _PAGE_PRESENT
#define __RW _PAGE_RW
#define _USR _PAGE_USER
#define ___A _PAGE_ACCESSED
#define ___D _PAGE_DIRTY
#define ___G _PAGE_GLOBAL
#define __NX _PAGE_NX

#define _ENC _PAGE_ENC
#define __WP _PAGE_CACHE_WP
#define __NC _PAGE_NOCACHE
#define _PSE _PAGE_PSE

#define pgprot_val(x)                ((x).pgprot)
#define __pgprot(x)                ((pgprot_t) { (x) } )
#define __pg(x)                        __pgprot(x)

#define _PAGE_PAT_LARGE                (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)

#define PAGE_NONE             __pg(   0|   0|   0|___A|   0|   0|   0|___G)
#define PAGE_SHARED             __pg(__PP|__RW|_USR|___A|__NX|   0|   0|   0)
#define PAGE_SHARED_EXEC     __pg(__PP|__RW|_USR|___A|   0|   0|   0|   0)
#define PAGE_COPY_NOEXEC     __pg(__PP|   0|_USR|___A|__NX|   0|   0|   0)
#define PAGE_COPY_EXEC             __pg(__PP|   0|_USR|___A|   0|   0|   0|   0)
#define PAGE_COPY             __pg(__PP|   0|_USR|___A|__NX|   0|   0|   0)
#define PAGE_READONLY             __pg(__PP|   0|_USR|___A|__NX|   0|   0|   0)
#define PAGE_READONLY_EXEC   __pg(__PP|   0|_USR|___A|   0|   0|   0|   0)

#define __PAGE_KERNEL                 (__PP|__RW|   0|___A|__NX|___D|   0|___G)
#define __PAGE_KERNEL_EXEC         (__PP|__RW|   0|___A|   0|___D|   0|___G)
#define _KERNPG_TABLE_NOENC         (__PP|__RW|   0|___A|   0|___D|   0|   0)
#define _KERNPG_TABLE                 (__PP|__RW|   0|___A|   0|___D|   0|   0| _ENC)
#define _PAGE_TABLE_NOENC         (__PP|__RW|_USR|___A|   0|___D|   0|   0)
#define _PAGE_TABLE                 (__PP|__RW|_USR|___A|   0|___D|   0|   0| _ENC)
#define __PAGE_KERNEL_RO         (__PP|   0|   0|___A|__NX|___D|   0|___G)
#define __PAGE_KERNEL_ROX         (__PP|   0|   0|___A|   0|___D|   0|___G)
#define __PAGE_KERNEL_NOCACHE         (__PP|__RW|   0|___A|__NX|___D|   0|___G| __NC)
#define __PAGE_KERNEL_VVAR         (__PP|   0|_USR|___A|__NX|___D|   0|___G)
#define __PAGE_KERNEL_LARGE         (__PP|__RW|   0|___A|__NX|___D|_PSE|___G)
#define __PAGE_KERNEL_LARGE_EXEC (__PP|__RW|   0|___A|   0|___D|_PSE|___G)
#define __PAGE_KERNEL_WP         (__PP|__RW|   0|___A|__NX|___D|   0|___G| __WP)


#define __PAGE_KERNEL_IO                __PAGE_KERNEL
#define __PAGE_KERNEL_IO_NOCACHE        __PAGE_KERNEL_NOCACHE


#ifndef __ASSEMBLY__

#define __PAGE_KERNEL_ENC        (__PAGE_KERNEL    | _ENC)
#define __PAGE_KERNEL_ENC_WP        (__PAGE_KERNEL_WP | _ENC)
#define __PAGE_KERNEL_NOENC        (__PAGE_KERNEL    |    0)
#define __PAGE_KERNEL_NOENC_WP        (__PAGE_KERNEL_WP |    0)

#define __pgprot_mask(x)        __pgprot((x) & __default_kernel_pte_mask)

#define PAGE_KERNEL                __pgprot_mask(__PAGE_KERNEL            | _ENC)
#define PAGE_KERNEL_NOENC        __pgprot_mask(__PAGE_KERNEL            |    0)
#define PAGE_KERNEL_RO                __pgprot_mask(__PAGE_KERNEL_RO         | _ENC)
#define PAGE_KERNEL_EXEC        __pgprot_mask(__PAGE_KERNEL_EXEC       | _ENC)
#define PAGE_KERNEL_EXEC_NOENC        __pgprot_mask(__PAGE_KERNEL_EXEC       |    0)
#define PAGE_KERNEL_ROX                __pgprot_mask(__PAGE_KERNEL_ROX        | _ENC)
#define PAGE_KERNEL_NOCACHE        __pgprot_mask(__PAGE_KERNEL_NOCACHE    | _ENC)
#define PAGE_KERNEL_LARGE        __pgprot_mask(__PAGE_KERNEL_LARGE      | _ENC)
#define PAGE_KERNEL_LARGE_EXEC        __pgprot_mask(__PAGE_KERNEL_LARGE_EXEC | _ENC)
#define PAGE_KERNEL_VVAR        __pgprot_mask(__PAGE_KERNEL_VVAR       | _ENC)

#define PAGE_KERNEL_IO                __pgprot_mask(__PAGE_KERNEL_IO)
#define PAGE_KERNEL_IO_NOCACHE        __pgprot_mask(__PAGE_KERNEL_IO_NOCACHE)

#endif        /* __ASSEMBLY__ */

/*         xwr */
#define __P000        PAGE_NONE
#define __P001        PAGE_READONLY
#define __P010        PAGE_COPY
#define __P011        PAGE_COPY
#define __P100        PAGE_READONLY_EXEC
#define __P101        PAGE_READONLY_EXEC
#define __P110        PAGE_COPY_EXEC
#define __P111        PAGE_COPY_EXEC

#define __S000        PAGE_NONE
#define __S001        PAGE_READONLY
#define __S010        PAGE_SHARED
#define __S011        PAGE_SHARED
#define __S100        PAGE_READONLY_EXEC
#define __S101        PAGE_READONLY_EXEC
#define __S110        PAGE_SHARED_EXEC
#define __S111        PAGE_SHARED_EXEC

/*
 * early identity mapping  pte attrib macros.
 */
#ifdef CONFIG_X86_64
#define __PAGE_KERNEL_IDENT_LARGE_EXEC        __PAGE_KERNEL_LARGE_EXEC
#else
#define PTE_IDENT_ATTR         0x003                /* PRESENT+RW */
#define PDE_IDENT_ATTR         0x063                /* PRESENT+RW+DIRTY+ACCESSED */
#define PGD_IDENT_ATTR         0x001                /* PRESENT (no other attributes) */
#endif

#ifdef CONFIG_X86_32
# include <asm/pgtable_32_types.h>
#else
# include <asm/pgtable_64_types.h>
#endif

#ifndef __ASSEMBLY__

#include <linux/types.h>

/* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */
#define PTE_PFN_MASK                ((pteval_t)PHYSICAL_PAGE_MASK)

/*
 *  Extracts the flags from a (pte|pmd|pud|pgd)val_t
 *  This includes the protection key value.
 */
#define PTE_FLAGS_MASK                (~PTE_PFN_MASK)

typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;

typedef struct { pgdval_t pgd; } pgd_t;

static inline pgprot_t pgprot_nx(pgprot_t prot)
{
        return __pgprot(pgprot_val(prot) | _PAGE_NX);
}
#define pgprot_nx pgprot_nx

#ifdef CONFIG_X86_PAE

/*
 * PHYSICAL_PAGE_MASK might be non-constant when SME is compiled in, so we can't
 * use it here.
 */

#define PGD_PAE_PAGE_MASK        ((signed long)PAGE_MASK)
#define PGD_PAE_PHYS_MASK        (((1ULL << __PHYSICAL_MASK_SHIFT)-1) & PGD_PAE_PAGE_MASK)

/*
 * PAE allows Base Address, P, PWT, PCD and AVL bits to be set in PGD entries.
 * All other bits are Reserved MBZ
 */
#define PGD_ALLOWED_BITS        (PGD_PAE_PHYS_MASK | _PAGE_PRESENT | \
                                 _PAGE_PWT | _PAGE_PCD | \
                                 _PAGE_SOFTW1 | _PAGE_SOFTW2 | _PAGE_SOFTW3)

#else
/* No need to mask any bits for !PAE */
#define PGD_ALLOWED_BITS        (~0ULL)
#endif

static inline pgd_t native_make_pgd(pgdval_t val)
{
        return (pgd_t) { val & PGD_ALLOWED_BITS };
}

static inline pgdval_t native_pgd_val(pgd_t pgd)
{
        return pgd.pgd & PGD_ALLOWED_BITS;
}

static inline pgdval_t pgd_flags(pgd_t pgd)
{
        return native_pgd_val(pgd) & PTE_FLAGS_MASK;
}

#if CONFIG_PGTABLE_LEVELS > 4
typedef struct { p4dval_t p4d; } p4d_t;

static inline p4d_t native_make_p4d(pudval_t val)
{
        return (p4d_t) { val };
}

static inline p4dval_t native_p4d_val(p4d_t p4d)
{
        return p4d.p4d;
}
#else
#include <asm-generic/pgtable-nop4d.h>

static inline p4d_t native_make_p4d(pudval_t val)
{
        return (p4d_t) { .pgd = native_make_pgd((pgdval_t)val) };
}

static inline p4dval_t native_p4d_val(p4d_t p4d)
{
        return native_pgd_val(p4d.pgd);
}
#endif

#if CONFIG_PGTABLE_LEVELS > 3
typedef struct { pudval_t pud; } pud_t;

static inline pud_t native_make_pud(pmdval_t val)
{
        return (pud_t) { val };
}

static inline pudval_t native_pud_val(pud_t pud)
{
        return pud.pud;
}
#else
#include <asm-generic/pgtable-nopud.h>

static inline pud_t native_make_pud(pudval_t val)
{
        return (pud_t) { .p4d.pgd = native_make_pgd(val) };
}

static inline pudval_t native_pud_val(pud_t pud)
{
        return native_pgd_val(pud.p4d.pgd);
}
#endif

#if CONFIG_PGTABLE_LEVELS > 2
typedef struct { pmdval_t pmd; } pmd_t;

static inline pmd_t native_make_pmd(pmdval_t val)
{
        return (pmd_t) { val };
}

static inline pmdval_t native_pmd_val(pmd_t pmd)
{
        return pmd.pmd;
}
#else
#include <asm-generic/pgtable-nopmd.h>

static inline pmd_t native_make_pmd(pmdval_t val)
{
        return (pmd_t) { .pud.p4d.pgd = native_make_pgd(val) };
}

static inline pmdval_t native_pmd_val(pmd_t pmd)
{
        return native_pgd_val(pmd.pud.p4d.pgd);
}
#endif

static inline p4dval_t p4d_pfn_mask(p4d_t p4d)
{
        /* No 512 GiB huge pages yet */
        return PTE_PFN_MASK;
}

static inline p4dval_t p4d_flags_mask(p4d_t p4d)
{
        return ~p4d_pfn_mask(p4d);
}

static inline p4dval_t p4d_flags(p4d_t p4d)
{
        return native_p4d_val(p4d) & p4d_flags_mask(p4d);
}

static inline pudval_t pud_pfn_mask(pud_t pud)
{
        if (native_pud_val(pud) & _PAGE_PSE)
                return PHYSICAL_PUD_PAGE_MASK;
        else
                return PTE_PFN_MASK;
}

static inline pudval_t pud_flags_mask(pud_t pud)
{
        return ~pud_pfn_mask(pud);
}

static inline pudval_t pud_flags(pud_t pud)
{
        return native_pud_val(pud) & pud_flags_mask(pud);
}

static inline pmdval_t pmd_pfn_mask(pmd_t pmd)
{
        if (native_pmd_val(pmd) & _PAGE_PSE)
                return PHYSICAL_PMD_PAGE_MASK;
        else
                return PTE_PFN_MASK;
}

static inline pmdval_t pmd_flags_mask(pmd_t pmd)
{
        return ~pmd_pfn_mask(pmd);
}

static inline pmdval_t pmd_flags(pmd_t pmd)
{
        return native_pmd_val(pmd) & pmd_flags_mask(pmd);
}

static inline pte_t native_make_pte(pteval_t val)
{
        return (pte_t) { .pte = val };
}

static inline pteval_t native_pte_val(pte_t pte)
{
        return pte.pte;
}

static inline pteval_t pte_flags(pte_t pte)
{
        return native_pte_val(pte) & PTE_FLAGS_MASK;
}

#define __pte2cm_idx(cb)                                \
        ((((cb) >> (_PAGE_BIT_PAT - 2)) & 4) |                \
         (((cb) >> (_PAGE_BIT_PCD - 1)) & 2) |                \
         (((cb) >> _PAGE_BIT_PWT) & 1))
#define __cm_idx2pte(i)                                        \
        ((((i) & 4) << (_PAGE_BIT_PAT - 2)) |                \
         (((i) & 2) << (_PAGE_BIT_PCD - 1)) |                \
         (((i) & 1) << _PAGE_BIT_PWT))

unsigned long cachemode2protval(enum page_cache_mode pcm);

static inline pgprotval_t protval_4k_2_large(pgprotval_t val)
{
        return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
                ((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
}
static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot)
{
        return __pgprot(protval_4k_2_large(pgprot_val(pgprot)));
}
static inline pgprotval_t protval_large_2_4k(pgprotval_t val)
{
        return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
                ((val & _PAGE_PAT_LARGE) >>
                 (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
}
static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot)
{
        return __pgprot(protval_large_2_4k(pgprot_val(pgprot)));
}


typedef struct page *pgtable_t;

extern pteval_t __supported_pte_mask;
extern pteval_t __default_kernel_pte_mask;
extern void set_nx(void);
extern int nx_enabled;

#define pgprot_writecombine        pgprot_writecombine
extern pgprot_t pgprot_writecombine(pgprot_t prot);

#define pgprot_writethrough        pgprot_writethrough
extern pgprot_t pgprot_writethrough(pgprot_t prot);

/* Indicate that x86 has its own track and untrack pfn vma functions */
#define __HAVE_PFNMAP_TRACKING

#define __HAVE_PHYS_MEM_ACCESS_PROT
struct file;
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
                              unsigned long size, pgprot_t vma_prot);

/* Install a pte for a particular vaddr in kernel space. */
void set_pte_vaddr(unsigned long vaddr, pte_t pte);

#ifdef CONFIG_X86_32
extern void native_pagetable_init(void);
#else
#define native_pagetable_init        paging_init
#endif

struct seq_file;
extern void arch_report_meminfo(struct seq_file *m);

enum pg_level {
        PG_LEVEL_NONE,
        PG_LEVEL_4K,
        PG_LEVEL_2M,
        PG_LEVEL_1G,
        PG_LEVEL_512G,
        PG_LEVEL_NUM
};

#ifdef CONFIG_PROC_FS
extern void update_page_count(int level, unsigned long pages);
#else
static inline void update_page_count(int level, unsigned long pages) { }
#endif

/*
 * Helper function that returns the kernel pagetable entry controlling
 * the virtual address 'address'. NULL means no pagetable entry present.
 * NOTE: the return type is pte_t but if the pmd is PSE then we return it
 * as a pte too.
 */
extern pte_t *lookup_address(unsigned long address, unsigned int *level);
extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
                                    unsigned int *level);

struct mm_struct;
extern pte_t *lookup_address_in_mm(struct mm_struct *mm, unsigned long address,
                                   unsigned int *level);
extern pmd_t *lookup_pmd_address(unsigned long address);
extern phys_addr_t slow_virt_to_phys(void *__address);
extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn,
                                          unsigned long address,
                                          unsigned numpages,
                                          unsigned long page_flags);
extern int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
                                            unsigned long numpages);
#endif        /* !__ASSEMBLY__ */

#endif /* _ASM_X86_PGTABLE_DEFS_H */































































































































































































































































































































































































































    1 































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_NODEMASK_H
#define __LINUX_NODEMASK_H

/*
 * Nodemasks provide a bitmap suitable for representing the
 * set of Node's in a system, one bit position per Node number.
 *
 * See detailed comments in the file linux/bitmap.h describing the
 * data type on which these nodemasks are based.
 *
 * For details of nodemask_parse_user(), see bitmap_parse_user() in
 * lib/bitmap.c.  For details of nodelist_parse(), see bitmap_parselist(),
 * also in bitmap.c.  For details of node_remap(), see bitmap_bitremap in
 * lib/bitmap.c.  For details of nodes_remap(), see bitmap_remap in
 * lib/bitmap.c.  For details of nodes_onto(), see bitmap_onto in
 * lib/bitmap.c.  For details of nodes_fold(), see bitmap_fold in
 * lib/bitmap.c.
 *
 * The available nodemask operations are:
 *
 * void node_set(node, mask)                turn on bit 'node' in mask
 * void node_clear(node, mask)                turn off bit 'node' in mask
 * void nodes_setall(mask)                set all bits
 * void nodes_clear(mask)                clear all bits
 * int node_isset(node, mask)                true iff bit 'node' set in mask
 * int node_test_and_set(node, mask)        test and set bit 'node' in mask
 *
 * void nodes_and(dst, src1, src2)        dst = src1 & src2  [intersection]
 * void nodes_or(dst, src1, src2)        dst = src1 | src2  [union]
 * void nodes_xor(dst, src1, src2)        dst = src1 ^ src2
 * void nodes_andnot(dst, src1, src2)        dst = src1 & ~src2
 * void nodes_complement(dst, src)        dst = ~src
 *
 * int nodes_equal(mask1, mask2)        Does mask1 == mask2?
 * int nodes_intersects(mask1, mask2)        Do mask1 and mask2 intersect?
 * int nodes_subset(mask1, mask2)        Is mask1 a subset of mask2?
 * int nodes_empty(mask)                Is mask empty (no bits sets)?
 * int nodes_full(mask)                        Is mask full (all bits sets)?
 * int nodes_weight(mask)                Hamming weight - number of set bits
 *
 * void nodes_shift_right(dst, src, n)        Shift right
 * void nodes_shift_left(dst, src, n)        Shift left
 *
 * unsigned int first_node(mask)        Number lowest set bit, or MAX_NUMNODES
 * unsigend int next_node(node, mask)        Next node past 'node', or MAX_NUMNODES
 * unsigned int next_node_in(node, mask) Next node past 'node', or wrap to first,
 *                                        or MAX_NUMNODES
 * unsigned int first_unset_node(mask)        First node not set in mask, or
 *                                        MAX_NUMNODES
 *
 * nodemask_t nodemask_of_node(node)        Return nodemask with bit 'node' set
 * NODE_MASK_ALL                        Initializer - all bits set
 * NODE_MASK_NONE                        Initializer - no bits set
 * unsigned long *nodes_addr(mask)        Array of unsigned long's in mask
 *
 * int nodemask_parse_user(ubuf, ulen, mask)        Parse ascii string as nodemask
 * int nodelist_parse(buf, map)                Parse ascii string as nodelist
 * int node_remap(oldbit, old, new)        newbit = map(old, new)(oldbit)
 * void nodes_remap(dst, src, old, new)        *dst = map(old, new)(src)
 * void nodes_onto(dst, orig, relmap)        *dst = orig relative to relmap
 * void nodes_fold(dst, orig, sz)        dst bits = orig bits mod sz
 *
 * for_each_node_mask(node, mask)        for-loop node over mask
 *
 * int num_online_nodes()                Number of online Nodes
 * int num_possible_nodes()                Number of all possible Nodes
 *
 * int node_random(mask)                Random node with set bit in mask
 *
 * int node_online(node)                Is some node online?
 * int node_possible(node)                Is some node possible?
 *
 * node_set_online(node)                set bit 'node' in node_online_map
 * node_set_offline(node)                clear bit 'node' in node_online_map
 *
 * for_each_node(node)                        for-loop node over node_possible_map
 * for_each_online_node(node)                for-loop node over node_online_map
 *
 * Subtlety:
 * 1) The 'type-checked' form of node_isset() causes gcc (3.3.2, anyway)
 *    to generate slightly worse code.  So use a simple one-line #define
 *    for node_isset(), instead of wrapping an inline inside a macro, the
 *    way we do the other calls.
 *
 * NODEMASK_SCRATCH
 * When doing above logical AND, OR, XOR, Remap operations the callers tend to
 * need temporary nodemask_t's on the stack. But if NODES_SHIFT is large,
 * nodemask_t's consume too much stack space.  NODEMASK_SCRATCH is a helper
 * for such situations. See below and CPUMASK_ALLOC also.
 */

#include <linux/threads.h>
#include <linux/bitmap.h>
#include <linux/minmax.h>
#include <linux/numa.h>

typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
extern nodemask_t _unused_nodemask_arg_;

/**
 * nodemask_pr_args - printf args to output a nodemask
 * @maskp: nodemask to be printed
 *
 * Can be used to provide arguments for '%*pb[l]' when printing a nodemask.
 */
#define nodemask_pr_args(maskp)        __nodemask_pr_numnodes(maskp), \
                                __nodemask_pr_bits(maskp)
static inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m)
{
        return m ? MAX_NUMNODES : 0;
}
static inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m)
{
        return m ? m->bits : NULL;
}

/*
 * The inline keyword gives the compiler room to decide to inline, or
 * not inline a function as it sees best.  However, as these functions
 * are called in both __init and non-__init functions, if they are not
 * inlined we will end up with a section mis-match error (of the type of
 * freeable items not being freed).  So we must use __always_inline here
 * to fix the problem.  If other functions in the future also end up in
 * this situation they will also need to be annotated as __always_inline
 */
#define node_set(node, dst) __node_set((node), &(dst))
static __always_inline void __node_set(int node, volatile nodemask_t *dstp)
{
        set_bit(node, dstp->bits);
}

#define node_clear(node, dst) __node_clear((node), &(dst))
static inline void __node_clear(int node, volatile nodemask_t *dstp)
{
        clear_bit(node, dstp->bits);
}

#define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES)
static inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits)
{
        bitmap_fill(dstp->bits, nbits);
}

#define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES)
static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits)
{
        bitmap_zero(dstp->bits, nbits);
}

/* No static inline type checking - see Subtlety (1) above. */
#define node_isset(node, nodemask) test_bit((node), (nodemask).bits)

#define node_test_and_set(node, nodemask) \
                        __node_test_and_set((node), &(nodemask))
static inline bool __node_test_and_set(int node, nodemask_t *addr)
{
        return test_and_set_bit(node, addr->bits);
}

#define nodes_and(dst, src1, src2) \
                        __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES)
static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_or(dst, src1, src2) \
                        __nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES)
static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_xor(dst, src1, src2) \
                        __nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES)
static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_andnot(dst, src1, src2) \
                        __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES)
static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_complement(dst, src) \
                        __nodes_complement(&(dst), &(src), MAX_NUMNODES)
static inline void __nodes_complement(nodemask_t *dstp,
                                        const nodemask_t *srcp, unsigned int nbits)
{
        bitmap_complement(dstp->bits, srcp->bits, nbits);
}

#define nodes_equal(src1, src2) \
                        __nodes_equal(&(src1), &(src2), MAX_NUMNODES)
static inline bool __nodes_equal(const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        return bitmap_equal(src1p->bits, src2p->bits, nbits);
}

#define nodes_intersects(src1, src2) \
                        __nodes_intersects(&(src1), &(src2), MAX_NUMNODES)
static inline bool __nodes_intersects(const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        return bitmap_intersects(src1p->bits, src2p->bits, nbits);
}

#define nodes_subset(src1, src2) \
                        __nodes_subset(&(src1), &(src2), MAX_NUMNODES)
static inline bool __nodes_subset(const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        return bitmap_subset(src1p->bits, src2p->bits, nbits);
}

#define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES)
static inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits)
{
        return bitmap_empty(srcp->bits, nbits);
}

#define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES)
static inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits)
{
        return bitmap_full(srcp->bits, nbits);
}

#define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES)
static inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits)
{
        return bitmap_weight(srcp->bits, nbits);
}

#define nodes_shift_right(dst, src, n) \
                        __nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES)
static inline void __nodes_shift_right(nodemask_t *dstp,
                                        const nodemask_t *srcp, int n, int nbits)
{
        bitmap_shift_right(dstp->bits, srcp->bits, n, nbits);
}

#define nodes_shift_left(dst, src, n) \
                        __nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES)
static inline void __nodes_shift_left(nodemask_t *dstp,
                                        const nodemask_t *srcp, int n, int nbits)
{
        bitmap_shift_left(dstp->bits, srcp->bits, n, nbits);
}

/* FIXME: better would be to fix all architectures to never return
          > MAX_NUMNODES, then the silly min_ts could be dropped. */

#define first_node(src) __first_node(&(src))
static inline unsigned int __first_node(const nodemask_t *srcp)
{
        return min_t(unsigned int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES));
}

#define next_node(n, src) __next_node((n), &(src))
static inline unsigned int __next_node(int n, const nodemask_t *srcp)
{
        return min_t(unsigned int, MAX_NUMNODES, find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
}

/*
 * Find the next present node in src, starting after node n, wrapping around to
 * the first node in src if needed.  Returns MAX_NUMNODES if src is empty.
 */
#define next_node_in(n, src) __next_node_in((n), &(src))
unsigned int __next_node_in(int node, const nodemask_t *srcp);

static inline void init_nodemask_of_node(nodemask_t *mask, int node)
{
        nodes_clear(*mask);
        node_set(node, *mask);
}

#define nodemask_of_node(node)                                                \
({                                                                        \
        typeof(_unused_nodemask_arg_) m;                                \
        if (sizeof(m) == sizeof(unsigned long)) {                        \
                m.bits[0] = 1UL << (node);                                \
        } else {                                                        \
                init_nodemask_of_node(&m, (node));                        \
        }                                                                \
        m;                                                                \
})

#define first_unset_node(mask) __first_unset_node(&(mask))
static inline unsigned int __first_unset_node(const nodemask_t *maskp)
{
        return min_t(unsigned int, MAX_NUMNODES,
                        find_first_zero_bit(maskp->bits, MAX_NUMNODES));
}

#define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES)

#if MAX_NUMNODES <= BITS_PER_LONG

#define NODE_MASK_ALL                                                        \
((nodemask_t) { {                                                        \
        [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD                \
} })

#else

#define NODE_MASK_ALL                                                        \
((nodemask_t) { {                                                        \
        [0 ... BITS_TO_LONGS(MAX_NUMNODES)-2] = ~0UL,                        \
        [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD                \
} })

#endif

#define NODE_MASK_NONE                                                        \
((nodemask_t) { {                                                        \
        [0 ... BITS_TO_LONGS(MAX_NUMNODES)-1] =  0UL                        \
} })

#define nodes_addr(src) ((src).bits)

#define nodemask_parse_user(ubuf, ulen, dst) \
                __nodemask_parse_user((ubuf), (ulen), &(dst), MAX_NUMNODES)
static inline int __nodemask_parse_user(const char __user *buf, int len,
                                        nodemask_t *dstp, int nbits)
{
        return bitmap_parse_user(buf, len, dstp->bits, nbits);
}

#define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES)
static inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits)
{
        return bitmap_parselist(buf, dstp->bits, nbits);
}

#define node_remap(oldbit, old, new) \
                __node_remap((oldbit), &(old), &(new), MAX_NUMNODES)
static inline int __node_remap(int oldbit,
                const nodemask_t *oldp, const nodemask_t *newp, int nbits)
{
        return bitmap_bitremap(oldbit, oldp->bits, newp->bits, nbits);
}

#define nodes_remap(dst, src, old, new) \
                __nodes_remap(&(dst), &(src), &(old), &(new), MAX_NUMNODES)
static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp,
                const nodemask_t *oldp, const nodemask_t *newp, int nbits)
{
        bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits);
}

#define nodes_onto(dst, orig, relmap) \
                __nodes_onto(&(dst), &(orig), &(relmap), MAX_NUMNODES)
static inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp,
                const nodemask_t *relmapp, int nbits)
{
        bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits);
}

#define nodes_fold(dst, orig, sz) \
                __nodes_fold(&(dst), &(orig), sz, MAX_NUMNODES)
static inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp,
                int sz, int nbits)
{
        bitmap_fold(dstp->bits, origp->bits, sz, nbits);
}

#if MAX_NUMNODES > 1
#define for_each_node_mask(node, mask)                                    \
        for ((node) = first_node(mask);                                    \
             (node >= 0) && (node) < MAX_NUMNODES;                    \
             (node) = next_node((node), (mask)))
#else /* MAX_NUMNODES == 1 */
#define for_each_node_mask(node, mask)                                  \
        for ((node) = 0; (node) < 1 && !nodes_empty(mask); (node)++)
#endif /* MAX_NUMNODES */

/*
 * Bitmasks that are kept for all the nodes.
 */
enum node_states {
        N_POSSIBLE,                /* The node could become online at some point */
        N_ONLINE,                /* The node is online */
        N_NORMAL_MEMORY,        /* The node has regular memory */
#ifdef CONFIG_HIGHMEM
        N_HIGH_MEMORY,                /* The node has regular or high memory */
#else
        N_HIGH_MEMORY = N_NORMAL_MEMORY,
#endif
        N_MEMORY,                /* The node has memory(regular, high, movable) */
        N_CPU,                /* The node has one or more cpus */
        N_GENERIC_INITIATOR,        /* The node has one or more Generic Initiators */
        NR_NODE_STATES
};

/*
 * The following particular system nodemasks and operations
 * on them manage all possible and online nodes.
 */

extern nodemask_t node_states[NR_NODE_STATES];

#if MAX_NUMNODES > 1
static inline int node_state(int node, enum node_states state)
{
        return node_isset(node, node_states[state]);
}

static inline void node_set_state(int node, enum node_states state)
{
        __node_set(node, &node_states[state]);
}

static inline void node_clear_state(int node, enum node_states state)
{
        __node_clear(node, &node_states[state]);
}

static inline int num_node_state(enum node_states state)
{
        return nodes_weight(node_states[state]);
}

#define for_each_node_state(__node, __state) \
        for_each_node_mask((__node), node_states[__state])

#define first_online_node        first_node(node_states[N_ONLINE])
#define first_memory_node        first_node(node_states[N_MEMORY])
static inline unsigned int next_online_node(int nid)
{
        return next_node(nid, node_states[N_ONLINE]);
}
static inline unsigned int next_memory_node(int nid)
{
        return next_node(nid, node_states[N_MEMORY]);
}

extern unsigned int nr_node_ids;
extern unsigned int nr_online_nodes;

static inline void node_set_online(int nid)
{
        node_set_state(nid, N_ONLINE);
        nr_online_nodes = num_node_state(N_ONLINE);
}

static inline void node_set_offline(int nid)
{
        node_clear_state(nid, N_ONLINE);
        nr_online_nodes = num_node_state(N_ONLINE);
}

#else

static inline int node_state(int node, enum node_states state)
{
        return node == 0;
}

static inline void node_set_state(int node, enum node_states state)
{
}

static inline void node_clear_state(int node, enum node_states state)
{
}

static inline int num_node_state(enum node_states state)
{
        return 1;
}

#define for_each_node_state(node, __state) \
        for ( (node) = 0; (node) == 0; (node) = 1)

#define first_online_node        0
#define first_memory_node        0
#define next_online_node(nid)        (MAX_NUMNODES)
#define nr_node_ids                1U
#define nr_online_nodes                1U

#define node_set_online(node)           node_set_state((node), N_ONLINE)
#define node_set_offline(node)           node_clear_state((node), N_ONLINE)

#endif

#if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1)
extern int node_random(const nodemask_t *maskp);
#else
static inline int node_random(const nodemask_t *mask)
{
        return 0;
}
#endif

#define node_online_map         node_states[N_ONLINE]
#define node_possible_map         node_states[N_POSSIBLE]

#define num_online_nodes()        num_node_state(N_ONLINE)
#define num_possible_nodes()        num_node_state(N_POSSIBLE)
#define node_online(node)        node_state((node), N_ONLINE)
#define node_possible(node)        node_state((node), N_POSSIBLE)

#define for_each_node(node)           for_each_node_state(node, N_POSSIBLE)
#define for_each_online_node(node) for_each_node_state(node, N_ONLINE)

/*
 * For nodemask scrach area.
 * NODEMASK_ALLOC(type, name) allocates an object with a specified type and
 * name.
 */
#if NODES_SHIFT > 8 /* nodemask_t > 32 bytes */
#define NODEMASK_ALLOC(type, name, gfp_flags)        \
                        type *name = kmalloc(sizeof(*name), gfp_flags)
#define NODEMASK_FREE(m)                        kfree(m)
#else
#define NODEMASK_ALLOC(type, name, gfp_flags)        type _##name, *name = &_##name
#define NODEMASK_FREE(m)                        do {} while (0)
#endif

/* A example struture for using NODEMASK_ALLOC, used in mempolicy. */
struct nodemask_scratch {
        nodemask_t        mask1;
        nodemask_t        mask2;
};

#define NODEMASK_SCRATCH(x)                                                \
                        NODEMASK_ALLOC(struct nodemask_scratch, x,        \
                                        GFP_KERNEL | __GFP_NORETRY)
#define NODEMASK_SCRATCH_FREE(x)        NODEMASK_FREE(x)


#endif /* __LINUX_NODEMASK_H */

































    1 










































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM timer

#if !defined(_TRACE_TIMER_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_TIMER_H

#include <linux/tracepoint.h>
#include <linux/hrtimer.h>
#include <linux/timer.h>

DECLARE_EVENT_CLASS(timer_class,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer),

        TP_STRUCT__entry(
                __field( void *,        timer        )
        ),

        TP_fast_assign(
                __entry->timer        = timer;
        ),

        TP_printk("timer=%p", __entry->timer)
);

/**
 * timer_init - called when the timer is initialized
 * @timer:        pointer to struct timer_list
 */
DEFINE_EVENT(timer_class, timer_init,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer)
);

#define decode_timer_flags(flags)                        \
        __print_flags(flags, "|",                        \
                {  TIMER_MIGRATING,        "M" },                \
                {  TIMER_DEFERRABLE,        "D" },                \
                {  TIMER_PINNED,        "P" },                \
                {  TIMER_IRQSAFE,        "I" })

/**
 * timer_start - called when the timer is started
 * @timer:        pointer to struct timer_list
 * @expires:        the timers expiry time
 */
TRACE_EVENT(timer_start,

        TP_PROTO(struct timer_list *timer,
                unsigned long expires,
                unsigned int flags),

        TP_ARGS(timer, expires, flags),

        TP_STRUCT__entry(
                __field( void *,        timer                )
                __field( void *,        function        )
                __field( unsigned long,        expires                )
                __field( unsigned long,        now                )
                __field( unsigned int,        flags                )
        ),

        TP_fast_assign(
                __entry->timer                = timer;
                __entry->function        = timer->function;
                __entry->expires        = expires;
                __entry->now                = jiffies;
                __entry->flags                = flags;
        ),

        TP_printk("timer=%p function=%ps expires=%lu [timeout=%ld] cpu=%u idx=%u flags=%s",
                  __entry->timer, __entry->function, __entry->expires,
                  (long)__entry->expires - __entry->now,
                  __entry->flags & TIMER_CPUMASK,
                  __entry->flags >> TIMER_ARRAYSHIFT,
                  decode_timer_flags(__entry->flags & TIMER_TRACE_FLAGMASK))
);

/**
 * timer_expire_entry - called immediately before the timer callback
 * @timer:        pointer to struct timer_list
 *
 * Allows to determine the timer latency.
 */
TRACE_EVENT(timer_expire_entry,

        TP_PROTO(struct timer_list *timer, unsigned long baseclk),

        TP_ARGS(timer, baseclk),

        TP_STRUCT__entry(
                __field( void *,        timer        )
                __field( unsigned long,        now        )
                __field( void *,        function)
                __field( unsigned long,        baseclk        )
        ),

        TP_fast_assign(
                __entry->timer                = timer;
                __entry->now                = jiffies;
                __entry->function        = timer->function;
                __entry->baseclk        = baseclk;
        ),

        TP_printk("timer=%p function=%ps now=%lu baseclk=%lu",
                  __entry->timer, __entry->function, __entry->now,
                  __entry->baseclk)
);

/**
 * timer_expire_exit - called immediately after the timer callback returns
 * @timer:        pointer to struct timer_list
 *
 * When used in combination with the timer_expire_entry tracepoint we can
 * determine the runtime of the timer callback function.
 *
 * NOTE: Do NOT derefernce timer in TP_fast_assign. The pointer might
 * be invalid. We solely track the pointer.
 */
DEFINE_EVENT(timer_class, timer_expire_exit,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer)
);

/**
 * timer_cancel - called when the timer is canceled
 * @timer:        pointer to struct timer_list
 */
DEFINE_EVENT(timer_class, timer_cancel,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer)
);

#define decode_clockid(type)                                                \
        __print_symbolic(type,                                                \
                { CLOCK_REALTIME,        "CLOCK_REALTIME"        },        \
                { CLOCK_MONOTONIC,        "CLOCK_MONOTONIC"        },        \
                { CLOCK_BOOTTIME,        "CLOCK_BOOTTIME"        },        \
                { CLOCK_TAI,                "CLOCK_TAI"                })

#define decode_hrtimer_mode(mode)                                        \
        __print_symbolic(mode,                                                \
                { HRTIMER_MODE_ABS,                "ABS"                },        \
                { HRTIMER_MODE_REL,                "REL"                },        \
                { HRTIMER_MODE_ABS_PINNED,        "ABS|PINNED"        },        \
                { HRTIMER_MODE_REL_PINNED,        "REL|PINNED"        },        \
                { HRTIMER_MODE_ABS_SOFT,        "ABS|SOFT"        },        \
                { HRTIMER_MODE_REL_SOFT,        "REL|SOFT"        },        \
                { HRTIMER_MODE_ABS_PINNED_SOFT,        "ABS|PINNED|SOFT" },        \
                { HRTIMER_MODE_REL_PINNED_SOFT,        "REL|PINNED|SOFT" },        \
                { HRTIMER_MODE_ABS_HARD,        "ABS|HARD" },                \
                { HRTIMER_MODE_REL_HARD,        "REL|HARD" },                \
                { HRTIMER_MODE_ABS_PINNED_HARD, "ABS|PINNED|HARD" },        \
                { HRTIMER_MODE_REL_PINNED_HARD,        "REL|PINNED|HARD" })

/**
 * hrtimer_init - called when the hrtimer is initialized
 * @hrtimer:        pointer to struct hrtimer
 * @clockid:        the hrtimers clock
 * @mode:        the hrtimers mode
 */
TRACE_EVENT(hrtimer_init,

        TP_PROTO(struct hrtimer *hrtimer, clockid_t clockid,
                 enum hrtimer_mode mode),

        TP_ARGS(hrtimer, clockid, mode),

        TP_STRUCT__entry(
                __field( void *,                hrtimer                )
                __field( clockid_t,                clockid                )
                __field( enum hrtimer_mode,        mode                )
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
                __entry->clockid        = clockid;
                __entry->mode                = mode;
        ),

        TP_printk("hrtimer=%p clockid=%s mode=%s", __entry->hrtimer,
                  decode_clockid(__entry->clockid),
                  decode_hrtimer_mode(__entry->mode))
);

/**
 * hrtimer_start - called when the hrtimer is started
 * @hrtimer: pointer to struct hrtimer
 */
TRACE_EVENT(hrtimer_start,

        TP_PROTO(struct hrtimer *hrtimer, enum hrtimer_mode mode),

        TP_ARGS(hrtimer, mode),

        TP_STRUCT__entry(
                __field( void *,        hrtimer                )
                __field( void *,        function        )
                __field( s64,                expires                )
                __field( s64,                softexpires        )
                __field( enum hrtimer_mode,        mode        )
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
                __entry->function        = hrtimer->function;
                __entry->expires        = hrtimer_get_expires(hrtimer);
                __entry->softexpires        = hrtimer_get_softexpires(hrtimer);
                __entry->mode                = mode;
        ),

        TP_printk("hrtimer=%p function=%ps expires=%llu softexpires=%llu "
                  "mode=%s", __entry->hrtimer, __entry->function,
                  (unsigned long long) __entry->expires,
                  (unsigned long long) __entry->softexpires,
                  decode_hrtimer_mode(__entry->mode))
);

/**
 * hrtimer_expire_entry - called immediately before the hrtimer callback
 * @hrtimer:        pointer to struct hrtimer
 * @now:        pointer to variable which contains current time of the
 *                timers base.
 *
 * Allows to determine the timer latency.
 */
TRACE_EVENT(hrtimer_expire_entry,

        TP_PROTO(struct hrtimer *hrtimer, ktime_t *now),

        TP_ARGS(hrtimer, now),

        TP_STRUCT__entry(
                __field( void *,        hrtimer        )
                __field( s64,                now        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
                __entry->now                = *now;
                __entry->function        = hrtimer->function;
        ),

        TP_printk("hrtimer=%p function=%ps now=%llu",
                  __entry->hrtimer, __entry->function,
                  (unsigned long long) __entry->now)
);

DECLARE_EVENT_CLASS(hrtimer_class,

        TP_PROTO(struct hrtimer *hrtimer),

        TP_ARGS(hrtimer),

        TP_STRUCT__entry(
                __field( void *,        hrtimer        )
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
        ),

        TP_printk("hrtimer=%p", __entry->hrtimer)
);

/**
 * hrtimer_expire_exit - called immediately after the hrtimer callback returns
 * @hrtimer:        pointer to struct hrtimer
 *
 * When used in combination with the hrtimer_expire_entry tracepoint we can
 * determine the runtime of the callback function.
 */
DEFINE_EVENT(hrtimer_class, hrtimer_expire_exit,

        TP_PROTO(struct hrtimer *hrtimer),

        TP_ARGS(hrtimer)
);

/**
 * hrtimer_cancel - called when the hrtimer is canceled
 * @hrtimer:        pointer to struct hrtimer
 */
DEFINE_EVENT(hrtimer_class, hrtimer_cancel,

        TP_PROTO(struct hrtimer *hrtimer),

        TP_ARGS(hrtimer)
);

/**
 * itimer_state - called when itimer is started or canceled
 * @which:        name of the interval timer
 * @value:        the itimers value, itimer is canceled if value->it_value is
 *                zero, otherwise it is started
 * @expires:        the itimers expiry time
 */
TRACE_EVENT(itimer_state,

        TP_PROTO(int which, const struct itimerspec64 *const value,
                 unsigned long long expires),

        TP_ARGS(which, value, expires),

        TP_STRUCT__entry(
                __field(        int,                        which                )
                __field(        unsigned long long,        expires                )
                __field(        long,                        value_sec        )
                __field(        long,                        value_nsec        )
                __field(        long,                        interval_sec        )
                __field(        long,                        interval_nsec        )
        ),

        TP_fast_assign(
                __entry->which                = which;
                __entry->expires        = expires;
                __entry->value_sec        = value->it_value.tv_sec;
                __entry->value_nsec        = value->it_value.tv_nsec;
                __entry->interval_sec        = value->it_interval.tv_sec;
                __entry->interval_nsec        = value->it_interval.tv_nsec;
        ),

        TP_printk("which=%d expires=%llu it_value=%ld.%06ld it_interval=%ld.%06ld",
                  __entry->which, __entry->expires,
                  __entry->value_sec, __entry->value_nsec / NSEC_PER_USEC,
                  __entry->interval_sec, __entry->interval_nsec / NSEC_PER_USEC)
);

/**
 * itimer_expire - called when itimer expires
 * @which:        type of the interval timer
 * @pid:        pid of the process which owns the timer
 * @now:        current time, used to calculate the latency of itimer
 */
TRACE_EVENT(itimer_expire,

        TP_PROTO(int which, struct pid *pid, unsigned long long now),

        TP_ARGS(which, pid, now),

        TP_STRUCT__entry(
                __field( int ,                        which        )
                __field( pid_t,                        pid        )
                __field( unsigned long long,        now        )
        ),

        TP_fast_assign(
                __entry->which        = which;
                __entry->now        = now;
                __entry->pid        = pid_nr(pid);
        ),

        TP_printk("which=%d pid=%d now=%llu", __entry->which,
                  (int) __entry->pid, __entry->now)
);

#ifdef CONFIG_NO_HZ_COMMON

#define TICK_DEP_NAMES                                        \
                tick_dep_mask_name(NONE)                \
                tick_dep_name(POSIX_TIMER)                \
                tick_dep_name(PERF_EVENTS)                \
                tick_dep_name(SCHED)                        \
                tick_dep_name(CLOCK_UNSTABLE)                \
                tick_dep_name(RCU)                        \
                tick_dep_name_end(RCU_EXP)

#undef tick_dep_name
#undef tick_dep_mask_name
#undef tick_dep_name_end

/* The MASK will convert to their bits and they need to be processed too */
#define tick_dep_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \
        TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);
#define tick_dep_name_end(sdep)  TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \
        TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);
/* NONE only has a mask defined for it */
#define tick_dep_mask_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);

TICK_DEP_NAMES

#undef tick_dep_name
#undef tick_dep_mask_name
#undef tick_dep_name_end

#define tick_dep_name(sdep) { TICK_DEP_MASK_##sdep, #sdep },
#define tick_dep_mask_name(sdep) { TICK_DEP_MASK_##sdep, #sdep },
#define tick_dep_name_end(sdep) { TICK_DEP_MASK_##sdep, #sdep }

#define show_tick_dep_name(val)                                \
        __print_symbolic(val, TICK_DEP_NAMES)

TRACE_EVENT(tick_stop,

        TP_PROTO(int success, int dependency),

        TP_ARGS(success, dependency),

        TP_STRUCT__entry(
                __field( int ,                success        )
                __field( int ,                dependency )
        ),

        TP_fast_assign(
                __entry->success        = success;
                __entry->dependency        = dependency;
        ),

        TP_printk("success=%d dependency=%s",  __entry->success, \
                        show_tick_dep_name(__entry->dependency))
);
#endif

#endif /*  _TRACE_TIMER_H */

/* This part must be outside protection */
#include <trace/define_trace.h>

















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * bvec iterator
 *
 * Copyright (C) 2001 Ming Lei <ming.lei@canonical.com>
 */
#ifndef __LINUX_BVEC_ITER_H
#define __LINUX_BVEC_ITER_H

#include <linux/bug.h>
#include <linux/errno.h>
#include <linux/limits.h>
#include <linux/minmax.h>
#include <linux/mm.h>
#include <linux/types.h>

struct page;

/**
 * struct bio_vec - a contiguous range of physical memory addresses
 * @bv_page:   First page associated with the address range.
 * @bv_len:    Number of bytes in the address range.
 * @bv_offset: Start of the address range relative to the start of @bv_page.
 *
 * The following holds for a bvec if n * PAGE_SIZE < bv_offset + bv_len:
 *
 *   nth_page(@bv_page, n) == @bv_page + n
 *
 * This holds because page_is_mergeable() checks the above property.
 */
struct bio_vec {
        struct page        *bv_page;
        unsigned int        bv_len;
        unsigned int        bv_offset;
};

struct bvec_iter {
        sector_t                bi_sector;        /* device address in 512 byte
                                                   sectors */
        unsigned int                bi_size;        /* residual I/O count */

        unsigned int                bi_idx;                /* current index into bvl_vec */

        unsigned int            bi_bvec_done;        /* number of bytes completed in
                                                   current bvec */
};

struct bvec_iter_all {
        struct bio_vec        bv;
        int                idx;
        unsigned        done;
};

/*
 * various member access, note that bio_data should of course not be used
 * on highmem page vectors
 */
#define __bvec_iter_bvec(bvec, iter)        (&(bvec)[(iter).bi_idx])

/* multi-page (mp_bvec) helpers */
#define mp_bvec_iter_page(bvec, iter)                                \
        (__bvec_iter_bvec((bvec), (iter))->bv_page)

#define mp_bvec_iter_len(bvec, iter)                                \
        min((iter).bi_size,                                        \
            __bvec_iter_bvec((bvec), (iter))->bv_len - (iter).bi_bvec_done)

#define mp_bvec_iter_offset(bvec, iter)                                \
        (__bvec_iter_bvec((bvec), (iter))->bv_offset + (iter).bi_bvec_done)

#define mp_bvec_iter_page_idx(bvec, iter)                        \
        (mp_bvec_iter_offset((bvec), (iter)) / PAGE_SIZE)

#define mp_bvec_iter_bvec(bvec, iter)                                \
((struct bio_vec) {                                                \
        .bv_page        = mp_bvec_iter_page((bvec), (iter)),        \
        .bv_len                = mp_bvec_iter_len((bvec), (iter)),        \
        .bv_offset        = mp_bvec_iter_offset((bvec), (iter)),        \
})

/* For building single-page bvec in flight */
 #define bvec_iter_offset(bvec, iter)                                \
        (mp_bvec_iter_offset((bvec), (iter)) % PAGE_SIZE)

#define bvec_iter_len(bvec, iter)                                \
        min_t(unsigned, mp_bvec_iter_len((bvec), (iter)),                \
              PAGE_SIZE - bvec_iter_offset((bvec), (iter)))

#define bvec_iter_page(bvec, iter)                                \
        (mp_bvec_iter_page((bvec), (iter)) +                        \
         mp_bvec_iter_page_idx((bvec), (iter)))

#define bvec_iter_bvec(bvec, iter)                                \
((struct bio_vec) {                                                \
        .bv_page        = bvec_iter_page((bvec), (iter)),        \
        .bv_len                = bvec_iter_len((bvec), (iter)),        \
        .bv_offset        = bvec_iter_offset((bvec), (iter)),        \
})

static inline bool bvec_iter_advance(const struct bio_vec *bv,
                struct bvec_iter *iter, unsigned bytes)
{
        unsigned int idx = iter->bi_idx;

        if (WARN_ONCE(bytes > iter->bi_size,
                     "Attempted to advance past end of bvec iter\n")) {
                iter->bi_size = 0;
                return false;
        }

        iter->bi_size -= bytes;
        bytes += iter->bi_bvec_done;

        while (bytes && bytes >= bv[idx].bv_len) {
                bytes -= bv[idx].bv_len;
                idx++;
        }

        iter->bi_idx = idx;
        iter->bi_bvec_done = bytes;
        return true;
}

static inline void bvec_iter_skip_zero_bvec(struct bvec_iter *iter)
{
        iter->bi_bvec_done = 0;
        iter->bi_idx++;
}

#define for_each_bvec(bvl, bio_vec, iter, start)                        \
        for (iter = (start);                                                \
             (iter).bi_size &&                                                \
                ((bvl = bvec_iter_bvec((bio_vec), (iter))), 1);        \
             (bvl).bv_len ? (void)bvec_iter_advance((bio_vec), &(iter),        \
                     (bvl).bv_len) : bvec_iter_skip_zero_bvec(&(iter)))

/* for iterating one bio from start to end */
#define BVEC_ITER_ALL_INIT (struct bvec_iter)                                \
{                                                                        \
        .bi_sector        = 0,                                                \
        .bi_size        = UINT_MAX,                                        \
        .bi_idx                = 0,                                                \
        .bi_bvec_done        = 0,                                                \
}

static inline struct bio_vec *bvec_init_iter_all(struct bvec_iter_all *iter_all)
{
        iter_all->done = 0;
        iter_all->idx = 0;

        return &iter_all->bv;
}

static inline void bvec_advance(const struct bio_vec *bvec,
                                struct bvec_iter_all *iter_all)
{
        struct bio_vec *bv = &iter_all->bv;

        if (iter_all->done) {
                bv->bv_page++;
                bv->bv_offset = 0;
        } else {
                bv->bv_page = bvec->bv_page + (bvec->bv_offset >> PAGE_SHIFT);
                bv->bv_offset = bvec->bv_offset & ~PAGE_MASK;
        }
        bv->bv_len = min_t(unsigned int, PAGE_SIZE - bv->bv_offset,
                           bvec->bv_len - iter_all->done);
        iter_all->done += bv->bv_len;

        if (iter_all->done == bvec->bv_len) {
                iter_all->idx++;
                iter_all->done = 0;
        }
}

#endif /* __LINUX_BVEC_ITER_H */
















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RATELIMIT_H
#define _LINUX_RATELIMIT_H

#include <linux/ratelimit_types.h>
#include <linux/sched.h>
#include <linux/spinlock.h>

static inline void ratelimit_state_init(struct ratelimit_state *rs,
                                        int interval, int burst)
{
        memset(rs, 0, sizeof(*rs));

        raw_spin_lock_init(&rs->lock);
        rs->interval        = interval;
        rs->burst        = burst;
}

static inline void ratelimit_default_init(struct ratelimit_state *rs)
{
        return ratelimit_state_init(rs, DEFAULT_RATELIMIT_INTERVAL,
                                        DEFAULT_RATELIMIT_BURST);
}

static inline void ratelimit_state_exit(struct ratelimit_state *rs)
{
        if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE))
                return;

        if (rs->missed) {
                pr_warn("%s: %d output lines suppressed due to ratelimiting\n",
                        current->comm, rs->missed);
                rs->missed = 0;
        }
}

static inline void
ratelimit_set_flags(struct ratelimit_state *rs, unsigned long flags)
{
        rs->flags = flags;
}

extern struct ratelimit_state printk_ratelimit_state;

#ifdef CONFIG_PRINTK

#define WARN_ON_RATELIMIT(condition, state)        ({                \
        bool __rtn_cond = !!(condition);                        \
        WARN_ON(__rtn_cond && __ratelimit(state));                \
        __rtn_cond;                                                \
})

#define WARN_RATELIMIT(condition, format, ...)                        \
({                                                                \
        static DEFINE_RATELIMIT_STATE(_rs,                        \
                                      DEFAULT_RATELIMIT_INTERVAL,        \
                                      DEFAULT_RATELIMIT_BURST);        \
        int rtn = !!(condition);                                \
                                                                \
        if (unlikely(rtn && __ratelimit(&_rs)))                        \
                WARN(rtn, format, ##__VA_ARGS__);                \
                                                                \
        rtn;                                                        \
})

#else

#define WARN_ON_RATELIMIT(condition, state)                        \
        WARN_ON(condition)

#define WARN_RATELIMIT(condition, format, ...)                        \
({                                                                \
        int rtn = WARN(condition, format, ##__VA_ARGS__);        \
        rtn;                                                        \
})

#endif

#endif /* _LINUX_RATELIMIT_H */






































    1 
    1 










































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Statically sized hash table implementation
 * (C) 2012  Sasha Levin <levinsasha928@gmail.com>
 */

#ifndef _LINUX_HASHTABLE_H
#define _LINUX_HASHTABLE_H

#include <linux/list.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/hash.h>
#include <linux/rculist.h>

#define DEFINE_HASHTABLE(name, bits)                                                \
        struct hlist_head name[1 << (bits)] =                                        \
                        { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT }

#define DEFINE_READ_MOSTLY_HASHTABLE(name, bits)                                \
        struct hlist_head name[1 << (bits)] __read_mostly =                        \
                        { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT }

#define DECLARE_HASHTABLE(name, bits)                                           \
        struct hlist_head name[1 << (bits)]

#define HASH_SIZE(name) (ARRAY_SIZE(name))
#define HASH_BITS(name) ilog2(HASH_SIZE(name))

/* Use hash_32 when possible to allow for fast 32bit hashing in 64bit kernels. */
#define hash_min(val, bits)                                                        \
        (sizeof(val) <= 4 ? hash_32(val, bits) : hash_long(val, bits))

static inline void __hash_init(struct hlist_head *ht, unsigned int sz)
{
        unsigned int i;

        for (i = 0; i < sz; i++)
                INIT_HLIST_HEAD(&ht[i]);
}

/**
 * hash_init - initialize a hash table
 * @hashtable: hashtable to be initialized
 *
 * Calculates the size of the hashtable from the given parameter, otherwise
 * same as hash_init_size.
 *
 * This has to be a macro since HASH_BITS() will not work on pointers since
 * it calculates the size during preprocessing.
 */
#define hash_init(hashtable) __hash_init(hashtable, HASH_SIZE(hashtable))

/**
 * hash_add - add an object to a hashtable
 * @hashtable: hashtable to add to
 * @node: the &struct hlist_node of the object to be added
 * @key: the key of the object to be added
 */
#define hash_add(hashtable, node, key)                                                \
        hlist_add_head(node, &hashtable[hash_min(key, HASH_BITS(hashtable))])

/**
 * hash_add_rcu - add an object to a rcu enabled hashtable
 * @hashtable: hashtable to add to
 * @node: the &struct hlist_node of the object to be added
 * @key: the key of the object to be added
 */
#define hash_add_rcu(hashtable, node, key)                                        \
        hlist_add_head_rcu(node, &hashtable[hash_min(key, HASH_BITS(hashtable))])

/**
 * hash_hashed - check whether an object is in any hashtable
 * @node: the &struct hlist_node of the object to be checked
 */
static inline bool hash_hashed(struct hlist_node *node)
{
        return !hlist_unhashed(node);
}

static inline bool __hash_empty(struct hlist_head *ht, unsigned int sz)
{
        unsigned int i;

        for (i = 0; i < sz; i++)
                if (!hlist_empty(&ht[i]))
                        return false;

        return true;
}

/**
 * hash_empty - check whether a hashtable is empty
 * @hashtable: hashtable to check
 *
 * This has to be a macro since HASH_BITS() will not work on pointers since
 * it calculates the size during preprocessing.
 */
#define hash_empty(hashtable) __hash_empty(hashtable, HASH_SIZE(hashtable))

/**
 * hash_del - remove an object from a hashtable
 * @node: &struct hlist_node of the object to remove
 */
static inline void hash_del(struct hlist_node *node)
{
        hlist_del_init(node);
}

/**
 * hash_del_rcu - remove an object from a rcu enabled hashtable
 * @node: &struct hlist_node of the object to remove
 */
static inline void hash_del_rcu(struct hlist_node *node)
{
        hlist_del_init_rcu(node);
}

/**
 * hash_for_each - iterate over a hashtable
 * @name: hashtable to iterate
 * @bkt: integer to use as bucket loop cursor
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 */
#define hash_for_each(name, bkt, obj, member)                                \
        for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\
                        (bkt)++)\
                hlist_for_each_entry(obj, &name[bkt], member)

/**
 * hash_for_each_rcu - iterate over a rcu enabled hashtable
 * @name: hashtable to iterate
 * @bkt: integer to use as bucket loop cursor
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 */
#define hash_for_each_rcu(name, bkt, obj, member)                        \
        for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\
                        (bkt)++)\
                hlist_for_each_entry_rcu(obj, &name[bkt], member)

/**
 * hash_for_each_safe - iterate over a hashtable safe against removal of
 * hash entry
 * @name: hashtable to iterate
 * @bkt: integer to use as bucket loop cursor
 * @tmp: a &struct hlist_node used for temporary storage
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 */
#define hash_for_each_safe(name, bkt, tmp, obj, member)                        \
        for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\
                        (bkt)++)\
                hlist_for_each_entry_safe(obj, tmp, &name[bkt], member)

/**
 * hash_for_each_possible - iterate over all possible objects hashing to the
 * same bucket
 * @name: hashtable to iterate
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 * @key: the key of the objects to iterate over
 */
#define hash_for_each_possible(name, obj, member, key)                        \
        hlist_for_each_entry(obj, &name[hash_min(key, HASH_BITS(name))], member)

/**
 * hash_for_each_possible_rcu - iterate over all possible objects hashing to the
 * same bucket in an rcu enabled hashtable
 * @name: hashtable to iterate
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 * @key: the key of the objects to iterate over
 */
#define hash_for_each_possible_rcu(name, obj, member, key, cond...)        \
        hlist_for_each_entry_rcu(obj, &name[hash_min(key, HASH_BITS(name))],\
                member, ## cond)

/**
 * hash_for_each_possible_rcu_notrace - iterate over all possible objects hashing
 * to the same bucket in an rcu enabled hashtable in a rcu enabled hashtable
 * @name: hashtable to iterate
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 * @key: the key of the objects to iterate over
 *
 * This is the same as hash_for_each_possible_rcu() except that it does
 * not do any RCU debugging or tracing.
 */
#define hash_for_each_possible_rcu_notrace(name, obj, member, key) \
        hlist_for_each_entry_rcu_notrace(obj, \
                &name[hash_min(key, HASH_BITS(name))], member)

/**
 * hash_for_each_possible_safe - iterate over all possible objects hashing to the
 * same bucket safe against removals
 * @name: hashtable to iterate
 * @obj: the type * to use as a loop cursor for each entry
 * @tmp: a &struct hlist_node used for temporary storage
 * @member: the name of the hlist_node within the struct
 * @key: the key of the objects to iterate over
 */
#define hash_for_each_possible_safe(name, obj, tmp, member, key)        \
        hlist_for_each_entry_safe(obj, tmp,\
                &name[hash_min(key, HASH_BITS(name))], member)


#endif





































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * descriptor table internals; you almost certainly want file.h instead.
 */

#ifndef __LINUX_FDTABLE_H
#define __LINUX_FDTABLE_H

#include <linux/posix_types.h>
#include <linux/compiler.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/nospec.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/fs.h>

#include <linux/atomic.h>

/*
 * The default fd array needs to be at least BITS_PER_LONG,
 * as this is the granularity returned by copy_fdset().
 */
#define NR_OPEN_DEFAULT BITS_PER_LONG
#define NR_OPEN_MAX ~0U

struct fdtable {
        unsigned int max_fds;
        struct file __rcu **fd;      /* current fd array */
        unsigned long *close_on_exec;
        unsigned long *open_fds;
        unsigned long *full_fds_bits;
        struct rcu_head rcu;
};

static inline bool close_on_exec(unsigned int fd, const struct fdtable *fdt)
{
        return test_bit(fd, fdt->close_on_exec);
}

static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
{
        return test_bit(fd, fdt->open_fds);
}

/*
 * Open file table structure
 */
struct files_struct {
  /*
   * read mostly part
   */
        atomic_t count;
        bool resize_in_progress;
        wait_queue_head_t resize_wait;

        struct fdtable __rcu *fdt;
        struct fdtable fdtab;
  /*
   * written part on a separate cache line in SMP
   */
        spinlock_t file_lock ____cacheline_aligned_in_smp;
        unsigned int next_fd;
        unsigned long close_on_exec_init[1];
        unsigned long open_fds_init[1];
        unsigned long full_fds_bits_init[1];
        struct file __rcu * fd_array[NR_OPEN_DEFAULT];
};

struct file_operations;
struct vfsmount;
struct dentry;

#define rcu_dereference_check_fdtable(files, fdtfd) \
        rcu_dereference_check((fdtfd), lockdep_is_held(&(files)->file_lock))

#define files_fdtable(files) \
        rcu_dereference_check_fdtable((files), (files)->fdt)

/*
 * The caller must ensure that fd table isn't shared or hold rcu or file lock
 */
static inline struct file *__fcheck_files(struct files_struct *files, unsigned int fd)
{
        struct fdtable *fdt = rcu_dereference_raw(files->fdt);

        if (fd < fdt->max_fds) {
                fd = array_index_nospec(fd, fdt->max_fds);
                return rcu_dereference_raw(fdt->fd[fd]);
        }
        return NULL;
}

static inline struct file *fcheck_files(struct files_struct *files, unsigned int fd)
{
        RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&
                           !lockdep_is_held(&files->file_lock),
                           "suspicious rcu_dereference_check() usage");
        return __fcheck_files(files, fd);
}

/*
 * Check whether the specified fd has an open file.
 */
#define fcheck(fd)        fcheck_files(current->files, fd)

struct task_struct;

struct files_struct *get_files_struct(struct task_struct *);
void put_files_struct(struct files_struct *fs);
void reset_files_struct(struct files_struct *);
int unshare_files(struct files_struct **);
struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy;
void do_close_on_exec(struct files_struct *);
int iterate_fd(struct files_struct *, unsigned,
                int (*)(const void *, struct file *, unsigned),
                const void *);

extern int __alloc_fd(struct files_struct *files,
                      unsigned start, unsigned end, unsigned flags);
extern void __fd_install(struct files_struct *files,
                      unsigned int fd, struct file *file);
extern int __close_fd(struct files_struct *files,
                      unsigned int fd);
extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags);
extern int close_fd_get_file(unsigned int fd, struct file **res);
extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
                      struct files_struct **new_fdp);

extern struct kmem_cache *files_cachep;

#endif /* __LINUX_FDTABLE_H */


























    1 









    1 































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_ERR_H
#define _LINUX_ERR_H

#include <linux/compiler.h>
#include <linux/types.h>

#include <asm/errno.h>

/*
 * Kernel pointers have redundant information, so we can use a
 * scheme where we can return either an error code or a normal
 * pointer with the same return value.
 *
 * This should be a per-architecture thing, to allow different
 * error and pointer decisions.
 */
#define MAX_ERRNO        4095

#ifndef __ASSEMBLY__

#define IS_ERR_VALUE(x) unlikely((unsigned long)(void *)(x) >= (unsigned long)-MAX_ERRNO)

static inline void * __must_check ERR_PTR(long error)
{
        return (void *) error;
}

static inline long __must_check PTR_ERR(__force const void *ptr)
{
        return (long) ptr;
}

static inline bool __must_check IS_ERR(__force const void *ptr)
{
        return IS_ERR_VALUE((unsigned long)ptr);
}

static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr)
{
        return unlikely(!ptr) || IS_ERR_VALUE((unsigned long)ptr);
}

/**
 * ERR_CAST - Explicitly cast an error-valued pointer to another pointer type
 * @ptr: The pointer to cast.
 *
 * Explicitly cast an error-valued pointer to another pointer type in such a
 * way as to make it clear that's what's going on.
 */
static inline void * __must_check ERR_CAST(__force const void *ptr)
{
        /* cast away the const */
        return (void *) ptr;
}

static inline int __must_check PTR_ERR_OR_ZERO(__force const void *ptr)
{
        if (IS_ERR(ptr))
                return PTR_ERR(ptr);
        else
                return 0;
}

#endif

#endif /* _LINUX_ERR_H */







































































    1 




















    1 


    1 







































    1 































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Access vector cache interface for object managers.
 *
 * Author : Stephen Smalley, <sds@tycho.nsa.gov>
 */
#ifndef _SELINUX_AVC_H_
#define _SELINUX_AVC_H_

#include <linux/stddef.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/kdev_t.h>
#include <linux/spinlock.h>
#include <linux/init.h>
#include <linux/audit.h>
#include <linux/lsm_audit.h>
#include <linux/in6.h>
#include "flask.h"
#include "av_permissions.h"
#include "security.h"

/*
 * An entry in the AVC.
 */
struct avc_entry;

struct task_struct;
struct inode;
struct sock;
struct sk_buff;

/*
 * AVC statistics
 */
struct avc_cache_stats {
        unsigned int lookups;
        unsigned int misses;
        unsigned int allocations;
        unsigned int reclaims;
        unsigned int frees;
};

/*
 * We only need this data after we have decided to send an audit message.
 */
struct selinux_audit_data {
        u32 ssid;
        u32 tsid;
        u16 tclass;
        u32 requested;
        u32 audited;
        u32 denied;
        int result;
        struct selinux_state *state;
};

/*
 * AVC operations
 */

void __init avc_init(void);

static inline u32 avc_audit_required(u32 requested,
                              struct av_decision *avd,
                              int result,
                              u32 auditdeny,
                              u32 *deniedp)
{
        u32 denied, audited;
        denied = requested & ~avd->allowed;
        if (unlikely(denied)) {
                audited = denied & avd->auditdeny;
                /*
                 * auditdeny is TRICKY!  Setting a bit in
                 * this field means that ANY denials should NOT be audited if
                 * the policy contains an explicit dontaudit rule for that
                 * permission.  Take notice that this is unrelated to the
                 * actual permissions that were denied.  As an example lets
                 * assume:
                 *
                 * denied == READ
                 * avd.auditdeny & ACCESS == 0 (not set means explicit rule)
                 * auditdeny & ACCESS == 1
                 *
                 * We will NOT audit the denial even though the denied
                 * permission was READ and the auditdeny checks were for
                 * ACCESS
                 */
                if (auditdeny && !(auditdeny & avd->auditdeny))
                        audited = 0;
        } else if (result)
                audited = denied = requested;
        else
                audited = requested & avd->auditallow;
        *deniedp = denied;
        return audited;
}

int slow_avc_audit(struct selinux_state *state,
                   u32 ssid, u32 tsid, u16 tclass,
                   u32 requested, u32 audited, u32 denied, int result,
                   struct common_audit_data *a);

/**
 * avc_audit - Audit the granting or denial of permissions.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 * @requested: requested permissions
 * @avd: access vector decisions
 * @result: result from avc_has_perm_noaudit
 * @a:  auxiliary audit data
 * @flags: VFS walk flags
 *
 * Audit the granting or denial of permissions in accordance
 * with the policy.  This function is typically called by
 * avc_has_perm() after a permission check, but can also be
 * called directly by callers who use avc_has_perm_noaudit()
 * in order to separate the permission check from the auditing.
 * For example, this separation is useful when the permission check must
 * be performed under a lock, to allow the lock to be released
 * before calling the auditing code.
 */
static inline int avc_audit(struct selinux_state *state,
                            u32 ssid, u32 tsid,
                            u16 tclass, u32 requested,
                            struct av_decision *avd,
                            int result,
                            struct common_audit_data *a,
                            int flags)
{
        u32 audited, denied;
        audited = avc_audit_required(requested, avd, result, 0, &denied);
        if (likely(!audited))
                return 0;
        /* fall back to ref-walk if we have to generate audit */
        if (flags & MAY_NOT_BLOCK)
                return -ECHILD;
        return slow_avc_audit(state, ssid, tsid, tclass,
                              requested, audited, denied, result,
                              a);
}

#define AVC_STRICT 1 /* Ignore permissive mode. */
#define AVC_EXTENDED_PERMS 2        /* update extended permissions */
#define AVC_NONBLOCKING    4        /* non blocking */
int avc_has_perm_noaudit(struct selinux_state *state,
                         u32 ssid, u32 tsid,
                         u16 tclass, u32 requested,
                         unsigned flags,
                         struct av_decision *avd);

int avc_has_perm(struct selinux_state *state,
                 u32 ssid, u32 tsid,
                 u16 tclass, u32 requested,
                 struct common_audit_data *auditdata);
int avc_has_perm_flags(struct selinux_state *state,
                       u32 ssid, u32 tsid,
                       u16 tclass, u32 requested,
                       struct common_audit_data *auditdata,
                       int flags);

int avc_has_extended_perms(struct selinux_state *state,
                           u32 ssid, u32 tsid, u16 tclass, u32 requested,
                           u8 driver, u8 perm, struct common_audit_data *ad);


u32 avc_policy_seqno(struct selinux_state *state);

#define AVC_CALLBACK_GRANT                1
#define AVC_CALLBACK_TRY_REVOKE                2
#define AVC_CALLBACK_REVOKE                4
#define AVC_CALLBACK_RESET                8
#define AVC_CALLBACK_AUDITALLOW_ENABLE        16
#define AVC_CALLBACK_AUDITALLOW_DISABLE        32
#define AVC_CALLBACK_AUDITDENY_ENABLE        64
#define AVC_CALLBACK_AUDITDENY_DISABLE        128
#define AVC_CALLBACK_ADD_XPERMS                256

int avc_add_callback(int (*callback)(u32 event), u32 events);

/* Exported to selinuxfs */
struct selinux_avc;
int avc_get_hash_stats(struct selinux_avc *avc, char *page);
unsigned int avc_get_cache_threshold(struct selinux_avc *avc);
void avc_set_cache_threshold(struct selinux_avc *avc,
                             unsigned int cache_threshold);

/* Attempt to free avc node cache */
void avc_disable(void);

#ifdef CONFIG_SECURITY_SELINUX_AVC_STATS
DECLARE_PER_CPU(struct avc_cache_stats, avc_cache_stats);
#endif

#endif /* _SELINUX_AVC_H_ */















































































































































































































































































































































































































































































































    1 









    1 




















































    1 










    1 















































































































































































































































































































































































































    1 



















































































































































































































































































































































    1 























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 





































































































































































































































































































































































































































































































    1 







































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_FS_H
#define _LINUX_FS_H

#include <linux/linkage.h>
#include <linux/wait_bit.h>
#include <linux/kdev_t.h>
#include <linux/dcache.h>
#include <linux/path.h>
#include <linux/stat.h>
#include <linux/cache.h>
#include <linux/list.h>
#include <linux/list_lru.h>
#include <linux/llist.h>
#include <linux/radix-tree.h>
#include <linux/xarray.h>
#include <linux/rbtree.h>
#include <linux/init.h>
#include <linux/pid.h>
#include <linux/bug.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/mm_types.h>
#include <linux/capability.h>
#include <linux/semaphore.h>
#include <linux/fcntl.h>
#include <linux/rculist_bl.h>
#include <linux/atomic.h>
#include <linux/shrinker.h>
#include <linux/migrate_mode.h>
#include <linux/uidgid.h>
#include <linux/lockdep.h>
#include <linux/percpu-rwsem.h>
#include <linux/workqueue.h>
#include <linux/delayed_call.h>
#include <linux/uuid.h>
#include <linux/errseq.h>
#include <linux/ioprio.h>
#include <linux/fs_types.h>
#include <linux/build_bug.h>
#include <linux/stddef.h>

#include <asm/byteorder.h>
#include <uapi/linux/fs.h>

struct backing_dev_info;
struct bdi_writeback;
struct bio;
struct export_operations;
struct fiemap_extent_info;
struct hd_geometry;
struct iovec;
struct kiocb;
struct kobject;
struct pipe_inode_info;
struct poll_table_struct;
struct kstatfs;
struct vm_area_struct;
struct vfsmount;
struct cred;
struct swap_info_struct;
struct seq_file;
struct workqueue_struct;
struct iov_iter;
struct fscrypt_info;
struct fscrypt_operations;
struct fsverity_info;
struct fsverity_operations;
struct fs_context;
struct fs_parameter_spec;

extern void __init inode_init(void);
extern void __init inode_init_early(void);
extern void __init files_init(void);
extern void __init files_maxfiles_init(void);

extern struct files_stat_struct files_stat;
extern unsigned long get_max_files(void);
extern unsigned int sysctl_nr_open;
extern struct inodes_stat_t inodes_stat;
extern int leases_enable, lease_break_time;
extern int sysctl_protected_symlinks;
extern int sysctl_protected_hardlinks;
extern int sysctl_protected_fifos;
extern int sysctl_protected_regular;

typedef __kernel_rwf_t rwf_t;

struct buffer_head;
typedef int (get_block_t)(struct inode *inode, sector_t iblock,
                        struct buffer_head *bh_result, int create);
typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
                        ssize_t bytes, void *private);

#define MAY_EXEC                0x00000001
#define MAY_WRITE                0x00000002
#define MAY_READ                0x00000004
#define MAY_APPEND                0x00000008
#define MAY_ACCESS                0x00000010
#define MAY_OPEN                0x00000020
#define MAY_CHDIR                0x00000040
/* called from RCU mode, don't block */
#define MAY_NOT_BLOCK                0x00000080

/*
 * flags in file.f_mode.  Note that FMODE_READ and FMODE_WRITE must correspond
 * to O_WRONLY and O_RDWR via the strange trick in do_dentry_open()
 */

/* file is open for reading */
#define FMODE_READ                ((__force fmode_t)0x1)
/* file is open for writing */
#define FMODE_WRITE                ((__force fmode_t)0x2)
/* file is seekable */
#define FMODE_LSEEK                ((__force fmode_t)0x4)
/* file can be accessed using pread */
#define FMODE_PREAD                ((__force fmode_t)0x8)
/* file can be accessed using pwrite */
#define FMODE_PWRITE                ((__force fmode_t)0x10)
/* File is opened for execution with sys_execve / sys_uselib */
#define FMODE_EXEC                ((__force fmode_t)0x20)
/* File is opened with O_NDELAY (only set for block devices) */
#define FMODE_NDELAY                ((__force fmode_t)0x40)
/* File is opened with O_EXCL (only set for block devices) */
#define FMODE_EXCL                ((__force fmode_t)0x80)
/* File is opened using open(.., 3, ..) and is writeable only for ioctls
   (specialy hack for floppy.c) */
#define FMODE_WRITE_IOCTL        ((__force fmode_t)0x100)
/* 32bit hashes as llseek() offset (for directories) */
#define FMODE_32BITHASH         ((__force fmode_t)0x200)
/* 64bit hashes as llseek() offset (for directories) */
#define FMODE_64BITHASH         ((__force fmode_t)0x400)

/*
 * Don't update ctime and mtime.
 *
 * Currently a special hack for the XFS open_by_handle ioctl, but we'll
 * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon.
 */
#define FMODE_NOCMTIME                ((__force fmode_t)0x800)

/* Expect random access pattern */
#define FMODE_RANDOM                ((__force fmode_t)0x1000)

/* File is huge (eg. /dev/kmem): treat loff_t as unsigned */
#define FMODE_UNSIGNED_OFFSET        ((__force fmode_t)0x2000)

/* File is opened with O_PATH; almost nothing can be done with it */
#define FMODE_PATH                ((__force fmode_t)0x4000)

/* File needs atomic accesses to f_pos */
#define FMODE_ATOMIC_POS        ((__force fmode_t)0x8000)
/* Write access to underlying fs */
#define FMODE_WRITER                ((__force fmode_t)0x10000)
/* Has read method(s) */
#define FMODE_CAN_READ          ((__force fmode_t)0x20000)
/* Has write method(s) */
#define FMODE_CAN_WRITE         ((__force fmode_t)0x40000)

#define FMODE_OPENED                ((__force fmode_t)0x80000)
#define FMODE_CREATED                ((__force fmode_t)0x100000)

/* File is stream-like */
#define FMODE_STREAM                ((__force fmode_t)0x200000)

/* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY                ((__force fmode_t)0x4000000)

/* File is capable of returning -EAGAIN if I/O will block */
#define FMODE_NOWAIT                ((__force fmode_t)0x8000000)

/* File represents mount that needs unmounting */
#define FMODE_NEED_UNMOUNT        ((__force fmode_t)0x10000000)

/* File does not contribute to nr_files count */
#define FMODE_NOACCOUNT                ((__force fmode_t)0x20000000)

/* File supports async buffered reads */
#define FMODE_BUF_RASYNC        ((__force fmode_t)0x40000000)

/*
 * Attribute flags.  These should be or-ed together to figure out what
 * has been changed!
 */
#define ATTR_MODE        (1 << 0)
#define ATTR_UID        (1 << 1)
#define ATTR_GID        (1 << 2)
#define ATTR_SIZE        (1 << 3)
#define ATTR_ATIME        (1 << 4)
#define ATTR_MTIME        (1 << 5)
#define ATTR_CTIME        (1 << 6)
#define ATTR_ATIME_SET        (1 << 7)
#define ATTR_MTIME_SET        (1 << 8)
#define ATTR_FORCE        (1 << 9) /* Not a change, but a change it */
#define ATTR_KILL_SUID        (1 << 11)
#define ATTR_KILL_SGID        (1 << 12)
#define ATTR_FILE        (1 << 13)
#define ATTR_KILL_PRIV        (1 << 14)
#define ATTR_OPEN        (1 << 15) /* Truncating from open(O_TRUNC) */
#define ATTR_TIMES_SET        (1 << 16)
#define ATTR_TOUCH        (1 << 17)

/*
 * Whiteout is represented by a char device.  The following constants define the
 * mode and device number to use.
 */
#define WHITEOUT_MODE 0
#define WHITEOUT_DEV 0

/*
 * This is the Inode Attributes structure, used for notify_change().  It
 * uses the above definitions as flags, to know which values have changed.
 * Also, in this manner, a Filesystem can look at only the values it cares
 * about.  Basically, these are the attributes that the VFS layer can
 * request to change from the FS layer.
 *
 * Derek Atkins <warlord@MIT.EDU> 94-10-20
 */
struct iattr {
        unsigned int        ia_valid;
        umode_t                ia_mode;
        kuid_t                ia_uid;
        kgid_t                ia_gid;
        loff_t                ia_size;
        struct timespec64 ia_atime;
        struct timespec64 ia_mtime;
        struct timespec64 ia_ctime;

        /*
         * Not an attribute, but an auxiliary info for filesystems wanting to
         * implement an ftruncate() like method.  NOTE: filesystem should
         * check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL).
         */
        struct file        *ia_file;
};

/*
 * Includes for diskquotas.
 */
#include <linux/quota.h>

/*
 * Maximum number of layers of fs stack.  Needs to be limited to
 * prevent kernel stack overflow
 */
#define FILESYSTEM_MAX_STACK_DEPTH 2

/** 
 * enum positive_aop_returns - aop return codes with specific semantics
 *
 * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has
 *                             completed, that the page is still locked, and
 *                             should be considered active.  The VM uses this hint
 *                             to return the page to the active list -- it won't
 *                             be a candidate for writeback again in the near
 *                             future.  Other callers must be careful to unlock
 *                             the page if they get this return.  Returned by
 *                             writepage(); 
 *
 * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has
 *                          unlocked it and the page might have been truncated.
 *                          The caller should back up to acquiring a new page and
 *                          trying again.  The aop will be taking reasonable
 *                          precautions not to livelock.  If the caller held a page
 *                          reference, it should drop it before retrying.  Returned
 *                          by readpage().
 *
 * address_space_operation functions return these large constants to indicate
 * special semantics to the caller.  These are much larger than the bytes in a
 * page to allow for functions that return the number of bytes operated on in a
 * given page.
 */

enum positive_aop_returns {
        AOP_WRITEPAGE_ACTIVATE        = 0x80000,
        AOP_TRUNCATED_PAGE        = 0x80001,
};

#define AOP_FLAG_CONT_EXPAND                0x0001 /* called from cont_expand */
#define AOP_FLAG_NOFS                        0x0002 /* used by filesystem to direct
                                                * helper code (eg buffer layer)
                                                * to clear GFP_FS from alloc */

/*
 * oh the beauties of C type declarations.
 */
struct page;
struct address_space;
struct writeback_control;
struct readahead_control;

/*
 * Write life time hint values.
 * Stored in struct inode as u8.
 */
enum rw_hint {
        WRITE_LIFE_NOT_SET        = 0,
        WRITE_LIFE_NONE                = RWH_WRITE_LIFE_NONE,
        WRITE_LIFE_SHORT        = RWH_WRITE_LIFE_SHORT,
        WRITE_LIFE_MEDIUM        = RWH_WRITE_LIFE_MEDIUM,
        WRITE_LIFE_LONG                = RWH_WRITE_LIFE_LONG,
        WRITE_LIFE_EXTREME        = RWH_WRITE_LIFE_EXTREME,
};

/* Match RWF_* bits to IOCB bits */
#define IOCB_HIPRI                (__force int) RWF_HIPRI
#define IOCB_DSYNC                (__force int) RWF_DSYNC
#define IOCB_SYNC                (__force int) RWF_SYNC
#define IOCB_NOWAIT                (__force int) RWF_NOWAIT
#define IOCB_APPEND                (__force int) RWF_APPEND

/* non-RWF related bits - start at 16 */
#define IOCB_EVENTFD                (1 << 16)
#define IOCB_DIRECT                (1 << 17)
#define IOCB_WRITE                (1 << 18)
/* iocb->ki_waitq is valid */
#define IOCB_WAITQ                (1 << 19)
#define IOCB_NOIO                (1 << 20)
/* kiocb is a read or write operation submitted by fs/aio.c. */
#define IOCB_AIO_RW                (1 << 23)

struct kiocb {
        struct file                *ki_filp;

        /* The 'ki_filp' pointer is shared in a union for aio */
        randomized_struct_fields_start

        loff_t                        ki_pos;
        void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
        void                        *private;
        int                        ki_flags;
        u16                        ki_hint;
        u16                        ki_ioprio; /* See linux/ioprio.h */
        union {
                unsigned int                ki_cookie; /* for ->iopoll */
                struct wait_page_queue        *ki_waitq; /* for async buffered IO */
        };

        randomized_struct_fields_end
};

static inline bool is_sync_kiocb(struct kiocb *kiocb)
{
        return kiocb->ki_complete == NULL;
}

/*
 * "descriptor" for what we're up to with a read.
 * This allows us to use the same read code yet
 * have multiple different users of the data that
 * we read from a file.
 *
 * The simplest case just copies the data to user
 * mode.
 */
typedef struct {
        size_t written;
        size_t count;
        union {
                char __user *buf;
                void *data;
        } arg;
        int error;
} read_descriptor_t;

typedef int (*read_actor_t)(read_descriptor_t *, struct page *,
                unsigned long, unsigned long);

struct address_space_operations {
        int (*writepage)(struct page *page, struct writeback_control *wbc);
        int (*readpage)(struct file *, struct page *);

        /* Write back some dirty pages from this mapping. */
        int (*writepages)(struct address_space *, struct writeback_control *);

        /* Set a page dirty.  Return true if this dirtied it */
        int (*set_page_dirty)(struct page *page);

        /*
         * Reads in the requested pages. Unlike ->readpage(), this is
         * PURELY used for read-ahead!.
         */
        int (*readpages)(struct file *filp, struct address_space *mapping,
                        struct list_head *pages, unsigned nr_pages);
        void (*readahead)(struct readahead_control *);

        int (*write_begin)(struct file *, struct address_space *mapping,
                                loff_t pos, unsigned len, unsigned flags,
                                struct page **pagep, void **fsdata);
        int (*write_end)(struct file *, struct address_space *mapping,
                                loff_t pos, unsigned len, unsigned copied,
                                struct page *page, void *fsdata);

        /* Unfortunately this kludge is needed for FIBMAP. Don't use it */
        sector_t (*bmap)(struct address_space *, sector_t);
        void (*invalidatepage) (struct page *, unsigned int, unsigned int);
        int (*releasepage) (struct page *, gfp_t);
        void (*freepage)(struct page *);
        ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
        /*
         * migrate the contents of a page to the specified target. If
         * migrate_mode is MIGRATE_ASYNC, it must not block.
         */
        int (*migratepage) (struct address_space *,
                        struct page *, struct page *, enum migrate_mode);
        bool (*isolate_page)(struct page *, isolate_mode_t);
        void (*putback_page)(struct page *);
        int (*launder_page) (struct page *);
        int (*is_partially_uptodate) (struct page *, unsigned long,
                                        unsigned long);
        void (*is_dirty_writeback) (struct page *, bool *, bool *);
        int (*error_remove_page)(struct address_space *, struct page *);

        /* swapfile support */
        int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
                                sector_t *span);
        void (*swap_deactivate)(struct file *file);
};

extern const struct address_space_operations empty_aops;

/*
 * pagecache_write_begin/pagecache_write_end must be used by general code
 * to write into the pagecache.
 */
int pagecache_write_begin(struct file *, struct address_space *mapping,
                                loff_t pos, unsigned len, unsigned flags,
                                struct page **pagep, void **fsdata);

int pagecache_write_end(struct file *, struct address_space *mapping,
                                loff_t pos, unsigned len, unsigned copied,
                                struct page *page, void *fsdata);

/**
 * struct address_space - Contents of a cacheable, mappable object.
 * @host: Owner, either the inode or the block_device.
 * @i_pages: Cached pages.
 * @gfp_mask: Memory allocation flags to use for allocating pages.
 * @i_mmap_writable: Number of VM_SHARED mappings.
 * @nr_thps: Number of THPs in the pagecache (non-shmem only).
 * @i_mmap: Tree of private and shared mappings.
 * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
 * @nrpages: Number of page entries, protected by the i_pages lock.
 * @nrexceptional: Shadow or DAX entries, protected by the i_pages lock.
 * @writeback_index: Writeback starts here.
 * @a_ops: Methods.
 * @flags: Error bits and flags (AS_*).
 * @wb_err: The most recent error which has occurred.
 * @private_lock: For use by the owner of the address_space.
 * @private_list: For use by the owner of the address_space.
 * @private_data: For use by the owner of the address_space.
 */
struct address_space {
        struct inode                *host;
        struct xarray                i_pages;
        gfp_t                        gfp_mask;
        atomic_t                i_mmap_writable;
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        /* number of thp, only for non-shmem files */
        atomic_t                nr_thps;
#endif
        struct rb_root_cached        i_mmap;
        struct rw_semaphore        i_mmap_rwsem;
        unsigned long                nrpages;
        unsigned long                nrexceptional;
        pgoff_t                        writeback_index;
        const struct address_space_operations *a_ops;
        unsigned long                flags;
        errseq_t                wb_err;
        spinlock_t                private_lock;
        struct list_head        private_list;
        void                        *private_data;
} __attribute__((aligned(sizeof(long)))) __randomize_layout;
        /*
         * On most architectures that alignment is already the case; but
         * must be enforced here for CRIS, to let the least significant bit
         * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON.
         */

/* XArray tags, for tagging dirty and writeback pages in the pagecache. */
#define PAGECACHE_TAG_DIRTY        XA_MARK_0
#define PAGECACHE_TAG_WRITEBACK        XA_MARK_1
#define PAGECACHE_TAG_TOWRITE        XA_MARK_2

/*
 * Returns true if any of the pages in the mapping are marked with the tag.
 */
static inline bool mapping_tagged(struct address_space *mapping, xa_mark_t tag)
{
        return xa_marked(&mapping->i_pages, tag);
}

static inline void i_mmap_lock_write(struct address_space *mapping)
{
        down_write(&mapping->i_mmap_rwsem);
}

static inline int i_mmap_trylock_write(struct address_space *mapping)
{
        return down_write_trylock(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_unlock_write(struct address_space *mapping)
{
        up_write(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_lock_read(struct address_space *mapping)
{
        down_read(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_unlock_read(struct address_space *mapping)
{
        up_read(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_assert_locked(struct address_space *mapping)
{
        lockdep_assert_held(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_assert_write_locked(struct address_space *mapping)
{
        lockdep_assert_held_write(&mapping->i_mmap_rwsem);
}

/*
 * Might pages of this file be mapped into userspace?
 */
static inline int mapping_mapped(struct address_space *mapping)
{
        return        !RB_EMPTY_ROOT(&mapping->i_mmap.rb_root);
}

/*
 * Might pages of this file have been modified in userspace?
 * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap
 * marks vma as VM_SHARED if it is shared, and the file was opened for
 * writing i.e. vma may be mprotected writable even if now readonly.
 *
 * If i_mmap_writable is negative, no new writable mappings are allowed. You
 * can only deny writable mappings, if none exists right now.
 */
static inline int mapping_writably_mapped(struct address_space *mapping)
{
        return atomic_read(&mapping->i_mmap_writable) > 0;
}

static inline int mapping_map_writable(struct address_space *mapping)
{
        return atomic_inc_unless_negative(&mapping->i_mmap_writable) ?
                0 : -EPERM;
}

static inline void mapping_unmap_writable(struct address_space *mapping)
{
        atomic_dec(&mapping->i_mmap_writable);
}

static inline int mapping_deny_writable(struct address_space *mapping)
{
        return atomic_dec_unless_positive(&mapping->i_mmap_writable) ?
                0 : -EBUSY;
}

static inline void mapping_allow_writable(struct address_space *mapping)
{
        atomic_inc(&mapping->i_mmap_writable);
}

/*
 * Use sequence counter to get consistent i_size on 32-bit processors.
 */
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
#include <linux/seqlock.h>
#define __NEED_I_SIZE_ORDERED
#define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount)
#else
#define i_size_ordered_init(inode) do { } while (0)
#endif

struct posix_acl;
#define ACL_NOT_CACHED ((void *)(-1))
#define ACL_DONT_CACHE ((void *)(-3))

static inline struct posix_acl *
uncached_acl_sentinel(struct task_struct *task)
{
        return (void *)task + 1;
}

static inline bool
is_uncached_acl(struct posix_acl *acl)
{
        return (long)acl & 1;
}

#define IOP_FASTPERM        0x0001
#define IOP_LOOKUP        0x0002
#define IOP_NOFOLLOW        0x0004
#define IOP_XATTR        0x0008
#define IOP_DEFAULT_READLINK        0x0010

struct fsnotify_mark_connector;

/*
 * Keep mostly read-only and often accessed (especially for
 * the RCU path lookup and 'stat' data) fields at the beginning
 * of the 'struct inode'
 */
struct inode {
        umode_t                        i_mode;
        unsigned short                i_opflags;
        kuid_t                        i_uid;
        kgid_t                        i_gid;
        unsigned int                i_flags;

#ifdef CONFIG_FS_POSIX_ACL
        struct posix_acl        *i_acl;
        struct posix_acl        *i_default_acl;
#endif

        const struct inode_operations        *i_op;
        struct super_block        *i_sb;
        struct address_space        *i_mapping;

#ifdef CONFIG_SECURITY
        void                        *i_security;
#endif

        /* Stat data, not accessed from path walking */
        unsigned long                i_ino;
        /*
         * Filesystems may only read i_nlink directly.  They shall use the
         * following functions for modification:
         *
         *    (set|clear|inc|drop)_nlink
         *    inode_(inc|dec)_link_count
         */
        union {
                const unsigned int i_nlink;
                unsigned int __i_nlink;
        };
        dev_t                        i_rdev;
        loff_t                        i_size;
        struct timespec64        i_atime;
        struct timespec64        i_mtime;
        struct timespec64        i_ctime;
        spinlock_t                i_lock;        /* i_blocks, i_bytes, maybe i_size */
        unsigned short          i_bytes;
        u8                        i_blkbits;
        u8                        i_write_hint;
        blkcnt_t                i_blocks;

#ifdef __NEED_I_SIZE_ORDERED
        seqcount_t                i_size_seqcount;
#endif

        /* Misc */
        unsigned long                i_state;
        struct rw_semaphore        i_rwsem;

        unsigned long                dirtied_when;        /* jiffies of first dirtying */
        unsigned long                dirtied_time_when;

        struct hlist_node        i_hash;
        struct list_head        i_io_list;        /* backing dev IO list */
#ifdef CONFIG_CGROUP_WRITEBACK
        struct bdi_writeback        *i_wb;                /* the associated cgroup wb */

        /* foreign inode detection, see wbc_detach_inode() */
        int                        i_wb_frn_winner;
        u16                        i_wb_frn_avg_time;
        u16                        i_wb_frn_history;
#endif
        struct list_head        i_lru;                /* inode LRU list */
        struct list_head        i_sb_list;
        struct list_head        i_wb_list;        /* backing dev writeback list */
        union {
                struct hlist_head        i_dentry;
                struct rcu_head                i_rcu;
        };
        atomic64_t                i_version;
        atomic64_t                i_sequence; /* see futex */
        atomic_t                i_count;
        atomic_t                i_dio_count;
        atomic_t                i_writecount;
#if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING)
        atomic_t                i_readcount; /* struct files open RO */
#endif
        union {
                const struct file_operations        *i_fop;        /* former ->i_op->default_file_ops */
                void (*free_inode)(struct inode *);
        };
        struct file_lock_context        *i_flctx;
        struct address_space        i_data;
        struct list_head        i_devices;
        union {
                struct pipe_inode_info        *i_pipe;
                struct block_device        *i_bdev;
                struct cdev                *i_cdev;
                char                        *i_link;
                unsigned                i_dir_seq;
        };

        __u32                        i_generation;

#ifdef CONFIG_FSNOTIFY
        __u32                        i_fsnotify_mask; /* all events this inode cares about */
        struct fsnotify_mark_connector __rcu        *i_fsnotify_marks;
#endif

#ifdef CONFIG_FS_ENCRYPTION
        struct fscrypt_info        *i_crypt_info;
#endif

#ifdef CONFIG_FS_VERITY
        struct fsverity_info        *i_verity_info;
#endif

        void                        *i_private; /* fs or device private pointer */
} __randomize_layout;

struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode);

static inline unsigned int i_blocksize(const struct inode *node)
{
        return (1 << node->i_blkbits);
}

static inline int inode_unhashed(struct inode *inode)
{
        return hlist_unhashed(&inode->i_hash);
}

/*
 * __mark_inode_dirty expects inodes to be hashed.  Since we don't
 * want special inodes in the fileset inode space, we make them
 * appear hashed, but do not put on any lists.  hlist_del()
 * will work fine and require no locking.
 */
static inline void inode_fake_hash(struct inode *inode)
{
        hlist_add_fake(&inode->i_hash);
}

/*
 * inode->i_mutex nesting subclasses for the lock validator:
 *
 * 0: the object of the current VFS operation
 * 1: parent
 * 2: child/target
 * 3: xattr
 * 4: second non-directory
 * 5: second parent (when locking independent directories in rename)
 *
 * I_MUTEX_NONDIR2 is for certain operations (such as rename) which lock two
 * non-directories at once.
 *
 * The locking order between these classes is
 * parent[2] -> child -> grandchild -> normal -> xattr -> second non-directory
 */
enum inode_i_mutex_lock_class
{
        I_MUTEX_NORMAL,
        I_MUTEX_PARENT,
        I_MUTEX_CHILD,
        I_MUTEX_XATTR,
        I_MUTEX_NONDIR2,
        I_MUTEX_PARENT2,
};

static inline void inode_lock(struct inode *inode)
{
        down_write(&inode->i_rwsem);
}

static inline void inode_unlock(struct inode *inode)
{
        up_write(&inode->i_rwsem);
}

static inline void inode_lock_shared(struct inode *inode)
{
        down_read(&inode->i_rwsem);
}

static inline void inode_unlock_shared(struct inode *inode)
{
        up_read(&inode->i_rwsem);
}

static inline int inode_trylock(struct inode *inode)
{
        return down_write_trylock(&inode->i_rwsem);
}

static inline int inode_trylock_shared(struct inode *inode)
{
        return down_read_trylock(&inode->i_rwsem);
}

static inline int inode_is_locked(struct inode *inode)
{
        return rwsem_is_locked(&inode->i_rwsem);
}

static inline void inode_lock_nested(struct inode *inode, unsigned subclass)
{
        down_write_nested(&inode->i_rwsem, subclass);
}

static inline void inode_lock_shared_nested(struct inode *inode, unsigned subclass)
{
        down_read_nested(&inode->i_rwsem, subclass);
}

void lock_two_nondirectories(struct inode *, struct inode*);
void unlock_two_nondirectories(struct inode *, struct inode*);

/*
 * NOTE: in a 32bit arch with a preemptable kernel and
 * an UP compile the i_size_read/write must be atomic
 * with respect to the local cpu (unlike with preempt disabled),
 * but they don't need to be atomic with respect to other cpus like in
 * true SMP (so they need either to either locally disable irq around
 * the read or for example on x86 they can be still implemented as a
 * cmpxchg8b without the need of the lock prefix). For SMP compiles
 * and 64bit archs it makes no difference if preempt is enabled or not.
 */
static inline loff_t i_size_read(const struct inode *inode)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
        loff_t i_size;
        unsigned int seq;

        do {
                seq = read_seqcount_begin(&inode->i_size_seqcount);
                i_size = inode->i_size;
        } while (read_seqcount_retry(&inode->i_size_seqcount, seq));
        return i_size;
#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
        loff_t i_size;

        preempt_disable();
        i_size = inode->i_size;
        preempt_enable();
        return i_size;
#else
        return inode->i_size;
#endif
}

/*
 * NOTE: unlike i_size_read(), i_size_write() does need locking around it
 * (normally i_mutex), otherwise on 32bit/SMP an update of i_size_seqcount
 * can be lost, resulting in subsequent i_size_read() calls spinning forever.
 */
static inline void i_size_write(struct inode *inode, loff_t i_size)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
        preempt_disable();
        write_seqcount_begin(&inode->i_size_seqcount);
        inode->i_size = i_size;
        write_seqcount_end(&inode->i_size_seqcount);
        preempt_enable();
#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
        preempt_disable();
        inode->i_size = i_size;
        preempt_enable();
#else
        inode->i_size = i_size;
#endif
}

static inline unsigned iminor(const struct inode *inode)
{
        return MINOR(inode->i_rdev);
}

static inline unsigned imajor(const struct inode *inode)
{
        return MAJOR(inode->i_rdev);
}

struct fown_struct {
        rwlock_t lock;          /* protects pid, uid, euid fields */
        struct pid *pid;        /* pid or -pgrp where SIGIO should be sent */
        enum pid_type pid_type;        /* Kind of process group SIGIO should be sent to */
        kuid_t uid, euid;        /* uid/euid of process setting the owner */
        int signum;                /* posix.1b rt signal to be delivered on IO */
};

/*
 * Track a single file's readahead state
 */
struct file_ra_state {
        pgoff_t start;                        /* where readahead started */
        unsigned int size;                /* # of readahead pages */
        unsigned int async_size;        /* do asynchronous readahead when
                                           there are only # of pages ahead */

        unsigned int ra_pages;                /* Maximum readahead window */
        unsigned int mmap_miss;                /* Cache miss stat for mmap accesses */
        loff_t prev_pos;                /* Cache last read() position */
};

/*
 * Check if @index falls in the readahead windows.
 */
static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
{
        return (index >= ra->start &&
                index <  ra->start + ra->size);
}

struct file {
        union {
                struct llist_node        fu_llist;
                struct rcu_head         fu_rcuhead;
        } f_u;
        struct path                f_path;
        struct inode                *f_inode;        /* cached value */
        const struct file_operations        *f_op;

        /*
         * Protects f_ep_links, f_flags.
         * Must not be taken from IRQ context.
         */
        spinlock_t                f_lock;
        enum rw_hint                f_write_hint;
        atomic_long_t                f_count;
        unsigned int                 f_flags;
        fmode_t                        f_mode;
        struct mutex                f_pos_lock;
        loff_t                        f_pos;
        struct fown_struct        f_owner;
        const struct cred        *f_cred;
        struct file_ra_state        f_ra;

        u64                        f_version;
#ifdef CONFIG_SECURITY
        void                        *f_security;
#endif
        /* needed for tty driver, and maybe others */
        void                        *private_data;

#ifdef CONFIG_EPOLL
        /* Used by fs/eventpoll.c to link all the hooks to this file */
        struct list_head        f_ep_links;
        struct list_head        f_tfile_llink;
#endif /* #ifdef CONFIG_EPOLL */
        struct address_space        *f_mapping;
        errseq_t                f_wb_err;
        errseq_t                f_sb_err; /* for syncfs */
} __randomize_layout
  __attribute__((aligned(4)));        /* lest something weird decides that 2 is OK */

struct file_handle {
        __u32 handle_bytes;
        int handle_type;
        /* file identifier */
        unsigned char f_handle[];
};

static inline struct file *get_file(struct file *f)
{
        atomic_long_inc(&f->f_count);
        return f;
}
#define get_file_rcu_many(x, cnt)        \
        atomic_long_add_unless(&(x)->f_count, (cnt), 0)
#define get_file_rcu(x) get_file_rcu_many((x), 1)
#define file_count(x)        atomic_long_read(&(x)->f_count)

#define        MAX_NON_LFS        ((1UL<<31) - 1)

/* Page cache limit. The filesystems should put that into their s_maxbytes 
   limits, otherwise bad things can happen in VM. */ 
#if BITS_PER_LONG==32
#define MAX_LFS_FILESIZE        ((loff_t)ULONG_MAX << PAGE_SHIFT)
#elif BITS_PER_LONG==64
#define MAX_LFS_FILESIZE         ((loff_t)LLONG_MAX)
#endif

#define FL_POSIX        1
#define FL_FLOCK        2
#define FL_DELEG        4        /* NFSv4 delegation */
#define FL_ACCESS        8        /* not trying to lock, just looking */
#define FL_EXISTS        16        /* when unlocking, test for existence */
#define FL_LEASE        32        /* lease held on this file */
#define FL_CLOSE        64        /* unlock on close */
#define FL_SLEEP        128        /* A blocking lock */
#define FL_DOWNGRADE_PENDING        256 /* Lease is being downgraded */
#define FL_UNLOCK_PENDING        512 /* Lease is being broken */
#define FL_OFDLCK        1024        /* lock is "owned" by struct file */
#define FL_LAYOUT        2048        /* outstanding pNFS layout */

#define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE)

/*
 * Special return value from posix_lock_file() and vfs_lock_file() for
 * asynchronous locking.
 */
#define FILE_LOCK_DEFERRED 1

/* legacy typedef, should eventually be removed */
typedef void *fl_owner_t;

struct file_lock;

struct file_lock_operations {
        void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
        void (*fl_release_private)(struct file_lock *);
};

struct lock_manager_operations {
        fl_owner_t (*lm_get_owner)(fl_owner_t);
        void (*lm_put_owner)(fl_owner_t);
        void (*lm_notify)(struct file_lock *);        /* unblock callback */
        int (*lm_grant)(struct file_lock *, int);
        bool (*lm_break)(struct file_lock *);
        int (*lm_change)(struct file_lock *, int, struct list_head *);
        void (*lm_setup)(struct file_lock *, void **);
        bool (*lm_breaker_owns_lease)(struct file_lock *);
};

struct lock_manager {
        struct list_head list;
        /*
         * NFSv4 and up also want opens blocked during the grace period;
         * NLM doesn't care:
         */
        bool block_opens;
};

struct net;
void locks_start_grace(struct net *, struct lock_manager *);
void locks_end_grace(struct lock_manager *);
bool locks_in_grace(struct net *);
bool opens_in_grace(struct net *);

/* that will die - we need it for nfs_lock_info */
#include <linux/nfs_fs_i.h>

/*
 * struct file_lock represents a generic "file lock". It's used to represent
 * POSIX byte range locks, BSD (flock) locks, and leases. It's important to
 * note that the same struct is used to represent both a request for a lock and
 * the lock itself, but the same object is never used for both.
 *
 * FIXME: should we create a separate "struct lock_request" to help distinguish
 * these two uses?
 *
 * The varous i_flctx lists are ordered by:
 *
 * 1) lock owner
 * 2) lock range start
 * 3) lock range end
 *
 * Obviously, the last two criteria only matter for POSIX locks.
 */
struct file_lock {
        struct file_lock *fl_blocker;        /* The lock, that is blocking us */
        struct list_head fl_list;        /* link into file_lock_context */
        struct hlist_node fl_link;        /* node in global lists */
        struct list_head fl_blocked_requests;        /* list of requests with
                                                 * ->fl_blocker pointing here
                                                 */
        struct list_head fl_blocked_member;        /* node in
                                                 * ->fl_blocker->fl_blocked_requests
                                                 */
        fl_owner_t fl_owner;
        unsigned int fl_flags;
        unsigned char fl_type;
        unsigned int fl_pid;
        int fl_link_cpu;                /* what cpu's list is this on? */
        wait_queue_head_t fl_wait;
        struct file *fl_file;
        loff_t fl_start;
        loff_t fl_end;

        struct fasync_struct *        fl_fasync; /* for lease break notifications */
        /* for lease breaks: */
        unsigned long fl_break_time;
        unsigned long fl_downgrade_time;

        const struct file_lock_operations *fl_ops;        /* Callbacks for filesystems */
        const struct lock_manager_operations *fl_lmops;        /* Callbacks for lockmanagers */
        union {
                struct nfs_lock_info        nfs_fl;
                struct nfs4_lock_info        nfs4_fl;
                struct {
                        struct list_head link;        /* link in AFS vnode's pending_locks list */
                        int state;                /* state of grant or error if -ve */
                        unsigned int        debug_id;
                } afs;
        } fl_u;
} __randomize_layout;

struct file_lock_context {
        spinlock_t                flc_lock;
        struct list_head        flc_flock;
        struct list_head        flc_posix;
        struct list_head        flc_lease;
};

/* The following constant reflects the upper bound of the file/locking space */
#ifndef OFFSET_MAX
#define INT_LIMIT(x)        (~((x)1 << (sizeof(x)*8 - 1)))
#define OFFSET_MAX        INT_LIMIT(loff_t)
#define OFFT_OFFSET_MAX        INT_LIMIT(off_t)
#endif

extern void send_sigio(struct fown_struct *fown, int fd, int band);

#define locks_inode(f) file_inode(f)

#ifdef CONFIG_FILE_LOCKING
extern int fcntl_getlk(struct file *, unsigned int, struct flock *);
extern int fcntl_setlk(unsigned int, struct file *, unsigned int,
                        struct flock *);

#if BITS_PER_LONG == 32
extern int fcntl_getlk64(struct file *, unsigned int, struct flock64 *);
extern int fcntl_setlk64(unsigned int, struct file *, unsigned int,
                        struct flock64 *);
#endif

extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg);
extern int fcntl_getlease(struct file *filp);

/* fs/locks.c */
void locks_free_lock_context(struct inode *inode);
void locks_free_lock(struct file_lock *fl);
extern void locks_init_lock(struct file_lock *);
extern struct file_lock * locks_alloc_lock(void);
extern void locks_copy_lock(struct file_lock *, struct file_lock *);
extern void locks_copy_conflock(struct file_lock *, struct file_lock *);
extern void locks_remove_posix(struct file *, fl_owner_t);
extern void locks_remove_file(struct file *);
extern void locks_release_private(struct file_lock *);
extern void posix_test_lock(struct file *, struct file_lock *);
extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *);
extern int locks_delete_block(struct file_lock *);
extern int vfs_test_lock(struct file *, struct file_lock *);
extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *);
extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl);
bool vfs_inode_has_locks(struct inode *inode);
extern int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl);
extern int __break_lease(struct inode *inode, unsigned int flags, unsigned int type);
extern void lease_get_mtime(struct inode *, struct timespec64 *time);
extern int generic_setlease(struct file *, long, struct file_lock **, void **priv);
extern int vfs_setlease(struct file *, long, struct file_lock **, void **);
extern int lease_modify(struct file_lock *, int, struct list_head *);

struct notifier_block;
extern int lease_register_notifier(struct notifier_block *);
extern void lease_unregister_notifier(struct notifier_block *);

struct files_struct;
extern void show_fd_locks(struct seq_file *f,
                         struct file *filp, struct files_struct *files);
#else /* !CONFIG_FILE_LOCKING */
static inline int fcntl_getlk(struct file *file, unsigned int cmd,
                              struct flock __user *user)
{
        return -EINVAL;
}

static inline int fcntl_setlk(unsigned int fd, struct file *file,
                              unsigned int cmd, struct flock __user *user)
{
        return -EACCES;
}

#if BITS_PER_LONG == 32
static inline int fcntl_getlk64(struct file *file, unsigned int cmd,
                                struct flock64 __user *user)
{
        return -EINVAL;
}

static inline int fcntl_setlk64(unsigned int fd, struct file *file,
                                unsigned int cmd, struct flock64 __user *user)
{
        return -EACCES;
}
#endif
static inline int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
{
        return -EINVAL;
}

static inline int fcntl_getlease(struct file *filp)
{
        return F_UNLCK;
}

static inline void
locks_free_lock_context(struct inode *inode)
{
}

static inline void locks_init_lock(struct file_lock *fl)
{
        return;
}

static inline void locks_copy_conflock(struct file_lock *new, struct file_lock *fl)
{
        return;
}

static inline void locks_copy_lock(struct file_lock *new, struct file_lock *fl)
{
        return;
}

static inline void locks_remove_posix(struct file *filp, fl_owner_t owner)
{
        return;
}

static inline void locks_remove_file(struct file *filp)
{
        return;
}

static inline void posix_test_lock(struct file *filp, struct file_lock *fl)
{
        return;
}

static inline int posix_lock_file(struct file *filp, struct file_lock *fl,
                                  struct file_lock *conflock)
{
        return -ENOLCK;
}

static inline int locks_delete_block(struct file_lock *waiter)
{
        return -ENOENT;
}

static inline int vfs_test_lock(struct file *filp, struct file_lock *fl)
{
        return 0;
}

static inline int vfs_lock_file(struct file *filp, unsigned int cmd,
                                struct file_lock *fl, struct file_lock *conf)
{
        return -ENOLCK;
}

static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
{
        return 0;
}

static inline bool vfs_inode_has_locks(struct inode *inode)
{
        return false;
}

static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
        return -ENOLCK;
}

static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
{
        return 0;
}

static inline void lease_get_mtime(struct inode *inode,
                                   struct timespec64 *time)
{
        return;
}

static inline int generic_setlease(struct file *filp, long arg,
                                    struct file_lock **flp, void **priv)
{
        return -EINVAL;
}

static inline int vfs_setlease(struct file *filp, long arg,
                               struct file_lock **lease, void **priv)
{
        return -EINVAL;
}

static inline int lease_modify(struct file_lock *fl, int arg,
                               struct list_head *dispose)
{
        return -EINVAL;
}

struct files_struct;
static inline void show_fd_locks(struct seq_file *f,
                        struct file *filp, struct files_struct *files) {}
#endif /* !CONFIG_FILE_LOCKING */

static inline struct inode *file_inode(const struct file *f)
{
        return f->f_inode;
}

static inline struct dentry *file_dentry(const struct file *file)
{
        return d_real(file->f_path.dentry, file_inode(file));
}

static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
{
        return locks_lock_inode_wait(locks_inode(filp), fl);
}

struct fasync_struct {
        rwlock_t                fa_lock;
        int                        magic;
        int                        fa_fd;
        struct fasync_struct        *fa_next; /* singly linked list */
        struct file                *fa_file;
        struct rcu_head                fa_rcu;
};

#define FASYNC_MAGIC 0x4601

/* SMP safe fasync helpers: */
extern int fasync_helper(int, struct file *, int, struct fasync_struct **);
extern struct fasync_struct *fasync_insert_entry(int, struct file *, struct fasync_struct **, struct fasync_struct *);
extern int fasync_remove_entry(struct file *, struct fasync_struct **);
extern struct fasync_struct *fasync_alloc(void);
extern void fasync_free(struct fasync_struct *);

/* can be called from interrupts */
extern void kill_fasync(struct fasync_struct **, int, int);

extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force);
extern int f_setown(struct file *filp, unsigned long arg, int force);
extern void f_delown(struct file *filp);
extern pid_t f_getown(struct file *filp);
extern int send_sigurg(struct fown_struct *fown);

/*
 * sb->s_flags.  Note that these mirror the equivalent MS_* flags where
 * represented in both.
 */
#define SB_RDONLY       BIT(0)        /* Mount read-only */
#define SB_NOSUID       BIT(1)        /* Ignore suid and sgid bits */
#define SB_NODEV        BIT(2)        /* Disallow access to device special files */
#define SB_NOEXEC       BIT(3)        /* Disallow program execution */
#define SB_SYNCHRONOUS  BIT(4)        /* Writes are synced at once */
#define SB_MANDLOCK     BIT(6)        /* Allow mandatory locks on an FS */
#define SB_DIRSYNC      BIT(7)        /* Directory modifications are synchronous */
#define SB_NOATIME      BIT(10)        /* Do not update access times. */
#define SB_NODIRATIME   BIT(11)        /* Do not update directory access times */
#define SB_SILENT       BIT(15)
#define SB_POSIXACL     BIT(16)        /* VFS does not apply the umask */
#define SB_INLINECRYPT  BIT(17)        /* Use blk-crypto for encrypted files */
#define SB_KERNMOUNT    BIT(22)        /* this is a kern_mount call */
#define SB_I_VERSION    BIT(23)        /* Update inode I_version field */
#define SB_LAZYTIME     BIT(25)        /* Update the on-disk [acm]times lazily */

/* These sb flags are internal to the kernel */
#define SB_SUBMOUNT     BIT(26)
#define SB_FORCE        BIT(27)
#define SB_NOSEC        BIT(28)
#define SB_BORN         BIT(29)
#define SB_ACTIVE       BIT(30)
#define SB_NOUSER       BIT(31)

/* These flags relate to encoding and casefolding */
#define SB_ENC_STRICT_MODE_FL        (1 << 0)

#define sb_has_strict_encoding(sb) \
        (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL)

/*
 *        Umount options
 */

#define MNT_FORCE        0x00000001        /* Attempt to forcibily umount */
#define MNT_DETACH        0x00000002        /* Just detach from the tree */
#define MNT_EXPIRE        0x00000004        /* Mark for expiry */
#define UMOUNT_NOFOLLOW        0x00000008        /* Don't follow symlink on umount */
#define UMOUNT_UNUSED        0x80000000        /* Flag guaranteed to be unused */

/* sb->s_iflags */
#define SB_I_CGROUPWB        0x00000001        /* cgroup-aware writeback enabled */
#define SB_I_NOEXEC        0x00000002        /* Ignore executables on this fs */
#define SB_I_NODEV        0x00000004        /* Ignore devices on this fs */
#define SB_I_STABLE_WRITES 0x00000008        /* don't modify blks until WB is done */

/* sb->s_iflags to limit user namespace mounts */
#define SB_I_USERNS_VISIBLE                0x00000010 /* fstype already mounted */
#define SB_I_IMA_UNVERIFIABLE_SIGNATURE        0x00000020
#define SB_I_UNTRUSTED_MOUNTER                0x00000040

#define SB_I_SKIP_SYNC        0x00000100        /* Skip superblock at global sync */

/* Possible states of 'frozen' field */
enum {
        SB_UNFROZEN = 0,                /* FS is unfrozen */
        SB_FREEZE_WRITE        = 1,                /* Writes, dir ops, ioctls frozen */
        SB_FREEZE_PAGEFAULT = 2,        /* Page faults stopped as well */
        SB_FREEZE_FS = 3,                /* For internal FS use (e.g. to stop
                                         * internal threads if needed) */
        SB_FREEZE_COMPLETE = 4,                /* ->freeze_fs finished successfully */
};

#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)

struct sb_writers {
        int                                frozen;                /* Is sb frozen? */
        wait_queue_head_t                wait_unfrozen;        /* for get_super_thawed() */
        struct percpu_rw_semaphore        rw_sem[SB_FREEZE_LEVELS];
};

struct super_block {
        struct list_head        s_list;                /* Keep this first */
        dev_t                        s_dev;                /* search index; _not_ kdev_t */
        unsigned char                s_blocksize_bits;
        unsigned long                s_blocksize;
        loff_t                        s_maxbytes;        /* Max file size */
        struct file_system_type        *s_type;
        const struct super_operations        *s_op;
        const struct dquot_operations        *dq_op;
        const struct quotactl_ops        *s_qcop;
        const struct export_operations *s_export_op;
        unsigned long                s_flags;
        unsigned long                s_iflags;        /* internal SB_I_* flags */
        unsigned long                s_magic;
        struct dentry                *s_root;
        struct rw_semaphore        s_umount;
        int                        s_count;
        atomic_t                s_active;
#ifdef CONFIG_SECURITY
        void                    *s_security;
#endif
        const struct xattr_handler **s_xattr;
#ifdef CONFIG_FS_ENCRYPTION
        const struct fscrypt_operations        *s_cop;
        struct fscrypt_keyring        *s_master_keys; /* master crypto keys in use */
#endif
#ifdef CONFIG_FS_VERITY
        const struct fsverity_operations *s_vop;
#endif
#ifdef CONFIG_UNICODE
        struct unicode_map *s_encoding;
        __u16 s_encoding_flags;
#endif
        struct hlist_bl_head        s_roots;        /* alternate root dentries for NFS */
        struct list_head        s_mounts;        /* list of mounts; _not_ for fs use */
        struct block_device        *s_bdev;
        struct backing_dev_info *s_bdi;
        struct mtd_info                *s_mtd;
        struct hlist_node        s_instances;
        unsigned int                s_quota_types;        /* Bitmask of supported quota types */
        struct quota_info        s_dquot;        /* Diskquota specific options */

        struct sb_writers        s_writers;

        /*
         * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and
         * s_fsnotify_marks together for cache efficiency. They are frequently
         * accessed and rarely modified.
         */
        void                        *s_fs_info;        /* Filesystem private info */

        /* Granularity of c/m/atime in ns (cannot be worse than a second) */
        u32                        s_time_gran;
        /* Time limits for c/m/atime in seconds */
        time64_t                   s_time_min;
        time64_t                   s_time_max;
#ifdef CONFIG_FSNOTIFY
        __u32                        s_fsnotify_mask;
        struct fsnotify_mark_connector __rcu        *s_fsnotify_marks;
#endif

        char                        s_id[32];        /* Informational name */
        uuid_t                        s_uuid;                /* UUID */

        unsigned int                s_max_links;
        fmode_t                        s_mode;

        /*
         * The next field is for VFS *only*. No filesystems have any business
         * even looking at it. You had been warned.
         */
        struct mutex s_vfs_rename_mutex;        /* Kludge */

        /*
         * Filesystem subtype.  If non-empty the filesystem type field
         * in /proc/mounts will be "type.subtype"
         */
        const char *s_subtype;

        const struct dentry_operations *s_d_op; /* default d_op for dentries */

        /*
         * Saved pool identifier for cleancache (-1 means none)
         */
        int cleancache_poolid;

        struct shrinker s_shrink;        /* per-sb shrinker handle */

        /* Number of inodes with nlink == 0 but still referenced */
        atomic_long_t s_remove_count;

        /* Pending fsnotify inode refs */
        atomic_long_t s_fsnotify_inode_refs;

        /* Being remounted read-only */
        int s_readonly_remount;

        /* per-sb errseq_t for reporting writeback errors via syncfs */
        errseq_t s_wb_err;

        /* AIO completions deferred from interrupt context */
        struct workqueue_struct *s_dio_done_wq;
        struct hlist_head s_pins;

        /*
         * Owning user namespace and default context in which to
         * interpret filesystem uids, gids, quotas, device nodes,
         * xattrs and security labels.
         */
        struct user_namespace *s_user_ns;

        /*
         * The list_lru structure is essentially just a pointer to a table
         * of per-node lru lists, each of which has its own spinlock.
         * There is no need to put them into separate cachelines.
         */
        struct list_lru                s_dentry_lru;
        struct list_lru                s_inode_lru;
        struct rcu_head                rcu;
        struct work_struct        destroy_work;

        struct mutex                s_sync_lock;        /* sync serialisation lock */

        /*
         * Indicates how deep in a filesystem stack this SB is
         */
        int s_stack_depth;

        /* s_inode_list_lock protects s_inodes */
        spinlock_t                s_inode_list_lock ____cacheline_aligned_in_smp;
        struct list_head        s_inodes;        /* all inodes */

        spinlock_t                s_inode_wblist_lock;
        struct list_head        s_inodes_wb;        /* writeback inodes */
} __randomize_layout;

/* Helper functions so that in most cases filesystems will
 * not need to deal directly with kuid_t and kgid_t and can
 * instead deal with the raw numeric values that are stored
 * in the filesystem.
 */
static inline uid_t i_uid_read(const struct inode *inode)
{
        return from_kuid(inode->i_sb->s_user_ns, inode->i_uid);
}

static inline gid_t i_gid_read(const struct inode *inode)
{
        return from_kgid(inode->i_sb->s_user_ns, inode->i_gid);
}

static inline void i_uid_write(struct inode *inode, uid_t uid)
{
        inode->i_uid = make_kuid(inode->i_sb->s_user_ns, uid);
}

static inline void i_gid_write(struct inode *inode, gid_t gid)
{
        inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid);
}

struct timespec64 current_time(struct inode *inode);
struct timespec64 inode_set_ctime_current(struct inode *inode);

/**
 * inode_get_ctime - fetch the current ctime from the inode
 * @inode: inode from which to fetch ctime
 *
 * Grab the current ctime from the inode and return it.
 */
static inline struct timespec64 inode_get_ctime(const struct inode *inode)
{
        return inode->i_ctime;
}

/**
 * inode_set_ctime_to_ts - set the ctime in the inode
 * @inode: inode in which to set the ctime
 * @ts: value to set in the ctime field
 *
 * Set the ctime in @inode to @ts
 */
static inline struct timespec64 inode_set_ctime_to_ts(struct inode *inode,
                                                      struct timespec64 ts)
{
        inode->i_ctime = ts;
        return ts;
}

/**
 * inode_set_ctime - set the ctime in the inode
 * @inode: inode in which to set the ctime
 * @sec: tv_sec value to set
 * @nsec: tv_nsec value to set
 *
 * Set the ctime in @inode to { @sec, @nsec }
 */
static inline struct timespec64 inode_set_ctime(struct inode *inode,
                                                time64_t sec, long nsec)
{
        struct timespec64 ts = { .tv_sec  = sec,
                                 .tv_nsec = nsec };

        return inode_set_ctime_to_ts(inode, ts);
}

/*
 * Snapshotting support.
 */

/*
 * These are internal functions, please use sb_start_{write,pagefault,intwrite}
 * instead.
 */
static inline void __sb_end_write(struct super_block *sb, int level)
{
        percpu_up_read(sb->s_writers.rw_sem + level-1);
}

static inline void __sb_start_write(struct super_block *sb, int level)
{
        percpu_down_read(sb->s_writers.rw_sem + level - 1);
}

static inline bool __sb_start_write_trylock(struct super_block *sb, int level)
{
        return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1);
}

#define __sb_writers_acquired(sb, lev)        \
        percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
#define __sb_writers_release(sb, lev)        \
        percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)

/**
 * sb_end_write - drop write access to a superblock
 * @sb: the super we wrote to
 *
 * Decrement number of writers to the filesystem. Wake up possible waiters
 * wanting to freeze the filesystem.
 */
static inline void sb_end_write(struct super_block *sb)
{
        __sb_end_write(sb, SB_FREEZE_WRITE);
}

/**
 * sb_end_pagefault - drop write access to a superblock from a page fault
 * @sb: the super we wrote to
 *
 * Decrement number of processes handling write page fault to the filesystem.
 * Wake up possible waiters wanting to freeze the filesystem.
 */
static inline void sb_end_pagefault(struct super_block *sb)
{
        __sb_end_write(sb, SB_FREEZE_PAGEFAULT);
}

/**
 * sb_end_intwrite - drop write access to a superblock for internal fs purposes
 * @sb: the super we wrote to
 *
 * Decrement fs-internal number of writers to the filesystem.  Wake up possible
 * waiters wanting to freeze the filesystem.
 */
static inline void sb_end_intwrite(struct super_block *sb)
{
        __sb_end_write(sb, SB_FREEZE_FS);
}

/**
 * sb_start_write - get write access to a superblock
 * @sb: the super we write to
 *
 * When a process wants to write data or metadata to a file system (i.e. dirty
 * a page or an inode), it should embed the operation in a sb_start_write() -
 * sb_end_write() pair to get exclusion against file system freezing. This
 * function increments number of writers preventing freezing. If the file
 * system is already frozen, the function waits until the file system is
 * thawed.
 *
 * Since freeze protection behaves as a lock, users have to preserve
 * ordering of freeze protection and other filesystem locks. Generally,
 * freeze protection should be the outermost lock. In particular, we have:
 *
 * sb_start_write
 *   -> i_mutex                        (write path, truncate, directory ops, ...)
 *   -> s_umount                (freeze_super, thaw_super)
 */
static inline void sb_start_write(struct super_block *sb)
{
        __sb_start_write(sb, SB_FREEZE_WRITE);
}

static inline bool sb_start_write_trylock(struct super_block *sb)
{
        return __sb_start_write_trylock(sb, SB_FREEZE_WRITE);
}

/**
 * sb_start_pagefault - get write access to a superblock from a page fault
 * @sb: the super we write to
 *
 * When a process starts handling write page fault, it should embed the
 * operation into sb_start_pagefault() - sb_end_pagefault() pair to get
 * exclusion against file system freezing. This is needed since the page fault
 * is going to dirty a page. This function increments number of running page
 * faults preventing freezing. If the file system is already frozen, the
 * function waits until the file system is thawed.
 *
 * Since page fault freeze protection behaves as a lock, users have to preserve
 * ordering of freeze protection and other filesystem locks. It is advised to
 * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault
 * handling code implies lock dependency:
 *
 * mmap_lock
 *   -> sb_start_pagefault
 */
static inline void sb_start_pagefault(struct super_block *sb)
{
        __sb_start_write(sb, SB_FREEZE_PAGEFAULT);
}

/*
 * sb_start_intwrite - get write access to a superblock for internal fs purposes
 * @sb: the super we write to
 *
 * This is the third level of protection against filesystem freezing. It is
 * free for use by a filesystem. The only requirement is that it must rank
 * below sb_start_pagefault.
 *
 * For example filesystem can call sb_start_intwrite() when starting a
 * transaction which somewhat eases handling of freezing for internal sources
 * of filesystem changes (internal fs threads, discarding preallocation on file
 * close, etc.).
 */
static inline void sb_start_intwrite(struct super_block *sb)
{
        __sb_start_write(sb, SB_FREEZE_FS);
}

static inline bool sb_start_intwrite_trylock(struct super_block *sb)
{
        return __sb_start_write_trylock(sb, SB_FREEZE_FS);
}


extern bool inode_owner_or_capable(const struct inode *inode);

/*
 * VFS helper functions..
 */
extern int vfs_create(struct inode *, struct dentry *, umode_t, bool);
extern int vfs_mkdir(struct inode *, struct dentry *, umode_t);
extern int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
extern int vfs_symlink(struct inode *, struct dentry *, const char *);
extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct inode **);
extern int vfs_rmdir(struct inode *, struct dentry *);
extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);
extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);

static inline int vfs_whiteout(struct inode *dir, struct dentry *dentry)
{
        return vfs_mknod(dir, dentry, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
}

extern struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode,
                                  int open_flag);

int vfs_mkobj(struct dentry *, umode_t,
                int (*f)(struct dentry *, umode_t, void *),
                void *);

int vfs_fchown(struct file *file, uid_t user, gid_t group);
int vfs_fchmod(struct file *file, umode_t mode);
int vfs_utimes(const struct path *path, struct timespec64 *times);

extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);

#ifdef CONFIG_COMPAT
extern long compat_ptr_ioctl(struct file *file, unsigned int cmd,
                                        unsigned long arg);
#else
#define compat_ptr_ioctl NULL
#endif

/*
 * VFS file helper functions.
 */
extern void inode_init_owner(struct inode *inode, const struct inode *dir,
                        umode_t mode);
extern bool may_open_dev(const struct path *path);
umode_t mode_strip_sgid(const struct inode *dir, umode_t mode);

/*
 * This is the "filldir" function type, used by readdir() to let
 * the kernel specify what kind of dirent layout it wants to have.
 * This allows the kernel to read directories into kernel space or
 * to have different dirent layouts depending on the binary type.
 */
struct dir_context;
typedef int (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64,
                         unsigned);

struct dir_context {
        filldir_t actor;
        loff_t pos;
};

/*
 * These flags let !MMU mmap() govern direct device mapping vs immediate
 * copying more easily for MAP_PRIVATE, especially for ROM filesystems.
 *
 * NOMMU_MAP_COPY:        Copy can be mapped (MAP_PRIVATE)
 * NOMMU_MAP_DIRECT:        Can be mapped directly (MAP_SHARED)
 * NOMMU_MAP_READ:        Can be mapped for reading
 * NOMMU_MAP_WRITE:        Can be mapped for writing
 * NOMMU_MAP_EXEC:        Can be mapped for execution
 */
#define NOMMU_MAP_COPY                0x00000001
#define NOMMU_MAP_DIRECT        0x00000008
#define NOMMU_MAP_READ                VM_MAYREAD
#define NOMMU_MAP_WRITE                VM_MAYWRITE
#define NOMMU_MAP_EXEC                VM_MAYEXEC

#define NOMMU_VMFLAGS \
        (NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC)

/*
 * These flags control the behavior of the remap_file_range function pointer.
 * If it is called with len == 0 that means "remap to end of source file".
 * See Documentation/filesystems/vfs.rst for more details about this call.
 *
 * REMAP_FILE_DEDUP: only remap if contents identical (i.e. deduplicate)
 * REMAP_FILE_CAN_SHORTEN: caller can handle a shortened request
 */
#define REMAP_FILE_DEDUP                (1 << 0)
#define REMAP_FILE_CAN_SHORTEN                (1 << 1)

/*
 * These flags signal that the caller is ok with altering various aspects of
 * the behavior of the remap operation.  The changes must be made by the
 * implementation; the vfs remap helper functions can take advantage of them.
 * Flags in this category exist to preserve the quirky behavior of the hoisted
 * btrfs clone/dedupe ioctls.
 */
#define REMAP_FILE_ADVISORY                (REMAP_FILE_CAN_SHORTEN)

/*
 * These flags control the behavior of vfs_copy_file_range().
 * They are not available to the user via syscall.
 *
 * COPY_FILE_SPLICE: call splice direct instead of fs clone/copy ops
 */
#define COPY_FILE_SPLICE                (1 << 0)

struct iov_iter;

struct file_operations {
        struct module *owner;
        loff_t (*llseek) (struct file *, loff_t, int);
        ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
        ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
        ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
        ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
        int (*iopoll)(struct kiocb *kiocb, bool spin);
        int (*iterate) (struct file *, struct dir_context *);
        int (*iterate_shared) (struct file *, struct dir_context *);
        __poll_t (*poll) (struct file *, struct poll_table_struct *);
        long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
        long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
        int (*mmap) (struct file *, struct vm_area_struct *);
        unsigned long mmap_supported_flags;
        int (*open) (struct inode *, struct file *);
        int (*flush) (struct file *, fl_owner_t id);
        int (*release) (struct inode *, struct file *);
        int (*fsync) (struct file *, loff_t, loff_t, int datasync);
        int (*fasync) (int, struct file *, int);
        int (*lock) (struct file *, int, struct file_lock *);
        ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
        unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
        int (*check_flags)(int);
        int (*flock) (struct file *, int, struct file_lock *);
        ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
        ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
        int (*setlease)(struct file *, long, struct file_lock **, void **);
        long (*fallocate)(struct file *file, int mode, loff_t offset,
                          loff_t len);
        void (*show_fdinfo)(struct seq_file *m, struct file *f);
#ifndef CONFIG_MMU
        unsigned (*mmap_capabilities)(struct file *);
#endif
        ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
                        loff_t, size_t, unsigned int);
        loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
                                   struct file *file_out, loff_t pos_out,
                                   loff_t len, unsigned int remap_flags);
        int (*fadvise)(struct file *, loff_t, loff_t, int);
        bool may_pollfree;
} __randomize_layout;

struct inode_operations {
        struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
        const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
        int (*permission) (struct inode *, int);
        struct posix_acl * (*get_acl)(struct inode *, int);

        int (*readlink) (struct dentry *, char __user *,int);

        int (*create) (struct inode *,struct dentry *, umode_t, bool);
        int (*link) (struct dentry *,struct inode *,struct dentry *);
        int (*unlink) (struct inode *,struct dentry *);
        int (*symlink) (struct inode *,struct dentry *,const char *);
        int (*mkdir) (struct inode *,struct dentry *,umode_t);
        int (*rmdir) (struct inode *,struct dentry *);
        int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
        int (*rename) (struct inode *, struct dentry *,
                        struct inode *, struct dentry *, unsigned int);
        int (*setattr) (struct dentry *, struct iattr *);
        int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
        ssize_t (*listxattr) (struct dentry *, char *, size_t);
        int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
                      u64 len);
        int (*update_time)(struct inode *, struct timespec64 *, int);
        int (*atomic_open)(struct inode *, struct dentry *,
                           struct file *, unsigned open_flag,
                           umode_t create_mode);
        int (*tmpfile) (struct inode *, struct dentry *, umode_t);
        int (*set_acl)(struct inode *, struct posix_acl *, int);
} ____cacheline_aligned;

static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio,
                                     struct iov_iter *iter)
{
        return file->f_op->read_iter(kio, iter);
}

static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio,
                                      struct iov_iter *iter)
{
        return file->f_op->write_iter(kio, iter);
}

static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
{
        return file->f_op->mmap(file, vma);
}

extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
                                   loff_t, size_t, unsigned int);
extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
                                       struct file *file_out, loff_t pos_out,
                                       size_t len, unsigned int flags);
extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
                                         struct file *file_out, loff_t pos_out,
                                         loff_t *count,
                                         unsigned int remap_flags);
extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
                                  struct file *file_out, loff_t pos_out,
                                  loff_t len, unsigned int remap_flags);
extern loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
                                   struct file *file_out, loff_t pos_out,
                                   loff_t len, unsigned int remap_flags);
extern int vfs_dedupe_file_range(struct file *file,
                                 struct file_dedupe_range *same);
extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
                                        struct file *dst_file, loff_t dst_pos,
                                        loff_t len, unsigned int remap_flags);


struct super_operations {
           struct inode *(*alloc_inode)(struct super_block *sb);
        void (*destroy_inode)(struct inode *);
        void (*free_inode)(struct inode *);

           void (*dirty_inode) (struct inode *, int flags);
        int (*write_inode) (struct inode *, struct writeback_control *wbc);
        int (*drop_inode) (struct inode *);
        void (*evict_inode) (struct inode *);
        void (*put_super) (struct super_block *);
        int (*sync_fs)(struct super_block *sb, int wait);
        int (*freeze_super) (struct super_block *);
        int (*freeze_fs) (struct super_block *);
        int (*thaw_super) (struct super_block *);
        int (*unfreeze_fs) (struct super_block *);
        int (*statfs) (struct dentry *, struct kstatfs *);
        int (*remount_fs) (struct super_block *, int *, char *);
        void (*umount_begin) (struct super_block *);

        int (*show_options)(struct seq_file *, struct dentry *);
        int (*show_devname)(struct seq_file *, struct dentry *);
        int (*show_path)(struct seq_file *, struct dentry *);
        int (*show_stats)(struct seq_file *, struct dentry *);
#ifdef CONFIG_QUOTA
        ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
        ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
        struct dquot **(*get_dquots)(struct inode *);
#endif
        int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
        long (*nr_cached_objects)(struct super_block *,
                                  struct shrink_control *);
        long (*free_cached_objects)(struct super_block *,
                                    struct shrink_control *);
};

/*
 * Inode flags - they have no relation to superblock flags now
 */
#define S_SYNC                (1 << 0)  /* Writes are synced at once */
#define S_NOATIME        (1 << 1)  /* Do not update access times */
#define S_APPEND        (1 << 2)  /* Append-only file */
#define S_IMMUTABLE        (1 << 3)  /* Immutable file */
#define S_DEAD                (1 << 4)  /* removed, but still open directory */
#define S_NOQUOTA        (1 << 5)  /* Inode is not counted to quota */
#define S_DIRSYNC        (1 << 6)  /* Directory modifications are synchronous */
#define S_NOCMTIME        (1 << 7)  /* Do not update file c/mtime */
#define S_SWAPFILE        (1 << 8)  /* Do not truncate: swapon got its bmaps */
#define S_PRIVATE        (1 << 9)  /* Inode is fs-internal */
#define S_IMA                (1 << 10) /* Inode has an associated IMA struct */
#define S_AUTOMOUNT        (1 << 11) /* Automount/referral quasi-directory */
#define S_NOSEC                (1 << 12) /* no suid or xattr security attributes */
#ifdef CONFIG_FS_DAX
#define S_DAX                (1 << 13) /* Direct Access, avoiding the page cache */
#else
#define S_DAX                0          /* Make all the DAX code disappear */
#endif
#define S_ENCRYPTED        (1 << 14) /* Encrypted file (using fs/crypto/) */
#define S_CASEFOLD        (1 << 15) /* Casefolded file */
#define S_VERITY        (1 << 16) /* Verity file (using fs/verity/) */

/*
 * Note that nosuid etc flags are inode-specific: setting some file-system
 * flags just means all the inodes inherit those flags by default. It might be
 * possible to override it selectively if you really wanted to with some
 * ioctl() that is not currently implemented.
 *
 * Exception: SB_RDONLY is always applied to the entire file system.
 *
 * Unfortunately, it is possible to change a filesystems flags with it mounted
 * with files in use.  This means that all of the inodes will not have their
 * i_flags updated.  Hence, i_flags no longer inherit the superblock mount
 * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org
 */
#define __IS_FLG(inode, flg)        ((inode)->i_sb->s_flags & (flg))

static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags & SB_RDONLY; }
#define IS_RDONLY(inode)        sb_rdonly((inode)->i_sb)
#define IS_SYNC(inode)                (__IS_FLG(inode, SB_SYNCHRONOUS) || \
                                        ((inode)->i_flags & S_SYNC))
#define IS_DIRSYNC(inode)        (__IS_FLG(inode, SB_SYNCHRONOUS|SB_DIRSYNC) || \
                                        ((inode)->i_flags & (S_SYNC|S_DIRSYNC)))
#define IS_MANDLOCK(inode)        __IS_FLG(inode, SB_MANDLOCK)
#define IS_NOATIME(inode)        __IS_FLG(inode, SB_RDONLY|SB_NOATIME)
#define IS_I_VERSION(inode)        __IS_FLG(inode, SB_I_VERSION)

#define IS_NOQUOTA(inode)        ((inode)->i_flags & S_NOQUOTA)
#define IS_APPEND(inode)        ((inode)->i_flags & S_APPEND)
#define IS_IMMUTABLE(inode)        ((inode)->i_flags & S_IMMUTABLE)
#define IS_POSIXACL(inode)        __IS_FLG(inode, SB_POSIXACL)

#define IS_DEADDIR(inode)        ((inode)->i_flags & S_DEAD)
#define IS_NOCMTIME(inode)        ((inode)->i_flags & S_NOCMTIME)
#define IS_SWAPFILE(inode)        ((inode)->i_flags & S_SWAPFILE)
#define IS_PRIVATE(inode)        ((inode)->i_flags & S_PRIVATE)
#define IS_IMA(inode)                ((inode)->i_flags & S_IMA)
#define IS_AUTOMOUNT(inode)        ((inode)->i_flags & S_AUTOMOUNT)
#define IS_NOSEC(inode)                ((inode)->i_flags & S_NOSEC)
#define IS_DAX(inode)                ((inode)->i_flags & S_DAX)
#define IS_ENCRYPTED(inode)        ((inode)->i_flags & S_ENCRYPTED)
#define IS_CASEFOLDED(inode)        ((inode)->i_flags & S_CASEFOLD)
#define IS_VERITY(inode)        ((inode)->i_flags & S_VERITY)

#define IS_WHITEOUT(inode)        (S_ISCHR(inode->i_mode) && \
                                 (inode)->i_rdev == WHITEOUT_DEV)

static inline bool HAS_UNMAPPED_ID(struct inode *inode)
{
        return !uid_valid(inode->i_uid) || !gid_valid(inode->i_gid);
}

static inline enum rw_hint file_write_hint(struct file *file)
{
        if (file->f_write_hint != WRITE_LIFE_NOT_SET)
                return file->f_write_hint;

        return file_inode(file)->i_write_hint;
}

static inline int iocb_flags(struct file *file);

static inline u16 ki_hint_validate(enum rw_hint hint)
{
        typeof(((struct kiocb *)0)->ki_hint) max_hint = -1;

        if (hint <= max_hint)
                return hint;
        return 0;
}

static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
{
        *kiocb = (struct kiocb) {
                .ki_filp = filp,
                .ki_flags = iocb_flags(filp),
                .ki_hint = ki_hint_validate(file_write_hint(filp)),
                .ki_ioprio = get_current_ioprio(),
        };
}

static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
                               struct file *filp)
{
        *kiocb = (struct kiocb) {
                .ki_filp = filp,
                .ki_flags = kiocb_src->ki_flags,
                .ki_hint = kiocb_src->ki_hint,
                .ki_ioprio = kiocb_src->ki_ioprio,
                .ki_pos = kiocb_src->ki_pos,
        };
}

/*
 * Inode state bits.  Protected by inode->i_lock
 *
 * Three bits determine the dirty state of the inode, I_DIRTY_SYNC,
 * I_DIRTY_DATASYNC and I_DIRTY_PAGES.
 *
 * Four bits define the lifetime of an inode.  Initially, inodes are I_NEW,
 * until that flag is cleared.  I_WILL_FREE, I_FREEING and I_CLEAR are set at
 * various stages of removing an inode.
 *
 * Two bits are used for locking and completion notification, I_NEW and I_SYNC.
 *
 * I_DIRTY_SYNC                Inode is dirty, but doesn't have to be written on
 *                        fdatasync().  i_atime is the usual cause.
 * I_DIRTY_DATASYNC        Data-related inode changes pending. We keep track of
 *                        these changes separately from I_DIRTY_SYNC so that we
 *                        don't have to write inode on fdatasync() when only
 *                        mtime has changed in it.
 * I_DIRTY_PAGES        Inode has dirty pages.  Inode itself may be clean.
 * I_NEW                Serves as both a mutex and completion notification.
 *                        New inodes set I_NEW.  If two processes both create
 *                        the same inode, one of them will release its inode and
 *                        wait for I_NEW to be released before returning.
 *                        Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can
 *                        also cause waiting on I_NEW, without I_NEW actually
 *                        being set.  find_inode() uses this to prevent returning
 *                        nearly-dead inodes.
 * I_WILL_FREE                Must be set when calling write_inode_now() if i_count
 *                        is zero.  I_FREEING must be set when I_WILL_FREE is
 *                        cleared.
 * I_FREEING                Set when inode is about to be freed but still has dirty
 *                        pages or buffers attached or the inode itself is still
 *                        dirty.
 * I_CLEAR                Added by clear_inode().  In this state the inode is
 *                        clean and can be destroyed.  Inode keeps I_FREEING.
 *
 *                        Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are
 *                        prohibited for many purposes.  iget() must wait for
 *                        the inode to be completely released, then create it
 *                        anew.  Other functions will just ignore such inodes,
 *                        if appropriate.  I_NEW is used for waiting.
 *
 * I_SYNC                Writeback of inode is running. The bit is set during
 *                        data writeback, and cleared with a wakeup on the bit
 *                        address once it is done. The bit is also used to pin
 *                        the inode in memory for flusher thread.
 *
 * I_REFERENCED                Marks the inode as recently references on the LRU list.
 *
 * I_DIO_WAKEUP                Never set.  Only used as a key for wait_on_bit().
 *
 * I_WB_SWITCH                Cgroup bdi_writeback switching in progress.  Used to
 *                        synchronize competing switching instances and to tell
 *                        wb stat updates to grab the i_pages lock.  See
 *                        inode_switch_wbs_work_fn() for details.
 *
 * I_OVL_INUSE                Used by overlayfs to get exclusive ownership on upper
 *                        and work dirs among overlayfs mounts.
 *
 * I_CREATING                New object's inode in the middle of setting up.
 *
 * I_DONTCACHE                Evict inode as soon as it is not used anymore.
 *
 * I_SYNC_QUEUED        Inode is queued in b_io or b_more_io writeback lists.
 *                        Used to detect that mark_inode_dirty() should not move
 *                         inode between dirty lists.
 *
 * Q: What is the difference between I_WILL_FREE and I_FREEING?
 */
#define I_DIRTY_SYNC                (1 << 0)
#define I_DIRTY_DATASYNC        (1 << 1)
#define I_DIRTY_PAGES                (1 << 2)
#define __I_NEW                        3
#define I_NEW                        (1 << __I_NEW)
#define I_WILL_FREE                (1 << 4)
#define I_FREEING                (1 << 5)
#define I_CLEAR                        (1 << 6)
#define __I_SYNC                7
#define I_SYNC                        (1 << __I_SYNC)
#define I_REFERENCED                (1 << 8)
#define __I_DIO_WAKEUP                9
#define I_DIO_WAKEUP                (1 << __I_DIO_WAKEUP)
#define I_LINKABLE                (1 << 10)
#define I_DIRTY_TIME                (1 << 11)
#define I_WB_SWITCH                (1 << 13)
#define I_OVL_INUSE                (1 << 14)
#define I_CREATING                (1 << 15)
#define I_DONTCACHE                (1 << 16)
#define I_SYNC_QUEUED                (1 << 17)

#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)

extern void __mark_inode_dirty(struct inode *, int);
static inline void mark_inode_dirty(struct inode *inode)
{
        __mark_inode_dirty(inode, I_DIRTY);
}

static inline void mark_inode_dirty_sync(struct inode *inode)
{
        __mark_inode_dirty(inode, I_DIRTY_SYNC);
}

extern void inc_nlink(struct inode *inode);
extern void drop_nlink(struct inode *inode);
extern void clear_nlink(struct inode *inode);
extern void set_nlink(struct inode *inode, unsigned int nlink);

static inline void inode_inc_link_count(struct inode *inode)
{
        inc_nlink(inode);
        mark_inode_dirty(inode);
}

static inline void inode_dec_link_count(struct inode *inode)
{
        drop_nlink(inode);
        mark_inode_dirty(inode);
}

enum file_time_flags {
        S_ATIME = 1,
        S_MTIME = 2,
        S_CTIME = 4,
        S_VERSION = 8,
};

extern bool atime_needs_update(const struct path *, struct inode *);
extern void touch_atime(const struct path *);
int inode_update_time(struct inode *inode, struct timespec64 *time, int flags);

static inline void file_accessed(struct file *file)
{
        if (!(file->f_flags & O_NOATIME))
                touch_atime(&file->f_path);
}

extern int file_modified(struct file *file);

int sync_inode(struct inode *inode, struct writeback_control *wbc);
int sync_inode_metadata(struct inode *inode, int wait);

struct file_system_type {
        const char *name;
        int fs_flags;
#define FS_REQUIRES_DEV                1 
#define FS_BINARY_MOUNTDATA        2
#define FS_HAS_SUBTYPE                4
#define FS_USERNS_MOUNT                8        /* Can be mounted by userns root */
#define FS_DISALLOW_NOTIFY_PERM        16        /* Disable fanotify permission events */
#define FS_THP_SUPPORT                8192        /* Remove once all fs converted */
#define FS_RENAME_DOES_D_MOVE        32768        /* FS will handle d_move() during rename() internally. */
        int (*init_fs_context)(struct fs_context *);
        const struct fs_parameter_spec *parameters;
        struct dentry *(*mount) (struct file_system_type *, int,
                       const char *, void *);
        void (*kill_sb) (struct super_block *);
        struct module *owner;
        struct file_system_type * next;
        struct hlist_head fs_supers;

        struct lock_class_key s_lock_key;
        struct lock_class_key s_umount_key;
        struct lock_class_key s_vfs_rename_key;
        struct lock_class_key s_writers_key[SB_FREEZE_LEVELS];

        struct lock_class_key i_lock_key;
        struct lock_class_key i_mutex_key;
        struct lock_class_key i_mutex_dir_key;
};

#define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME)

extern struct dentry *mount_bdev(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data,
        int (*fill_super)(struct super_block *, void *, int));
extern struct dentry *mount_single(struct file_system_type *fs_type,
        int flags, void *data,
        int (*fill_super)(struct super_block *, void *, int));
extern struct dentry *mount_nodev(struct file_system_type *fs_type,
        int flags, void *data,
        int (*fill_super)(struct super_block *, void *, int));
extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path);
void generic_shutdown_super(struct super_block *sb);
void kill_block_super(struct super_block *sb);
void kill_anon_super(struct super_block *sb);
void kill_litter_super(struct super_block *sb);
void deactivate_super(struct super_block *sb);
void deactivate_locked_super(struct super_block *sb);
int set_anon_super(struct super_block *s, void *data);
int set_anon_super_fc(struct super_block *s, struct fs_context *fc);
int get_anon_bdev(dev_t *);
void free_anon_bdev(dev_t);
struct super_block *sget_fc(struct fs_context *fc,
                            int (*test)(struct super_block *, struct fs_context *),
                            int (*set)(struct super_block *, struct fs_context *));
struct super_block *sget(struct file_system_type *type,
                        int (*test)(struct super_block *,void *),
                        int (*set)(struct super_block *,void *),
                        int flags, void *data);

/* Alas, no aliases. Too much hassle with bringing module.h everywhere */
#define fops_get(fops) \
        (((fops) && try_module_get((fops)->owner) ? (fops) : NULL))
#define fops_put(fops) \
        do { if (fops) module_put((fops)->owner); } while(0)
/*
 * This one is to be used *ONLY* from ->open() instances.
 * fops must be non-NULL, pinned down *and* module dependencies
 * should be sufficient to pin the caller down as well.
 */
#define replace_fops(f, fops) \
        do {        \
                struct file *__file = (f); \
                fops_put(__file->f_op); \
                BUG_ON(!(__file->f_op = (fops))); \
        } while(0)

extern int register_filesystem(struct file_system_type *);
extern int unregister_filesystem(struct file_system_type *);
extern struct vfsmount *kern_mount(struct file_system_type *);
extern void kern_unmount(struct vfsmount *mnt);
extern int may_umount_tree(struct vfsmount *);
extern int may_umount(struct vfsmount *);
extern long do_mount(const char *, const char __user *,
                     const char *, unsigned long, void *);
extern struct vfsmount *collect_mounts(const struct path *);
extern void drop_collected_mounts(struct vfsmount *);
extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *,
                          struct vfsmount *);
extern int vfs_statfs(const struct path *, struct kstatfs *);
extern int user_statfs(const char __user *, struct kstatfs *);
extern int fd_statfs(int, struct kstatfs *);
extern int freeze_super(struct super_block *super);
extern int thaw_super(struct super_block *super);
extern bool our_mnt(struct vfsmount *mnt);
extern __printf(2, 3)
int super_setup_bdi_name(struct super_block *sb, char *fmt, ...);
extern int super_setup_bdi(struct super_block *sb);

extern int current_umask(void);

extern void ihold(struct inode * inode);
extern void iput(struct inode *);
extern int generic_update_time(struct inode *, struct timespec64 *, int);

/* /sys/fs */
extern struct kobject *fs_kobj;

#define MAX_RW_COUNT (INT_MAX & PAGE_MASK)

#ifdef CONFIG_MANDATORY_FILE_LOCKING
extern int locks_mandatory_locked(struct file *);
extern int locks_mandatory_area(struct inode *, struct file *, loff_t, loff_t, unsigned char);

/*
 * Candidates for mandatory locking have the setgid bit set
 * but no group execute bit -  an otherwise meaningless combination.
 */

static inline int __mandatory_lock(struct inode *ino)
{
        return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID;
}

/*
 * ... and these candidates should be on SB_MANDLOCK mounted fs,
 * otherwise these will be advisory locks
 */

static inline int mandatory_lock(struct inode *ino)
{
        return IS_MANDLOCK(ino) && __mandatory_lock(ino);
}

static inline int locks_verify_locked(struct file *file)
{
        if (mandatory_lock(locks_inode(file)))
                return locks_mandatory_locked(file);
        return 0;
}

static inline int locks_verify_truncate(struct inode *inode,
                                    struct file *f,
                                    loff_t size)
{
        if (!inode->i_flctx || !mandatory_lock(inode))
                return 0;

        if (size < inode->i_size) {
                return locks_mandatory_area(inode, f, size, inode->i_size - 1,
                                F_WRLCK);
        } else {
                return locks_mandatory_area(inode, f, inode->i_size, size - 1,
                                F_WRLCK);
        }
}

#else /* !CONFIG_MANDATORY_FILE_LOCKING */

static inline int locks_mandatory_locked(struct file *file)
{
        return 0;
}

static inline int locks_mandatory_area(struct inode *inode, struct file *filp,
                                       loff_t start, loff_t end, unsigned char type)
{
        return 0;
}

static inline int __mandatory_lock(struct inode *inode)
{
        return 0;
}

static inline int mandatory_lock(struct inode *inode)
{
        return 0;
}

static inline int locks_verify_locked(struct file *file)
{
        return 0;
}

static inline int locks_verify_truncate(struct inode *inode, struct file *filp,
                                        size_t size)
{
        return 0;
}

#endif /* CONFIG_MANDATORY_FILE_LOCKING */


#ifdef CONFIG_FILE_LOCKING
static inline int break_lease(struct inode *inode, unsigned int mode)
{
        /*
         * Since this check is lockless, we must ensure that any refcounts
         * taken are done before checking i_flctx->flc_lease. Otherwise, we
         * could end up racing with tasks trying to set a new lease on this
         * file.
         */
        smp_mb();
        if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
                return __break_lease(inode, mode, FL_LEASE);
        return 0;
}

static inline int break_deleg(struct inode *inode, unsigned int mode)
{
        /*
         * Since this check is lockless, we must ensure that any refcounts
         * taken are done before checking i_flctx->flc_lease. Otherwise, we
         * could end up racing with tasks trying to set a new lease on this
         * file.
         */
        smp_mb();
        if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
                return __break_lease(inode, mode, FL_DELEG);
        return 0;
}

static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode)
{
        int ret;

        ret = break_deleg(inode, O_WRONLY|O_NONBLOCK);
        if (ret == -EWOULDBLOCK && delegated_inode) {
                *delegated_inode = inode;
                ihold(inode);
        }
        return ret;
}

static inline int break_deleg_wait(struct inode **delegated_inode)
{
        int ret;

        ret = break_deleg(*delegated_inode, O_WRONLY);
        iput(*delegated_inode);
        *delegated_inode = NULL;
        return ret;
}

static inline int break_layout(struct inode *inode, bool wait)
{
        smp_mb();
        if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
                return __break_lease(inode,
                                wait ? O_WRONLY : O_WRONLY | O_NONBLOCK,
                                FL_LAYOUT);
        return 0;
}

#else /* !CONFIG_FILE_LOCKING */
static inline int break_lease(struct inode *inode, unsigned int mode)
{
        return 0;
}

static inline int break_deleg(struct inode *inode, unsigned int mode)
{
        return 0;
}

static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode)
{
        return 0;
}

static inline int break_deleg_wait(struct inode **delegated_inode)
{
        BUG();
        return 0;
}

static inline int break_layout(struct inode *inode, bool wait)
{
        return 0;
}

#endif /* CONFIG_FILE_LOCKING */

/* fs/open.c */
struct audit_names;
struct filename {
        const char                *name;        /* pointer to actual string */
        const __user char        *uptr;        /* original userland pointer */
        int                        refcnt;
        struct audit_names        *aname;
        const char                iname[];
};
static_assert(offsetof(struct filename, iname) % sizeof(long) == 0);

extern long vfs_truncate(const struct path *, loff_t);
extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
                       struct file *filp);
extern int vfs_fallocate(struct file *file, int mode, loff_t offset,
                        loff_t len);
extern long do_sys_open(int dfd, const char __user *filename, int flags,
                        umode_t mode);
extern struct file *file_open_name(struct filename *, int, umode_t);
extern struct file *filp_open(const char *, int, umode_t);
extern struct file *file_open_root(struct dentry *, struct vfsmount *,
                                   const char *, int, umode_t);
extern struct file * dentry_open(const struct path *, int, const struct cred *);
extern struct file * open_with_fake_path(const struct path *, int,
                                         struct inode*, const struct cred *);
static inline struct file *file_clone_open(struct file *file)
{
        return dentry_open(&file->f_path, file->f_flags, file->f_cred);
}
extern int filp_close(struct file *, fl_owner_t id);

extern struct filename *getname_flags(const char __user *, int, int *);
extern struct filename *getname(const char __user *);
extern struct filename *getname_kernel(const char *);
extern void putname(struct filename *name);

extern int finish_open(struct file *file, struct dentry *dentry,
                        int (*open)(struct inode *, struct file *));
extern int finish_no_open(struct file *file, struct dentry *dentry);

/* fs/dcache.c */
extern void __init vfs_caches_init_early(void);
extern void __init vfs_caches_init(void);

extern struct kmem_cache *names_cachep;

#define __getname()                kmem_cache_alloc(names_cachep, GFP_KERNEL)
#define __putname(name)                kmem_cache_free(names_cachep, (void *)(name))

extern struct super_block *blockdev_superblock;
static inline bool sb_is_blkdev_sb(struct super_block *sb)
{
        return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock;
}

void emergency_thaw_all(void);
extern int sync_filesystem(struct super_block *);
extern const struct file_operations def_blk_fops;
extern const struct file_operations def_chr_fops;

/* fs/char_dev.c */
#define CHRDEV_MAJOR_MAX 512
/* Marks the bottom of the first segment of free char majors */
#define CHRDEV_MAJOR_DYN_END 234
/* Marks the top and bottom of the second segment of free char majors */
#define CHRDEV_MAJOR_DYN_EXT_START 511
#define CHRDEV_MAJOR_DYN_EXT_END 384

extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *);
extern int register_chrdev_region(dev_t, unsigned, const char *);
extern int __register_chrdev(unsigned int major, unsigned int baseminor,
                             unsigned int count, const char *name,
                             const struct file_operations *fops);
extern void __unregister_chrdev(unsigned int major, unsigned int baseminor,
                                unsigned int count, const char *name);
extern void unregister_chrdev_region(dev_t, unsigned);
extern void chrdev_show(struct seq_file *,off_t);

static inline int register_chrdev(unsigned int major, const char *name,
                                  const struct file_operations *fops)
{
        return __register_chrdev(major, 0, 256, name, fops);
}

static inline void unregister_chrdev(unsigned int major, const char *name)
{
        __unregister_chrdev(major, 0, 256, name);
}

extern void init_special_inode(struct inode *, umode_t, dev_t);

/* Invalid inode operations -- fs/bad_inode.c */
extern void make_bad_inode(struct inode *);
extern bool is_bad_inode(struct inode *);

unsigned long invalidate_mapping_pages(struct address_space *mapping,
                                        pgoff_t start, pgoff_t end);

void invalidate_mapping_pagevec(struct address_space *mapping,
                                pgoff_t start, pgoff_t end,
                                unsigned long *nr_pagevec);

static inline void invalidate_remote_inode(struct inode *inode)
{
        if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
            S_ISLNK(inode->i_mode))
                invalidate_mapping_pages(inode->i_mapping, 0, -1);
}
extern int invalidate_inode_pages2(struct address_space *mapping);
extern int invalidate_inode_pages2_range(struct address_space *mapping,
                                         pgoff_t start, pgoff_t end);
extern int write_inode_now(struct inode *, int);
extern int filemap_fdatawrite(struct address_space *);
extern int filemap_flush(struct address_space *);
extern int filemap_fdatawait_keep_errors(struct address_space *mapping);
extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
                                   loff_t lend);
extern int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
                loff_t start_byte, loff_t end_byte);

static inline int filemap_fdatawait(struct address_space *mapping)
{
        return filemap_fdatawait_range(mapping, 0, LLONG_MAX);
}

extern bool filemap_range_has_page(struct address_space *, loff_t lstart,
                                  loff_t lend);
extern int filemap_write_and_wait_range(struct address_space *mapping,
                                        loff_t lstart, loff_t lend);
extern int __filemap_fdatawrite_range(struct address_space *mapping,
                                loff_t start, loff_t end, int sync_mode);
extern int filemap_fdatawrite_range(struct address_space *mapping,
                                loff_t start, loff_t end);
extern int filemap_check_errors(struct address_space *mapping);
extern void __filemap_set_wb_err(struct address_space *mapping, int err);

static inline int filemap_write_and_wait(struct address_space *mapping)
{
        return filemap_write_and_wait_range(mapping, 0, LLONG_MAX);
}

extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
                                                loff_t lend);
extern int __must_check file_check_and_advance_wb_err(struct file *file);
extern int __must_check file_write_and_wait_range(struct file *file,
                                                loff_t start, loff_t end);

static inline int file_write_and_wait(struct file *file)
{
        return file_write_and_wait_range(file, 0, LLONG_MAX);
}

/**
 * filemap_set_wb_err - set a writeback error on an address_space
 * @mapping: mapping in which to set writeback error
 * @err: error to be set in mapping
 *
 * When writeback fails in some way, we must record that error so that
 * userspace can be informed when fsync and the like are called.  We endeavor
 * to report errors on any file that was open at the time of the error.  Some
 * internal callers also need to know when writeback errors have occurred.
 *
 * When a writeback error occurs, most filesystems will want to call
 * filemap_set_wb_err to record the error in the mapping so that it will be
 * automatically reported whenever fsync is called on the file.
 */
static inline void filemap_set_wb_err(struct address_space *mapping, int err)
{
        /* Fastpath for common case of no error */
        if (unlikely(err))
                __filemap_set_wb_err(mapping, err);
}

/**
 * filemap_check_wb_err - has an error occurred since the mark was sampled?
 * @mapping: mapping to check for writeback errors
 * @since: previously-sampled errseq_t
 *
 * Grab the errseq_t value from the mapping, and see if it has changed "since"
 * the given value was sampled.
 *
 * If it has then report the latest error set, otherwise return 0.
 */
static inline int filemap_check_wb_err(struct address_space *mapping,
                                        errseq_t since)
{
        return errseq_check(&mapping->wb_err, since);
}

/**
 * filemap_sample_wb_err - sample the current errseq_t to test for later errors
 * @mapping: mapping to be sampled
 *
 * Writeback errors are always reported relative to a particular sample point
 * in the past. This function provides those sample points.
 */
static inline errseq_t filemap_sample_wb_err(struct address_space *mapping)
{
        return errseq_sample(&mapping->wb_err);
}

/**
 * file_sample_sb_err - sample the current errseq_t to test for later errors
 * @file: file pointer to be sampled
 *
 * Grab the most current superblock-level errseq_t value for the given
 * struct file.
 */
static inline errseq_t file_sample_sb_err(struct file *file)
{
        return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err);
}

extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
                           int datasync);
extern int vfs_fsync(struct file *file, int datasync);

extern int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
                                unsigned int flags);

/*
 * Sync the bytes written if this was a synchronous write.  Expect ki_pos
 * to already be updated for the write, and will return either the amount
 * of bytes passed in, or an error if syncing the file failed.
 */
static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
{
        if (iocb->ki_flags & IOCB_DSYNC) {
                int ret = vfs_fsync_range(iocb->ki_filp,
                                iocb->ki_pos - count, iocb->ki_pos - 1,
                                (iocb->ki_flags & IOCB_SYNC) ? 0 : 1);
                if (ret)
                        return ret;
        }

        return count;
}

extern void emergency_sync(void);
extern void emergency_remount(void);

#ifdef CONFIG_BLOCK
extern int bmap(struct inode *inode, sector_t *block);
#else
static inline int bmap(struct inode *inode,  sector_t *block)
{
        return -EINVAL;
}
#endif

extern int notify_change(struct dentry *, struct iattr *, struct inode **);
extern int inode_permission(struct inode *, int);
extern int generic_permission(struct inode *, int);
extern int __check_sticky(struct inode *dir, struct inode *inode);

static inline bool execute_ok(struct inode *inode)
{
        return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode);
}

static inline bool inode_wrong_type(const struct inode *inode, umode_t mode)
{
        return (inode->i_mode ^ mode) & S_IFMT;
}

static inline void file_start_write(struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return;
        sb_start_write(file_inode(file)->i_sb);
}

static inline bool file_start_write_trylock(struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return true;
        return sb_start_write_trylock(file_inode(file)->i_sb);
}

static inline void file_end_write(struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return;
        __sb_end_write(file_inode(file)->i_sb, SB_FREEZE_WRITE);
}

/*
 * get_write_access() gets write permission for a file.
 * put_write_access() releases this write permission.
 * This is used for regular files.
 * We cannot support write (and maybe mmap read-write shared) accesses and
 * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode
 * can have the following values:
 * 0: no writers, no VM_DENYWRITE mappings
 * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist
 * > 0: (i_writecount) users are writing to the file.
 *
 * Normally we operate on that counter with atomic_{inc,dec} and it's safe
 * except for the cases where we don't hold i_writecount yet. Then we need to
 * use {get,deny}_write_access() - these functions check the sign and refuse
 * to do the change if sign is wrong.
 */
static inline int get_write_access(struct inode *inode)
{
        return atomic_inc_unless_negative(&inode->i_writecount) ? 0 : -ETXTBSY;
}
static inline int deny_write_access(struct file *file)
{
        struct inode *inode = file_inode(file);
        return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY;
}
static inline void put_write_access(struct inode * inode)
{
        atomic_dec(&inode->i_writecount);
}
static inline void allow_write_access(struct file *file)
{
        if (file)
                atomic_inc(&file_inode(file)->i_writecount);
}
static inline bool inode_is_open_for_write(const struct inode *inode)
{
        return atomic_read(&inode->i_writecount) > 0;
}

#if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING)
static inline void i_readcount_dec(struct inode *inode)
{
        BUG_ON(!atomic_read(&inode->i_readcount));
        atomic_dec(&inode->i_readcount);
}
static inline void i_readcount_inc(struct inode *inode)
{
        atomic_inc(&inode->i_readcount);
}
#else
static inline void i_readcount_dec(struct inode *inode)
{
        return;
}
static inline void i_readcount_inc(struct inode *inode)
{
        return;
}
#endif
extern int do_pipe_flags(int *, int);

extern ssize_t kernel_read(struct file *, void *, size_t, loff_t *);
ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos);
extern ssize_t kernel_write(struct file *, const void *, size_t, loff_t *);
extern ssize_t __kernel_write(struct file *, const void *, size_t, loff_t *);
extern struct file * open_exec(const char *);
 
/* fs/dcache.c -- generic fs support functions */
extern bool is_subdir(struct dentry *, struct dentry *);
extern bool path_is_under(const struct path *, const struct path *);

extern char *file_path(struct file *, char *, int);

#include <linux/err.h>

/* needed for stackable file system support */
extern loff_t default_llseek(struct file *file, loff_t offset, int whence);

extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence);

extern int inode_init_always(struct super_block *, struct inode *);
extern void inode_init_once(struct inode *);
extern void address_space_init_once(struct address_space *mapping);
extern struct inode * igrab(struct inode *);
extern ino_t iunique(struct super_block *, ino_t);
extern int inode_needs_sync(struct inode *inode);
extern int generic_delete_inode(struct inode *inode);
static inline int generic_drop_inode(struct inode *inode)
{
        return !inode->i_nlink || inode_unhashed(inode);
}
extern void d_mark_dontcache(struct inode *inode);

extern struct inode *ilookup5_nowait(struct super_block *sb,
                unsigned long hashval, int (*test)(struct inode *, void *),
                void *data);
extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
                int (*test)(struct inode *, void *), void *data);
extern struct inode *ilookup(struct super_block *sb, unsigned long ino);

extern struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
                int (*test)(struct inode *, void *),
                int (*set)(struct inode *, void *),
                void *data);
extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *);
extern struct inode * iget_locked(struct super_block *, unsigned long);
extern struct inode *find_inode_nowait(struct super_block *,
                                       unsigned long,
                                       int (*match)(struct inode *,
                                                    unsigned long, void *),
                                       void *data);
extern struct inode *find_inode_rcu(struct super_block *, unsigned long,
                                    int (*)(struct inode *, void *), void *);
extern struct inode *find_inode_by_ino_rcu(struct super_block *, unsigned long);
extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *);
extern int insert_inode_locked(struct inode *);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
extern void lockdep_annotate_inode_mutex_key(struct inode *inode);
#else
static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { };
#endif
extern void unlock_new_inode(struct inode *);
extern void discard_new_inode(struct inode *);
extern unsigned int get_next_ino(void);
extern void evict_inodes(struct super_block *sb);

/*
 * Userspace may rely on the the inode number being non-zero. For example, glibc
 * simply ignores files with zero i_ino in unlink() and other places.
 *
 * As an additional complication, if userspace was compiled with
 * _FILE_OFFSET_BITS=32 on a 64-bit kernel we'll only end up reading out the
 * lower 32 bits, so we need to check that those aren't zero explicitly. With
 * _FILE_OFFSET_BITS=64, this may cause some harmless false-negatives, but
 * better safe than sorry.
 */
static inline bool is_zero_ino(ino_t ino)
{
        return (u32)ino == 0;
}

extern void __iget(struct inode * inode);
extern void iget_failed(struct inode *);
extern void clear_inode(struct inode *);
extern void __destroy_inode(struct inode *);
extern struct inode *new_inode_pseudo(struct super_block *sb);
extern struct inode *new_inode(struct super_block *sb);
extern void free_inode_nonrcu(struct inode *inode);
extern int setattr_should_drop_suidgid(struct inode *);
extern int file_remove_privs(struct file *);

extern void __insert_inode_hash(struct inode *, unsigned long hashval);
static inline void insert_inode_hash(struct inode *inode)
{
        __insert_inode_hash(inode, inode->i_ino);
}

extern void __remove_inode_hash(struct inode *);
static inline void remove_inode_hash(struct inode *inode)
{
        if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash))
                __remove_inode_hash(inode);
}

extern void inode_sb_list_add(struct inode *inode);

extern int sb_set_blocksize(struct super_block *, int);
extern int sb_min_blocksize(struct super_block *, int);

extern int generic_file_mmap(struct file *, struct vm_area_struct *);
extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
extern int generic_write_check_limits(struct file *file, loff_t pos,
                loff_t *count);
extern int generic_file_rw_checks(struct file *file_in, struct file *file_out);
extern ssize_t generic_file_buffered_read(struct kiocb *iocb,
                struct iov_iter *to, ssize_t already_read);
extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *);
extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t);

ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
                rwf_t flags);
ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
                rwf_t flags);
ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
                           struct iov_iter *iter);
ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
                            struct iov_iter *iter);

/* fs/block_dev.c */
extern ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to);
extern ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from);
extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
                        int datasync);
extern void block_sync_page(struct page *page);

/* fs/splice.c */
extern ssize_t generic_file_splice_read(struct file *, loff_t *,
                struct pipe_inode_info *, size_t, unsigned int);
extern ssize_t iter_file_splice_write(struct pipe_inode_info *,
                struct file *, loff_t *, size_t, unsigned int);
extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe,
                struct file *out, loff_t *, size_t len, unsigned int flags);
extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
                loff_t *opos, size_t len, unsigned int flags);


extern void
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
extern loff_t noop_llseek(struct file *file, loff_t offset, int whence);
extern loff_t no_llseek(struct file *file, loff_t offset, int whence);
extern loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize);
extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence);
extern loff_t generic_file_llseek_size(struct file *file, loff_t offset,
                int whence, loff_t maxsize, loff_t eof);
extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
                int whence, loff_t size);
extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t);
extern loff_t no_seek_end_llseek(struct file *, loff_t, int);
extern int generic_file_open(struct inode * inode, struct file * filp);
extern int nonseekable_open(struct inode * inode, struct file * filp);
extern int stream_open(struct inode * inode, struct file * filp);

#ifdef CONFIG_BLOCK
typedef void (dio_submit_t)(struct bio *bio, struct inode *inode,
                            loff_t file_offset);

enum {
        /* need locking between buffered and direct access */
        DIO_LOCKING        = 0x01,

        /* filesystem does not support filling holes */
        DIO_SKIP_HOLES        = 0x02,
};

ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
                             struct block_device *bdev, struct iov_iter *iter,
                             get_block_t get_block,
                             dio_iodone_t end_io, dio_submit_t submit_io,
                             int flags);

static inline ssize_t blockdev_direct_IO(struct kiocb *iocb,
                                         struct inode *inode,
                                         struct iov_iter *iter,
                                         get_block_t get_block)
{
        return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
                        get_block, NULL, NULL, DIO_LOCKING | DIO_SKIP_HOLES);
}
#endif

void inode_dio_wait(struct inode *inode);

/*
 * inode_dio_begin - signal start of a direct I/O requests
 * @inode: inode the direct I/O happens on
 *
 * This is called once we've finished processing a direct I/O request,
 * and is used to wake up callers waiting for direct I/O to be quiesced.
 */
static inline void inode_dio_begin(struct inode *inode)
{
        atomic_inc(&inode->i_dio_count);
}

/*
 * inode_dio_end - signal finish of a direct I/O requests
 * @inode: inode the direct I/O happens on
 *
 * This is called once we've finished processing a direct I/O request,
 * and is used to wake up callers waiting for direct I/O to be quiesced.
 */
static inline void inode_dio_end(struct inode *inode)
{
        if (atomic_dec_and_test(&inode->i_dio_count))
                wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
}

/*
 * Warn about a page cache invalidation failure diring a direct I/O write.
 */
void dio_warn_stale_pagecache(struct file *filp);

extern void inode_set_flags(struct inode *inode, unsigned int flags,
                            unsigned int mask);

extern const struct file_operations generic_ro_fops;

#define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))

extern int readlink_copy(char __user *, int, const char *);
extern int page_readlink(struct dentry *, char __user *, int);
extern const char *page_get_link(struct dentry *, struct inode *,
                                 struct delayed_call *);
extern void page_put_link(void *);
extern int __page_symlink(struct inode *inode, const char *symname, int len,
                int nofs);
extern int page_symlink(struct inode *inode, const char *symname, int len);
extern const struct inode_operations page_symlink_inode_operations;
extern void kfree_link(void *);
extern void generic_fillattr(struct inode *, struct kstat *);
extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
void __inode_add_bytes(struct inode *inode, loff_t bytes);
void inode_add_bytes(struct inode *inode, loff_t bytes);
void __inode_sub_bytes(struct inode *inode, loff_t bytes);
void inode_sub_bytes(struct inode *inode, loff_t bytes);
static inline loff_t __inode_get_bytes(struct inode *inode)
{
        return (((loff_t)inode->i_blocks) << 9) + inode->i_bytes;
}
loff_t inode_get_bytes(struct inode *inode);
void inode_set_bytes(struct inode *inode, loff_t bytes);
const char *simple_get_link(struct dentry *, struct inode *,
                            struct delayed_call *);
extern const struct inode_operations simple_symlink_inode_operations;

extern int iterate_dir(struct file *, struct dir_context *);

int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
                int flags);
int vfs_fstat(int fd, struct kstat *stat);

static inline int vfs_stat(const char __user *filename, struct kstat *stat)
{
        return vfs_fstatat(AT_FDCWD, filename, stat, 0);
}
static inline int vfs_lstat(const char __user *name, struct kstat *stat)
{
        return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW);
}

extern const char *vfs_get_link(struct dentry *, struct delayed_call *);
extern int vfs_readlink(struct dentry *, char __user *, int);

extern struct file_system_type *get_filesystem(struct file_system_type *fs);
extern void put_filesystem(struct file_system_type *fs);
extern struct file_system_type *get_fs_type(const char *name);
extern struct super_block *get_super(struct block_device *);
extern struct super_block *get_super_thawed(struct block_device *);
extern struct super_block *get_super_exclusive_thawed(struct block_device *bdev);
extern struct super_block *get_active_super(struct block_device *bdev);
extern void drop_super(struct super_block *sb);
extern void drop_super_exclusive(struct super_block *sb);
extern void iterate_supers(void (*)(struct super_block *, void *), void *);
extern void iterate_supers_type(struct file_system_type *,
                                void (*)(struct super_block *, void *), void *);

extern int dcache_dir_open(struct inode *, struct file *);
extern int dcache_dir_close(struct inode *, struct file *);
extern loff_t dcache_dir_lseek(struct file *, loff_t, int);
extern int dcache_readdir(struct file *, struct dir_context *);
extern int simple_setattr(struct dentry *, struct iattr *);
extern int simple_getattr(const struct path *, struct kstat *, u32, unsigned int);
extern int simple_statfs(struct dentry *, struct kstatfs *);
extern int simple_open(struct inode *inode, struct file *file);
extern int simple_link(struct dentry *, struct inode *, struct dentry *);
extern int simple_unlink(struct inode *, struct dentry *);
extern int simple_rmdir(struct inode *, struct dentry *);
extern int simple_rename(struct inode *, struct dentry *,
                         struct inode *, struct dentry *, unsigned int);
extern void simple_recursive_removal(struct dentry *,
                              void (*callback)(struct dentry *));
extern int noop_fsync(struct file *, loff_t, loff_t, int);
extern int noop_set_page_dirty(struct page *page);
extern void noop_invalidatepage(struct page *page, unsigned int offset,
                unsigned int length);
extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
extern int simple_empty(struct dentry *);
extern int simple_readpage(struct file *file, struct page *page);
extern int simple_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata);
extern int simple_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct page *page, void *fsdata);
extern int always_delete_dentry(const struct dentry *);
extern struct inode *alloc_anon_inode(struct super_block *);
extern int simple_nosetlease(struct file *, long, struct file_lock **, void **);
extern const struct dentry_operations simple_dentry_operations;

extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags);
extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
extern const struct file_operations simple_dir_operations;
extern const struct inode_operations simple_dir_inode_operations;
extern void make_empty_dir_inode(struct inode *inode);
extern bool is_empty_dir_inode(struct inode *inode);
struct tree_descr { const char *name; const struct file_operations *ops; int mode; };
struct dentry *d_alloc_name(struct dentry *, const char *);
extern int simple_fill_super(struct super_block *, unsigned long,
                             const struct tree_descr *);
extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count);
extern void simple_release_fs(struct vfsmount **mount, int *count);

extern ssize_t simple_read_from_buffer(void __user *to, size_t count,
                        loff_t *ppos, const void *from, size_t available);
extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
                const void __user *from, size_t count);

extern int __generic_file_fsync(struct file *, loff_t, loff_t, int);
extern int generic_file_fsync(struct file *, loff_t, loff_t, int);

extern int generic_check_addressable(unsigned, u64);

#ifdef CONFIG_UNICODE
extern int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str);
extern int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
                                const char *str, const struct qstr *name);
#endif

#ifdef CONFIG_MIGRATION
extern int buffer_migrate_page(struct address_space *,
                                struct page *, struct page *,
                                enum migrate_mode);
extern int buffer_migrate_page_norefs(struct address_space *,
                                struct page *, struct page *,
                                enum migrate_mode);
#else
#define buffer_migrate_page NULL
#define buffer_migrate_page_norefs NULL
#endif

extern int setattr_prepare(struct dentry *, struct iattr *);
extern int inode_newsize_ok(const struct inode *, loff_t offset);
extern void setattr_copy(struct inode *inode, const struct iattr *attr);

extern int file_update_time(struct file *file);

static inline bool vma_is_dax(const struct vm_area_struct *vma)
{
        return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
}

static inline bool vma_is_fsdax(struct vm_area_struct *vma)
{
        struct inode *inode;

        if (!vma->vm_file)
                return false;
        if (!vma_is_dax(vma))
                return false;
        inode = file_inode(vma->vm_file);
        if (S_ISCHR(inode->i_mode))
                return false; /* device-dax */
        return true;
}

static inline int iocb_flags(struct file *file)
{
        int res = 0;
        if (file->f_flags & O_APPEND)
                res |= IOCB_APPEND;
        if (file->f_flags & O_DIRECT)
                res |= IOCB_DIRECT;
        if ((file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host))
                res |= IOCB_DSYNC;
        if (file->f_flags & __O_SYNC)
                res |= IOCB_SYNC;
        return res;
}

static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
{
        int kiocb_flags = 0;

        /* make sure there's no overlap between RWF and private IOCB flags */
        BUILD_BUG_ON((__force int) RWF_SUPPORTED & IOCB_EVENTFD);

        if (!flags)
                return 0;
        if (unlikely(flags & ~RWF_SUPPORTED))
                return -EOPNOTSUPP;

        if (flags & RWF_NOWAIT) {
                if (!(ki->ki_filp->f_mode & FMODE_NOWAIT))
                        return -EOPNOTSUPP;
                kiocb_flags |= IOCB_NOIO;
        }
        kiocb_flags |= (__force int) (flags & RWF_SUPPORTED);
        if (flags & RWF_SYNC)
                kiocb_flags |= IOCB_DSYNC;

        ki->ki_flags |= kiocb_flags;
        return 0;
}

static inline ino_t parent_ino(struct dentry *dentry)
{
        ino_t res;

        /*
         * Don't strictly need d_lock here? If the parent ino could change
         * then surely we'd have a deeper race in the caller?
         */
        spin_lock(&dentry->d_lock);
        res = dentry->d_parent->d_inode->i_ino;
        spin_unlock(&dentry->d_lock);
        return res;
}

/* Transaction based IO helpers */

/*
 * An argresp is stored in an allocated page and holds the
 * size of the argument or response, along with its content
 */
struct simple_transaction_argresp {
        ssize_t size;
        char data[];
};

#define SIMPLE_TRANSACTION_LIMIT (PAGE_SIZE - sizeof(struct simple_transaction_argresp))

char *simple_transaction_get(struct file *file, const char __user *buf,
                                size_t size);
ssize_t simple_transaction_read(struct file *file, char __user *buf,
                                size_t size, loff_t *pos);
int simple_transaction_release(struct inode *inode, struct file *file);

void simple_transaction_set(struct file *file, size_t n);

/*
 * simple attribute files
 *
 * These attributes behave similar to those in sysfs:
 *
 * Writing to an attribute immediately sets a value, an open file can be
 * written to multiple times.
 *
 * Reading from an attribute creates a buffer from the value that might get
 * read with multiple read calls. When the attribute has been read
 * completely, no further read calls are possible until the file is opened
 * again.
 *
 * All attributes contain a text representation of a numeric value
 * that are accessed with the get() and set() functions.
 */
#define DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, __is_signed)        \
static int __fops ## _open(struct inode *inode, struct file *file)        \
{                                                                        \
        __simple_attr_check_format(__fmt, 0ull);                        \
        return simple_attr_open(inode, file, __get, __set, __fmt);        \
}                                                                        \
static const struct file_operations __fops = {                                \
        .owner         = THIS_MODULE,                                                \
        .open         = __fops ## _open,                                        \
        .release = simple_attr_release,                                        \
        .read         = simple_attr_read,                                        \
        .write         = (__is_signed) ? simple_attr_write_signed : simple_attr_write,        \
        .llseek         = generic_file_llseek,                                        \
}

#define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt)                \
        DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, false)

#define DEFINE_SIMPLE_ATTRIBUTE_SIGNED(__fops, __get, __set, __fmt)        \
        DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, true)

static inline __printf(1, 2)
void __simple_attr_check_format(const char *fmt, ...)
{
        /* don't do anything, just let the compiler check the arguments; */
}

int simple_attr_open(struct inode *inode, struct file *file,
                     int (*get)(void *, u64 *), int (*set)(void *, u64),
                     const char *fmt);
int simple_attr_release(struct inode *inode, struct file *file);
ssize_t simple_attr_read(struct file *file, char __user *buf,
                         size_t len, loff_t *ppos);
ssize_t simple_attr_write(struct file *file, const char __user *buf,
                          size_t len, loff_t *ppos);
ssize_t simple_attr_write_signed(struct file *file, const char __user *buf,
                                 size_t len, loff_t *ppos);

struct ctl_table;
int proc_nr_files(struct ctl_table *table, int write,
                  void *buffer, size_t *lenp, loff_t *ppos);
int proc_nr_dentry(struct ctl_table *table, int write,
                  void *buffer, size_t *lenp, loff_t *ppos);
int proc_nr_inodes(struct ctl_table *table, int write,
                   void *buffer, size_t *lenp, loff_t *ppos);
int __init get_filesystem_list(char *buf);

#define __FMODE_EXEC                ((__force int) FMODE_EXEC)
#define __FMODE_NONOTIFY        ((__force int) FMODE_NONOTIFY)

#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
#define OPEN_FMODE(flag) ((__force fmode_t)(((flag + 1) & O_ACCMODE) | \
                                            (flag & __FMODE_NONOTIFY)))

static inline bool is_sxid(umode_t mode)
{
        return mode & (S_ISUID | S_ISGID);
}

static inline int check_sticky(struct inode *dir, struct inode *inode)
{
        if (!(dir->i_mode & S_ISVTX))
                return 0;

        return __check_sticky(dir, inode);
}

static inline void inode_has_no_xattr(struct inode *inode)
{
        if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & SB_NOSEC))
                inode->i_flags |= S_NOSEC;
}

static inline bool is_root_inode(struct inode *inode)
{
        return inode == inode->i_sb->s_root->d_inode;
}

static inline bool dir_emit(struct dir_context *ctx,
                            const char *name, int namelen,
                            u64 ino, unsigned type)
{
        return ctx->actor(ctx, name, namelen, ctx->pos, ino, type) == 0;
}
static inline bool dir_emit_dot(struct file *file, struct dir_context *ctx)
{
        return ctx->actor(ctx, ".", 1, ctx->pos,
                          file->f_path.dentry->d_inode->i_ino, DT_DIR) == 0;
}
static inline bool dir_emit_dotdot(struct file *file, struct dir_context *ctx)
{
        return ctx->actor(ctx, "..", 2, ctx->pos,
                          parent_ino(file->f_path.dentry), DT_DIR) == 0;
}
static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx)
{
        if (ctx->pos == 0) {
                if (!dir_emit_dot(file, ctx))
                        return false;
                ctx->pos = 1;
        }
        if (ctx->pos == 1) {
                if (!dir_emit_dotdot(file, ctx))
                        return false;
                ctx->pos = 2;
        }
        return true;
}
static inline bool dir_relax(struct inode *inode)
{
        inode_unlock(inode);
        inode_lock(inode);
        return !IS_DEADDIR(inode);
}

static inline bool dir_relax_shared(struct inode *inode)
{
        inode_unlock_shared(inode);
        inode_lock_shared(inode);
        return !IS_DEADDIR(inode);
}

extern bool path_noexec(const struct path *path);
extern void inode_nohighmem(struct inode *inode);

/* mm/fadvise.c */
extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
                       int advice);
extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
                           int advice);

int vfs_ioc_setflags_prepare(struct inode *inode, unsigned int oldflags,
                             unsigned int flags);

int vfs_ioc_fssetxattr_check(struct inode *inode, const struct fsxattr *old_fa,
                             struct fsxattr *fa);

static inline void simple_fill_fsxattr(struct fsxattr *fa, __u32 xflags)
{
        memset(fa, 0, sizeof(*fa));
        fa->fsx_xflags = xflags;
}

/*
 * Flush file data before changing attributes.  Caller must hold any locks
 * required to prevent further writes to this file until we're done setting
 * flags.
 */
static inline int inode_drain_writes(struct inode *inode)
{
        inode_dio_wait(inode);
        return filemap_write_and_wait(inode->i_mapping);
}

#endif /* _LINUX_FS_H */




















    1 


    1 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PAGE_64_H
#define _ASM_X86_PAGE_64_H

#include <asm/page_64_types.h>

#ifndef __ASSEMBLY__
#include <asm/alternative.h>

/* duplicated to the one in bootmem.h */
extern unsigned long max_pfn;
extern unsigned long phys_base;

extern unsigned long page_offset_base;
extern unsigned long vmalloc_base;
extern unsigned long vmemmap_base;

static inline unsigned long __phys_addr_nodebug(unsigned long x)
{
        unsigned long y = x - __START_KERNEL_map;

        /* use the carry flag to determine if x was < __START_KERNEL_map */
        x = y + ((x > y) ? phys_base : (__START_KERNEL_map - PAGE_OFFSET));

        return x;
}

#ifdef CONFIG_DEBUG_VIRTUAL
extern unsigned long __phys_addr(unsigned long);
extern unsigned long __phys_addr_symbol(unsigned long);
#else
#define __phys_addr(x)                __phys_addr_nodebug(x)
#define __phys_addr_symbol(x) \
        ((unsigned long)(x) - __START_KERNEL_map + phys_base)
#endif

#define __phys_reloc_hide(x)        (x)

#ifdef CONFIG_FLATMEM
#define pfn_valid(pfn)          ((pfn) < max_pfn)
#endif

void clear_page_orig(void *page);
void clear_page_rep(void *page);
void clear_page_erms(void *page);

static inline void clear_page(void *page)
{
        alternative_call_2(clear_page_orig,
                           clear_page_rep, X86_FEATURE_REP_GOOD,
                           clear_page_erms, X86_FEATURE_ERMS,
                           "=D" (page),
                           "0" (page)
                           : "cc", "memory", "rax", "rcx");
}

void copy_page(void *to, void *from);

#endif        /* !__ASSEMBLY__ */

#ifdef CONFIG_X86_VSYSCALL_EMULATION
# define __HAVE_ARCH_GATE_AREA 1
#endif

#endif /* _ASM_X86_PAGE_64_H */






















































































































































































































































































































































































































































































































































































    1 

























































































































































































































    1 








































































































































































































































































































































































































































































































    1 
    1 
    1 



































































































































































































    1 




    1 
















































    1 
































    1 


    1 




    1 











































    1 











































































































































































































































































































































































































    1 

















































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Security plug functions
 *
 * Copyright (C) 2001 WireX Communications, Inc <chris@wirex.com>
 * Copyright (C) 2001-2002 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (C) 2001 Networks Associates Technology, Inc <ssmalley@nai.com>
 * Copyright (C) 2016 Mellanox Technologies
 */

#define pr_fmt(fmt) "LSM: " fmt

#include <linux/bpf.h>
#include <linux/capability.h>
#include <linux/dcache.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kernel_read_file.h>
#include <linux/lsm_hooks.h>
#include <linux/integrity.h>
#include <linux/ima.h>
#include <linux/evm.h>
#include <linux/fsnotify.h>
#include <linux/mman.h>
#include <linux/mount.h>
#include <linux/personality.h>
#include <linux/backing-dev.h>
#include <linux/string.h>
#include <linux/msg.h>
#include <net/flow.h>

#define MAX_LSM_EVM_XATTR        2

/* How many LSMs were built into the kernel? */
#define LSM_COUNT (__end_lsm_info - __start_lsm_info)

/*
 * These are descriptions of the reasons that can be passed to the
 * security_locked_down() LSM hook. Placing this array here allows
 * all security modules to use the same descriptions for auditing
 * purposes.
 */
const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = {
        [LOCKDOWN_NONE] = "none",
        [LOCKDOWN_MODULE_SIGNATURE] = "unsigned module loading",
        [LOCKDOWN_DEV_MEM] = "/dev/mem,kmem,port",
        [LOCKDOWN_EFI_TEST] = "/dev/efi_test access",
        [LOCKDOWN_KEXEC] = "kexec of unsigned images",
        [LOCKDOWN_HIBERNATION] = "hibernation",
        [LOCKDOWN_PCI_ACCESS] = "direct PCI access",
        [LOCKDOWN_IOPORT] = "raw io port access",
        [LOCKDOWN_MSR] = "raw MSR access",
        [LOCKDOWN_ACPI_TABLES] = "modifying ACPI tables",
        [LOCKDOWN_PCMCIA_CIS] = "direct PCMCIA CIS storage",
        [LOCKDOWN_TIOCSSERIAL] = "reconfiguration of serial port IO",
        [LOCKDOWN_MODULE_PARAMETERS] = "unsafe module parameters",
        [LOCKDOWN_MMIOTRACE] = "unsafe mmio",
        [LOCKDOWN_DEBUGFS] = "debugfs access",
        [LOCKDOWN_XMON_WR] = "xmon write access",
        [LOCKDOWN_BPF_WRITE_USER] = "use of bpf to write user RAM",
        [LOCKDOWN_DBG_WRITE_KERNEL] = "use of kgdb/kdb to write kernel RAM",
        [LOCKDOWN_INTEGRITY_MAX] = "integrity",
        [LOCKDOWN_KCORE] = "/proc/kcore access",
        [LOCKDOWN_KPROBES] = "use of kprobes",
        [LOCKDOWN_BPF_READ] = "use of bpf to read kernel RAM",
        [LOCKDOWN_DBG_READ_KERNEL] = "use of kgdb/kdb to read kernel RAM",
        [LOCKDOWN_PERF] = "unsafe use of perf",
        [LOCKDOWN_TRACEFS] = "use of tracefs",
        [LOCKDOWN_XMON_RW] = "xmon read and write access",
        [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality",
};

struct security_hook_heads security_hook_heads __lsm_ro_after_init;
static BLOCKING_NOTIFIER_HEAD(blocking_lsm_notifier_chain);

static struct kmem_cache *lsm_file_cache;
static struct kmem_cache *lsm_inode_cache;

char *lsm_names;
static struct lsm_blob_sizes blob_sizes __lsm_ro_after_init;

/* Boot-time LSM user choice */
static __initdata const char *chosen_lsm_order;
static __initdata const char *chosen_major_lsm;

static __initconst const char * const builtin_lsm_order = CONFIG_LSM;

/* Ordered list of LSMs to initialize. */
static __initdata struct lsm_info **ordered_lsms;
static __initdata struct lsm_info *exclusive;

static __initdata bool debug;
#define init_debug(...)                                                \
        do {                                                        \
                if (debug)                                        \
                        pr_info(__VA_ARGS__);                        \
        } while (0)

static bool __init is_enabled(struct lsm_info *lsm)
{
        if (!lsm->enabled)
                return false;

        return *lsm->enabled;
}

/* Mark an LSM's enabled flag. */
static int lsm_enabled_true __initdata = 1;
static int lsm_enabled_false __initdata = 0;
static void __init set_enabled(struct lsm_info *lsm, bool enabled)
{
        /*
         * When an LSM hasn't configured an enable variable, we can use
         * a hard-coded location for storing the default enabled state.
         */
        if (!lsm->enabled) {
                if (enabled)
                        lsm->enabled = &lsm_enabled_true;
                else
                        lsm->enabled = &lsm_enabled_false;
        } else if (lsm->enabled == &lsm_enabled_true) {
                if (!enabled)
                        lsm->enabled = &lsm_enabled_false;
        } else if (lsm->enabled == &lsm_enabled_false) {
                if (enabled)
                        lsm->enabled = &lsm_enabled_true;
        } else {
                *lsm->enabled = enabled;
        }
}

/* Is an LSM already listed in the ordered LSMs list? */
static bool __init exists_ordered_lsm(struct lsm_info *lsm)
{
        struct lsm_info **check;

        for (check = ordered_lsms; *check; check++)
                if (*check == lsm)
                        return true;

        return false;
}

/* Append an LSM to the list of ordered LSMs to initialize. */
static int last_lsm __initdata;
static void __init append_ordered_lsm(struct lsm_info *lsm, const char *from)
{
        /* Ignore duplicate selections. */
        if (exists_ordered_lsm(lsm))
                return;

        if (WARN(last_lsm == LSM_COUNT, "%s: out of LSM slots!?\n", from))
                return;

        /* Enable this LSM, if it is not already set. */
        if (!lsm->enabled)
                lsm->enabled = &lsm_enabled_true;
        ordered_lsms[last_lsm++] = lsm;

        init_debug("%s ordering: %s (%sabled)\n", from, lsm->name,
                   is_enabled(lsm) ? "en" : "dis");
}

/* Is an LSM allowed to be initialized? */
static bool __init lsm_allowed(struct lsm_info *lsm)
{
        /* Skip if the LSM is disabled. */
        if (!is_enabled(lsm))
                return false;

        /* Not allowed if another exclusive LSM already initialized. */
        if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && exclusive) {
                init_debug("exclusive disabled: %s\n", lsm->name);
                return false;
        }

        return true;
}

static void __init lsm_set_blob_size(int *need, int *lbs)
{
        int offset;

        if (*need > 0) {
                offset = *lbs;
                *lbs += *need;
                *need = offset;
        }
}

static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed)
{
        if (!needed)
                return;

        lsm_set_blob_size(&needed->lbs_cred, &blob_sizes.lbs_cred);
        lsm_set_blob_size(&needed->lbs_file, &blob_sizes.lbs_file);
        /*
         * The inode blob gets an rcu_head in addition to
         * what the modules might need.
         */
        if (needed->lbs_inode && blob_sizes.lbs_inode == 0)
                blob_sizes.lbs_inode = sizeof(struct rcu_head);
        lsm_set_blob_size(&needed->lbs_inode, &blob_sizes.lbs_inode);
        lsm_set_blob_size(&needed->lbs_ipc, &blob_sizes.lbs_ipc);
        lsm_set_blob_size(&needed->lbs_msg_msg, &blob_sizes.lbs_msg_msg);
        lsm_set_blob_size(&needed->lbs_task, &blob_sizes.lbs_task);
}

/* Prepare LSM for initialization. */
static void __init prepare_lsm(struct lsm_info *lsm)
{
        int enabled = lsm_allowed(lsm);

        /* Record enablement (to handle any following exclusive LSMs). */
        set_enabled(lsm, enabled);

        /* If enabled, do pre-initialization work. */
        if (enabled) {
                if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && !exclusive) {
                        exclusive = lsm;
                        init_debug("exclusive chosen: %s\n", lsm->name);
                }

                lsm_set_blob_sizes(lsm->blobs);
        }
}

/* Initialize a given LSM, if it is enabled. */
static void __init initialize_lsm(struct lsm_info *lsm)
{
        if (is_enabled(lsm)) {
                int ret;

                init_debug("initializing %s\n", lsm->name);
                ret = lsm->init();
                WARN(ret, "%s failed to initialize: %d\n", lsm->name, ret);
        }
}

/* Populate ordered LSMs list from comma-separated LSM name list. */
static void __init ordered_lsm_parse(const char *order, const char *origin)
{
        struct lsm_info *lsm;
        char *sep, *name, *next;

        /* LSM_ORDER_FIRST is always first. */
        for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                if (lsm->order == LSM_ORDER_FIRST)
                        append_ordered_lsm(lsm, "first");
        }

        /* Process "security=", if given. */
        if (chosen_major_lsm) {
                struct lsm_info *major;

                /*
                 * To match the original "security=" behavior, this
                 * explicitly does NOT fallback to another Legacy Major
                 * if the selected one was separately disabled: disable
                 * all non-matching Legacy Major LSMs.
                 */
                for (major = __start_lsm_info; major < __end_lsm_info;
                     major++) {
                        if ((major->flags & LSM_FLAG_LEGACY_MAJOR) &&
                            strcmp(major->name, chosen_major_lsm) != 0) {
                                set_enabled(major, false);
                                init_debug("security=%s disabled: %s\n",
                                           chosen_major_lsm, major->name);
                        }
                }
        }

        sep = kstrdup(order, GFP_KERNEL);
        next = sep;
        /* Walk the list, looking for matching LSMs. */
        while ((name = strsep(&next, ",")) != NULL) {
                bool found = false;

                for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                        if (lsm->order == LSM_ORDER_MUTABLE &&
                            strcmp(lsm->name, name) == 0) {
                                append_ordered_lsm(lsm, origin);
                                found = true;
                        }
                }

                if (!found)
                        init_debug("%s ignored: %s\n", origin, name);
        }

        /* Process "security=", if given. */
        if (chosen_major_lsm) {
                for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                        if (exists_ordered_lsm(lsm))
                                continue;
                        if (strcmp(lsm->name, chosen_major_lsm) == 0)
                                append_ordered_lsm(lsm, "security=");
                }
        }

        /* Disable all LSMs not in the ordered list. */
        for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                if (exists_ordered_lsm(lsm))
                        continue;
                set_enabled(lsm, false);
                init_debug("%s disabled: %s\n", origin, lsm->name);
        }

        kfree(sep);
}

static void __init lsm_early_cred(struct cred *cred);
static void __init lsm_early_task(struct task_struct *task);

static int lsm_append(const char *new, char **result);

static void __init ordered_lsm_init(void)
{
        struct lsm_info **lsm;

        ordered_lsms = kcalloc(LSM_COUNT + 1, sizeof(*ordered_lsms),
                                GFP_KERNEL);

        if (chosen_lsm_order) {
                if (chosen_major_lsm) {
                        pr_info("security= is ignored because it is superseded by lsm=\n");
                        chosen_major_lsm = NULL;
                }
                ordered_lsm_parse(chosen_lsm_order, "cmdline");
        } else
                ordered_lsm_parse(builtin_lsm_order, "builtin");

        for (lsm = ordered_lsms; *lsm; lsm++)
                prepare_lsm(*lsm);

        init_debug("cred blob size     = %d\n", blob_sizes.lbs_cred);
        init_debug("file blob size     = %d\n", blob_sizes.lbs_file);
        init_debug("inode blob size    = %d\n", blob_sizes.lbs_inode);
        init_debug("ipc blob size      = %d\n", blob_sizes.lbs_ipc);
        init_debug("msg_msg blob size  = %d\n", blob_sizes.lbs_msg_msg);
        init_debug("task blob size     = %d\n", blob_sizes.lbs_task);

        /*
         * Create any kmem_caches needed for blobs
         */
        if (blob_sizes.lbs_file)
                lsm_file_cache = kmem_cache_create("lsm_file_cache",
                                                   blob_sizes.lbs_file, 0,
                                                   SLAB_PANIC, NULL);
        if (blob_sizes.lbs_inode)
                lsm_inode_cache = kmem_cache_create("lsm_inode_cache",
                                                    blob_sizes.lbs_inode, 0,
                                                    SLAB_PANIC, NULL);

        lsm_early_cred((struct cred *) current->cred);
        lsm_early_task(current);
        for (lsm = ordered_lsms; *lsm; lsm++)
                initialize_lsm(*lsm);

        kfree(ordered_lsms);
}

int __init early_security_init(void)
{
        int i;
        struct hlist_head *list = (struct hlist_head *) &security_hook_heads;
        struct lsm_info *lsm;

        for (i = 0; i < sizeof(security_hook_heads) / sizeof(struct hlist_head);
             i++)
                INIT_HLIST_HEAD(&list[i]);

        for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) {
                if (!lsm->enabled)
                        lsm->enabled = &lsm_enabled_true;
                prepare_lsm(lsm);
                initialize_lsm(lsm);
        }

        return 0;
}

/**
 * security_init - initializes the security framework
 *
 * This should be called early in the kernel initialization sequence.
 */
int __init security_init(void)
{
        struct lsm_info *lsm;

        pr_info("Security Framework initializing\n");

        /*
         * Append the names of the early LSM modules now that kmalloc() is
         * available
         */
        for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) {
                if (lsm->enabled)
                        lsm_append(lsm->name, &lsm_names);
        }

        /* Load LSMs in specified order. */
        ordered_lsm_init();

        return 0;
}

/* Save user chosen LSM */
static int __init choose_major_lsm(char *str)
{
        chosen_major_lsm = str;
        return 1;
}
__setup("security=", choose_major_lsm);

/* Explicitly choose LSM initialization order. */
static int __init choose_lsm_order(char *str)
{
        chosen_lsm_order = str;
        return 1;
}
__setup("lsm=", choose_lsm_order);

/* Enable LSM order debugging. */
static int __init enable_debug(char *str)
{
        debug = true;
        return 1;
}
__setup("lsm.debug", enable_debug);

static bool match_last_lsm(const char *list, const char *lsm)
{
        const char *last;

        if (WARN_ON(!list || !lsm))
                return false;
        last = strrchr(list, ',');
        if (last)
                /* Pass the comma, strcmp() will check for '\0' */
                last++;
        else
                last = list;
        return !strcmp(last, lsm);
}

static int lsm_append(const char *new, char **result)
{
        char *cp;

        if (*result == NULL) {
                *result = kstrdup(new, GFP_KERNEL);
                if (*result == NULL)
                        return -ENOMEM;
        } else {
                /* Check if it is the last registered name */
                if (match_last_lsm(*result, new))
                        return 0;
                cp = kasprintf(GFP_KERNEL, "%s,%s", *result, new);
                if (cp == NULL)
                        return -ENOMEM;
                kfree(*result);
                *result = cp;
        }
        return 0;
}

/**
 * security_add_hooks - Add a modules hooks to the hook lists.
 * @hooks: the hooks to add
 * @count: the number of hooks to add
 * @lsm: the name of the security module
 *
 * Each LSM has to register its hooks with the infrastructure.
 */
void __init security_add_hooks(struct security_hook_list *hooks, int count,
                                char *lsm)
{
        int i;

        for (i = 0; i < count; i++) {
                hooks[i].lsm = lsm;
                hlist_add_tail_rcu(&hooks[i].list, hooks[i].head);
        }

        /*
         * Don't try to append during early_security_init(), we'll come back
         * and fix this up afterwards.
         */
        if (slab_is_available()) {
                if (lsm_append(lsm, &lsm_names) < 0)
                        panic("%s - Cannot get early memory.\n", __func__);
        }
}

int call_blocking_lsm_notifier(enum lsm_event event, void *data)
{
        return blocking_notifier_call_chain(&blocking_lsm_notifier_chain,
                                            event, data);
}
EXPORT_SYMBOL(call_blocking_lsm_notifier);

int register_blocking_lsm_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&blocking_lsm_notifier_chain,
                                                nb);
}
EXPORT_SYMBOL(register_blocking_lsm_notifier);

int unregister_blocking_lsm_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&blocking_lsm_notifier_chain,
                                                  nb);
}
EXPORT_SYMBOL(unregister_blocking_lsm_notifier);

/**
 * lsm_cred_alloc - allocate a composite cred blob
 * @cred: the cred that needs a blob
 * @gfp: allocation type
 *
 * Allocate the cred blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_cred_alloc(struct cred *cred, gfp_t gfp)
{
        if (blob_sizes.lbs_cred == 0) {
                cred->security = NULL;
                return 0;
        }

        cred->security = kzalloc(blob_sizes.lbs_cred, gfp);
        if (cred->security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_early_cred - during initialization allocate a composite cred blob
 * @cred: the cred that needs a blob
 *
 * Allocate the cred blob for all the modules
 */
static void __init lsm_early_cred(struct cred *cred)
{
        int rc = lsm_cred_alloc(cred, GFP_KERNEL);

        if (rc)
                panic("%s: Early cred alloc failed.\n", __func__);
}

/**
 * lsm_file_alloc - allocate a composite file blob
 * @file: the file that needs a blob
 *
 * Allocate the file blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_file_alloc(struct file *file)
{
        if (!lsm_file_cache) {
                file->f_security = NULL;
                return 0;
        }

        file->f_security = kmem_cache_zalloc(lsm_file_cache, GFP_KERNEL);
        if (file->f_security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_inode_alloc - allocate a composite inode blob
 * @inode: the inode that needs a blob
 *
 * Allocate the inode blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
int lsm_inode_alloc(struct inode *inode)
{
        if (!lsm_inode_cache) {
                inode->i_security = NULL;
                return 0;
        }

        inode->i_security = kmem_cache_zalloc(lsm_inode_cache, GFP_NOFS);
        if (inode->i_security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_task_alloc - allocate a composite task blob
 * @task: the task that needs a blob
 *
 * Allocate the task blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_task_alloc(struct task_struct *task)
{
        if (blob_sizes.lbs_task == 0) {
                task->security = NULL;
                return 0;
        }

        task->security = kzalloc(blob_sizes.lbs_task, GFP_KERNEL);
        if (task->security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_ipc_alloc - allocate a composite ipc blob
 * @kip: the ipc that needs a blob
 *
 * Allocate the ipc blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_ipc_alloc(struct kern_ipc_perm *kip)
{
        if (blob_sizes.lbs_ipc == 0) {
                kip->security = NULL;
                return 0;
        }

        kip->security = kzalloc(blob_sizes.lbs_ipc, GFP_KERNEL);
        if (kip->security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_msg_msg_alloc - allocate a composite msg_msg blob
 * @mp: the msg_msg that needs a blob
 *
 * Allocate the ipc blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_msg_msg_alloc(struct msg_msg *mp)
{
        if (blob_sizes.lbs_msg_msg == 0) {
                mp->security = NULL;
                return 0;
        }

        mp->security = kzalloc(blob_sizes.lbs_msg_msg, GFP_KERNEL);
        if (mp->security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_early_task - during initialization allocate a composite task blob
 * @task: the task that needs a blob
 *
 * Allocate the task blob for all the modules
 */
static void __init lsm_early_task(struct task_struct *task)
{
        int rc = lsm_task_alloc(task);

        if (rc)
                panic("%s: Early task alloc failed.\n", __func__);
}

/*
 * The default value of the LSM hook is defined in linux/lsm_hook_defs.h and
 * can be accessed with:
 *
 *        LSM_RET_DEFAULT(<hook_name>)
 *
 * The macros below define static constants for the default value of each
 * LSM hook.
 */
#define LSM_RET_DEFAULT(NAME) (NAME##_default)
#define DECLARE_LSM_RET_DEFAULT_void(DEFAULT, NAME)
#define DECLARE_LSM_RET_DEFAULT_int(DEFAULT, NAME) \
        static const int LSM_RET_DEFAULT(NAME) = (DEFAULT);
#define LSM_HOOK(RET, DEFAULT, NAME, ...) \
        DECLARE_LSM_RET_DEFAULT_##RET(DEFAULT, NAME)

#include <linux/lsm_hook_defs.h>
#undef LSM_HOOK

/*
 * Hook list operation macros.
 *
 * call_void_hook:
 *        This is a hook that does not return a value.
 *
 * call_int_hook:
 *        This is a hook that returns a value.
 */

#define call_void_hook(FUNC, ...)                                \
        do {                                                        \
                struct security_hook_list *P;                        \
                                                                \
                hlist_for_each_entry(P, &security_hook_heads.FUNC, list) \
                        P->hook.FUNC(__VA_ARGS__);                \
        } while (0)

#define call_int_hook(FUNC, IRC, ...) ({                        \
        int RC = IRC;                                                \
        do {                                                        \
                struct security_hook_list *P;                        \
                                                                \
                hlist_for_each_entry(P, &security_hook_heads.FUNC, list) { \
                        RC = P->hook.FUNC(__VA_ARGS__);                \
                        if (RC != 0)                                \
                                break;                                \
                }                                                \
        } while (0);                                                \
        RC;                                                        \
})

/* Security operations */

int security_binder_set_context_mgr(const struct cred *mgr)
{
        return call_int_hook(binder_set_context_mgr, 0, mgr);
}

int security_binder_transaction(const struct cred *from,
                                const struct cred *to)
{
        return call_int_hook(binder_transaction, 0, from, to);
}

int security_binder_transfer_binder(const struct cred *from,
                                    const struct cred *to)
{
        return call_int_hook(binder_transfer_binder, 0, from, to);
}

int security_binder_transfer_file(const struct cred *from,
                                  const struct cred *to, struct file *file)
{
        return call_int_hook(binder_transfer_file, 0, from, to, file);
}

int security_ptrace_access_check(struct task_struct *child, unsigned int mode)
{
        return call_int_hook(ptrace_access_check, 0, child, mode);
}

int security_ptrace_traceme(struct task_struct *parent)
{
        return call_int_hook(ptrace_traceme, 0, parent);
}

int security_capget(struct task_struct *target,
                     kernel_cap_t *effective,
                     kernel_cap_t *inheritable,
                     kernel_cap_t *permitted)
{
        return call_int_hook(capget, 0, target,
                                effective, inheritable, permitted);
}

int security_capset(struct cred *new, const struct cred *old,
                    const kernel_cap_t *effective,
                    const kernel_cap_t *inheritable,
                    const kernel_cap_t *permitted)
{
        return call_int_hook(capset, 0, new, old,
                                effective, inheritable, permitted);
}

int security_capable(const struct cred *cred,
                     struct user_namespace *ns,
                     int cap,
                     unsigned int opts)
{
        return call_int_hook(capable, 0, cred, ns, cap, opts);
}

int security_quotactl(int cmds, int type, int id, struct super_block *sb)
{
        return call_int_hook(quotactl, 0, cmds, type, id, sb);
}

int security_quota_on(struct dentry *dentry)
{
        return call_int_hook(quota_on, 0, dentry);
}

int security_syslog(int type)
{
        return call_int_hook(syslog, 0, type);
}

int security_settime64(const struct timespec64 *ts, const struct timezone *tz)
{
        return call_int_hook(settime, 0, ts, tz);
}

int security_vm_enough_memory_mm(struct mm_struct *mm, long pages)
{
        struct security_hook_list *hp;
        int cap_sys_admin = 1;
        int rc;

        /*
         * The module will respond with a positive value if
         * it thinks the __vm_enough_memory() call should be
         * made with the cap_sys_admin set. If all of the modules
         * agree that it should be set it will. If any module
         * thinks it should not be set it won't.
         */
        hlist_for_each_entry(hp, &security_hook_heads.vm_enough_memory, list) {
                rc = hp->hook.vm_enough_memory(mm, pages);
                if (rc <= 0) {
                        cap_sys_admin = 0;
                        break;
                }
        }
        return __vm_enough_memory(mm, pages, cap_sys_admin);
}

int security_bprm_creds_for_exec(struct linux_binprm *bprm)
{
        return call_int_hook(bprm_creds_for_exec, 0, bprm);
}

int security_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file)
{
        return call_int_hook(bprm_creds_from_file, 0, bprm, file);
}

int security_bprm_check(struct linux_binprm *bprm)
{
        int ret;

        ret = call_int_hook(bprm_check_security, 0, bprm);
        if (ret)
                return ret;
        return ima_bprm_check(bprm);
}

void security_bprm_committing_creds(struct linux_binprm *bprm)
{
        call_void_hook(bprm_committing_creds, bprm);
}

void security_bprm_committed_creds(struct linux_binprm *bprm)
{
        call_void_hook(bprm_committed_creds, bprm);
}

int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc)
{
        return call_int_hook(fs_context_dup, 0, fc, src_fc);
}

int security_fs_context_parse_param(struct fs_context *fc,
                                    struct fs_parameter *param)
{
        struct security_hook_list *hp;
        int trc;
        int rc = -ENOPARAM;

        hlist_for_each_entry(hp, &security_hook_heads.fs_context_parse_param,
                             list) {
                trc = hp->hook.fs_context_parse_param(fc, param);
                if (trc == 0)
                        rc = 0;
                else if (trc != -ENOPARAM)
                        return trc;
        }
        return rc;
}

int security_sb_alloc(struct super_block *sb)
{
        return call_int_hook(sb_alloc_security, 0, sb);
}

void security_sb_free(struct super_block *sb)
{
        call_void_hook(sb_free_security, sb);
}

void security_free_mnt_opts(void **mnt_opts)
{
        if (!*mnt_opts)
                return;
        call_void_hook(sb_free_mnt_opts, *mnt_opts);
        *mnt_opts = NULL;
}
EXPORT_SYMBOL(security_free_mnt_opts);

int security_sb_eat_lsm_opts(char *options, void **mnt_opts)
{
        return call_int_hook(sb_eat_lsm_opts, 0, options, mnt_opts);
}
EXPORT_SYMBOL(security_sb_eat_lsm_opts);

int security_sb_remount(struct super_block *sb,
                        void *mnt_opts)
{
        return call_int_hook(sb_remount, 0, sb, mnt_opts);
}
EXPORT_SYMBOL(security_sb_remount);

int security_sb_kern_mount(struct super_block *sb)
{
        return call_int_hook(sb_kern_mount, 0, sb);
}

int security_sb_show_options(struct seq_file *m, struct super_block *sb)
{
        return call_int_hook(sb_show_options, 0, m, sb);
}

int security_sb_statfs(struct dentry *dentry)
{
        return call_int_hook(sb_statfs, 0, dentry);
}

int security_sb_mount(const char *dev_name, const struct path *path,
                       const char *type, unsigned long flags, void *data)
{
        return call_int_hook(sb_mount, 0, dev_name, path, type, flags, data);
}

int security_sb_umount(struct vfsmount *mnt, int flags)
{
        return call_int_hook(sb_umount, 0, mnt, flags);
}

int security_sb_pivotroot(const struct path *old_path, const struct path *new_path)
{
        return call_int_hook(sb_pivotroot, 0, old_path, new_path);
}

int security_sb_set_mnt_opts(struct super_block *sb,
                                void *mnt_opts,
                                unsigned long kern_flags,
                                unsigned long *set_kern_flags)
{
        return call_int_hook(sb_set_mnt_opts,
                                mnt_opts ? -EOPNOTSUPP : 0, sb,
                                mnt_opts, kern_flags, set_kern_flags);
}
EXPORT_SYMBOL(security_sb_set_mnt_opts);

int security_sb_clone_mnt_opts(const struct super_block *oldsb,
                                struct super_block *newsb,
                                unsigned long kern_flags,
                                unsigned long *set_kern_flags)
{
        return call_int_hook(sb_clone_mnt_opts, 0, oldsb, newsb,
                                kern_flags, set_kern_flags);
}
EXPORT_SYMBOL(security_sb_clone_mnt_opts);

int security_add_mnt_opt(const char *option, const char *val, int len,
                         void **mnt_opts)
{
        return call_int_hook(sb_add_mnt_opt, -EINVAL,
                                        option, val, len, mnt_opts);
}
EXPORT_SYMBOL(security_add_mnt_opt);

int security_move_mount(const struct path *from_path, const struct path *to_path)
{
        return call_int_hook(move_mount, 0, from_path, to_path);
}

int security_path_notify(const struct path *path, u64 mask,
                                unsigned int obj_type)
{
        return call_int_hook(path_notify, 0, path, mask, obj_type);
}

int security_inode_alloc(struct inode *inode)
{
        int rc = lsm_inode_alloc(inode);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(inode_alloc_security, 0, inode);
        if (unlikely(rc))
                security_inode_free(inode);
        return rc;
}

static void inode_free_by_rcu(struct rcu_head *head)
{
        /*
         * The rcu head is at the start of the inode blob
         */
        kmem_cache_free(lsm_inode_cache, head);
}

void security_inode_free(struct inode *inode)
{
        integrity_inode_free(inode);
        call_void_hook(inode_free_security, inode);
        /*
         * The inode may still be referenced in a path walk and
         * a call to security_inode_permission() can be made
         * after inode_free_security() is called. Ideally, the VFS
         * wouldn't do this, but fixing that is a much harder
         * job. For now, simply free the i_security via RCU, and
         * leave the current inode->i_security pointer intact.
         * The inode will be freed after the RCU grace period too.
         */
        if (inode->i_security)
                call_rcu((struct rcu_head *)inode->i_security,
                                inode_free_by_rcu);
}

int security_dentry_init_security(struct dentry *dentry, int mode,
                                        const struct qstr *name, void **ctx,
                                        u32 *ctxlen)
{
        return call_int_hook(dentry_init_security, -EOPNOTSUPP, dentry, mode,
                                name, ctx, ctxlen);
}
EXPORT_SYMBOL(security_dentry_init_security);

int security_dentry_create_files_as(struct dentry *dentry, int mode,
                                    struct qstr *name,
                                    const struct cred *old, struct cred *new)
{
        return call_int_hook(dentry_create_files_as, 0, dentry, mode,
                                name, old, new);
}
EXPORT_SYMBOL(security_dentry_create_files_as);

int security_inode_init_security(struct inode *inode, struct inode *dir,
                                 const struct qstr *qstr,
                                 const initxattrs initxattrs, void *fs_data)
{
        struct xattr new_xattrs[MAX_LSM_EVM_XATTR + 1];
        struct xattr *lsm_xattr, *evm_xattr, *xattr;
        int ret;

        if (unlikely(IS_PRIVATE(inode)))
                return 0;

        if (!initxattrs)
                return call_int_hook(inode_init_security, -EOPNOTSUPP, inode,
                                     dir, qstr, NULL, NULL, NULL);
        memset(new_xattrs, 0, sizeof(new_xattrs));
        lsm_xattr = new_xattrs;
        ret = call_int_hook(inode_init_security, -EOPNOTSUPP, inode, dir, qstr,
                                                &lsm_xattr->name,
                                                &lsm_xattr->value,
                                                &lsm_xattr->value_len);
        if (ret)
                goto out;

        evm_xattr = lsm_xattr + 1;
        ret = evm_inode_init_security(inode, lsm_xattr, evm_xattr);
        if (ret)
                goto out;
        ret = initxattrs(inode, new_xattrs, fs_data);
out:
        for (xattr = new_xattrs; xattr->value != NULL; xattr++)
                kfree(xattr->value);
        return (ret == -EOPNOTSUPP) ? 0 : ret;
}
EXPORT_SYMBOL(security_inode_init_security);

int security_old_inode_init_security(struct inode *inode, struct inode *dir,
                                     const struct qstr *qstr, const char **name,
                                     void **value, size_t *len)
{
        if (unlikely(IS_PRIVATE(inode)))
                return -EOPNOTSUPP;
        return call_int_hook(inode_init_security, -EOPNOTSUPP, inode, dir,
                             qstr, name, value, len);
}
EXPORT_SYMBOL(security_old_inode_init_security);

#ifdef CONFIG_SECURITY_PATH
int security_path_mknod(const struct path *dir, struct dentry *dentry, umode_t mode,
                        unsigned int dev)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_mknod, 0, dir, dentry, mode, dev);
}
EXPORT_SYMBOL(security_path_mknod);

int security_path_mkdir(const struct path *dir, struct dentry *dentry, umode_t mode)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_mkdir, 0, dir, dentry, mode);
}
EXPORT_SYMBOL(security_path_mkdir);

int security_path_rmdir(const struct path *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_rmdir, 0, dir, dentry);
}

int security_path_unlink(const struct path *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_unlink, 0, dir, dentry);
}
EXPORT_SYMBOL(security_path_unlink);

int security_path_symlink(const struct path *dir, struct dentry *dentry,
                          const char *old_name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_symlink, 0, dir, dentry, old_name);
}

int security_path_link(struct dentry *old_dentry, const struct path *new_dir,
                       struct dentry *new_dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry))))
                return 0;
        return call_int_hook(path_link, 0, old_dentry, new_dir, new_dentry);
}

int security_path_rename(const struct path *old_dir, struct dentry *old_dentry,
                         const struct path *new_dir, struct dentry *new_dentry,
                         unsigned int flags)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) ||
                     (d_is_positive(new_dentry) && IS_PRIVATE(d_backing_inode(new_dentry)))))
                return 0;

        if (flags & RENAME_EXCHANGE) {
                int err = call_int_hook(path_rename, 0, new_dir, new_dentry,
                                        old_dir, old_dentry);
                if (err)
                        return err;
        }

        return call_int_hook(path_rename, 0, old_dir, old_dentry, new_dir,
                                new_dentry);
}
EXPORT_SYMBOL(security_path_rename);

int security_path_truncate(const struct path *path)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(path_truncate, 0, path);
}

int security_path_chmod(const struct path *path, umode_t mode)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(path_chmod, 0, path, mode);
}

int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(path_chown, 0, path, uid, gid);
}

int security_path_chroot(const struct path *path)
{
        return call_int_hook(path_chroot, 0, path);
}
#endif

int security_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_create, 0, dir, dentry, mode);
}
EXPORT_SYMBOL_GPL(security_inode_create);

int security_inode_link(struct dentry *old_dentry, struct inode *dir,
                         struct dentry *new_dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry))))
                return 0;
        return call_int_hook(inode_link, 0, old_dentry, dir, new_dentry);
}

int security_inode_unlink(struct inode *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_unlink, 0, dir, dentry);
}

int security_inode_symlink(struct inode *dir, struct dentry *dentry,
                            const char *old_name)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_symlink, 0, dir, dentry, old_name);
}

int security_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_mkdir, 0, dir, dentry, mode);
}
EXPORT_SYMBOL_GPL(security_inode_mkdir);

int security_inode_rmdir(struct inode *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_rmdir, 0, dir, dentry);
}

int security_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_mknod, 0, dir, dentry, mode, dev);
}

int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry,
                           struct inode *new_dir, struct dentry *new_dentry,
                           unsigned int flags)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) ||
            (d_is_positive(new_dentry) && IS_PRIVATE(d_backing_inode(new_dentry)))))
                return 0;

        if (flags & RENAME_EXCHANGE) {
                int err = call_int_hook(inode_rename, 0, new_dir, new_dentry,
                                                     old_dir, old_dentry);
                if (err)
                        return err;
        }

        return call_int_hook(inode_rename, 0, old_dir, old_dentry,
                                           new_dir, new_dentry);
}

int security_inode_readlink(struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_readlink, 0, dentry);
}

int security_inode_follow_link(struct dentry *dentry, struct inode *inode,
                               bool rcu)
{
        if (unlikely(IS_PRIVATE(inode)))
                return 0;
        return call_int_hook(inode_follow_link, 0, dentry, inode, rcu);
}

int security_inode_permission(struct inode *inode, int mask)
{
        if (unlikely(IS_PRIVATE(inode)))
                return 0;
        return call_int_hook(inode_permission, 0, inode, mask);
}

int security_inode_setattr(struct dentry *dentry, struct iattr *attr)
{
        int ret;

        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        ret = call_int_hook(inode_setattr, 0, dentry, attr);
        if (ret)
                return ret;
        return evm_inode_setattr(dentry, attr);
}
EXPORT_SYMBOL_GPL(security_inode_setattr);

int security_inode_getattr(const struct path *path)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(inode_getattr, 0, path);
}

int security_inode_setxattr(struct dentry *dentry, const char *name,
                            const void *value, size_t size, int flags)
{
        int ret;

        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        /*
         * SELinux and Smack integrate the cap call,
         * so assume that all LSMs supplying this call do so.
         */
        ret = call_int_hook(inode_setxattr, 1, dentry, name, value, size,
                                flags);

        if (ret == 1)
                ret = cap_inode_setxattr(dentry, name, value, size, flags);
        if (ret)
                return ret;
        ret = ima_inode_setxattr(dentry, name, value, size);
        if (ret)
                return ret;
        return evm_inode_setxattr(dentry, name, value, size);
}

void security_inode_post_setxattr(struct dentry *dentry, const char *name,
                                  const void *value, size_t size, int flags)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_setxattr, dentry, name, value, size, flags);
        evm_inode_post_setxattr(dentry, name, value, size);
}

int security_inode_getxattr(struct dentry *dentry, const char *name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_getxattr, 0, dentry, name);
}

int security_inode_listxattr(struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_listxattr, 0, dentry);
}

int security_inode_removexattr(struct dentry *dentry, const char *name)
{
        int ret;

        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        /*
         * SELinux and Smack integrate the cap call,
         * so assume that all LSMs supplying this call do so.
         */
        ret = call_int_hook(inode_removexattr, 1, dentry, name);
        if (ret == 1)
                ret = cap_inode_removexattr(dentry, name);
        if (ret)
                return ret;
        ret = ima_inode_removexattr(dentry, name);
        if (ret)
                return ret;
        return evm_inode_removexattr(dentry, name);
}

int security_inode_need_killpriv(struct dentry *dentry)
{
        return call_int_hook(inode_need_killpriv, 0, dentry);
}

int security_inode_killpriv(struct dentry *dentry)
{
        return call_int_hook(inode_killpriv, 0, dentry);
}

int security_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc)
{
        struct security_hook_list *hp;
        int rc;

        if (unlikely(IS_PRIVATE(inode)))
                return LSM_RET_DEFAULT(inode_getsecurity);
        /*
         * Only one module will provide an attribute with a given name.
         */
        hlist_for_each_entry(hp, &security_hook_heads.inode_getsecurity, list) {
                rc = hp->hook.inode_getsecurity(inode, name, buffer, alloc);
                if (rc != LSM_RET_DEFAULT(inode_getsecurity))
                        return rc;
        }
        return LSM_RET_DEFAULT(inode_getsecurity);
}

int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags)
{
        struct security_hook_list *hp;
        int rc;

        if (unlikely(IS_PRIVATE(inode)))
                return LSM_RET_DEFAULT(inode_setsecurity);
        /*
         * Only one module will provide an attribute with a given name.
         */
        hlist_for_each_entry(hp, &security_hook_heads.inode_setsecurity, list) {
                rc = hp->hook.inode_setsecurity(inode, name, value, size,
                                                                flags);
                if (rc != LSM_RET_DEFAULT(inode_setsecurity))
                        return rc;
        }
        return LSM_RET_DEFAULT(inode_setsecurity);
}

int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size)
{
        if (unlikely(IS_PRIVATE(inode)))
                return 0;
        return call_int_hook(inode_listsecurity, 0, inode, buffer, buffer_size);
}
EXPORT_SYMBOL(security_inode_listsecurity);

void security_inode_getsecid(struct inode *inode, u32 *secid)
{
        call_void_hook(inode_getsecid, inode, secid);
}

int security_inode_copy_up(struct dentry *src, struct cred **new)
{
        return call_int_hook(inode_copy_up, 0, src, new);
}
EXPORT_SYMBOL(security_inode_copy_up);

int security_inode_copy_up_xattr(const char *name)
{
        struct security_hook_list *hp;
        int rc;

        /*
         * The implementation can return 0 (accept the xattr), 1 (discard the
         * xattr), -EOPNOTSUPP if it does not know anything about the xattr or
         * any other error code incase of an error.
         */
        hlist_for_each_entry(hp,
                &security_hook_heads.inode_copy_up_xattr, list) {
                rc = hp->hook.inode_copy_up_xattr(name);
                if (rc != LSM_RET_DEFAULT(inode_copy_up_xattr))
                        return rc;
        }

        return LSM_RET_DEFAULT(inode_copy_up_xattr);
}
EXPORT_SYMBOL(security_inode_copy_up_xattr);

int security_kernfs_init_security(struct kernfs_node *kn_dir,
                                  struct kernfs_node *kn)
{
        return call_int_hook(kernfs_init_security, 0, kn_dir, kn);
}

int security_file_permission(struct file *file, int mask)
{
        int ret;

        ret = call_int_hook(file_permission, 0, file, mask);
        if (ret)
                return ret;

        return fsnotify_perm(file, mask);
}

int security_file_alloc(struct file *file)
{
        int rc = lsm_file_alloc(file);

        if (rc)
                return rc;
        rc = call_int_hook(file_alloc_security, 0, file);
        if (unlikely(rc))
                security_file_free(file);
        return rc;
}

void security_file_free(struct file *file)
{
        void *blob;

        call_void_hook(file_free_security, file);

        blob = file->f_security;
        if (blob) {
                file->f_security = NULL;
                kmem_cache_free(lsm_file_cache, blob);
        }
}

int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        return call_int_hook(file_ioctl, 0, file, cmd, arg);
}
EXPORT_SYMBOL_GPL(security_file_ioctl);

/**
 * security_file_ioctl_compat() - Check if an ioctl is allowed in compat mode
 * @file: associated file
 * @cmd: ioctl cmd
 * @arg: ioctl arguments
 *
 * Compat version of security_file_ioctl() that correctly handles 32-bit
 * processes running on 64-bit kernels.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_ioctl_compat(struct file *file, unsigned int cmd,
                               unsigned long arg)
{
        return call_int_hook(file_ioctl_compat, 0, file, cmd, arg);
}
EXPORT_SYMBOL_GPL(security_file_ioctl_compat);

static inline unsigned long mmap_prot(struct file *file, unsigned long prot)
{
        /*
         * Does we have PROT_READ and does the application expect
         * it to imply PROT_EXEC?  If not, nothing to talk about...
         */
        if ((prot & (PROT_READ | PROT_EXEC)) != PROT_READ)
                return prot;
        if (!(current->personality & READ_IMPLIES_EXEC))
                return prot;
        /*
         * if that's an anonymous mapping, let it.
         */
        if (!file)
                return prot | PROT_EXEC;
        /*
         * ditto if it's not on noexec mount, except that on !MMU we need
         * NOMMU_MAP_EXEC (== VM_MAYEXEC) in this case
         */
        if (!path_noexec(&file->f_path)) {
#ifndef CONFIG_MMU
                if (file->f_op->mmap_capabilities) {
                        unsigned caps = file->f_op->mmap_capabilities(file);
                        if (!(caps & NOMMU_MAP_EXEC))
                                return prot;
                }
#endif
                return prot | PROT_EXEC;
        }
        /* anything on noexec mount won't get PROT_EXEC */
        return prot;
}

int security_mmap_file(struct file *file, unsigned long prot,
                        unsigned long flags)
{
        unsigned long prot_adj = mmap_prot(file, prot);
        int ret;

        ret = call_int_hook(mmap_file, 0, file, prot, prot_adj, flags);
        if (ret)
                return ret;
        return ima_file_mmap(file, prot, prot_adj, flags);
}

int security_mmap_addr(unsigned long addr)
{
        return call_int_hook(mmap_addr, 0, addr);
}

int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
                            unsigned long prot)
{
        int ret;

        ret = call_int_hook(file_mprotect, 0, vma, reqprot, prot);
        if (ret)
                return ret;
        return ima_file_mprotect(vma, prot);
}

int security_file_lock(struct file *file, unsigned int cmd)
{
        return call_int_hook(file_lock, 0, file, cmd);
}

int security_file_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
{
        return call_int_hook(file_fcntl, 0, file, cmd, arg);
}

void security_file_set_fowner(struct file *file)
{
        call_void_hook(file_set_fowner, file);
}

int security_file_send_sigiotask(struct task_struct *tsk,
                                  struct fown_struct *fown, int sig)
{
        return call_int_hook(file_send_sigiotask, 0, tsk, fown, sig);
}

int security_file_receive(struct file *file)
{
        return call_int_hook(file_receive, 0, file);
}

int security_file_open(struct file *file)
{
        int ret;

        ret = call_int_hook(file_open, 0, file);
        if (ret)
                return ret;

        return fsnotify_perm(file, MAY_OPEN);
}

int security_task_alloc(struct task_struct *task, unsigned long clone_flags)
{
        int rc = lsm_task_alloc(task);

        if (rc)
                return rc;
        rc = call_int_hook(task_alloc, 0, task, clone_flags);
        if (unlikely(rc))
                security_task_free(task);
        return rc;
}

void security_task_free(struct task_struct *task)
{
        call_void_hook(task_free, task);

        kfree(task->security);
        task->security = NULL;
}

int security_cred_alloc_blank(struct cred *cred, gfp_t gfp)
{
        int rc = lsm_cred_alloc(cred, gfp);

        if (rc)
                return rc;

        rc = call_int_hook(cred_alloc_blank, 0, cred, gfp);
        if (unlikely(rc))
                security_cred_free(cred);
        return rc;
}

void security_cred_free(struct cred *cred)
{
        /*
         * There is a failure case in prepare_creds() that
         * may result in a call here with ->security being NULL.
         */
        if (unlikely(cred->security == NULL))
                return;

        call_void_hook(cred_free, cred);

        kfree(cred->security);
        cred->security = NULL;
}

int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp)
{
        int rc = lsm_cred_alloc(new, gfp);

        if (rc)
                return rc;

        rc = call_int_hook(cred_prepare, 0, new, old, gfp);
        if (unlikely(rc))
                security_cred_free(new);
        return rc;
}

void security_transfer_creds(struct cred *new, const struct cred *old)
{
        call_void_hook(cred_transfer, new, old);
}

void security_cred_getsecid(const struct cred *c, u32 *secid)
{
        *secid = 0;
        call_void_hook(cred_getsecid, c, secid);
}
EXPORT_SYMBOL(security_cred_getsecid);

int security_kernel_act_as(struct cred *new, u32 secid)
{
        return call_int_hook(kernel_act_as, 0, new, secid);
}

int security_kernel_create_files_as(struct cred *new, struct inode *inode)
{
        return call_int_hook(kernel_create_files_as, 0, new, inode);
}

int security_kernel_module_request(char *kmod_name)
{
        int ret;

        ret = call_int_hook(kernel_module_request, 0, kmod_name);
        if (ret)
                return ret;
        return integrity_kernel_module_request(kmod_name);
}

int security_kernel_read_file(struct file *file, enum kernel_read_file_id id,
                              bool contents)
{
        int ret;

        ret = call_int_hook(kernel_read_file, 0, file, id, contents);
        if (ret)
                return ret;
        return ima_read_file(file, id, contents);
}
EXPORT_SYMBOL_GPL(security_kernel_read_file);

int security_kernel_post_read_file(struct file *file, char *buf, loff_t size,
                                   enum kernel_read_file_id id)
{
        int ret;

        ret = call_int_hook(kernel_post_read_file, 0, file, buf, size, id);
        if (ret)
                return ret;
        return ima_post_read_file(file, buf, size, id);
}
EXPORT_SYMBOL_GPL(security_kernel_post_read_file);

int security_kernel_load_data(enum kernel_load_data_id id, bool contents)
{
        int ret;

        ret = call_int_hook(kernel_load_data, 0, id, contents);
        if (ret)
                return ret;
        return ima_load_data(id, contents);
}
EXPORT_SYMBOL_GPL(security_kernel_load_data);

int security_kernel_post_load_data(char *buf, loff_t size,
                                   enum kernel_load_data_id id,
                                   char *description)
{
        int ret;

        ret = call_int_hook(kernel_post_load_data, 0, buf, size, id,
                            description);
        if (ret)
                return ret;
        return ima_post_load_data(buf, size, id, description);
}
EXPORT_SYMBOL_GPL(security_kernel_post_load_data);

int security_task_fix_setuid(struct cred *new, const struct cred *old,
                             int flags)
{
        return call_int_hook(task_fix_setuid, 0, new, old, flags);
}

int security_task_fix_setgid(struct cred *new, const struct cred *old,
                                 int flags)
{
        return call_int_hook(task_fix_setgid, 0, new, old, flags);
}

int security_task_setpgid(struct task_struct *p, pid_t pgid)
{
        return call_int_hook(task_setpgid, 0, p, pgid);
}

int security_task_getpgid(struct task_struct *p)
{
        return call_int_hook(task_getpgid, 0, p);
}

int security_task_getsid(struct task_struct *p)
{
        return call_int_hook(task_getsid, 0, p);
}

void security_task_getsecid(struct task_struct *p, u32 *secid)
{
        *secid = 0;
        call_void_hook(task_getsecid, p, secid);
}
EXPORT_SYMBOL(security_task_getsecid);

int security_task_setnice(struct task_struct *p, int nice)
{
        return call_int_hook(task_setnice, 0, p, nice);
}

int security_task_setioprio(struct task_struct *p, int ioprio)
{
        return call_int_hook(task_setioprio, 0, p, ioprio);
}

int security_task_getioprio(struct task_struct *p)
{
        return call_int_hook(task_getioprio, 0, p);
}

int security_task_prlimit(const struct cred *cred, const struct cred *tcred,
                          unsigned int flags)
{
        return call_int_hook(task_prlimit, 0, cred, tcred, flags);
}

int security_task_setrlimit(struct task_struct *p, unsigned int resource,
                struct rlimit *new_rlim)
{
        return call_int_hook(task_setrlimit, 0, p, resource, new_rlim);
}

int security_task_setscheduler(struct task_struct *p)
{
        return call_int_hook(task_setscheduler, 0, p);
}

int security_task_getscheduler(struct task_struct *p)
{
        return call_int_hook(task_getscheduler, 0, p);
}

int security_task_movememory(struct task_struct *p)
{
        return call_int_hook(task_movememory, 0, p);
}

int security_task_kill(struct task_struct *p, struct kernel_siginfo *info,
                        int sig, const struct cred *cred)
{
        return call_int_hook(task_kill, 0, p, info, sig, cred);
}

int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
                         unsigned long arg4, unsigned long arg5)
{
        int thisrc;
        int rc = LSM_RET_DEFAULT(task_prctl);
        struct security_hook_list *hp;

        hlist_for_each_entry(hp, &security_hook_heads.task_prctl, list) {
                thisrc = hp->hook.task_prctl(option, arg2, arg3, arg4, arg5);
                if (thisrc != LSM_RET_DEFAULT(task_prctl)) {
                        rc = thisrc;
                        if (thisrc != 0)
                                break;
                }
        }
        return rc;
}

void security_task_to_inode(struct task_struct *p, struct inode *inode)
{
        call_void_hook(task_to_inode, p, inode);
}

int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag)
{
        return call_int_hook(ipc_permission, 0, ipcp, flag);
}

void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid)
{
        *secid = 0;
        call_void_hook(ipc_getsecid, ipcp, secid);
}

int security_msg_msg_alloc(struct msg_msg *msg)
{
        int rc = lsm_msg_msg_alloc(msg);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(msg_msg_alloc_security, 0, msg);
        if (unlikely(rc))
                security_msg_msg_free(msg);
        return rc;
}

void security_msg_msg_free(struct msg_msg *msg)
{
        call_void_hook(msg_msg_free_security, msg);
        kfree(msg->security);
        msg->security = NULL;
}

int security_msg_queue_alloc(struct kern_ipc_perm *msq)
{
        int rc = lsm_ipc_alloc(msq);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(msg_queue_alloc_security, 0, msq);
        if (unlikely(rc))
                security_msg_queue_free(msq);
        return rc;
}

void security_msg_queue_free(struct kern_ipc_perm *msq)
{
        call_void_hook(msg_queue_free_security, msq);
        kfree(msq->security);
        msq->security = NULL;
}

int security_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg)
{
        return call_int_hook(msg_queue_associate, 0, msq, msqflg);
}

int security_msg_queue_msgctl(struct kern_ipc_perm *msq, int cmd)
{
        return call_int_hook(msg_queue_msgctl, 0, msq, cmd);
}

int security_msg_queue_msgsnd(struct kern_ipc_perm *msq,
                               struct msg_msg *msg, int msqflg)
{
        return call_int_hook(msg_queue_msgsnd, 0, msq, msg, msqflg);
}

int security_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *msg,
                               struct task_struct *target, long type, int mode)
{
        return call_int_hook(msg_queue_msgrcv, 0, msq, msg, target, type, mode);
}

int security_shm_alloc(struct kern_ipc_perm *shp)
{
        int rc = lsm_ipc_alloc(shp);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(shm_alloc_security, 0, shp);
        if (unlikely(rc))
                security_shm_free(shp);
        return rc;
}

void security_shm_free(struct kern_ipc_perm *shp)
{
        call_void_hook(shm_free_security, shp);
        kfree(shp->security);
        shp->security = NULL;
}

int security_shm_associate(struct kern_ipc_perm *shp, int shmflg)
{
        return call_int_hook(shm_associate, 0, shp, shmflg);
}

int security_shm_shmctl(struct kern_ipc_perm *shp, int cmd)
{
        return call_int_hook(shm_shmctl, 0, shp, cmd);
}

int security_shm_shmat(struct kern_ipc_perm *shp, char __user *shmaddr, int shmflg)
{
        return call_int_hook(shm_shmat, 0, shp, shmaddr, shmflg);
}

int security_sem_alloc(struct kern_ipc_perm *sma)
{
        int rc = lsm_ipc_alloc(sma);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(sem_alloc_security, 0, sma);
        if (unlikely(rc))
                security_sem_free(sma);
        return rc;
}

void security_sem_free(struct kern_ipc_perm *sma)
{
        call_void_hook(sem_free_security, sma);
        kfree(sma->security);
        sma->security = NULL;
}

int security_sem_associate(struct kern_ipc_perm *sma, int semflg)
{
        return call_int_hook(sem_associate, 0, sma, semflg);
}

int security_sem_semctl(struct kern_ipc_perm *sma, int cmd)
{
        return call_int_hook(sem_semctl, 0, sma, cmd);
}

int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops,
                        unsigned nsops, int alter)
{
        return call_int_hook(sem_semop, 0, sma, sops, nsops, alter);
}

void security_d_instantiate(struct dentry *dentry, struct inode *inode)
{
        if (unlikely(inode && IS_PRIVATE(inode)))
                return;
        call_void_hook(d_instantiate, dentry, inode);
}
EXPORT_SYMBOL(security_d_instantiate);

int security_getprocattr(struct task_struct *p, const char *lsm, char *name,
                                char **value)
{
        struct security_hook_list *hp;

        hlist_for_each_entry(hp, &security_hook_heads.getprocattr, list) {
                if (lsm != NULL && strcmp(lsm, hp->lsm))
                        continue;
                return hp->hook.getprocattr(p, name, value);
        }
        return LSM_RET_DEFAULT(getprocattr);
}

int security_setprocattr(const char *lsm, const char *name, void *value,
                         size_t size)
{
        struct security_hook_list *hp;

        hlist_for_each_entry(hp, &security_hook_heads.setprocattr, list) {
                if (lsm != NULL && strcmp(lsm, hp->lsm))
                        continue;
                return hp->hook.setprocattr(name, value, size);
        }
        return LSM_RET_DEFAULT(setprocattr);
}

int security_netlink_send(struct sock *sk, struct sk_buff *skb)
{
        return call_int_hook(netlink_send, 0, sk, skb);
}

int security_ismaclabel(const char *name)
{
        return call_int_hook(ismaclabel, 0, name);
}
EXPORT_SYMBOL(security_ismaclabel);

int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
{
        struct security_hook_list *hp;
        int rc;

        /*
         * Currently, only one LSM can implement secid_to_secctx (i.e this
         * LSM hook is not "stackable").
         */
        hlist_for_each_entry(hp, &security_hook_heads.secid_to_secctx, list) {
                rc = hp->hook.secid_to_secctx(secid, secdata, seclen);
                if (rc != LSM_RET_DEFAULT(secid_to_secctx))
                        return rc;
        }

        return LSM_RET_DEFAULT(secid_to_secctx);
}
EXPORT_SYMBOL(security_secid_to_secctx);

int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
{
        *secid = 0;
        return call_int_hook(secctx_to_secid, 0, secdata, seclen, secid);
}
EXPORT_SYMBOL(security_secctx_to_secid);

void security_release_secctx(char *secdata, u32 seclen)
{
        call_void_hook(release_secctx, secdata, seclen);
}
EXPORT_SYMBOL(security_release_secctx);

void security_inode_invalidate_secctx(struct inode *inode)
{
        call_void_hook(inode_invalidate_secctx, inode);
}
EXPORT_SYMBOL(security_inode_invalidate_secctx);

int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen)
{
        return call_int_hook(inode_notifysecctx, 0, inode, ctx, ctxlen);
}
EXPORT_SYMBOL(security_inode_notifysecctx);

int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen)
{
        return call_int_hook(inode_setsecctx, 0, dentry, ctx, ctxlen);
}
EXPORT_SYMBOL(security_inode_setsecctx);

int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
{
        struct security_hook_list *hp;
        int rc;

        /*
         * Only one module will provide a security context.
         */
        hlist_for_each_entry(hp, &security_hook_heads.inode_getsecctx, list) {
                rc = hp->hook.inode_getsecctx(inode, ctx, ctxlen);
                if (rc != LSM_RET_DEFAULT(inode_getsecctx))
                        return rc;
        }

        return LSM_RET_DEFAULT(inode_getsecctx);
}
EXPORT_SYMBOL(security_inode_getsecctx);

#ifdef CONFIG_WATCH_QUEUE
int security_post_notification(const struct cred *w_cred,
                               const struct cred *cred,
                               struct watch_notification *n)
{
        return call_int_hook(post_notification, 0, w_cred, cred, n);
}
#endif /* CONFIG_WATCH_QUEUE */

#ifdef CONFIG_KEY_NOTIFICATIONS
int security_watch_key(struct key *key)
{
        return call_int_hook(watch_key, 0, key);
}
#endif

#ifdef CONFIG_SECURITY_NETWORK

int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk)
{
        return call_int_hook(unix_stream_connect, 0, sock, other, newsk);
}
EXPORT_SYMBOL(security_unix_stream_connect);

int security_unix_may_send(struct socket *sock,  struct socket *other)
{
        return call_int_hook(unix_may_send, 0, sock, other);
}
EXPORT_SYMBOL(security_unix_may_send);

int security_socket_create(int family, int type, int protocol, int kern)
{
        return call_int_hook(socket_create, 0, family, type, protocol, kern);
}

int security_socket_post_create(struct socket *sock, int family,
                                int type, int protocol, int kern)
{
        return call_int_hook(socket_post_create, 0, sock, family, type,
                                                protocol, kern);
}

int security_socket_socketpair(struct socket *socka, struct socket *sockb)
{
        return call_int_hook(socket_socketpair, 0, socka, sockb);
}
EXPORT_SYMBOL(security_socket_socketpair);

int security_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen)
{
        return call_int_hook(socket_bind, 0, sock, address, addrlen);
}

int security_socket_connect(struct socket *sock, struct sockaddr *address, int addrlen)
{
        return call_int_hook(socket_connect, 0, sock, address, addrlen);
}

int security_socket_listen(struct socket *sock, int backlog)
{
        return call_int_hook(socket_listen, 0, sock, backlog);
}

int security_socket_accept(struct socket *sock, struct socket *newsock)
{
        return call_int_hook(socket_accept, 0, sock, newsock);
}

int security_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size)
{
        return call_int_hook(socket_sendmsg, 0, sock, msg, size);
}

int security_socket_recvmsg(struct socket *sock, struct msghdr *msg,
                            int size, int flags)
{
        return call_int_hook(socket_recvmsg, 0, sock, msg, size, flags);
}

int security_socket_getsockname(struct socket *sock)
{
        return call_int_hook(socket_getsockname, 0, sock);
}

int security_socket_getpeername(struct socket *sock)
{
        return call_int_hook(socket_getpeername, 0, sock);
}

int security_socket_getsockopt(struct socket *sock, int level, int optname)
{
        return call_int_hook(socket_getsockopt, 0, sock, level, optname);
}

int security_socket_setsockopt(struct socket *sock, int level, int optname)
{
        return call_int_hook(socket_setsockopt, 0, sock, level, optname);
}

int security_socket_shutdown(struct socket *sock, int how)
{
        return call_int_hook(socket_shutdown, 0, sock, how);
}

int security_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        return call_int_hook(socket_sock_rcv_skb, 0, sk, skb);
}
EXPORT_SYMBOL(security_sock_rcv_skb);

int security_socket_getpeersec_stream(struct socket *sock, sockptr_t optval,
                                      sockptr_t optlen, unsigned int len)
{
        struct security_hook_list *hp;
        int rc;

        /*
         * Only one module will provide a security context.
         */
        hlist_for_each_entry(hp, &security_hook_heads.socket_getpeersec_stream,
                             list) {
                rc = hp->hook.socket_getpeersec_stream(sock, optval, optlen,
                                                       len);
                if (rc != LSM_RET_DEFAULT(socket_getpeersec_stream))
                        return rc;
        }
        return LSM_RET_DEFAULT(socket_getpeersec_stream);
}

int security_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid)
{
        struct security_hook_list *hp;
        int rc;

        /*
         * Only one module will provide a security context.
         */
        hlist_for_each_entry(hp, &security_hook_heads.socket_getpeersec_dgram,
                             list) {
                rc = hp->hook.socket_getpeersec_dgram(sock, skb, secid);
                if (rc != LSM_RET_DEFAULT(socket_getpeersec_dgram))
                        return rc;
        }
        return LSM_RET_DEFAULT(socket_getpeersec_dgram);
}
EXPORT_SYMBOL(security_socket_getpeersec_dgram);

int security_sk_alloc(struct sock *sk, int family, gfp_t priority)
{
        return call_int_hook(sk_alloc_security, 0, sk, family, priority);
}

void security_sk_free(struct sock *sk)
{
        call_void_hook(sk_free_security, sk);
}

void security_sk_clone(const struct sock *sk, struct sock *newsk)
{
        call_void_hook(sk_clone_security, sk, newsk);
}
EXPORT_SYMBOL(security_sk_clone);

void security_sk_classify_flow(struct sock *sk, struct flowi_common *flic)
{
        call_void_hook(sk_getsecid, sk, &flic->flowic_secid);
}
EXPORT_SYMBOL(security_sk_classify_flow);

void security_req_classify_flow(const struct request_sock *req,
                                struct flowi_common *flic)
{
        call_void_hook(req_classify_flow, req, flic);
}
EXPORT_SYMBOL(security_req_classify_flow);

void security_sock_graft(struct sock *sk, struct socket *parent)
{
        call_void_hook(sock_graft, sk, parent);
}
EXPORT_SYMBOL(security_sock_graft);

int security_inet_conn_request(struct sock *sk,
                        struct sk_buff *skb, struct request_sock *req)
{
        return call_int_hook(inet_conn_request, 0, sk, skb, req);
}
EXPORT_SYMBOL(security_inet_conn_request);

void security_inet_csk_clone(struct sock *newsk,
                        const struct request_sock *req)
{
        call_void_hook(inet_csk_clone, newsk, req);
}

void security_inet_conn_established(struct sock *sk,
                        struct sk_buff *skb)
{
        call_void_hook(inet_conn_established, sk, skb);
}
EXPORT_SYMBOL(security_inet_conn_established);

int security_secmark_relabel_packet(u32 secid)
{
        return call_int_hook(secmark_relabel_packet, 0, secid);
}
EXPORT_SYMBOL(security_secmark_relabel_packet);

void security_secmark_refcount_inc(void)
{
        call_void_hook(secmark_refcount_inc);
}
EXPORT_SYMBOL(security_secmark_refcount_inc);

void security_secmark_refcount_dec(void)
{
        call_void_hook(secmark_refcount_dec);
}
EXPORT_SYMBOL(security_secmark_refcount_dec);

int security_tun_dev_alloc_security(void **security)
{
        return call_int_hook(tun_dev_alloc_security, 0, security);
}
EXPORT_SYMBOL(security_tun_dev_alloc_security);

void security_tun_dev_free_security(void *security)
{
        call_void_hook(tun_dev_free_security, security);
}
EXPORT_SYMBOL(security_tun_dev_free_security);

int security_tun_dev_create(void)
{
        return call_int_hook(tun_dev_create, 0);
}
EXPORT_SYMBOL(security_tun_dev_create);

int security_tun_dev_attach_queue(void *security)
{
        return call_int_hook(tun_dev_attach_queue, 0, security);
}
EXPORT_SYMBOL(security_tun_dev_attach_queue);

int security_tun_dev_attach(struct sock *sk, void *security)
{
        return call_int_hook(tun_dev_attach, 0, sk, security);
}
EXPORT_SYMBOL(security_tun_dev_attach);

int security_tun_dev_open(void *security)
{
        return call_int_hook(tun_dev_open, 0, security);
}
EXPORT_SYMBOL(security_tun_dev_open);

int security_sctp_assoc_request(struct sctp_endpoint *ep, struct sk_buff *skb)
{
        return call_int_hook(sctp_assoc_request, 0, ep, skb);
}
EXPORT_SYMBOL(security_sctp_assoc_request);

int security_sctp_bind_connect(struct sock *sk, int optname,
                               struct sockaddr *address, int addrlen)
{
        return call_int_hook(sctp_bind_connect, 0, sk, optname,
                             address, addrlen);
}
EXPORT_SYMBOL(security_sctp_bind_connect);

void security_sctp_sk_clone(struct sctp_endpoint *ep, struct sock *sk,
                            struct sock *newsk)
{
        call_void_hook(sctp_sk_clone, ep, sk, newsk);
}
EXPORT_SYMBOL(security_sctp_sk_clone);

#endif        /* CONFIG_SECURITY_NETWORK */

#ifdef CONFIG_SECURITY_INFINIBAND

int security_ib_pkey_access(void *sec, u64 subnet_prefix, u16 pkey)
{
        return call_int_hook(ib_pkey_access, 0, sec, subnet_prefix, pkey);
}
EXPORT_SYMBOL(security_ib_pkey_access);

int security_ib_endport_manage_subnet(void *sec, const char *dev_name, u8 port_num)
{
        return call_int_hook(ib_endport_manage_subnet, 0, sec, dev_name, port_num);
}
EXPORT_SYMBOL(security_ib_endport_manage_subnet);

int security_ib_alloc_security(void **sec)
{
        return call_int_hook(ib_alloc_security, 0, sec);
}
EXPORT_SYMBOL(security_ib_alloc_security);

void security_ib_free_security(void *sec)
{
        call_void_hook(ib_free_security, sec);
}
EXPORT_SYMBOL(security_ib_free_security);
#endif        /* CONFIG_SECURITY_INFINIBAND */

#ifdef CONFIG_SECURITY_NETWORK_XFRM

int security_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp,
                               struct xfrm_user_sec_ctx *sec_ctx,
                               gfp_t gfp)
{
        return call_int_hook(xfrm_policy_alloc_security, 0, ctxp, sec_ctx, gfp);
}
EXPORT_SYMBOL(security_xfrm_policy_alloc);

int security_xfrm_policy_clone(struct xfrm_sec_ctx *old_ctx,
                              struct xfrm_sec_ctx **new_ctxp)
{
        return call_int_hook(xfrm_policy_clone_security, 0, old_ctx, new_ctxp);
}

void security_xfrm_policy_free(struct xfrm_sec_ctx *ctx)
{
        call_void_hook(xfrm_policy_free_security, ctx);
}
EXPORT_SYMBOL(security_xfrm_policy_free);

int security_xfrm_policy_delete(struct xfrm_sec_ctx *ctx)
{
        return call_int_hook(xfrm_policy_delete_security, 0, ctx);
}

int security_xfrm_state_alloc(struct xfrm_state *x,
                              struct xfrm_user_sec_ctx *sec_ctx)
{
        return call_int_hook(xfrm_state_alloc, 0, x, sec_ctx);
}
EXPORT_SYMBOL(security_xfrm_state_alloc);

int security_xfrm_state_alloc_acquire(struct xfrm_state *x,
                                      struct xfrm_sec_ctx *polsec, u32 secid)
{
        return call_int_hook(xfrm_state_alloc_acquire, 0, x, polsec, secid);
}

int security_xfrm_state_delete(struct xfrm_state *x)
{
        return call_int_hook(xfrm_state_delete_security, 0, x);
}
EXPORT_SYMBOL(security_xfrm_state_delete);

void security_xfrm_state_free(struct xfrm_state *x)
{
        call_void_hook(xfrm_state_free_security, x);
}

int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir)
{
        return call_int_hook(xfrm_policy_lookup, 0, ctx, fl_secid, dir);
}

int security_xfrm_state_pol_flow_match(struct xfrm_state *x,
                                       struct xfrm_policy *xp,
                                       const struct flowi_common *flic)
{
        struct security_hook_list *hp;
        int rc = LSM_RET_DEFAULT(xfrm_state_pol_flow_match);

        /*
         * Since this function is expected to return 0 or 1, the judgment
         * becomes difficult if multiple LSMs supply this call. Fortunately,
         * we can use the first LSM's judgment because currently only SELinux
         * supplies this call.
         *
         * For speed optimization, we explicitly break the loop rather than
         * using the macro
         */
        hlist_for_each_entry(hp, &security_hook_heads.xfrm_state_pol_flow_match,
                                list) {
                rc = hp->hook.xfrm_state_pol_flow_match(x, xp, flic);
                break;
        }
        return rc;
}

int security_xfrm_decode_session(struct sk_buff *skb, u32 *secid)
{
        return call_int_hook(xfrm_decode_session, 0, skb, secid, 1);
}

void security_skb_classify_flow(struct sk_buff *skb, struct flowi_common *flic)
{
        int rc = call_int_hook(xfrm_decode_session, 0, skb, &flic->flowic_secid,
                                0);

        BUG_ON(rc);
}
EXPORT_SYMBOL(security_skb_classify_flow);

#endif        /* CONFIG_SECURITY_NETWORK_XFRM */

#ifdef CONFIG_KEYS

int security_key_alloc(struct key *key, const struct cred *cred,
                       unsigned long flags)
{
        return call_int_hook(key_alloc, 0, key, cred, flags);
}

void security_key_free(struct key *key)
{
        call_void_hook(key_free, key);
}

int security_key_permission(key_ref_t key_ref, const struct cred *cred,
                            enum key_need_perm need_perm)
{
        return call_int_hook(key_permission, 0, key_ref, cred, need_perm);
}

int security_key_getsecurity(struct key *key, char **_buffer)
{
        *_buffer = NULL;
        return call_int_hook(key_getsecurity, 0, key, _buffer);
}

#endif        /* CONFIG_KEYS */

#ifdef CONFIG_AUDIT

int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule)
{
        return call_int_hook(audit_rule_init, 0, field, op, rulestr, lsmrule);
}

int security_audit_rule_known(struct audit_krule *krule)
{
        return call_int_hook(audit_rule_known, 0, krule);
}

void security_audit_rule_free(void *lsmrule)
{
        call_void_hook(audit_rule_free, lsmrule);
}

int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule)
{
        return call_int_hook(audit_rule_match, 0, secid, field, op, lsmrule);
}
#endif /* CONFIG_AUDIT */

#ifdef CONFIG_BPF_SYSCALL
int security_bpf(int cmd, union bpf_attr *attr, unsigned int size)
{
        return call_int_hook(bpf, 0, cmd, attr, size);
}
int security_bpf_map(struct bpf_map *map, fmode_t fmode)
{
        return call_int_hook(bpf_map, 0, map, fmode);
}
int security_bpf_prog(struct bpf_prog *prog)
{
        return call_int_hook(bpf_prog, 0, prog);
}
int security_bpf_map_alloc(struct bpf_map *map)
{
        return call_int_hook(bpf_map_alloc_security, 0, map);
}
int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
{
        return call_int_hook(bpf_prog_alloc_security, 0, aux);
}
void security_bpf_map_free(struct bpf_map *map)
{
        call_void_hook(bpf_map_free_security, map);
}
void security_bpf_prog_free(struct bpf_prog_aux *aux)
{
        call_void_hook(bpf_prog_free_security, aux);
}
#endif /* CONFIG_BPF_SYSCALL */

int security_locked_down(enum lockdown_reason what)
{
        return call_int_hook(locked_down, 0, what);
}
EXPORT_SYMBOL(security_locked_down);

#ifdef CONFIG_PERF_EVENTS
int security_perf_event_open(struct perf_event_attr *attr, int type)
{
        return call_int_hook(perf_event_open, 0, attr, type);
}

int security_perf_event_alloc(struct perf_event *event)
{
        return call_int_hook(perf_event_alloc, 0, event);
}

void security_perf_event_free(struct perf_event *event)
{
        call_void_hook(perf_event_free, event);
}

int security_perf_event_read(struct perf_event *event)
{
        return call_int_hook(perf_event_read, 0, event);
}

int security_perf_event_write(struct perf_event *event)
{
        return call_int_hook(perf_event_write, 0, event);
}
#endif /* CONFIG_PERF_EVENTS */

















































    1 











    1 













































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_WORD_AT_A_TIME_H
#define _ASM_WORD_AT_A_TIME_H

#include <linux/kernel.h>

/*
 * This is largely generic for little-endian machines, but the
 * optimal byte mask counting is probably going to be something
 * that is architecture-specific. If you have a reliably fast
 * bit count instruction, that might be better than the multiply
 * and shift, for example.
 */
struct word_at_a_time {
        const unsigned long one_bits, high_bits;
};

#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) }

#ifdef CONFIG_64BIT

/*
 * Jan Achrenius on G+: microoptimized version of
 * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56"
 * that works for the bytemasks without having to
 * mask them first.
 */
static inline long count_masked_bytes(unsigned long mask)
{
        return mask*0x0001020304050608ul >> 56;
}

#else        /* 32-bit case */

/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
static inline long count_masked_bytes(long mask)
{
        /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
        long a = (0x0ff0001+mask) >> 23;
        /* Fix the 1 for 00 case */
        return a & mask;
}

#endif

/* Return nonzero if it has a zero */
static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c)
{
        unsigned long mask = ((a - c->one_bits) & ~a) & c->high_bits;
        *bits = mask;
        return mask;
}

static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, const struct word_at_a_time *c)
{
        return bits;
}

static inline unsigned long create_zero_mask(unsigned long bits)
{
        bits = (bits - 1) & ~bits;
        return bits >> 7;
}

/* The mask we created is directly usable as a bytemask */
#define zero_bytemask(mask) (mask)

static inline unsigned long find_zero(unsigned long mask)
{
        return count_masked_bytes(mask);
}

/*
 * Load an unaligned word from kernel space.
 *
 * In the (very unlikely) case of the word being a page-crosser
 * and the next page not being mapped, take the exception and
 * return zeroes in the non-existing part.
 */
static inline unsigned long load_unaligned_zeropad(const void *addr)
{
        unsigned long ret, dummy;

        asm(
                "1:\tmov %2,%0\n"
                "2:\n"
                ".section .fixup,\"ax\"\n"
                "3:\t"
                "lea %2,%1\n\t"
                "and %3,%1\n\t"
                "mov (%1),%0\n\t"
                "leal %2,%%ecx\n\t"
                "andl %4,%%ecx\n\t"
                "shll $3,%%ecx\n\t"
                "shr %%cl,%0\n\t"
                "jmp 2b\n"
                ".previous\n"
                _ASM_EXTABLE(1b, 3b)
                :"=&r" (ret),"=&c" (dummy)
                :"m" (*(unsigned long *)addr),
                 "i" (-sizeof(unsigned long)),
                 "i" (sizeof(unsigned long)-1));
        return ret;
}

#endif /* _ASM_WORD_AT_A_TIME_H */

























































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
// SPDX-License-Identifier: GPL-2.0
/*
 *  hrtimers - High-resolution kernel timers
 *
 *   Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
 *   Copyright(C) 2005, Red Hat, Inc., Ingo Molnar
 *
 *  data type definitions, declarations, prototypes
 *
 *  Started by: Thomas Gleixner and Ingo Molnar
 */
#ifndef _LINUX_HRTIMER_H
#define _LINUX_HRTIMER_H

#include <linux/hrtimer_defs.h>
#include <linux/rbtree.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/percpu.h>
#include <linux/seqlock.h>
#include <linux/timer.h>
#include <linux/timerqueue.h>

struct hrtimer_clock_base;
struct hrtimer_cpu_base;

/*
 * Mode arguments of xxx_hrtimer functions:
 *
 * HRTIMER_MODE_ABS                - Time value is absolute
 * HRTIMER_MODE_REL                - Time value is relative to now
 * HRTIMER_MODE_PINNED                - Timer is bound to CPU (is only considered
 *                                  when starting the timer)
 * HRTIMER_MODE_SOFT                - Timer callback function will be executed in
 *                                  soft irq context
 * HRTIMER_MODE_HARD                - Timer callback function will be executed in
 *                                  hard irq context even on PREEMPT_RT.
 */
enum hrtimer_mode {
        HRTIMER_MODE_ABS        = 0x00,
        HRTIMER_MODE_REL        = 0x01,
        HRTIMER_MODE_PINNED        = 0x02,
        HRTIMER_MODE_SOFT        = 0x04,
        HRTIMER_MODE_HARD        = 0x08,

        HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED,
        HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED,

        HRTIMER_MODE_ABS_SOFT        = HRTIMER_MODE_ABS | HRTIMER_MODE_SOFT,
        HRTIMER_MODE_REL_SOFT        = HRTIMER_MODE_REL | HRTIMER_MODE_SOFT,

        HRTIMER_MODE_ABS_PINNED_SOFT = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_SOFT,
        HRTIMER_MODE_REL_PINNED_SOFT = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_SOFT,

        HRTIMER_MODE_ABS_HARD        = HRTIMER_MODE_ABS | HRTIMER_MODE_HARD,
        HRTIMER_MODE_REL_HARD        = HRTIMER_MODE_REL | HRTIMER_MODE_HARD,

        HRTIMER_MODE_ABS_PINNED_HARD = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_HARD,
        HRTIMER_MODE_REL_PINNED_HARD = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_HARD,
};

/*
 * Return values for the callback function
 */
enum hrtimer_restart {
        HRTIMER_NORESTART,        /* Timer is not restarted */
        HRTIMER_RESTART,        /* Timer must be restarted */
};

/*
 * Values to track state of the timer
 *
 * Possible states:
 *
 * 0x00                inactive
 * 0x01                enqueued into rbtree
 *
 * The callback state is not part of the timer->state because clearing it would
 * mean touching the timer after the callback, this makes it impossible to free
 * the timer from the callback function.
 *
 * Therefore we track the callback state in:
 *
 *        timer->base->cpu_base->running == timer
 *
 * On SMP it is possible to have a "callback function running and enqueued"
 * status. It happens for example when a posix timer expired and the callback
 * queued a signal. Between dropping the lock which protects the posix timer
 * and reacquiring the base lock of the hrtimer, another CPU can deliver the
 * signal and rearm the timer.
 *
 * All state transitions are protected by cpu_base->lock.
 */
#define HRTIMER_STATE_INACTIVE        0x00
#define HRTIMER_STATE_ENQUEUED        0x01

/**
 * struct hrtimer - the basic hrtimer structure
 * @node:        timerqueue node, which also manages node.expires,
 *                the absolute expiry time in the hrtimers internal
 *                representation. The time is related to the clock on
 *                which the timer is based. Is setup by adding
 *                slack to the _softexpires value. For non range timers
 *                identical to _softexpires.
 * @_softexpires: the absolute earliest expiry time of the hrtimer.
 *                The time which was given as expiry time when the timer
 *                was armed.
 * @function:        timer expiry callback function
 * @base:        pointer to the timer base (per cpu and per clock)
 * @state:        state information (See bit values above)
 * @is_rel:        Set if the timer was armed relative
 * @is_soft:        Set if hrtimer will be expired in soft interrupt context.
 * @is_hard:        Set if hrtimer will be expired in hard interrupt context
 *                even on RT.
 *
 * The hrtimer structure must be initialized by hrtimer_init()
 */
struct hrtimer {
        struct timerqueue_node                node;
        ktime_t                                _softexpires;
        enum hrtimer_restart                (*function)(struct hrtimer *);
        struct hrtimer_clock_base        *base;
        u8                                state;
        u8                                is_rel;
        u8                                is_soft;
        u8                                is_hard;
};

/**
 * struct hrtimer_sleeper - simple sleeper structure
 * @timer:        embedded timer structure
 * @task:        task to wake up
 *
 * task is set to NULL, when the timer expires.
 */
struct hrtimer_sleeper {
        struct hrtimer timer;
        struct task_struct *task;
};

#ifdef CONFIG_64BIT
# define __hrtimer_clock_base_align        ____cacheline_aligned
#else
# define __hrtimer_clock_base_align
#endif

/**
 * struct hrtimer_clock_base - the timer base for a specific clock
 * @cpu_base:                per cpu clock base
 * @index:                clock type index for per_cpu support when moving a
 *                        timer to a base on another cpu.
 * @clockid:                clock id for per_cpu support
 * @seq:                seqcount around __run_hrtimer
 * @running:                pointer to the currently running hrtimer
 * @active:                red black tree root node for the active timers
 * @get_time:                function to retrieve the current time of the clock
 * @offset:                offset of this clock to the monotonic base
 */
struct hrtimer_clock_base {
        struct hrtimer_cpu_base        *cpu_base;
        unsigned int                index;
        clockid_t                clockid;
        seqcount_raw_spinlock_t        seq;
        struct hrtimer                *running;
        struct timerqueue_head        active;
        ktime_t                        (*get_time)(void);
        ktime_t                        offset;
} __hrtimer_clock_base_align;

enum  hrtimer_base_type {
        HRTIMER_BASE_MONOTONIC,
        HRTIMER_BASE_REALTIME,
        HRTIMER_BASE_BOOTTIME,
        HRTIMER_BASE_TAI,
        HRTIMER_BASE_MONOTONIC_SOFT,
        HRTIMER_BASE_REALTIME_SOFT,
        HRTIMER_BASE_BOOTTIME_SOFT,
        HRTIMER_BASE_TAI_SOFT,
        HRTIMER_MAX_CLOCK_BASES,
};

/**
 * struct hrtimer_cpu_base - the per cpu clock bases
 * @lock:                lock protecting the base and associated clock bases
 *                        and timers
 * @cpu:                cpu number
 * @active_bases:        Bitfield to mark bases with active timers
 * @clock_was_set_seq:        Sequence counter of clock was set events
 * @hres_active:        State of high resolution mode
 * @in_hrtirq:                hrtimer_interrupt() is currently executing
 * @hang_detected:        The last hrtimer interrupt detected a hang
 * @softirq_activated:        displays, if the softirq is raised - update of softirq
 *                        related settings is not required then.
 * @nr_events:                Total number of hrtimer interrupt events
 * @nr_retries:                Total number of hrtimer interrupt retries
 * @nr_hangs:                Total number of hrtimer interrupt hangs
 * @max_hang_time:        Maximum time spent in hrtimer_interrupt
 * @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are
 *                         expired
 * @online:                CPU is online from an hrtimers point of view
 * @timer_waiters:        A hrtimer_cancel() invocation waits for the timer
 *                        callback to finish.
 * @expires_next:        absolute time of the next event, is required for remote
 *                        hrtimer enqueue; it is the total first expiry time (hard
 *                        and soft hrtimer are taken into account)
 * @next_timer:                Pointer to the first expiring timer
 * @softirq_expires_next: Time to check, if soft queues needs also to be expired
 * @softirq_next_timer: Pointer to the first expiring softirq based timer
 * @clock_base:                array of clock bases for this cpu
 *
 * Note: next_timer is just an optimization for __remove_hrtimer().
 *         Do not dereference the pointer because it is not reliable on
 *         cross cpu removals.
 */
struct hrtimer_cpu_base {
        raw_spinlock_t                        lock;
        unsigned int                        cpu;
        unsigned int                        active_bases;
        unsigned int                        clock_was_set_seq;
        unsigned int                        hres_active                : 1,
                                        in_hrtirq                : 1,
                                        hang_detected                : 1,
                                        softirq_activated       : 1,
                                        online                        : 1;
#ifdef CONFIG_HIGH_RES_TIMERS
        unsigned int                        nr_events;
        unsigned short                        nr_retries;
        unsigned short                        nr_hangs;
        unsigned int                        max_hang_time;
#endif
#ifdef CONFIG_PREEMPT_RT
        spinlock_t                        softirq_expiry_lock;
        atomic_t                        timer_waiters;
#endif
        ktime_t                                expires_next;
        struct hrtimer                        *next_timer;
        ktime_t                                softirq_expires_next;
        struct hrtimer                        *softirq_next_timer;
        struct hrtimer_clock_base        clock_base[HRTIMER_MAX_CLOCK_BASES];
} ____cacheline_aligned;

static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
{
        timer->node.expires = time;
        timer->_softexpires = time;
}

static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta)
{
        timer->_softexpires = time;
        timer->node.expires = ktime_add_safe(time, delta);
}

static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, u64 delta)
{
        timer->_softexpires = time;
        timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta));
}

static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64)
{
        timer->node.expires = tv64;
        timer->_softexpires = tv64;
}

static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)
{
        timer->node.expires = ktime_add_safe(timer->node.expires, time);
        timer->_softexpires = ktime_add_safe(timer->_softexpires, time);
}

static inline void hrtimer_add_expires_ns(struct hrtimer *timer, u64 ns)
{
        timer->node.expires = ktime_add_ns(timer->node.expires, ns);
        timer->_softexpires = ktime_add_ns(timer->_softexpires, ns);
}

static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
{
        return timer->node.expires;
}

static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer)
{
        return timer->_softexpires;
}

static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer)
{
        return timer->node.expires;
}
static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer)
{
        return timer->_softexpires;
}

static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer)
{
        return ktime_to_ns(timer->node.expires);
}

static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer)
{
        return ktime_sub(timer->node.expires, timer->base->get_time());
}

static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
{
        return timer->base->get_time();
}

static inline int hrtimer_is_hres_active(struct hrtimer *timer)
{
        return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
                timer->base->cpu_base->hres_active : 0;
}

#ifdef CONFIG_HIGH_RES_TIMERS
struct clock_event_device;

extern void hrtimer_interrupt(struct clock_event_device *dev);

extern unsigned int hrtimer_resolution;

#else

#define hrtimer_resolution        (unsigned int)LOW_RES_NSEC

#endif

static inline ktime_t
__hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now)
{
        ktime_t rem = ktime_sub(timer->node.expires, now);

        /*
         * Adjust relative timers for the extra we added in
         * hrtimer_start_range_ns() to prevent short timeouts.
         */
        if (IS_ENABLED(CONFIG_TIME_LOW_RES) && timer->is_rel)
                rem -= hrtimer_resolution;
        return rem;
}

static inline ktime_t
hrtimer_expires_remaining_adjusted(const struct hrtimer *timer)
{
        return __hrtimer_expires_remaining_adjusted(timer,
                                                    timer->base->get_time());
}

#ifdef CONFIG_TIMERFD
extern void timerfd_clock_was_set(void);
#else
static inline void timerfd_clock_was_set(void) { }
#endif
extern void hrtimers_resume(void);

DECLARE_PER_CPU(struct tick_device, tick_cpu_device);

#ifdef CONFIG_PREEMPT_RT
void hrtimer_cancel_wait_running(const struct hrtimer *timer);
#else
static inline void hrtimer_cancel_wait_running(struct hrtimer *timer)
{
        cpu_relax();
}
#endif

/* Exported timer functions: */

/* Initialize timers: */
extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock,
                         enum hrtimer_mode mode);
extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
                                 enum hrtimer_mode mode);

#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock,
                                  enum hrtimer_mode mode);
extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
                                          clockid_t clock_id,
                                          enum hrtimer_mode mode);

extern void destroy_hrtimer_on_stack(struct hrtimer *timer);
#else
static inline void hrtimer_init_on_stack(struct hrtimer *timer,
                                         clockid_t which_clock,
                                         enum hrtimer_mode mode)
{
        hrtimer_init(timer, which_clock, mode);
}

static inline void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
                                                 clockid_t clock_id,
                                                 enum hrtimer_mode mode)
{
        hrtimer_init_sleeper(sl, clock_id, mode);
}

static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { }
#endif

/* Basic timer operations: */
extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
                                   u64 range_ns, const enum hrtimer_mode mode);

/**
 * hrtimer_start - (re)start an hrtimer
 * @timer:        the timer to be added
 * @tim:        expiry time
 * @mode:        timer mode: absolute (HRTIMER_MODE_ABS) or
 *                relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
 *                softirq based mode is considered for debug purpose only!
 */
static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim,
                                 const enum hrtimer_mode mode)
{
        hrtimer_start_range_ns(timer, tim, 0, mode);
}

extern int hrtimer_cancel(struct hrtimer *timer);
extern int hrtimer_try_to_cancel(struct hrtimer *timer);

static inline void hrtimer_start_expires(struct hrtimer *timer,
                                         enum hrtimer_mode mode)
{
        u64 delta;
        ktime_t soft, hard;
        soft = hrtimer_get_softexpires(timer);
        hard = hrtimer_get_expires(timer);
        delta = ktime_to_ns(ktime_sub(hard, soft));
        hrtimer_start_range_ns(timer, soft, delta, mode);
}

void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
                                   enum hrtimer_mode mode);

static inline void hrtimer_restart(struct hrtimer *timer)
{
        hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}

/* Query timers: */
extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);

static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
{
        return __hrtimer_get_remaining(timer, false);
}

extern u64 hrtimer_get_next_event(void);
extern u64 hrtimer_next_event_without(const struct hrtimer *exclude);

extern bool hrtimer_active(const struct hrtimer *timer);

/**
 * hrtimer_is_queued = check, whether the timer is on one of the queues
 * @timer:        Timer to check
 *
 * Returns: True if the timer is queued, false otherwise
 *
 * The function can be used lockless, but it gives only a current snapshot.
 */
static inline bool hrtimer_is_queued(struct hrtimer *timer)
{
        /* The READ_ONCE pairs with the update functions of timer->state */
        return !!(READ_ONCE(timer->state) & HRTIMER_STATE_ENQUEUED);
}

/*
 * Helper function to check, whether the timer is running the callback
 * function
 */
static inline int hrtimer_callback_running(struct hrtimer *timer)
{
        return timer->base->running == timer;
}

/* Forward a hrtimer so it expires after now: */
extern u64
hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval);

/**
 * hrtimer_forward_now - forward the timer expiry so it expires after now
 * @timer:        hrtimer to forward
 * @interval:        the interval to forward
 *
 * Forward the timer expiry so it will expire after the current time
 * of the hrtimer clock base. Returns the number of overruns.
 *
 * Can be safely called from the callback function of @timer. If
 * called from other contexts @timer must neither be enqueued nor
 * running the callback and the caller needs to take care of
 * serialization.
 *
 * Note: This only updates the timer expiry value and does not requeue
 * the timer.
 */
static inline u64 hrtimer_forward_now(struct hrtimer *timer,
                                      ktime_t interval)
{
        return hrtimer_forward(timer, timer->base->get_time(), interval);
}

/* Precise sleep: */

extern int nanosleep_copyout(struct restart_block *, struct timespec64 *);
extern long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
                              const clockid_t clockid);

extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta,
                                    const enum hrtimer_mode mode);
extern int schedule_hrtimeout_range_clock(ktime_t *expires,
                                          u64 delta,
                                          const enum hrtimer_mode mode,
                                          clockid_t clock_id);
extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);

/* Soft interrupt function to run the hrtimer queues: */
extern void hrtimer_run_queues(void);

/* Bootup initialization: */
extern void __init hrtimers_init(void);

/* Show pending timers: */
extern void sysrq_timer_list_show(void);

int hrtimers_prepare_cpu(unsigned int cpu);
#ifdef CONFIG_HOTPLUG_CPU
int hrtimers_cpu_dying(unsigned int cpu);
#else
#define hrtimers_cpu_dying        NULL
#endif

#endif














































































































































































    1 










    1 










































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Percpu refcounts:
 * (C) 2012 Google, Inc.
 * Author: Kent Overstreet <koverstreet@google.com>
 *
 * This implements a refcount with similar semantics to atomic_t - atomic_inc(),
 * atomic_dec_and_test() - but percpu.
 *
 * There's one important difference between percpu refs and normal atomic_t
 * refcounts; you have to keep track of your initial refcount, and then when you
 * start shutting down you call percpu_ref_kill() _before_ dropping the initial
 * refcount.
 *
 * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less
 * than an atomic_t - this is because of the way shutdown works, see
 * percpu_ref_kill()/PERCPU_COUNT_BIAS.
 *
 * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the
 * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill()
 * puts the ref back in single atomic_t mode, collecting the per cpu refs and
 * issuing the appropriate barriers, and then marks the ref as shutting down so
 * that percpu_ref_put() will check for the ref hitting 0.  After it returns,
 * it's safe to drop the initial ref.
 *
 * USAGE:
 *
 * See fs/aio.c for some example usage; it's used there for struct kioctx, which
 * is created when userspaces calls io_setup(), and destroyed when userspace
 * calls io_destroy() or the process exits.
 *
 * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it
 * removes the kioctx from the proccess's table of kioctxs and kills percpu_ref.
 * After that, there can't be any new users of the kioctx (from lookup_ioctx())
 * and it's then safe to drop the initial ref with percpu_ref_put().
 *
 * Note that the free path, free_ioctx(), needs to go through explicit call_rcu()
 * to synchronize with RCU protected lookup_ioctx().  percpu_ref operations don't
 * imply RCU grace periods of any kind and if a user wants to combine percpu_ref
 * with RCU protection, it must be done explicitly.
 *
 * Code that does a two stage shutdown like this often needs some kind of
 * explicit synchronization to ensure the initial refcount can only be dropped
 * once - percpu_ref_kill() does this for you, it returns true once and false if
 * someone else already called it. The aio code uses it this way, but it's not
 * necessary if the code has some other mechanism to synchronize teardown.
 * around.
 */

#ifndef _LINUX_PERCPU_REFCOUNT_H
#define _LINUX_PERCPU_REFCOUNT_H

#include <linux/atomic.h>
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/rcupdate.h>
#include <linux/gfp.h>

struct percpu_ref;
typedef void (percpu_ref_func_t)(struct percpu_ref *);

/* flags set in the lower bits of percpu_ref->percpu_count_ptr */
enum {
        __PERCPU_REF_ATOMIC        = 1LU << 0,        /* operating in atomic mode */
        __PERCPU_REF_DEAD        = 1LU << 1,        /* (being) killed */
        __PERCPU_REF_ATOMIC_DEAD = __PERCPU_REF_ATOMIC | __PERCPU_REF_DEAD,

        __PERCPU_REF_FLAG_BITS        = 2,
};

/* @flags for percpu_ref_init() */
enum {
        /*
         * Start w/ ref == 1 in atomic mode.  Can be switched to percpu
         * operation using percpu_ref_switch_to_percpu().  If initialized
         * with this flag, the ref will stay in atomic mode until
         * percpu_ref_switch_to_percpu() is invoked on it.
         * Implies ALLOW_REINIT.
         */
        PERCPU_REF_INIT_ATOMIC        = 1 << 0,

        /*
         * Start dead w/ ref == 0 in atomic mode.  Must be revived with
         * percpu_ref_reinit() before used.  Implies INIT_ATOMIC and
         * ALLOW_REINIT.
         */
        PERCPU_REF_INIT_DEAD        = 1 << 1,

        /*
         * Allow switching from atomic mode to percpu mode.
         */
        PERCPU_REF_ALLOW_REINIT        = 1 << 2,
};

struct percpu_ref_data {
        atomic_long_t                count;
        percpu_ref_func_t        *release;
        percpu_ref_func_t        *confirm_switch;
        bool                        force_atomic:1;
        bool                        allow_reinit:1;
        struct rcu_head                rcu;
        struct percpu_ref        *ref;
};

struct percpu_ref {
        /*
         * The low bit of the pointer indicates whether the ref is in percpu
         * mode; if set, then get/put will manipulate the atomic_t.
         */
        unsigned long                percpu_count_ptr;

        /*
         * 'percpu_ref' is often embedded into user structure, and only
         * 'percpu_count_ptr' is required in fast path, move other fields
         * into 'percpu_ref_data', so we can reduce memory footprint in
         * fast path.
         */
        struct percpu_ref_data  *data;
};

int __must_check percpu_ref_init(struct percpu_ref *ref,
                                 percpu_ref_func_t *release, unsigned int flags,
                                 gfp_t gfp);
void percpu_ref_exit(struct percpu_ref *ref);
void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
                                 percpu_ref_func_t *confirm_switch);
void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref);
void percpu_ref_switch_to_percpu(struct percpu_ref *ref);
void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
                                 percpu_ref_func_t *confirm_kill);
void percpu_ref_resurrect(struct percpu_ref *ref);
void percpu_ref_reinit(struct percpu_ref *ref);
bool percpu_ref_is_zero(struct percpu_ref *ref);

/**
 * percpu_ref_kill - drop the initial ref
 * @ref: percpu_ref to kill
 *
 * Must be used to drop the initial ref on a percpu refcount; must be called
 * precisely once before shutdown.
 *
 * Switches @ref into atomic mode before gathering up the percpu counters
 * and dropping the initial ref.
 *
 * There are no implied RCU grace periods between kill and release.
 */
static inline void percpu_ref_kill(struct percpu_ref *ref)
{
        percpu_ref_kill_and_confirm(ref, NULL);
}

/*
 * Internal helper.  Don't use outside percpu-refcount proper.  The
 * function doesn't return the pointer and let the caller test it for NULL
 * because doing so forces the compiler to generate two conditional
 * branches as it can't assume that @ref->percpu_count is not NULL.
 */
static inline bool __ref_is_percpu(struct percpu_ref *ref,
                                          unsigned long __percpu **percpu_countp)
{
        unsigned long percpu_ptr;

        /*
         * The value of @ref->percpu_count_ptr is tested for
         * !__PERCPU_REF_ATOMIC, which may be set asynchronously, and then
         * used as a pointer.  If the compiler generates a separate fetch
         * when using it as a pointer, __PERCPU_REF_ATOMIC may be set in
         * between contaminating the pointer value, meaning that
         * READ_ONCE() is required when fetching it.
         *
         * The dependency ordering from the READ_ONCE() pairs
         * with smp_store_release() in __percpu_ref_switch_to_percpu().
         */
        percpu_ptr = READ_ONCE(ref->percpu_count_ptr);

        /*
         * Theoretically, the following could test just ATOMIC; however,
         * then we'd have to mask off DEAD separately as DEAD may be
         * visible without ATOMIC if we race with percpu_ref_kill().  DEAD
         * implies ATOMIC anyway.  Test them together.
         */
        if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC_DEAD))
                return false;

        *percpu_countp = (unsigned long __percpu *)percpu_ptr;
        return true;
}

/**
 * percpu_ref_get_many - increment a percpu refcount
 * @ref: percpu_ref to get
 * @nr: number of references to get
 *
 * Analogous to atomic_long_add().
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr)
{
        unsigned long __percpu *percpu_count;

        rcu_read_lock();

        if (__ref_is_percpu(ref, &percpu_count))
                this_cpu_add(*percpu_count, nr);
        else
                atomic_long_add(nr, &ref->data->count);

        rcu_read_unlock();
}

/**
 * percpu_ref_get - increment a percpu refcount
 * @ref: percpu_ref to get
 *
 * Analagous to atomic_long_inc().
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_get(struct percpu_ref *ref)
{
        percpu_ref_get_many(ref, 1);
}

/**
 * percpu_ref_tryget_many - try to increment a percpu refcount
 * @ref: percpu_ref to try-get
 * @nr: number of references to get
 *
 * Increment a percpu refcount  by @nr unless its count already reached zero.
 * Returns %true on success; %false on failure.
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline bool percpu_ref_tryget_many(struct percpu_ref *ref,
                                          unsigned long nr)
{
        unsigned long __percpu *percpu_count;
        bool ret;

        rcu_read_lock();

        if (__ref_is_percpu(ref, &percpu_count)) {
                this_cpu_add(*percpu_count, nr);
                ret = true;
        } else {
                ret = atomic_long_add_unless(&ref->data->count, nr, 0);
        }

        rcu_read_unlock();

        return ret;
}

/**
 * percpu_ref_tryget - try to increment a percpu refcount
 * @ref: percpu_ref to try-get
 *
 * Increment a percpu refcount unless its count already reached zero.
 * Returns %true on success; %false on failure.
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline bool percpu_ref_tryget(struct percpu_ref *ref)
{
        return percpu_ref_tryget_many(ref, 1);
}

/**
 * percpu_ref_tryget_live - try to increment a live percpu refcount
 * @ref: percpu_ref to try-get
 *
 * Increment a percpu refcount unless it has already been killed.  Returns
 * %true on success; %false on failure.
 *
 * Completion of percpu_ref_kill() in itself doesn't guarantee that this
 * function will fail.  For such guarantee, percpu_ref_kill_and_confirm()
 * should be used.  After the confirm_kill callback is invoked, it's
 * guaranteed that no new reference will be given out by
 * percpu_ref_tryget_live().
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
{
        unsigned long __percpu *percpu_count;
        bool ret = false;

        rcu_read_lock();

        if (__ref_is_percpu(ref, &percpu_count)) {
                this_cpu_inc(*percpu_count);
                ret = true;
        } else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) {
                ret = atomic_long_inc_not_zero(&ref->data->count);
        }

        rcu_read_unlock();

        return ret;
}

/**
 * percpu_ref_put_many - decrement a percpu refcount
 * @ref: percpu_ref to put
 * @nr: number of references to put
 *
 * Decrement the refcount, and if 0, call the release function (which was passed
 * to percpu_ref_init())
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_put_many(struct percpu_ref *ref, unsigned long nr)
{
        unsigned long __percpu *percpu_count;

        rcu_read_lock();

        if (__ref_is_percpu(ref, &percpu_count))
                this_cpu_sub(*percpu_count, nr);
        else if (unlikely(atomic_long_sub_and_test(nr, &ref->data->count)))
                ref->data->release(ref);

        rcu_read_unlock();
}

/**
 * percpu_ref_put - decrement a percpu refcount
 * @ref: percpu_ref to put
 *
 * Decrement the refcount, and if 0, call the release function (which was passed
 * to percpu_ref_init())
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_put(struct percpu_ref *ref)
{
        percpu_ref_put_many(ref, 1);
}

/**
 * percpu_ref_is_dying - test whether a percpu refcount is dying or dead
 * @ref: percpu_ref to test
 *
 * Returns %true if @ref is dying or dead.
 *
 * This function is safe to call as long as @ref is between init and exit
 * and the caller is responsible for synchronizing against state changes.
 */
static inline bool percpu_ref_is_dying(struct percpu_ref *ref)
{
        return ref->percpu_count_ptr & __PERCPU_REF_DEAD;
}

#endif






































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_VSYSCALL_H
#define _ASM_X86_VSYSCALL_H

#include <linux/seqlock.h>
#include <uapi/asm/vsyscall.h>
#include <asm/page_types.h>

#ifdef CONFIG_X86_VSYSCALL_EMULATION
extern void map_vsyscall(void);
extern void set_vsyscall_pgtable_user_bits(pgd_t *root);

/*
 * Called on instruction fetch fault in vsyscall page.
 * Returns true if handled.
 */
extern bool emulate_vsyscall(unsigned long error_code,
                             struct pt_regs *regs, unsigned long address);
#else
static inline void map_vsyscall(void) {}
static inline bool emulate_vsyscall(unsigned long error_code,
                                    struct pt_regs *regs, unsigned long address)
{
        return false;
}
#endif

/*
 * The (legacy) vsyscall page is the long page in the kernel portion
 * of the address space that has user-accessible permissions.
 */
static inline bool is_vsyscall_vaddr(unsigned long vaddr)
{
        return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
}

#endif /* _ASM_X86_VSYSCALL_H */



































































    1 
    1 


























































































































































































































































































    1 


    1 



    1 





    1 









    1 

    1 









    1 








    1 



























































































    1 



    1 

    1 


    1 
    1 

    1 

    1 






















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
// SPDX-License-Identifier: GPL-2.0-only
/*
 *        linux/kernel/resource.c
 *
 * Copyright (C) 1999        Linus Torvalds
 * Copyright (C) 1999        Martin Mares <mj@ucw.cz>
 *
 * Arbitrary resource management.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/export.h>
#include <linux/errno.h>
#include <linux/ioport.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/proc_fs.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/device.h>
#include <linux/pfn.h>
#include <linux/mm.h>
#include <linux/resource_ext.h>
#include <asm/io.h>


struct resource ioport_resource = {
        .name        = "PCI IO",
        .start        = 0,
        .end        = IO_SPACE_LIMIT,
        .flags        = IORESOURCE_IO,
};
EXPORT_SYMBOL(ioport_resource);

struct resource iomem_resource = {
        .name        = "PCI mem",
        .start        = 0,
        .end        = -1,
        .flags        = IORESOURCE_MEM,
};
EXPORT_SYMBOL(iomem_resource);

/* constraints to be met while allocating resources */
struct resource_constraint {
        resource_size_t min, max, align;
        resource_size_t (*alignf)(void *, const struct resource *,
                        resource_size_t, resource_size_t);
        void *alignf_data;
};

static DEFINE_RWLOCK(resource_lock);

/*
 * For memory hotplug, there is no way to free resource entries allocated
 * by boot mem after the system is up. So for reusing the resource entry
 * we need to remember the resource.
 */
static struct resource *bootmem_resource_free;
static DEFINE_SPINLOCK(bootmem_resource_lock);

static struct resource *next_resource(struct resource *p, bool sibling_only)
{
        /* Caller wants to traverse through siblings only */
        if (sibling_only)
                return p->sibling;

        if (p->child)
                return p->child;
        while (!p->sibling && p->parent)
                p = p->parent;
        return p->sibling;
}

static void *r_next(struct seq_file *m, void *v, loff_t *pos)
{
        struct resource *p = v;
        (*pos)++;
        return (void *)next_resource(p, false);
}

#ifdef CONFIG_PROC_FS

enum { MAX_IORES_LEVEL = 5 };

static void *r_start(struct seq_file *m, loff_t *pos)
        __acquires(resource_lock)
{
        struct resource *p = PDE_DATA(file_inode(m->file));
        loff_t l = 0;
        read_lock(&resource_lock);
        for (p = p->child; p && l < *pos; p = r_next(m, p, &l))
                ;
        return p;
}

static void r_stop(struct seq_file *m, void *v)
        __releases(resource_lock)
{
        read_unlock(&resource_lock);
}

static int r_show(struct seq_file *m, void *v)
{
        struct resource *root = PDE_DATA(file_inode(m->file));
        struct resource *r = v, *p;
        unsigned long long start, end;
        int width = root->end < 0x10000 ? 4 : 8;
        int depth;

        for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent)
                if (p->parent == root)
                        break;

        if (file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) {
                start = r->start;
                end = r->end;
        } else {
                start = end = 0;
        }

        seq_printf(m, "%*s%0*llx-%0*llx : %s\n",
                        depth * 2, "",
                        width, start,
                        width, end,
                        r->name ? r->name : "<BAD>");
        return 0;
}

static const struct seq_operations resource_op = {
        .start        = r_start,
        .next        = r_next,
        .stop        = r_stop,
        .show        = r_show,
};

static int __init ioresources_init(void)
{
        proc_create_seq_data("ioports", 0, NULL, &resource_op,
                        &ioport_resource);
        proc_create_seq_data("iomem", 0, NULL, &resource_op, &iomem_resource);
        return 0;
}
__initcall(ioresources_init);

#endif /* CONFIG_PROC_FS */

static void free_resource(struct resource *res)
{
        if (!res)
                return;

        if (!PageSlab(virt_to_head_page(res))) {
                spin_lock(&bootmem_resource_lock);
                res->sibling = bootmem_resource_free;
                bootmem_resource_free = res;
                spin_unlock(&bootmem_resource_lock);
        } else {
                kfree(res);
        }
}

static struct resource *alloc_resource(gfp_t flags)
{
        struct resource *res = NULL;

        spin_lock(&bootmem_resource_lock);
        if (bootmem_resource_free) {
                res = bootmem_resource_free;
                bootmem_resource_free = res->sibling;
        }
        spin_unlock(&bootmem_resource_lock);

        if (res)
                memset(res, 0, sizeof(struct resource));
        else
                res = kzalloc(sizeof(struct resource), flags);

        return res;
}

/* Return the conflict entry if you can't request it */
static struct resource * __request_resource(struct resource *root, struct resource *new)
{
        resource_size_t start = new->start;
        resource_size_t end = new->end;
        struct resource *tmp, **p;

        if (end < start)
                return root;
        if (start < root->start)
                return root;
        if (end > root->end)
                return root;
        p = &root->child;
        for (;;) {
                tmp = *p;
                if (!tmp || tmp->start > end) {
                        new->sibling = tmp;
                        *p = new;
                        new->parent = root;
                        return NULL;
                }
                p = &tmp->sibling;
                if (tmp->end < start)
                        continue;
                return tmp;
        }
}

static int __release_resource(struct resource *old, bool release_child)
{
        struct resource *tmp, **p, *chd;

        p = &old->parent->child;
        for (;;) {
                tmp = *p;
                if (!tmp)
                        break;
                if (tmp == old) {
                        if (release_child || !(tmp->child)) {
                                *p = tmp->sibling;
                        } else {
                                for (chd = tmp->child;; chd = chd->sibling) {
                                        chd->parent = tmp->parent;
                                        if (!(chd->sibling))
                                                break;
                                }
                                *p = tmp->child;
                                chd->sibling = tmp->sibling;
                        }
                        old->parent = NULL;
                        return 0;
                }
                p = &tmp->sibling;
        }
        return -EINVAL;
}

static void __release_child_resources(struct resource *r)
{
        struct resource *tmp, *p;
        resource_size_t size;

        p = r->child;
        r->child = NULL;
        while (p) {
                tmp = p;
                p = p->sibling;

                tmp->parent = NULL;
                tmp->sibling = NULL;
                __release_child_resources(tmp);

                printk(KERN_DEBUG "release child resource %pR\n", tmp);
                /* need to restore size, and keep flags */
                size = resource_size(tmp);
                tmp->start = 0;
                tmp->end = size - 1;
        }
}

void release_child_resources(struct resource *r)
{
        write_lock(&resource_lock);
        __release_child_resources(r);
        write_unlock(&resource_lock);
}

/**
 * request_resource_conflict - request and reserve an I/O or memory resource
 * @root: root resource descriptor
 * @new: resource descriptor desired by caller
 *
 * Returns 0 for success, conflict resource on error.
 */
struct resource *request_resource_conflict(struct resource *root, struct resource *new)
{
        struct resource *conflict;

        write_lock(&resource_lock);
        conflict = __request_resource(root, new);
        write_unlock(&resource_lock);
        return conflict;
}

/**
 * request_resource - request and reserve an I/O or memory resource
 * @root: root resource descriptor
 * @new: resource descriptor desired by caller
 *
 * Returns 0 for success, negative error code on error.
 */
int request_resource(struct resource *root, struct resource *new)
{
        struct resource *conflict;

        conflict = request_resource_conflict(root, new);
        return conflict ? -EBUSY : 0;
}

EXPORT_SYMBOL(request_resource);

/**
 * release_resource - release a previously reserved resource
 * @old: resource pointer
 */
int release_resource(struct resource *old)
{
        int retval;

        write_lock(&resource_lock);
        retval = __release_resource(old, true);
        write_unlock(&resource_lock);
        return retval;
}

EXPORT_SYMBOL(release_resource);

/**
 * Finds the lowest iomem resource that covers part of [@start..@end].  The
 * caller must specify @start, @end, @flags, and @desc (which may be
 * IORES_DESC_NONE).
 *
 * If a resource is found, returns 0 and @*res is overwritten with the part
 * of the resource that's within [@start..@end]; if none is found, returns
 * -ENODEV.  Returns -EINVAL for invalid parameters.
 *
 * This function walks the whole tree and not just first level children
 * unless @first_lvl is true.
 *
 * @start:        start address of the resource searched for
 * @end:        end address of same resource
 * @flags:        flags which the resource must have
 * @desc:        descriptor the resource must have
 * @first_lvl:        walk only the first level children, if set
 * @res:        return ptr, if resource found
 */
static int find_next_iomem_res(resource_size_t start, resource_size_t end,
                               unsigned long flags, unsigned long desc,
                               bool first_lvl, struct resource *res)
{
        bool siblings_only = true;
        struct resource *p;

        if (!res)
                return -EINVAL;

        if (start >= end)
                return -EINVAL;

        read_lock(&resource_lock);

        for (p = iomem_resource.child; p; p = next_resource(p, siblings_only)) {
                /* If we passed the resource we are looking for, stop */
                if (p->start > end) {
                        p = NULL;
                        break;
                }

                /* Skip until we find a range that matches what we look for */
                if (p->end < start)
                        continue;

                /*
                 * Now that we found a range that matches what we look for,
                 * check the flags and the descriptor. If we were not asked to
                 * use only the first level, start looking at children as well.
                 */
                siblings_only = first_lvl;

                if ((p->flags & flags) != flags)
                        continue;
                if ((desc != IORES_DESC_NONE) && (desc != p->desc))
                        continue;

                /* Found a match, break */
                break;
        }

        if (p) {
                /* copy data */
                *res = (struct resource) {
                        .start = max(start, p->start),
                        .end = min(end, p->end),
                        .flags = p->flags,
                        .desc = p->desc,
                        .parent = p->parent,
                };
        }

        read_unlock(&resource_lock);
        return p ? 0 : -ENODEV;
}

static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
                                 unsigned long flags, unsigned long desc,
                                 bool first_lvl, void *arg,
                                 int (*func)(struct resource *, void *))
{
        struct resource res;
        int ret = -EINVAL;

        while (start < end &&
               !find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) {
                ret = (*func)(&res, arg);
                if (ret)
                        break;

                start = res.end + 1;
        }

        return ret;
}

/**
 * Walks through iomem resources and calls func() with matching resource
 * ranges. This walks through whole tree and not just first level children.
 * All the memory ranges which overlap start,end and also match flags and
 * desc are valid candidates.
 *
 * @desc: I/O resource descriptor. Use IORES_DESC_NONE to skip @desc check.
 * @flags: I/O resource flags
 * @start: start addr
 * @end: end addr
 * @arg: function argument for the callback @func
 * @func: callback function that is called for each qualifying resource area
 *
 * NOTE: For a new descriptor search, define a new IORES_DESC in
 * <linux/ioport.h> and set it in 'desc' of a target resource entry.
 */
int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
                u64 end, void *arg, int (*func)(struct resource *, void *))
{
        return __walk_iomem_res_desc(start, end, flags, desc, false, arg, func);
}
EXPORT_SYMBOL_GPL(walk_iomem_res_desc);

/*
 * This function calls the @func callback against all memory ranges of type
 * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
 * Now, this function is only for System RAM, it deals with full ranges and
 * not PFNs. If resources are not PFN-aligned, dealing with PFNs can truncate
 * ranges.
 */
int walk_system_ram_res(u64 start, u64 end, void *arg,
                        int (*func)(struct resource *, void *))
{
        unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;

        return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, false,
                                     arg, func);
}

/*
 * This function calls the @func callback against all memory ranges, which
 * are ranges marked as IORESOURCE_MEM and IORESOUCE_BUSY.
 */
int walk_mem_res(u64 start, u64 end, void *arg,
                 int (*func)(struct resource *, void *))
{
        unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;

        return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, false,
                                     arg, func);
}

/*
 * This function calls the @func callback against all memory ranges of type
 * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
 * It is to be used only for System RAM.
 *
 * This will find System RAM ranges that are children of top-level resources
 * in addition to top-level System RAM resources.
 */
int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
                          void *arg, int (*func)(unsigned long, unsigned long, void *))
{
        resource_size_t start, end;
        unsigned long flags;
        struct resource res;
        unsigned long pfn, end_pfn;
        int ret = -EINVAL;

        start = (u64) start_pfn << PAGE_SHIFT;
        end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
        flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
        while (start < end &&
               !find_next_iomem_res(start, end, flags, IORES_DESC_NONE,
                                    false, &res)) {
                pfn = PFN_UP(res.start);
                end_pfn = PFN_DOWN(res.end + 1);
                if (end_pfn > pfn)
                        ret = (*func)(pfn, end_pfn - pfn, arg);
                if (ret)
                        break;
                start = res.end + 1;
        }
        return ret;
}

static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
{
        return 1;
}

/*
 * This generic page_is_ram() returns true if specified address is
 * registered as System RAM in iomem_resource list.
 */
int __weak page_is_ram(unsigned long pfn)
{
        return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
}
EXPORT_SYMBOL_GPL(page_is_ram);

/**
 * region_intersects() - determine intersection of region with known resources
 * @start: region start address
 * @size: size of region
 * @flags: flags of resource (in iomem_resource)
 * @desc: descriptor of resource (in iomem_resource) or IORES_DESC_NONE
 *
 * Check if the specified region partially overlaps or fully eclipses a
 * resource identified by @flags and @desc (optional with IORES_DESC_NONE).
 * Return REGION_DISJOINT if the region does not overlap @flags/@desc,
 * return REGION_MIXED if the region overlaps @flags/@desc and another
 * resource, and return REGION_INTERSECTS if the region overlaps @flags/@desc
 * and no other defined resource. Note that REGION_INTERSECTS is also
 * returned in the case when the specified region overlaps RAM and undefined
 * memory holes.
 *
 * region_intersect() is used by memory remapping functions to ensure
 * the user is not remapping RAM and is a vast speed up over walking
 * through the resource table page by page.
 */
int region_intersects(resource_size_t start, size_t size, unsigned long flags,
                      unsigned long desc)
{
        struct resource res;
        int type = 0; int other = 0;
        struct resource *p;

        res.start = start;
        res.end = start + size - 1;

        read_lock(&resource_lock);
        for (p = iomem_resource.child; p ; p = p->sibling) {
                bool is_type = (((p->flags & flags) == flags) &&
                                ((desc == IORES_DESC_NONE) ||
                                 (desc == p->desc)));

                if (resource_overlaps(p, &res))
                        is_type ? type++ : other++;
        }
        read_unlock(&resource_lock);

        if (other == 0)
                return type ? REGION_INTERSECTS : REGION_DISJOINT;

        if (type)
                return REGION_MIXED;

        return REGION_DISJOINT;
}
EXPORT_SYMBOL_GPL(region_intersects);

void __weak arch_remove_reservations(struct resource *avail)
{
}

static resource_size_t simple_align_resource(void *data,
                                             const struct resource *avail,
                                             resource_size_t size,
                                             resource_size_t align)
{
        return avail->start;
}

static void resource_clip(struct resource *res, resource_size_t min,
                          resource_size_t max)
{
        if (res->start < min)
                res->start = min;
        if (res->end > max)
                res->end = max;
}

/*
 * Find empty slot in the resource tree with the given range and
 * alignment constraints
 */
static int __find_resource(struct resource *root, struct resource *old,
                         struct resource *new,
                         resource_size_t  size,
                         struct resource_constraint *constraint)
{
        struct resource *this = root->child;
        struct resource tmp = *new, avail, alloc;

        tmp.start = root->start;
        /*
         * Skip past an allocated resource that starts at 0, since the assignment
         * of this->start - 1 to tmp->end below would cause an underflow.
         */
        if (this && this->start == root->start) {
                tmp.start = (this == old) ? old->start : this->end + 1;
                this = this->sibling;
        }
        for(;;) {
                if (this)
                        tmp.end = (this == old) ?  this->end : this->start - 1;
                else
                        tmp.end = root->end;

                if (tmp.end < tmp.start)
                        goto next;

                resource_clip(&tmp, constraint->min, constraint->max);
                arch_remove_reservations(&tmp);

                /* Check for overflow after ALIGN() */
                avail.start = ALIGN(tmp.start, constraint->align);
                avail.end = tmp.end;
                avail.flags = new->flags & ~IORESOURCE_UNSET;
                if (avail.start >= tmp.start) {
                        alloc.flags = avail.flags;
                        alloc.start = constraint->alignf(constraint->alignf_data, &avail,
                                        size, constraint->align);
                        alloc.end = alloc.start + size - 1;
                        if (alloc.start <= alloc.end &&
                            resource_contains(&avail, &alloc)) {
                                new->start = alloc.start;
                                new->end = alloc.end;
                                return 0;
                        }
                }

next:                if (!this || this->end == root->end)
                        break;

                if (this != old)
                        tmp.start = this->end + 1;
                this = this->sibling;
        }
        return -EBUSY;
}

/*
 * Find empty slot in the resource tree given range and alignment.
 */
static int find_resource(struct resource *root, struct resource *new,
                        resource_size_t size,
                        struct resource_constraint  *constraint)
{
        return  __find_resource(root, NULL, new, size, constraint);
}

/**
 * reallocate_resource - allocate a slot in the resource tree given range & alignment.
 *        The resource will be relocated if the new size cannot be reallocated in the
 *        current location.
 *
 * @root: root resource descriptor
 * @old:  resource descriptor desired by caller
 * @newsize: new size of the resource descriptor
 * @constraint: the size and alignment constraints to be met.
 */
static int reallocate_resource(struct resource *root, struct resource *old,
                               resource_size_t newsize,
                               struct resource_constraint *constraint)
{
        int err=0;
        struct resource new = *old;
        struct resource *conflict;

        write_lock(&resource_lock);

        if ((err = __find_resource(root, old, &new, newsize, constraint)))
                goto out;

        if (resource_contains(&new, old)) {
                old->start = new.start;
                old->end = new.end;
                goto out;
        }

        if (old->child) {
                err = -EBUSY;
                goto out;
        }

        if (resource_contains(old, &new)) {
                old->start = new.start;
                old->end = new.end;
        } else {
                __release_resource(old, true);
                *old = new;
                conflict = __request_resource(root, old);
                BUG_ON(conflict);
        }
out:
        write_unlock(&resource_lock);
        return err;
}


/**
 * allocate_resource - allocate empty slot in the resource tree given range & alignment.
 *         The resource will be reallocated with a new size if it was already allocated
 * @root: root resource descriptor
 * @new: resource descriptor desired by caller
 * @size: requested resource region size
 * @min: minimum boundary to allocate
 * @max: maximum boundary to allocate
 * @align: alignment requested, in bytes
 * @alignf: alignment function, optional, called if not NULL
 * @alignf_data: arbitrary data to pass to the @alignf function
 */
int allocate_resource(struct resource *root, struct resource *new,
                      resource_size_t size, resource_size_t min,
                      resource_size_t max, resource_size_t align,
                      resource_size_t (*alignf)(void *,
                                                const struct resource *,
                                                resource_size_t,
                                                resource_size_t),
                      void *alignf_data)
{
        int err;
        struct resource_constraint constraint;

        if (!alignf)
                alignf = simple_align_resource;

        constraint.min = min;
        constraint.max = max;
        constraint.align = align;
        constraint.alignf = alignf;
        constraint.alignf_data = alignf_data;

        if ( new->parent ) {
                /* resource is already allocated, try reallocating with
                   the new constraints */
                return reallocate_resource(root, new, size, &constraint);
        }

        write_lock(&resource_lock);
        err = find_resource(root, new, size, &constraint);
        if (err >= 0 && __request_resource(root, new))
                err = -EBUSY;
        write_unlock(&resource_lock);
        return err;
}

EXPORT_SYMBOL(allocate_resource);

/**
 * lookup_resource - find an existing resource by a resource start address
 * @root: root resource descriptor
 * @start: resource start address
 *
 * Returns a pointer to the resource if found, NULL otherwise
 */
struct resource *lookup_resource(struct resource *root, resource_size_t start)
{
        struct resource *res;

        read_lock(&resource_lock);
        for (res = root->child; res; res = res->sibling) {
                if (res->start == start)
                        break;
        }
        read_unlock(&resource_lock);

        return res;
}

/*
 * Insert a resource into the resource tree. If successful, return NULL,
 * otherwise return the conflicting resource (compare to __request_resource())
 */
static struct resource * __insert_resource(struct resource *parent, struct resource *new)
{
        struct resource *first, *next;

        for (;; parent = first) {
                first = __request_resource(parent, new);
                if (!first)
                        return first;

                if (first == parent)
                        return first;
                if (WARN_ON(first == new))        /* duplicated insertion */
                        return first;

                if ((first->start > new->start) || (first->end < new->end))
                        break;
                if ((first->start == new->start) && (first->end == new->end))
                        break;
        }

        for (next = first; ; next = next->sibling) {
                /* Partial overlap? Bad, and unfixable */
                if (next->start < new->start || next->end > new->end)
                        return next;
                if (!next->sibling)
                        break;
                if (next->sibling->start > new->end)
                        break;
        }

        new->parent = parent;
        new->sibling = next->sibling;
        new->child = first;

        next->sibling = NULL;
        for (next = first; next; next = next->sibling)
                next->parent = new;

        if (parent->child == first) {
                parent->child = new;
        } else {
                next = parent->child;
                while (next->sibling != first)
                        next = next->sibling;
                next->sibling = new;
        }
        return NULL;
}

/**
 * insert_resource_conflict - Inserts resource in the resource tree
 * @parent: parent of the new resource
 * @new: new resource to insert
 *
 * Returns 0 on success, conflict resource if the resource can't be inserted.
 *
 * This function is equivalent to request_resource_conflict when no conflict
 * happens. If a conflict happens, and the conflicting resources
 * entirely fit within the range of the new resource, then the new
 * resource is inserted and the conflicting resources become children of
 * the new resource.
 *
 * This function is intended for producers of resources, such as FW modules
 * and bus drivers.
 */
struct resource *insert_resource_conflict(struct resource *parent, struct resource *new)
{
        struct resource *conflict;

        write_lock(&resource_lock);
        conflict = __insert_resource(parent, new);
        write_unlock(&resource_lock);
        return conflict;
}

/**
 * insert_resource - Inserts a resource in the resource tree
 * @parent: parent of the new resource
 * @new: new resource to insert
 *
 * Returns 0 on success, -EBUSY if the resource can't be inserted.
 *
 * This function is intended for producers of resources, such as FW modules
 * and bus drivers.
 */
int insert_resource(struct resource *parent, struct resource *new)
{
        struct resource *conflict;

        conflict = insert_resource_conflict(parent, new);
        return conflict ? -EBUSY : 0;
}
EXPORT_SYMBOL_GPL(insert_resource);

/**
 * insert_resource_expand_to_fit - Insert a resource into the resource tree
 * @root: root resource descriptor
 * @new: new resource to insert
 *
 * Insert a resource into the resource tree, possibly expanding it in order
 * to make it encompass any conflicting resources.
 */
void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
{
        if (new->parent)
                return;

        write_lock(&resource_lock);
        for (;;) {
                struct resource *conflict;

                conflict = __insert_resource(root, new);
                if (!conflict)
                        break;
                if (conflict == root)
                        break;

                /* Ok, expand resource to cover the conflict, then try again .. */
                if (conflict->start < new->start)
                        new->start = conflict->start;
                if (conflict->end > new->end)
                        new->end = conflict->end;

                printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
        }
        write_unlock(&resource_lock);
}

/**
 * remove_resource - Remove a resource in the resource tree
 * @old: resource to remove
 *
 * Returns 0 on success, -EINVAL if the resource is not valid.
 *
 * This function removes a resource previously inserted by insert_resource()
 * or insert_resource_conflict(), and moves the children (if any) up to
 * where they were before.  insert_resource() and insert_resource_conflict()
 * insert a new resource, and move any conflicting resources down to the
 * children of the new resource.
 *
 * insert_resource(), insert_resource_conflict() and remove_resource() are
 * intended for producers of resources, such as FW modules and bus drivers.
 */
int remove_resource(struct resource *old)
{
        int retval;

        write_lock(&resource_lock);
        retval = __release_resource(old, false);
        write_unlock(&resource_lock);
        return retval;
}
EXPORT_SYMBOL_GPL(remove_resource);

static int __adjust_resource(struct resource *res, resource_size_t start,
                                resource_size_t size)
{
        struct resource *tmp, *parent = res->parent;
        resource_size_t end = start + size - 1;
        int result = -EBUSY;

        if (!parent)
                goto skip;

        if ((start < parent->start) || (end > parent->end))
                goto out;

        if (res->sibling && (res->sibling->start <= end))
                goto out;

        tmp = parent->child;
        if (tmp != res) {
                while (tmp->sibling != res)
                        tmp = tmp->sibling;
                if (start <= tmp->end)
                        goto out;
        }

skip:
        for (tmp = res->child; tmp; tmp = tmp->sibling)
                if ((tmp->start < start) || (tmp->end > end))
                        goto out;

        res->start = start;
        res->end = end;
        result = 0;

 out:
        return result;
}

/**
 * adjust_resource - modify a resource's start and size
 * @res: resource to modify
 * @start: new start value
 * @size: new size
 *
 * Given an existing resource, change its start and size to match the
 * arguments.  Returns 0 on success, -EBUSY if it can't fit.
 * Existing children of the resource are assumed to be immutable.
 */
int adjust_resource(struct resource *res, resource_size_t start,
                    resource_size_t size)
{
        int result;

        write_lock(&resource_lock);
        result = __adjust_resource(res, start, size);
        write_unlock(&resource_lock);
        return result;
}
EXPORT_SYMBOL(adjust_resource);

static void __init
__reserve_region_with_split(struct resource *root, resource_size_t start,
                            resource_size_t end, const char *name)
{
        struct resource *parent = root;
        struct resource *conflict;
        struct resource *res = alloc_resource(GFP_ATOMIC);
        struct resource *next_res = NULL;
        int type = resource_type(root);

        if (!res)
                return;

        res->name = name;
        res->start = start;
        res->end = end;
        res->flags = type | IORESOURCE_BUSY;
        res->desc = IORES_DESC_NONE;

        while (1) {

                conflict = __request_resource(parent, res);
                if (!conflict) {
                        if (!next_res)
                                break;
                        res = next_res;
                        next_res = NULL;
                        continue;
                }

                /* conflict covered whole area */
                if (conflict->start <= res->start &&
                                conflict->end >= res->end) {
                        free_resource(res);
                        WARN_ON(next_res);
                        break;
                }

                /* failed, split and try again */
                if (conflict->start > res->start) {
                        end = res->end;
                        res->end = conflict->start - 1;
                        if (conflict->end < end) {
                                next_res = alloc_resource(GFP_ATOMIC);
                                if (!next_res) {
                                        free_resource(res);
                                        break;
                                }
                                next_res->name = name;
                                next_res->start = conflict->end + 1;
                                next_res->end = end;
                                next_res->flags = type | IORESOURCE_BUSY;
                                next_res->desc = IORES_DESC_NONE;
                        }
                } else {
                        res->start = conflict->end + 1;
                }
        }

}

void __init
reserve_region_with_split(struct resource *root, resource_size_t start,
                          resource_size_t end, const char *name)
{
        int abort = 0;

        write_lock(&resource_lock);
        if (root->start > start || root->end < end) {
                pr_err("requested range [0x%llx-0x%llx] not in root %pr\n",
                       (unsigned long long)start, (unsigned long long)end,
                       root);
                if (start > root->end || end < root->start)
                        abort = 1;
                else {
                        if (end > root->end)
                                end = root->end;
                        if (start < root->start)
                                start = root->start;
                        pr_err("fixing request to [0x%llx-0x%llx]\n",
                               (unsigned long long)start,
                               (unsigned long long)end);
                }
                dump_stack();
        }
        if (!abort)
                __reserve_region_with_split(root, start, end, name);
        write_unlock(&resource_lock);
}

/**
 * resource_alignment - calculate resource's alignment
 * @res: resource pointer
 *
 * Returns alignment on success, 0 (invalid alignment) on failure.
 */
resource_size_t resource_alignment(struct resource *res)
{
        switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) {
        case IORESOURCE_SIZEALIGN:
                return resource_size(res);
        case IORESOURCE_STARTALIGN:
                return res->start;
        default:
                return 0;
        }
}

/*
 * This is compatibility stuff for IO resources.
 *
 * Note how this, unlike the above, knows about
 * the IO flag meanings (busy etc).
 *
 * request_region creates a new busy region.
 *
 * release_region releases a matching busy region.
 */

static DECLARE_WAIT_QUEUE_HEAD(muxed_resource_wait);

/**
 * __request_region - create a new busy resource region
 * @parent: parent resource descriptor
 * @start: resource start address
 * @n: resource region size
 * @name: reserving caller's ID string
 * @flags: IO resource flags
 */
struct resource * __request_region(struct resource *parent,
                                   resource_size_t start, resource_size_t n,
                                   const char *name, int flags)
{
        DECLARE_WAITQUEUE(wait, current);
        struct resource *res = alloc_resource(GFP_KERNEL);
        struct resource *orig_parent = parent;

        if (!res)
                return NULL;

        res->name = name;
        res->start = start;
        res->end = start + n - 1;

        write_lock(&resource_lock);

        for (;;) {
                struct resource *conflict;

                res->flags = resource_type(parent) | resource_ext_type(parent);
                res->flags |= IORESOURCE_BUSY | flags;
                res->desc = parent->desc;

                conflict = __request_resource(parent, res);
                if (!conflict)
                        break;
                /*
                 * mm/hmm.c reserves physical addresses which then
                 * become unavailable to other users.  Conflicts are
                 * not expected.  Warn to aid debugging if encountered.
                 */
                if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
                        pr_warn("Unaddressable device %s %pR conflicts with %pR",
                                conflict->name, conflict, res);
                }
                if (conflict != parent) {
                        if (!(conflict->flags & IORESOURCE_BUSY)) {
                                parent = conflict;
                                continue;
                        }
                }
                if (conflict->flags & flags & IORESOURCE_MUXED) {
                        add_wait_queue(&muxed_resource_wait, &wait);
                        write_unlock(&resource_lock);
                        set_current_state(TASK_UNINTERRUPTIBLE);
                        schedule();
                        remove_wait_queue(&muxed_resource_wait, &wait);
                        write_lock(&resource_lock);
                        continue;
                }
                /* Uhhuh, that didn't work out.. */
                free_resource(res);
                res = NULL;
                break;
        }
        write_unlock(&resource_lock);

        if (res && orig_parent == &iomem_resource)
                revoke_devmem(res);

        return res;
}
EXPORT_SYMBOL(__request_region);

/**
 * __release_region - release a previously reserved resource region
 * @parent: parent resource descriptor
 * @start: resource start address
 * @n: resource region size
 *
 * The described resource region must match a currently busy region.
 */
void __release_region(struct resource *parent, resource_size_t start,
                      resource_size_t n)
{
        struct resource **p;
        resource_size_t end;

        p = &parent->child;
        end = start + n - 1;

        write_lock(&resource_lock);

        for (;;) {
                struct resource *res = *p;

                if (!res)
                        break;
                if (res->start <= start && res->end >= end) {
                        if (!(res->flags & IORESOURCE_BUSY)) {
                                p = &res->child;
                                continue;
                        }
                        if (res->start != start || res->end != end)
                                break;
                        *p = res->sibling;
                        write_unlock(&resource_lock);
                        if (res->flags & IORESOURCE_MUXED)
                                wake_up(&muxed_resource_wait);
                        free_resource(res);
                        return;
                }
                p = &res->sibling;
        }

        write_unlock(&resource_lock);

        printk(KERN_WARNING "Trying to free nonexistent resource "
                "<%016llx-%016llx>\n", (unsigned long long)start,
                (unsigned long long)end);
}
EXPORT_SYMBOL(__release_region);

#ifdef CONFIG_MEMORY_HOTREMOVE
/**
 * release_mem_region_adjustable - release a previously reserved memory region
 * @start: resource start address
 * @size: resource region size
 *
 * This interface is intended for memory hot-delete.  The requested region
 * is released from a currently busy memory resource.  The requested region
 * must either match exactly or fit into a single busy resource entry.  In
 * the latter case, the remaining resource is adjusted accordingly.
 * Existing children of the busy memory resource must be immutable in the
 * request.
 *
 * Note:
 * - Additional release conditions, such as overlapping region, can be
 *   supported after they are confirmed as valid cases.
 * - When a busy memory resource gets split into two entries, the code
 *   assumes that all children remain in the lower address entry for
 *   simplicity.  Enhance this logic when necessary.
 */
void release_mem_region_adjustable(resource_size_t start, resource_size_t size)
{
        struct resource *parent = &iomem_resource;
        struct resource *new_res = NULL;
        bool alloc_nofail = false;
        struct resource **p;
        struct resource *res;
        resource_size_t end;

        end = start + size - 1;
        if (WARN_ON_ONCE((start < parent->start) || (end > parent->end)))
                return;

        /*
         * We free up quite a lot of memory on memory hotunplug (esp., memap),
         * just before releasing the region. This is highly unlikely to
         * fail - let's play save and make it never fail as the caller cannot
         * perform any error handling (e.g., trying to re-add memory will fail
         * similarly).
         */
retry:
        new_res = alloc_resource(GFP_KERNEL | (alloc_nofail ? __GFP_NOFAIL : 0));

        p = &parent->child;
        write_lock(&resource_lock);

        while ((res = *p)) {
                if (res->start >= end)
                        break;

                /* look for the next resource if it does not fit into */
                if (res->start > start || res->end < end) {
                        p = &res->sibling;
                        continue;
                }

                if (!(res->flags & IORESOURCE_MEM))
                        break;

                if (!(res->flags & IORESOURCE_BUSY)) {
                        p = &res->child;
                        continue;
                }

                /* found the target resource; let's adjust accordingly */
                if (res->start == start && res->end == end) {
                        /* free the whole entry */
                        *p = res->sibling;
                        free_resource(res);
                } else if (res->start == start && res->end != end) {
                        /* adjust the start */
                        WARN_ON_ONCE(__adjust_resource(res, end + 1,
                                                       res->end - end));
                } else if (res->start != start && res->end == end) {
                        /* adjust the end */
                        WARN_ON_ONCE(__adjust_resource(res, res->start,
                                                       start - res->start));
                } else {
                        /* split into two entries - we need a new resource */
                        if (!new_res) {
                                new_res = alloc_resource(GFP_ATOMIC);
                                if (!new_res) {
                                        alloc_nofail = true;
                                        write_unlock(&resource_lock);
                                        goto retry;
                                }
                        }
                        new_res->name = res->name;
                        new_res->start = end + 1;
                        new_res->end = res->end;
                        new_res->flags = res->flags;
                        new_res->desc = res->desc;
                        new_res->parent = res->parent;
                        new_res->sibling = res->sibling;
                        new_res->child = NULL;

                        if (WARN_ON_ONCE(__adjust_resource(res, res->start,
                                                           start - res->start)))
                                break;
                        res->sibling = new_res;
                        new_res = NULL;
                }

                break;
        }

        write_unlock(&resource_lock);
        free_resource(new_res);
}
#endif        /* CONFIG_MEMORY_HOTREMOVE */

#ifdef CONFIG_MEMORY_HOTPLUG
static bool system_ram_resources_mergeable(struct resource *r1,
                                           struct resource *r2)
{
        /* We assume either r1 or r2 is IORESOURCE_SYSRAM_MERGEABLE. */
        return r1->flags == r2->flags && r1->end + 1 == r2->start &&
               r1->name == r2->name && r1->desc == r2->desc &&
               !r1->child && !r2->child;
}

/*
 * merge_system_ram_resource - mark the System RAM resource mergeable and try to
 * merge it with adjacent, mergeable resources
 * @res: resource descriptor
 *
 * This interface is intended for memory hotplug, whereby lots of contiguous
 * system ram resources are added (e.g., via add_memory*()) by a driver, and
 * the actual resource boundaries are not of interest (e.g., it might be
 * relevant for DIMMs). Only resources that are marked mergeable, that have the
 * same parent, and that don't have any children are considered. All mergeable
 * resources must be immutable during the request.
 *
 * Note:
 * - The caller has to make sure that no pointers to resources that are
 *   marked mergeable are used anymore after this call - the resource might
 *   be freed and the pointer might be stale!
 * - release_mem_region_adjustable() will split on demand on memory hotunplug
 */
void merge_system_ram_resource(struct resource *res)
{
        const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
        struct resource *cur;

        if (WARN_ON_ONCE((res->flags & flags) != flags))
                return;

        write_lock(&resource_lock);
        res->flags |= IORESOURCE_SYSRAM_MERGEABLE;

        /* Try to merge with next item in the list. */
        cur = res->sibling;
        if (cur && system_ram_resources_mergeable(res, cur)) {
                res->end = cur->end;
                res->sibling = cur->sibling;
                free_resource(cur);
        }

        /* Try to merge with previous item in the list. */
        cur = res->parent->child;
        while (cur && cur->sibling != res)
                cur = cur->sibling;
        if (cur && system_ram_resources_mergeable(cur, res)) {
                cur->end = res->end;
                cur->sibling = res->sibling;
                free_resource(res);
        }
        write_unlock(&resource_lock);
}
#endif        /* CONFIG_MEMORY_HOTPLUG */

/*
 * Managed region resource
 */
static void devm_resource_release(struct device *dev, void *ptr)
{
        struct resource **r = ptr;

        release_resource(*r);
}

/**
 * devm_request_resource() - request and reserve an I/O or memory resource
 * @dev: device for which to request the resource
 * @root: root of the resource tree from which to request the resource
 * @new: descriptor of the resource to request
 *
 * This is a device-managed version of request_resource(). There is usually
 * no need to release resources requested by this function explicitly since
 * that will be taken care of when the device is unbound from its driver.
 * If for some reason the resource needs to be released explicitly, because
 * of ordering issues for example, drivers must call devm_release_resource()
 * rather than the regular release_resource().
 *
 * When a conflict is detected between any existing resources and the newly
 * requested resource, an error message will be printed.
 *
 * Returns 0 on success or a negative error code on failure.
 */
int devm_request_resource(struct device *dev, struct resource *root,
                          struct resource *new)
{
        struct resource *conflict, **ptr;

        ptr = devres_alloc(devm_resource_release, sizeof(*ptr), GFP_KERNEL);
        if (!ptr)
                return -ENOMEM;

        *ptr = new;

        conflict = request_resource_conflict(root, new);
        if (conflict) {
                dev_err(dev, "resource collision: %pR conflicts with %s %pR\n",
                        new, conflict->name, conflict);
                devres_free(ptr);
                return -EBUSY;
        }

        devres_add(dev, ptr);
        return 0;
}
EXPORT_SYMBOL(devm_request_resource);

static int devm_resource_match(struct device *dev, void *res, void *data)
{
        struct resource **ptr = res;

        return *ptr == data;
}

/**
 * devm_release_resource() - release a previously requested resource
 * @dev: device for which to release the resource
 * @new: descriptor of the resource to release
 *
 * Releases a resource previously requested using devm_request_resource().
 */
void devm_release_resource(struct device *dev, struct resource *new)
{
        WARN_ON(devres_release(dev, devm_resource_release, devm_resource_match,
                               new));
}
EXPORT_SYMBOL(devm_release_resource);

struct region_devres {
        struct resource *parent;
        resource_size_t start;
        resource_size_t n;
};

static void devm_region_release(struct device *dev, void *res)
{
        struct region_devres *this = res;

        __release_region(this->parent, this->start, this->n);
}

static int devm_region_match(struct device *dev, void *res, void *match_data)
{
        struct region_devres *this = res, *match = match_data;

        return this->parent == match->parent &&
                this->start == match->start && this->n == match->n;
}

struct resource *
__devm_request_region(struct device *dev, struct resource *parent,
                      resource_size_t start, resource_size_t n, const char *name)
{
        struct region_devres *dr = NULL;
        struct resource *res;

        dr = devres_alloc(devm_region_release, sizeof(struct region_devres),
                          GFP_KERNEL);
        if (!dr)
                return NULL;

        dr->parent = parent;
        dr->start = start;
        dr->n = n;

        res = __request_region(parent, start, n, name, 0);
        if (res)
                devres_add(dev, dr);
        else
                devres_free(dr);

        return res;
}
EXPORT_SYMBOL(__devm_request_region);

void __devm_release_region(struct device *dev, struct resource *parent,
                           resource_size_t start, resource_size_t n)
{
        struct region_devres match_data = { parent, start, n };

        __release_region(parent, start, n);
        WARN_ON(devres_destroy(dev, devm_region_release, devm_region_match,
                               &match_data));
}
EXPORT_SYMBOL(__devm_release_region);

/*
 * Reserve I/O ports or memory based on "reserve=" kernel parameter.
 */
#define MAXRESERVE 4
static int __init reserve_setup(char *str)
{
        static int reserved;
        static struct resource reserve[MAXRESERVE];

        for (;;) {
                unsigned int io_start, io_num;
                int x = reserved;
                struct resource *parent;

                if (get_option(&str, &io_start) != 2)
                        break;
                if (get_option(&str, &io_num) == 0)
                        break;
                if (x < MAXRESERVE) {
                        struct resource *res = reserve + x;

                        /*
                         * If the region starts below 0x10000, we assume it's
                         * I/O port space; otherwise assume it's memory.
                         */
                        if (io_start < 0x10000) {
                                res->flags = IORESOURCE_IO;
                                parent = &ioport_resource;
                        } else {
                                res->flags = IORESOURCE_MEM;
                                parent = &iomem_resource;
                        }
                        res->name = "reserved";
                        res->start = io_start;
                        res->end = io_start + io_num - 1;
                        res->flags |= IORESOURCE_BUSY;
                        res->desc = IORES_DESC_NONE;
                        res->child = NULL;
                        if (request_resource(parent, res) == 0)
                                reserved = x+1;
                }
        }
        return 1;
}
__setup("reserve=", reserve_setup);

/*
 * Check if the requested addr and size spans more than any slot in the
 * iomem resource tree.
 */
int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
{
        struct resource *p = &iomem_resource;
        int err = 0;
        loff_t l;

        read_lock(&resource_lock);
        for (p = p->child; p ; p = r_next(NULL, p, &l)) {
                /*
                 * We can probably skip the resources without
                 * IORESOURCE_IO attribute?
                 */
                if (p->start >= addr + size)
                        continue;
                if (p->end < addr)
                        continue;
                if (PFN_DOWN(p->start) <= PFN_DOWN(addr) &&
                    PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1))
                        continue;
                /*
                 * if a resource is "BUSY", it's not a hardware resource
                 * but a driver mapping of such a resource; we don't want
                 * to warn for those; some drivers legitimately map only
                 * partial hardware resources. (example: vesafb)
                 */
                if (p->flags & IORESOURCE_BUSY)
                        continue;

                printk(KERN_WARNING "resource sanity check: requesting [mem %#010llx-%#010llx], which spans more than %s %pR\n",
                       (unsigned long long)addr,
                       (unsigned long long)(addr + size - 1),
                       p->name, p);
                err = -1;
                break;
        }
        read_unlock(&resource_lock);

        return err;
}

#ifdef CONFIG_STRICT_DEVMEM
static int strict_iomem_checks = 1;
#else
static int strict_iomem_checks;
#endif

/*
 * check if an address is reserved in the iomem resource tree
 * returns true if reserved, false if not reserved.
 */
bool iomem_is_exclusive(u64 addr)
{
        struct resource *p = &iomem_resource;
        bool err = false;
        loff_t l;
        int size = PAGE_SIZE;

        if (!strict_iomem_checks)
                return false;

        addr = addr & PAGE_MASK;

        read_lock(&resource_lock);
        for (p = p->child; p ; p = r_next(NULL, p, &l)) {
                /*
                 * We can probably skip the resources without
                 * IORESOURCE_IO attribute?
                 */
                if (p->start >= addr + size)
                        break;
                if (p->end < addr)
                        continue;
                /*
                 * A resource is exclusive if IORESOURCE_EXCLUSIVE is set
                 * or CONFIG_IO_STRICT_DEVMEM is enabled and the
                 * resource is busy.
                 */
                if ((p->flags & IORESOURCE_BUSY) == 0)
                        continue;
                if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM)
                                || p->flags & IORESOURCE_EXCLUSIVE) {
                        err = true;
                        break;
                }
        }
        read_unlock(&resource_lock);

        return err;
}

struct resource_entry *resource_list_create_entry(struct resource *res,
                                                  size_t extra_size)
{
        struct resource_entry *entry;

        entry = kzalloc(sizeof(*entry) + extra_size, GFP_KERNEL);
        if (entry) {
                INIT_LIST_HEAD(&entry->node);
                entry->res = res ? res : &entry->__res;
        }

        return entry;
}
EXPORT_SYMBOL(resource_list_create_entry);

void resource_list_free(struct list_head *head)
{
        struct resource_entry *entry, *tmp;

        list_for_each_entry_safe(entry, tmp, head, node)
                resource_list_destroy_entry(entry);
}
EXPORT_SYMBOL(resource_list_free);

#ifdef CONFIG_DEVICE_PRIVATE
static struct resource *__request_free_mem_region(struct device *dev,
                struct resource *base, unsigned long size, const char *name)
{
        resource_size_t end, addr;
        struct resource *res;

        size = ALIGN(size, 1UL << PA_SECTION_SHIFT);
        end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1);
        addr = end - size + 1UL;

        for (; addr > size && addr >= base->start; addr -= size) {
                if (region_intersects(addr, size, 0, IORES_DESC_NONE) !=
                                REGION_DISJOINT)
                        continue;

                if (dev)
                        res = devm_request_mem_region(dev, addr, size, name);
                else
                        res = request_mem_region(addr, size, name);
                if (!res)
                        return ERR_PTR(-ENOMEM);
                res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
                return res;
        }

        return ERR_PTR(-ERANGE);
}

/**
 * devm_request_free_mem_region - find free region for device private memory
 *
 * @dev: device struct to bind the resource to
 * @size: size in bytes of the device memory to add
 * @base: resource tree to look in
 *
 * This function tries to find an empty range of physical address big enough to
 * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE
 * memory, which in turn allocates struct pages.
 */
struct resource *devm_request_free_mem_region(struct device *dev,
                struct resource *base, unsigned long size)
{
        return __request_free_mem_region(dev, base, size, dev_name(dev));
}
EXPORT_SYMBOL_GPL(devm_request_free_mem_region);

struct resource *request_free_mem_region(struct resource *base,
                unsigned long size, const char *name)
{
        return __request_free_mem_region(NULL, base, size, name);
}
EXPORT_SYMBOL_GPL(request_free_mem_region);

#endif /* CONFIG_DEVICE_PRIVATE */

static int __init strict_iomem(char *str)
{
        if (strstr(str, "relaxed"))
                strict_iomem_checks = 0;
        if (strstr(str, "strict"))
                strict_iomem_checks = 1;
        return 1;
}

__setup("iomem=", strict_iomem);






























































































































































































































































































































































































































































































































































































































































































    1 




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/ipc/shm.c
 * Copyright (C) 1992, 1993 Krishna Balasubramanian
 *         Many improvements/fixes by Bruno Haible.
 * Replaced `struct shm_desc' by `struct vm_area_struct', July 1994.
 * Fixed the shm swap deallocation (shm_unuse()), August 1998 Andrea Arcangeli.
 *
 * /proc/sysvipc/shm support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
 * BIGMEM support, Andrea Arcangeli <andrea@suse.de>
 * SMP thread shm, Jean-Luc Boyard <jean-luc.boyard@siemens.fr>
 * HIGHMEM support, Ingo Molnar <mingo@redhat.com>
 * Make shmmax, shmall, shmmni sysctl'able, Christoph Rohland <cr@sap.com>
 * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com>
 * Move the mm functionality over to mm/shmem.c, Christoph Rohland <cr@sap.com>
 *
 * support for audit of ipc object properties and permission changes
 * Dustin Kirkland <dustin.kirkland@us.ibm.com>
 *
 * namespaces support
 * OpenVZ, SWsoft Inc.
 * Pavel Emelianov <xemul@openvz.org>
 *
 * Better ipc lock (kern_ipc_perm.lock) handling
 * Davidlohr Bueso <davidlohr.bueso@hp.com>, June 2013.
 */

#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/shm.h>
#include <linux/init.h>
#include <linux/file.h>
#include <linux/mman.h>
#include <linux/shmem_fs.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/ptrace.h>
#include <linux/seq_file.h>
#include <linux/rwsem.h>
#include <linux/nsproxy.h>
#include <linux/mount.h>
#include <linux/ipc_namespace.h>
#include <linux/rhashtable.h>

#include <linux/uaccess.h>

#include "util.h"

struct shmid_kernel /* private to the kernel */
{
        struct kern_ipc_perm        shm_perm;
        struct file                *shm_file;
        unsigned long                shm_nattch;
        unsigned long                shm_segsz;
        time64_t                shm_atim;
        time64_t                shm_dtim;
        time64_t                shm_ctim;
        struct pid                *shm_cprid;
        struct pid                *shm_lprid;
        struct user_struct        *mlock_user;

        /*
         * The task created the shm object, for
         * task_lock(shp->shm_creator)
         */
        struct task_struct        *shm_creator;

        /*
         * List by creator. task_lock(->shm_creator) required for read/write.
         * If list_empty(), then the creator is dead already.
         */
        struct list_head        shm_clist;
        struct ipc_namespace        *ns;
} __randomize_layout;

/* shm_mode upper byte flags */
#define SHM_DEST        01000        /* segment will be destroyed on last detach */
#define SHM_LOCKED        02000   /* segment will not be swapped */

struct shm_file_data {
        int id;
        struct ipc_namespace *ns;
        struct file *file;
        const struct vm_operations_struct *vm_ops;
};

#define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data))

static const struct file_operations shm_file_operations;
static const struct vm_operations_struct shm_vm_ops;

#define shm_ids(ns)        ((ns)->ids[IPC_SHM_IDS])

#define shm_unlock(shp)                        \
        ipc_unlock(&(shp)->shm_perm)

static int newseg(struct ipc_namespace *, struct ipc_params *);
static void shm_open(struct vm_area_struct *vma);
static void shm_close(struct vm_area_struct *vma);
static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp);
#ifdef CONFIG_PROC_FS
static int sysvipc_shm_proc_show(struct seq_file *s, void *it);
#endif

void shm_init_ns(struct ipc_namespace *ns)
{
        ns->shm_ctlmax = SHMMAX;
        ns->shm_ctlall = SHMALL;
        ns->shm_ctlmni = SHMMNI;
        ns->shm_rmid_forced = 0;
        ns->shm_tot = 0;
        ipc_init_ids(&shm_ids(ns));
}

/*
 * Called with shm_ids.rwsem (writer) and the shp structure locked.
 * Only shm_ids.rwsem remains locked on exit.
 */
static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
{
        struct shmid_kernel *shp;

        shp = container_of(ipcp, struct shmid_kernel, shm_perm);
        WARN_ON(ns != shp->ns);

        if (shp->shm_nattch) {
                shp->shm_perm.mode |= SHM_DEST;
                /* Do not find it any more */
                ipc_set_key_private(&shm_ids(ns), &shp->shm_perm);
                shm_unlock(shp);
        } else
                shm_destroy(ns, shp);
}

#ifdef CONFIG_IPC_NS
void shm_exit_ns(struct ipc_namespace *ns)
{
        free_ipcs(ns, &shm_ids(ns), do_shm_rmid);
        idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr);
        rhashtable_destroy(&ns->ids[IPC_SHM_IDS].key_ht);
}
#endif

static int __init ipc_ns_init(void)
{
        shm_init_ns(&init_ipc_ns);
        return 0;
}

pure_initcall(ipc_ns_init);

void __init shm_init(void)
{
        ipc_init_proc_interface("sysvipc/shm",
#if BITS_PER_LONG <= 32
                                "       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime        rss       swap\n",
#else
                                "       key      shmid perms                  size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime                   rss                  swap\n",
#endif
                                IPC_SHM_IDS, sysvipc_shm_proc_show);
}

static inline struct shmid_kernel *shm_obtain_object(struct ipc_namespace *ns, int id)
{
        struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&shm_ids(ns), id);

        if (IS_ERR(ipcp))
                return ERR_CAST(ipcp);

        return container_of(ipcp, struct shmid_kernel, shm_perm);
}

static inline struct shmid_kernel *shm_obtain_object_check(struct ipc_namespace *ns, int id)
{
        struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&shm_ids(ns), id);

        if (IS_ERR(ipcp))
                return ERR_CAST(ipcp);

        return container_of(ipcp, struct shmid_kernel, shm_perm);
}

/*
 * shm_lock_(check_) routines are called in the paths where the rwsem
 * is not necessarily held.
 */
static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
{
        struct kern_ipc_perm *ipcp;

        rcu_read_lock();
        ipcp = ipc_obtain_object_idr(&shm_ids(ns), id);
        if (IS_ERR(ipcp))
                goto err;

        ipc_lock_object(ipcp);
        /*
         * ipc_rmid() may have already freed the ID while ipc_lock_object()
         * was spinning: here verify that the structure is still valid.
         * Upon races with RMID, return -EIDRM, thus indicating that
         * the ID points to a removed identifier.
         */
        if (ipc_valid_object(ipcp)) {
                /* return a locked ipc object upon success */
                return container_of(ipcp, struct shmid_kernel, shm_perm);
        }

        ipc_unlock_object(ipcp);
        ipcp = ERR_PTR(-EIDRM);
err:
        rcu_read_unlock();
        /*
         * Callers of shm_lock() must validate the status of the returned ipc
         * object pointer and error out as appropriate.
         */
        return ERR_CAST(ipcp);
}

static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp)
{
        rcu_read_lock();
        ipc_lock_object(&ipcp->shm_perm);
}

static void shm_rcu_free(struct rcu_head *head)
{
        struct kern_ipc_perm *ptr = container_of(head, struct kern_ipc_perm,
                                                        rcu);
        struct shmid_kernel *shp = container_of(ptr, struct shmid_kernel,
                                                        shm_perm);
        security_shm_free(&shp->shm_perm);
        kvfree(shp);
}

/*
 * It has to be called with shp locked.
 * It must be called before ipc_rmid()
 */
static inline void shm_clist_rm(struct shmid_kernel *shp)
{
        struct task_struct *creator;

        /* ensure that shm_creator does not disappear */
        rcu_read_lock();

        /*
         * A concurrent exit_shm may do a list_del_init() as well.
         * Just do nothing if exit_shm already did the work
         */
        if (!list_empty(&shp->shm_clist)) {
                /*
                 * shp->shm_creator is guaranteed to be valid *only*
                 * if shp->shm_clist is not empty.
                 */
                creator = shp->shm_creator;

                task_lock(creator);
                /*
                 * list_del_init() is a nop if the entry was already removed
                 * from the list.
                 */
                list_del_init(&shp->shm_clist);
                task_unlock(creator);
        }
        rcu_read_unlock();
}

static inline void shm_rmid(struct shmid_kernel *s)
{
        shm_clist_rm(s);
        ipc_rmid(&shm_ids(s->ns), &s->shm_perm);
}


static int __shm_open(struct vm_area_struct *vma)
{
        struct file *file = vma->vm_file;
        struct shm_file_data *sfd = shm_file_data(file);
        struct shmid_kernel *shp;

        shp = shm_lock(sfd->ns, sfd->id);

        if (IS_ERR(shp))
                return PTR_ERR(shp);

        if (shp->shm_file != sfd->file) {
                /* ID was reused */
                shm_unlock(shp);
                return -EINVAL;
        }

        shp->shm_atim = ktime_get_real_seconds();
        ipc_update_pid(&shp->shm_lprid, task_tgid(current));
        shp->shm_nattch++;
        shm_unlock(shp);
        return 0;
}

/* This is called by fork, once for every shm attach. */
static void shm_open(struct vm_area_struct *vma)
{
        int err = __shm_open(vma);
        /*
         * We raced in the idr lookup or with shm_destroy().
         * Either way, the ID is busted.
         */
        WARN_ON_ONCE(err);
}

/*
 * shm_destroy - free the struct shmid_kernel
 *
 * @ns: namespace
 * @shp: struct to free
 *
 * It has to be called with shp and shm_ids.rwsem (writer) locked,
 * but returns with shp unlocked and freed.
 */
static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
{
        struct file *shm_file;

        shm_file = shp->shm_file;
        shp->shm_file = NULL;
        ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
        shm_rmid(shp);
        shm_unlock(shp);
        if (!is_file_hugepages(shm_file))
                shmem_lock(shm_file, 0, shp->mlock_user);
        else if (shp->mlock_user)
                user_shm_unlock(i_size_read(file_inode(shm_file)),
                                shp->mlock_user);
        fput(shm_file);
        ipc_update_pid(&shp->shm_cprid, NULL);
        ipc_update_pid(&shp->shm_lprid, NULL);
        ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);
}

/*
 * shm_may_destroy - identifies whether shm segment should be destroyed now
 *
 * Returns true if and only if there are no active users of the segment and
 * one of the following is true:
 *
 * 1) shmctl(id, IPC_RMID, NULL) was called for this shp
 *
 * 2) sysctl kernel.shm_rmid_forced is set to 1.
 */
static bool shm_may_destroy(struct shmid_kernel *shp)
{
        return (shp->shm_nattch == 0) &&
               (shp->ns->shm_rmid_forced ||
                (shp->shm_perm.mode & SHM_DEST));
}

/*
 * remove the attach descriptor vma.
 * free memory for segment if it is marked destroyed.
 * The descriptor has already been removed from the current->mm->mmap list
 * and will later be kfree()d.
 */
static void shm_close(struct vm_area_struct *vma)
{
        struct file *file = vma->vm_file;
        struct shm_file_data *sfd = shm_file_data(file);
        struct shmid_kernel *shp;
        struct ipc_namespace *ns = sfd->ns;

        down_write(&shm_ids(ns).rwsem);
        /* remove from the list of attaches of the shm segment */
        shp = shm_lock(ns, sfd->id);

        /*
         * We raced in the idr lookup or with shm_destroy().
         * Either way, the ID is busted.
         */
        if (WARN_ON_ONCE(IS_ERR(shp)))
                goto done; /* no-op */

        ipc_update_pid(&shp->shm_lprid, task_tgid(current));
        shp->shm_dtim = ktime_get_real_seconds();
        shp->shm_nattch--;
        if (shm_may_destroy(shp))
                shm_destroy(ns, shp);
        else
                shm_unlock(shp);
done:
        up_write(&shm_ids(ns).rwsem);
}

/* Called with ns->shm_ids(ns).rwsem locked */
static int shm_try_destroy_orphaned(int id, void *p, void *data)
{
        struct ipc_namespace *ns = data;
        struct kern_ipc_perm *ipcp = p;
        struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm);

        /*
         * We want to destroy segments without users and with already
         * exit'ed originating process.
         *
         * As shp->* are changed under rwsem, it's safe to skip shp locking.
         */
        if (!list_empty(&shp->shm_clist))
                return 0;

        if (shm_may_destroy(shp)) {
                shm_lock_by_ptr(shp);
                shm_destroy(ns, shp);
        }
        return 0;
}

void shm_destroy_orphaned(struct ipc_namespace *ns)
{
        down_write(&shm_ids(ns).rwsem);
        if (shm_ids(ns).in_use)
                idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns);
        up_write(&shm_ids(ns).rwsem);
}

/* Locking assumes this will only be called with task == current */
void exit_shm(struct task_struct *task)
{
        for (;;) {
                struct shmid_kernel *shp;
                struct ipc_namespace *ns;

                task_lock(task);

                if (list_empty(&task->sysvshm.shm_clist)) {
                        task_unlock(task);
                        break;
                }

                shp = list_first_entry(&task->sysvshm.shm_clist, struct shmid_kernel,
                                shm_clist);

                /*
                 * 1) Get pointer to the ipc namespace. It is worth to say
                 * that this pointer is guaranteed to be valid because
                 * shp lifetime is always shorter than namespace lifetime
                 * in which shp lives.
                 * We taken task_lock it means that shp won't be freed.
                 */
                ns = shp->ns;

                /*
                 * 2) If kernel.shm_rmid_forced is not set then only keep track of
                 * which shmids are orphaned, so that a later set of the sysctl
                 * can clean them up.
                 */
                if (!ns->shm_rmid_forced)
                        goto unlink_continue;

                /*
                 * 3) get a reference to the namespace.
                 *    The refcount could be already 0. If it is 0, then
                 *    the shm objects will be free by free_ipc_work().
                 */
                ns = get_ipc_ns_not_zero(ns);
                if (!ns) {
unlink_continue:
                        list_del_init(&shp->shm_clist);
                        task_unlock(task);
                        continue;
                }

                /*
                 * 4) get a reference to shp.
                 *   This cannot fail: shm_clist_rm() is called before
                 *   ipc_rmid(), thus the refcount cannot be 0.
                 */
                WARN_ON(!ipc_rcu_getref(&shp->shm_perm));

                /*
                 * 5) unlink the shm segment from the list of segments
                 *    created by current.
                 *    This must be done last. After unlinking,
                 *    only the refcounts obtained above prevent IPC_RMID
                 *    from destroying the segment or the namespace.
                 */
                list_del_init(&shp->shm_clist);

                task_unlock(task);

                /*
                 * 6) we have all references
                 *    Thus lock & if needed destroy shp.
                 */
                down_write(&shm_ids(ns).rwsem);
                shm_lock_by_ptr(shp);
                /*
                 * rcu_read_lock was implicitly taken in shm_lock_by_ptr, it's
                 * safe to call ipc_rcu_putref here
                 */
                ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);

                if (ipc_valid_object(&shp->shm_perm)) {
                        if (shm_may_destroy(shp))
                                shm_destroy(ns, shp);
                        else
                                shm_unlock(shp);
                } else {
                        /*
                         * Someone else deleted the shp from namespace
                         * idr/kht while we have waited.
                         * Just unlock and continue.
                         */
                        shm_unlock(shp);
                }

                up_write(&shm_ids(ns).rwsem);
                put_ipc_ns(ns); /* paired with get_ipc_ns_not_zero */
        }
}

static vm_fault_t shm_fault(struct vm_fault *vmf)
{
        struct file *file = vmf->vma->vm_file;
        struct shm_file_data *sfd = shm_file_data(file);

        return sfd->vm_ops->fault(vmf);
}

static int shm_split(struct vm_area_struct *vma, unsigned long addr)
{
        struct file *file = vma->vm_file;
        struct shm_file_data *sfd = shm_file_data(file);

        if (sfd->vm_ops->split)
                return sfd->vm_ops->split(vma, addr);

        return 0;
}

static unsigned long shm_pagesize(struct vm_area_struct *vma)
{
        struct file *file = vma->vm_file;
        struct shm_file_data *sfd = shm_file_data(file);

        if (sfd->vm_ops->pagesize)
                return sfd->vm_ops->pagesize(vma);

        return PAGE_SIZE;
}

#ifdef CONFIG_NUMA
static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
{
        struct file *file = vma->vm_file;
        struct shm_file_data *sfd = shm_file_data(file);
        int err = 0;

        if (sfd->vm_ops->set_policy)
                err = sfd->vm_ops->set_policy(vma, new);
        return err;
}

static struct mempolicy *shm_get_policy(struct vm_area_struct *vma,
                                        unsigned long addr)
{
        struct file *file = vma->vm_file;
        struct shm_file_data *sfd = shm_file_data(file);
        struct mempolicy *pol = NULL;

        if (sfd->vm_ops->get_policy)
                pol = sfd->vm_ops->get_policy(vma, addr);
        else if (vma->vm_policy)
                pol = vma->vm_policy;

        return pol;
}
#endif

static int shm_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct shm_file_data *sfd = shm_file_data(file);
        int ret;

        /*
         * In case of remap_file_pages() emulation, the file can represent an
         * IPC ID that was removed, and possibly even reused by another shm
         * segment already.  Propagate this case as an error to caller.
         */
        ret = __shm_open(vma);
        if (ret)
                return ret;

        ret = call_mmap(sfd->file, vma);
        if (ret) {
                shm_close(vma);
                return ret;
        }
        sfd->vm_ops = vma->vm_ops;
#ifdef CONFIG_MMU
        WARN_ON(!sfd->vm_ops->fault);
#endif
        vma->vm_ops = &shm_vm_ops;
        return 0;
}

static int shm_release(struct inode *ino, struct file *file)
{
        struct shm_file_data *sfd = shm_file_data(file);

        put_ipc_ns(sfd->ns);
        fput(sfd->file);
        shm_file_data(file) = NULL;
        kfree(sfd);
        return 0;
}

static int shm_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
        struct shm_file_data *sfd = shm_file_data(file);

        if (!sfd->file->f_op->fsync)
                return -EINVAL;
        return sfd->file->f_op->fsync(sfd->file, start, end, datasync);
}

static long shm_fallocate(struct file *file, int mode, loff_t offset,
                          loff_t len)
{
        struct shm_file_data *sfd = shm_file_data(file);

        if (!sfd->file->f_op->fallocate)
                return -EOPNOTSUPP;
        return sfd->file->f_op->fallocate(file, mode, offset, len);
}

static unsigned long shm_get_unmapped_area(struct file *file,
        unsigned long addr, unsigned long len, unsigned long pgoff,
        unsigned long flags)
{
        struct shm_file_data *sfd = shm_file_data(file);

        return sfd->file->f_op->get_unmapped_area(sfd->file, addr, len,
                                                pgoff, flags);
}

static const struct file_operations shm_file_operations = {
        .mmap                = shm_mmap,
        .fsync                = shm_fsync,
        .release        = shm_release,
        .get_unmapped_area        = shm_get_unmapped_area,
        .llseek                = noop_llseek,
        .fallocate        = shm_fallocate,
};

/*
 * shm_file_operations_huge is now identical to shm_file_operations,
 * but we keep it distinct for the sake of is_file_shm_hugepages().
 */
static const struct file_operations shm_file_operations_huge = {
        .mmap                = shm_mmap,
        .fsync                = shm_fsync,
        .release        = shm_release,
        .get_unmapped_area        = shm_get_unmapped_area,
        .llseek                = noop_llseek,
        .fallocate        = shm_fallocate,
};

bool is_file_shm_hugepages(struct file *file)
{
        return file->f_op == &shm_file_operations_huge;
}

static const struct vm_operations_struct shm_vm_ops = {
        .open        = shm_open,        /* callback for a new vm-area open */
        .close        = shm_close,        /* callback for when the vm-area is released */
        .fault        = shm_fault,
        .split        = shm_split,
        .pagesize = shm_pagesize,
#if defined(CONFIG_NUMA)
        .set_policy = shm_set_policy,
        .get_policy = shm_get_policy,
#endif
};

/**
 * newseg - Create a new shared memory segment
 * @ns: namespace
 * @params: ptr to the structure that contains key, size and shmflg
 *
 * Called with shm_ids.rwsem held as a writer.
 */
static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
{
        key_t key = params->key;
        int shmflg = params->flg;
        size_t size = params->u.size;
        int error;
        struct shmid_kernel *shp;
        size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
        struct file *file;
        char name[13];
        vm_flags_t acctflag = 0;

        if (size < SHMMIN || size > ns->shm_ctlmax)
                return -EINVAL;

        if (numpages << PAGE_SHIFT < size)
                return -ENOSPC;

        if (ns->shm_tot + numpages < ns->shm_tot ||
                        ns->shm_tot + numpages > ns->shm_ctlall)
                return -ENOSPC;

        shp = kvmalloc(sizeof(*shp), GFP_KERNEL_ACCOUNT);
        if (unlikely(!shp))
                return -ENOMEM;

        shp->shm_perm.key = key;
        shp->shm_perm.mode = (shmflg & S_IRWXUGO);
        shp->mlock_user = NULL;

        shp->shm_perm.security = NULL;
        error = security_shm_alloc(&shp->shm_perm);
        if (error) {
                kvfree(shp);
                return error;
        }

        sprintf(name, "SYSV%08x", key);
        if (shmflg & SHM_HUGETLB) {
                struct hstate *hs;
                size_t hugesize;

                hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
                if (!hs) {
                        error = -EINVAL;
                        goto no_file;
                }
                hugesize = ALIGN(size, huge_page_size(hs));

                /* hugetlb_file_setup applies strict accounting */
                if (shmflg & SHM_NORESERVE)
                        acctflag = VM_NORESERVE;
                file = hugetlb_file_setup(name, hugesize, acctflag,
                                  &shp->mlock_user, HUGETLB_SHMFS_INODE,
                                (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
        } else {
                /*
                 * Do not allow no accounting for OVERCOMMIT_NEVER, even
                 * if it's asked for.
                 */
                if  ((shmflg & SHM_NORESERVE) &&
                                sysctl_overcommit_memory != OVERCOMMIT_NEVER)
                        acctflag = VM_NORESERVE;
                file = shmem_kernel_file_setup(name, size, acctflag);
        }
        error = PTR_ERR(file);
        if (IS_ERR(file))
                goto no_file;

        shp->shm_cprid = get_pid(task_tgid(current));
        shp->shm_lprid = NULL;
        shp->shm_atim = shp->shm_dtim = 0;
        shp->shm_ctim = ktime_get_real_seconds();
        shp->shm_segsz = size;
        shp->shm_nattch = 0;
        shp->shm_file = file;
        shp->shm_creator = current;

        /* ipc_addid() locks shp upon success. */
        error = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
        if (error < 0)
                goto no_id;

        shp->ns = ns;

        task_lock(current);
        list_add(&shp->shm_clist, &current->sysvshm.shm_clist);
        task_unlock(current);

        /*
         * shmid gets reported as "inode#" in /proc/pid/maps.
         * proc-ps tools use this. Changing this will break them.
         */
        file_inode(file)->i_ino = shp->shm_perm.id;

        ns->shm_tot += numpages;
        error = shp->shm_perm.id;

        ipc_unlock_object(&shp->shm_perm);
        rcu_read_unlock();
        return error;

no_id:
        ipc_update_pid(&shp->shm_cprid, NULL);
        ipc_update_pid(&shp->shm_lprid, NULL);
        if (is_file_hugepages(file) && shp->mlock_user)
                user_shm_unlock(size, shp->mlock_user);
        fput(file);
        ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);
        return error;
no_file:
        call_rcu(&shp->shm_perm.rcu, shm_rcu_free);
        return error;
}

/*
 * Called with shm_ids.rwsem and ipcp locked.
 */
static int shm_more_checks(struct kern_ipc_perm *ipcp, struct ipc_params *params)
{
        struct shmid_kernel *shp;

        shp = container_of(ipcp, struct shmid_kernel, shm_perm);
        if (shp->shm_segsz < params->u.size)
                return -EINVAL;

        return 0;
}

long ksys_shmget(key_t key, size_t size, int shmflg)
{
        struct ipc_namespace *ns;
        static const struct ipc_ops shm_ops = {
                .getnew = newseg,
                .associate = security_shm_associate,
                .more_checks = shm_more_checks,
        };
        struct ipc_params shm_params;

        ns = current->nsproxy->ipc_ns;

        shm_params.key = key;
        shm_params.flg = shmflg;
        shm_params.u.size = size;

        return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params);
}

SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg)
{
        return ksys_shmget(key, size, shmflg);
}

static inline unsigned long copy_shmid_to_user(void __user *buf, struct shmid64_ds *in, int version)
{
        switch (version) {
        case IPC_64:
                return copy_to_user(buf, in, sizeof(*in));
        case IPC_OLD:
            {
                struct shmid_ds out;

                memset(&out, 0, sizeof(out));
                ipc64_perm_to_ipc_perm(&in->shm_perm, &out.shm_perm);
                out.shm_segsz        = in->shm_segsz;
                out.shm_atime        = in->shm_atime;
                out.shm_dtime        = in->shm_dtime;
                out.shm_ctime        = in->shm_ctime;
                out.shm_cpid        = in->shm_cpid;
                out.shm_lpid        = in->shm_lpid;
                out.shm_nattch        = in->shm_nattch;

                return copy_to_user(buf, &out, sizeof(out));
            }
        default:
                return -EINVAL;
        }
}

static inline unsigned long
copy_shmid_from_user(struct shmid64_ds *out, void __user *buf, int version)
{
        switch (version) {
        case IPC_64:
                if (copy_from_user(out, buf, sizeof(*out)))
                        return -EFAULT;
                return 0;
        case IPC_OLD:
            {
                struct shmid_ds tbuf_old;

                if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
                        return -EFAULT;

                out->shm_perm.uid        = tbuf_old.shm_perm.uid;
                out->shm_perm.gid        = tbuf_old.shm_perm.gid;
                out->shm_perm.mode        = tbuf_old.shm_perm.mode;

                return 0;
            }
        default:
                return -EINVAL;
        }
}

static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminfo64 *in, int version)
{
        switch (version) {
        case IPC_64:
                return copy_to_user(buf, in, sizeof(*in));
        case IPC_OLD:
            {
                struct shminfo out;

                if (in->shmmax > INT_MAX)
                        out.shmmax = INT_MAX;
                else
                        out.shmmax = (int)in->shmmax;

                out.shmmin        = in->shmmin;
                out.shmmni        = in->shmmni;
                out.shmseg        = in->shmseg;
                out.shmall        = in->shmall;

                return copy_to_user(buf, &out, sizeof(out));
            }
        default:
                return -EINVAL;
        }
}

/*
 * Calculate and add used RSS and swap pages of a shm.
 * Called with shm_ids.rwsem held as a reader
 */
static void shm_add_rss_swap(struct shmid_kernel *shp,
        unsigned long *rss_add, unsigned long *swp_add)
{
        struct inode *inode;

        inode = file_inode(shp->shm_file);

        if (is_file_hugepages(shp->shm_file)) {
                struct address_space *mapping = inode->i_mapping;
                struct hstate *h = hstate_file(shp->shm_file);
                *rss_add += pages_per_huge_page(h) * mapping->nrpages;
        } else {
#ifdef CONFIG_SHMEM
                struct shmem_inode_info *info = SHMEM_I(inode);

                spin_lock_irq(&info->lock);
                *rss_add += inode->i_mapping->nrpages;
                *swp_add += info->swapped;
                spin_unlock_irq(&info->lock);
#else
                *rss_add += inode->i_mapping->nrpages;
#endif
        }
}

/*
 * Called with shm_ids.rwsem held as a reader
 */
static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
                unsigned long *swp)
{
        int next_id;
        int total, in_use;

        *rss = 0;
        *swp = 0;

        in_use = shm_ids(ns).in_use;

        for (total = 0, next_id = 0; total < in_use; next_id++) {
                struct kern_ipc_perm *ipc;
                struct shmid_kernel *shp;

                ipc = idr_find(&shm_ids(ns).ipcs_idr, next_id);
                if (ipc == NULL)
                        continue;
                shp = container_of(ipc, struct shmid_kernel, shm_perm);

                shm_add_rss_swap(shp, rss, swp);

                total++;
        }
}

/*
 * This function handles some shmctl commands which require the rwsem
 * to be held in write mode.
 * NOTE: no locks must be held, the rwsem is taken inside this function.
 */
static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
                       struct shmid64_ds *shmid64)
{
        struct kern_ipc_perm *ipcp;
        struct shmid_kernel *shp;
        int err;

        down_write(&shm_ids(ns).rwsem);
        rcu_read_lock();

        ipcp = ipcctl_obtain_check(ns, &shm_ids(ns), shmid, cmd,
                                      &shmid64->shm_perm, 0);
        if (IS_ERR(ipcp)) {
                err = PTR_ERR(ipcp);
                goto out_unlock1;
        }

        shp = container_of(ipcp, struct shmid_kernel, shm_perm);

        err = security_shm_shmctl(&shp->shm_perm, cmd);
        if (err)
                goto out_unlock1;

        switch (cmd) {
        case IPC_RMID:
                ipc_lock_object(&shp->shm_perm);
                /* do_shm_rmid unlocks the ipc object and rcu */
                do_shm_rmid(ns, ipcp);
                goto out_up;
        case IPC_SET:
                ipc_lock_object(&shp->shm_perm);
                err = ipc_update_perm(&shmid64->shm_perm, ipcp);
                if (err)
                        goto out_unlock0;
                shp->shm_ctim = ktime_get_real_seconds();
                break;
        default:
                err = -EINVAL;
                goto out_unlock1;
        }

out_unlock0:
        ipc_unlock_object(&shp->shm_perm);
out_unlock1:
        rcu_read_unlock();
out_up:
        up_write(&shm_ids(ns).rwsem);
        return err;
}

static int shmctl_ipc_info(struct ipc_namespace *ns,
                           struct shminfo64 *shminfo)
{
        int err = security_shm_shmctl(NULL, IPC_INFO);
        if (!err) {
                memset(shminfo, 0, sizeof(*shminfo));
                shminfo->shmmni = shminfo->shmseg = ns->shm_ctlmni;
                shminfo->shmmax = ns->shm_ctlmax;
                shminfo->shmall = ns->shm_ctlall;
                shminfo->shmmin = SHMMIN;
                down_read(&shm_ids(ns).rwsem);
                err = ipc_get_maxidx(&shm_ids(ns));
                up_read(&shm_ids(ns).rwsem);
                if (err < 0)
                        err = 0;
        }
        return err;
}

static int shmctl_shm_info(struct ipc_namespace *ns,
                           struct shm_info *shm_info)
{
        int err = security_shm_shmctl(NULL, SHM_INFO);
        if (!err) {
                memset(shm_info, 0, sizeof(*shm_info));
                down_read(&shm_ids(ns).rwsem);
                shm_info->used_ids = shm_ids(ns).in_use;
                shm_get_stat(ns, &shm_info->shm_rss, &shm_info->shm_swp);
                shm_info->shm_tot = ns->shm_tot;
                shm_info->swap_attempts = 0;
                shm_info->swap_successes = 0;
                err = ipc_get_maxidx(&shm_ids(ns));
                up_read(&shm_ids(ns).rwsem);
                if (err < 0)
                        err = 0;
        }
        return err;
}

static int shmctl_stat(struct ipc_namespace *ns, int shmid,
                        int cmd, struct shmid64_ds *tbuf)
{
        struct shmid_kernel *shp;
        int err;

        memset(tbuf, 0, sizeof(*tbuf));

        rcu_read_lock();
        if (cmd == SHM_STAT || cmd == SHM_STAT_ANY) {
                shp = shm_obtain_object(ns, shmid);
                if (IS_ERR(shp)) {
                        err = PTR_ERR(shp);
                        goto out_unlock;
                }
        } else { /* IPC_STAT */
                shp = shm_obtain_object_check(ns, shmid);
                if (IS_ERR(shp)) {
                        err = PTR_ERR(shp);
                        goto out_unlock;
                }
        }

        /*
         * Semantically SHM_STAT_ANY ought to be identical to
         * that functionality provided by the /proc/sysvipc/
         * interface. As such, only audit these calls and
         * do not do traditional S_IRUGO permission checks on
         * the ipc object.
         */
        if (cmd == SHM_STAT_ANY)
                audit_ipc_obj(&shp->shm_perm);
        else {
                err = -EACCES;
                if (ipcperms(ns, &shp->shm_perm, S_IRUGO))
                        goto out_unlock;
        }

        err = security_shm_shmctl(&shp->shm_perm, cmd);
        if (err)
                goto out_unlock;

        ipc_lock_object(&shp->shm_perm);

        if (!ipc_valid_object(&shp->shm_perm)) {
                ipc_unlock_object(&shp->shm_perm);
                err = -EIDRM;
                goto out_unlock;
        }

        kernel_to_ipc64_perm(&shp->shm_perm, &tbuf->shm_perm);
        tbuf->shm_segsz        = shp->shm_segsz;
        tbuf->shm_atime        = shp->shm_atim;
        tbuf->shm_dtime        = shp->shm_dtim;
        tbuf->shm_ctime        = shp->shm_ctim;
#ifndef CONFIG_64BIT
        tbuf->shm_atime_high = shp->shm_atim >> 32;
        tbuf->shm_dtime_high = shp->shm_dtim >> 32;
        tbuf->shm_ctime_high = shp->shm_ctim >> 32;
#endif
        tbuf->shm_cpid        = pid_vnr(shp->shm_cprid);
        tbuf->shm_lpid        = pid_vnr(shp->shm_lprid);
        tbuf->shm_nattch = shp->shm_nattch;

        if (cmd == IPC_STAT) {
                /*
                 * As defined in SUS:
                 * Return 0 on success
                 */
                err = 0;
        } else {
                /*
                 * SHM_STAT and SHM_STAT_ANY (both Linux specific)
                 * Return the full id, including the sequence number
                 */
                err = shp->shm_perm.id;
        }

        ipc_unlock_object(&shp->shm_perm);
out_unlock:
        rcu_read_unlock();
        return err;
}

static int shmctl_do_lock(struct ipc_namespace *ns, int shmid, int cmd)
{
        struct shmid_kernel *shp;
        struct file *shm_file;
        int err;

        rcu_read_lock();
        shp = shm_obtain_object_check(ns, shmid);
        if (IS_ERR(shp)) {
                err = PTR_ERR(shp);
                goto out_unlock1;
        }

        audit_ipc_obj(&(shp->shm_perm));
        err = security_shm_shmctl(&shp->shm_perm, cmd);
        if (err)
                goto out_unlock1;

        ipc_lock_object(&shp->shm_perm);

        /* check if shm_destroy() is tearing down shp */
        if (!ipc_valid_object(&shp->shm_perm)) {
                err = -EIDRM;
                goto out_unlock0;
        }

        if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) {
                kuid_t euid = current_euid();

                if (!uid_eq(euid, shp->shm_perm.uid) &&
                    !uid_eq(euid, shp->shm_perm.cuid)) {
                        err = -EPERM;
                        goto out_unlock0;
                }
                if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK)) {
                        err = -EPERM;
                        goto out_unlock0;
                }
        }

        shm_file = shp->shm_file;
        if (is_file_hugepages(shm_file))
                goto out_unlock0;

        if (cmd == SHM_LOCK) {
                struct user_struct *user = current_user();

                err = shmem_lock(shm_file, 1, user);
                if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) {
                        shp->shm_perm.mode |= SHM_LOCKED;
                        shp->mlock_user = user;
                }
                goto out_unlock0;
        }

        /* SHM_UNLOCK */
        if (!(shp->shm_perm.mode & SHM_LOCKED))
                goto out_unlock0;
        shmem_lock(shm_file, 0, shp->mlock_user);
        shp->shm_perm.mode &= ~SHM_LOCKED;
        shp->mlock_user = NULL;
        get_file(shm_file);
        ipc_unlock_object(&shp->shm_perm);
        rcu_read_unlock();
        shmem_unlock_mapping(shm_file->f_mapping);

        fput(shm_file);
        return err;

out_unlock0:
        ipc_unlock_object(&shp->shm_perm);
out_unlock1:
        rcu_read_unlock();
        return err;
}

static long ksys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf, int version)
{
        int err;
        struct ipc_namespace *ns;
        struct shmid64_ds sem64;

        if (cmd < 0 || shmid < 0)
                return -EINVAL;

        ns = current->nsproxy->ipc_ns;

        switch (cmd) {
        case IPC_INFO: {
                struct shminfo64 shminfo;
                err = shmctl_ipc_info(ns, &shminfo);
                if (err < 0)
                        return err;
                if (copy_shminfo_to_user(buf, &shminfo, version))
                        err = -EFAULT;
                return err;
        }
        case SHM_INFO: {
                struct shm_info shm_info;
                err = shmctl_shm_info(ns, &shm_info);
                if (err < 0)
                        return err;
                if (copy_to_user(buf, &shm_info, sizeof(shm_info)))
                        err = -EFAULT;
                return err;
        }
        case SHM_STAT:
        case SHM_STAT_ANY:
        case IPC_STAT: {
                err = shmctl_stat(ns, shmid, cmd, &sem64);
                if (err < 0)
                        return err;
                if (copy_shmid_to_user(buf, &sem64, version))
                        err = -EFAULT;
                return err;
        }
        case IPC_SET:
                if (copy_shmid_from_user(&sem64, buf, version))
                        return -EFAULT;
                fallthrough;
        case IPC_RMID:
                return shmctl_down(ns, shmid, cmd, &sem64);
        case SHM_LOCK:
        case SHM_UNLOCK:
                return shmctl_do_lock(ns, shmid, cmd);
        default:
                return -EINVAL;
        }
}

SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
{
        return ksys_shmctl(shmid, cmd, buf, IPC_64);
}

#ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION
long ksys_old_shmctl(int shmid, int cmd, struct shmid_ds __user *buf)
{
        int version = ipc_parse_version(&cmd);

        return ksys_shmctl(shmid, cmd, buf, version);
}

SYSCALL_DEFINE3(old_shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
{
        return ksys_old_shmctl(shmid, cmd, buf);
}
#endif

#ifdef CONFIG_COMPAT

struct compat_shmid_ds {
        struct compat_ipc_perm shm_perm;
        int shm_segsz;
        old_time32_t shm_atime;
        old_time32_t shm_dtime;
        old_time32_t shm_ctime;
        compat_ipc_pid_t shm_cpid;
        compat_ipc_pid_t shm_lpid;
        unsigned short shm_nattch;
        unsigned short shm_unused;
        compat_uptr_t shm_unused2;
        compat_uptr_t shm_unused3;
};

struct compat_shminfo64 {
        compat_ulong_t shmmax;
        compat_ulong_t shmmin;
        compat_ulong_t shmmni;
        compat_ulong_t shmseg;
        compat_ulong_t shmall;
        compat_ulong_t __unused1;
        compat_ulong_t __unused2;
        compat_ulong_t __unused3;
        compat_ulong_t __unused4;
};

struct compat_shm_info {
        compat_int_t used_ids;
        compat_ulong_t shm_tot, shm_rss, shm_swp;
        compat_ulong_t swap_attempts, swap_successes;
};

static int copy_compat_shminfo_to_user(void __user *buf, struct shminfo64 *in,
                                        int version)
{
        if (in->shmmax > INT_MAX)
                in->shmmax = INT_MAX;
        if (version == IPC_64) {
                struct compat_shminfo64 info;
                memset(&info, 0, sizeof(info));
                info.shmmax = in->shmmax;
                info.shmmin = in->shmmin;
                info.shmmni = in->shmmni;
                info.shmseg = in->shmseg;
                info.shmall = in->shmall;
                return copy_to_user(buf, &info, sizeof(info));
        } else {
                struct shminfo info;
                memset(&info, 0, sizeof(info));
                info.shmmax = in->shmmax;
                info.shmmin = in->shmmin;
                info.shmmni = in->shmmni;
                info.shmseg = in->shmseg;
                info.shmall = in->shmall;
                return copy_to_user(buf, &info, sizeof(info));
        }
}

static int put_compat_shm_info(struct shm_info *ip,
                                struct compat_shm_info __user *uip)
{
        struct compat_shm_info info;

        memset(&info, 0, sizeof(info));
        info.used_ids = ip->used_ids;
        info.shm_tot = ip->shm_tot;
        info.shm_rss = ip->shm_rss;
        info.shm_swp = ip->shm_swp;
        info.swap_attempts = ip->swap_attempts;
        info.swap_successes = ip->swap_successes;
        return copy_to_user(uip, &info, sizeof(info));
}

static int copy_compat_shmid_to_user(void __user *buf, struct shmid64_ds *in,
                                        int version)
{
        if (version == IPC_64) {
                struct compat_shmid64_ds v;
                memset(&v, 0, sizeof(v));
                to_compat_ipc64_perm(&v.shm_perm, &in->shm_perm);
                v.shm_atime         = lower_32_bits(in->shm_atime);
                v.shm_atime_high = upper_32_bits(in->shm_atime);
                v.shm_dtime         = lower_32_bits(in->shm_dtime);
                v.shm_dtime_high = upper_32_bits(in->shm_dtime);
                v.shm_ctime         = lower_32_bits(in->shm_ctime);
                v.shm_ctime_high = upper_32_bits(in->shm_ctime);
                v.shm_segsz = in->shm_segsz;
                v.shm_nattch = in->shm_nattch;
                v.shm_cpid = in->shm_cpid;
                v.shm_lpid = in->shm_lpid;
                return copy_to_user(buf, &v, sizeof(v));
        } else {
                struct compat_shmid_ds v;
                memset(&v, 0, sizeof(v));
                to_compat_ipc_perm(&v.shm_perm, &in->shm_perm);
                v.shm_perm.key = in->shm_perm.key;
                v.shm_atime = in->shm_atime;
                v.shm_dtime = in->shm_dtime;
                v.shm_ctime = in->shm_ctime;
                v.shm_segsz = in->shm_segsz;
                v.shm_nattch = in->shm_nattch;
                v.shm_cpid = in->shm_cpid;
                v.shm_lpid = in->shm_lpid;
                return copy_to_user(buf, &v, sizeof(v));
        }
}

static int copy_compat_shmid_from_user(struct shmid64_ds *out, void __user *buf,
                                        int version)
{
        memset(out, 0, sizeof(*out));
        if (version == IPC_64) {
                struct compat_shmid64_ds __user *p = buf;
                return get_compat_ipc64_perm(&out->shm_perm, &p->shm_perm);
        } else {
                struct compat_shmid_ds __user *p = buf;
                return get_compat_ipc_perm(&out->shm_perm, &p->shm_perm);
        }
}

static long compat_ksys_shmctl(int shmid, int cmd, void __user *uptr, int version)
{
        struct ipc_namespace *ns;
        struct shmid64_ds sem64;
        int err;

        ns = current->nsproxy->ipc_ns;

        if (cmd < 0 || shmid < 0)
                return -EINVAL;

        switch (cmd) {
        case IPC_INFO: {
                struct shminfo64 shminfo;
                err = shmctl_ipc_info(ns, &shminfo);
                if (err < 0)
                        return err;
                if (copy_compat_shminfo_to_user(uptr, &shminfo, version))
                        err = -EFAULT;
                return err;
        }
        case SHM_INFO: {
                struct shm_info shm_info;
                err = shmctl_shm_info(ns, &shm_info);
                if (err < 0)
                        return err;
                if (put_compat_shm_info(&shm_info, uptr))
                        err = -EFAULT;
                return err;
        }
        case IPC_STAT:
        case SHM_STAT_ANY:
        case SHM_STAT:
                err = shmctl_stat(ns, shmid, cmd, &sem64);
                if (err < 0)
                        return err;
                if (copy_compat_shmid_to_user(uptr, &sem64, version))
                        err = -EFAULT;
                return err;

        case IPC_SET:
                if (copy_compat_shmid_from_user(&sem64, uptr, version))
                        return -EFAULT;
                fallthrough;
        case IPC_RMID:
                return shmctl_down(ns, shmid, cmd, &sem64);
        case SHM_LOCK:
        case SHM_UNLOCK:
                return shmctl_do_lock(ns, shmid, cmd);
        default:
                return -EINVAL;
        }
        return err;
}

COMPAT_SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, void __user *, uptr)
{
        return compat_ksys_shmctl(shmid, cmd, uptr, IPC_64);
}

#ifdef CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION
long compat_ksys_old_shmctl(int shmid, int cmd, void __user *uptr)
{
        int version = compat_ipc_parse_version(&cmd);

        return compat_ksys_shmctl(shmid, cmd, uptr, version);
}

COMPAT_SYSCALL_DEFINE3(old_shmctl, int, shmid, int, cmd, void __user *, uptr)
{
        return compat_ksys_old_shmctl(shmid, cmd, uptr);
}
#endif
#endif

/*
 * Fix shmaddr, allocate descriptor, map shm, add attach descriptor to lists.
 *
 * NOTE! Despite the name, this is NOT a direct system call entrypoint. The
 * "raddr" thing points to kernel space, and there has to be a wrapper around
 * this.
 */
long do_shmat(int shmid, char __user *shmaddr, int shmflg,
              ulong *raddr, unsigned long shmlba)
{
        struct shmid_kernel *shp;
        unsigned long addr = (unsigned long)shmaddr;
        unsigned long size;
        struct file *file, *base;
        int    err;
        unsigned long flags = MAP_SHARED;
        unsigned long prot;
        int acc_mode;
        struct ipc_namespace *ns;
        struct shm_file_data *sfd;
        int f_flags;
        unsigned long populate = 0;

        err = -EINVAL;
        if (shmid < 0)
                goto out;

        if (addr) {
                if (addr & (shmlba - 1)) {
                        if (shmflg & SHM_RND) {
                                addr &= ~(shmlba - 1);  /* round down */

                                /*
                                 * Ensure that the round-down is non-nil
                                 * when remapping. This can happen for
                                 * cases when addr < shmlba.
                                 */
                                if (!addr && (shmflg & SHM_REMAP))
                                        goto out;
                        } else
#ifndef __ARCH_FORCE_SHMLBA
                                if (addr & ~PAGE_MASK)
#endif
                                        goto out;
                }

                flags |= MAP_FIXED;
        } else if ((shmflg & SHM_REMAP))
                goto out;

        if (shmflg & SHM_RDONLY) {
                prot = PROT_READ;
                acc_mode = S_IRUGO;
                f_flags = O_RDONLY;
        } else {
                prot = PROT_READ | PROT_WRITE;
                acc_mode = S_IRUGO | S_IWUGO;
                f_flags = O_RDWR;
        }
        if (shmflg & SHM_EXEC) {
                prot |= PROT_EXEC;
                acc_mode |= S_IXUGO;
        }

        /*
         * We cannot rely on the fs check since SYSV IPC does have an
         * additional creator id...
         */
        ns = current->nsproxy->ipc_ns;
        rcu_read_lock();
        shp = shm_obtain_object_check(ns, shmid);
        if (IS_ERR(shp)) {
                err = PTR_ERR(shp);
                goto out_unlock;
        }

        err = -EACCES;
        if (ipcperms(ns, &shp->shm_perm, acc_mode))
                goto out_unlock;

        err = security_shm_shmat(&shp->shm_perm, shmaddr, shmflg);
        if (err)
                goto out_unlock;

        ipc_lock_object(&shp->shm_perm);

        /* check if shm_destroy() is tearing down shp */
        if (!ipc_valid_object(&shp->shm_perm)) {
                ipc_unlock_object(&shp->shm_perm);
                err = -EIDRM;
                goto out_unlock;
        }

        /*
         * We need to take a reference to the real shm file to prevent the
         * pointer from becoming stale in cases where the lifetime of the outer
         * file extends beyond that of the shm segment.  It's not usually
         * possible, but it can happen during remap_file_pages() emulation as
         * that unmaps the memory, then does ->mmap() via file reference only.
         * We'll deny the ->mmap() if the shm segment was since removed, but to
         * detect shm ID reuse we need to compare the file pointers.
         */
        base = get_file(shp->shm_file);
        shp->shm_nattch++;
        size = i_size_read(file_inode(base));
        ipc_unlock_object(&shp->shm_perm);
        rcu_read_unlock();

        err = -ENOMEM;
        sfd = kzalloc(sizeof(*sfd), GFP_KERNEL);
        if (!sfd) {
                fput(base);
                goto out_nattch;
        }

        file = alloc_file_clone(base, f_flags,
                          is_file_hugepages(base) ?
                                &shm_file_operations_huge :
                                &shm_file_operations);
        err = PTR_ERR(file);
        if (IS_ERR(file)) {
                kfree(sfd);
                fput(base);
                goto out_nattch;
        }

        sfd->id = shp->shm_perm.id;
        sfd->ns = get_ipc_ns(ns);
        sfd->file = base;
        sfd->vm_ops = NULL;
        file->private_data = sfd;

        err = security_mmap_file(file, prot, flags);
        if (err)
                goto out_fput;

        if (mmap_write_lock_killable(current->mm)) {
                err = -EINTR;
                goto out_fput;
        }

        if (addr && !(shmflg & SHM_REMAP)) {
                err = -EINVAL;
                if (addr + size < addr)
                        goto invalid;

                if (find_vma_intersection(current->mm, addr, addr + size))
                        goto invalid;
        }

        addr = do_mmap(file, addr, size, prot, flags, 0, &populate, NULL);
        *raddr = addr;
        err = 0;
        if (IS_ERR_VALUE(addr))
                err = (long)addr;
invalid:
        mmap_write_unlock(current->mm);
        if (populate)
                mm_populate(addr, populate);

out_fput:
        fput(file);

out_nattch:
        down_write(&shm_ids(ns).rwsem);
        shp = shm_lock(ns, shmid);
        shp->shm_nattch--;

        if (shm_may_destroy(shp))
                shm_destroy(ns, shp);
        else
                shm_unlock(shp);
        up_write(&shm_ids(ns).rwsem);
        return err;

out_unlock:
        rcu_read_unlock();
out:
        return err;
}

SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg)
{
        unsigned long ret;
        long err;

        err = do_shmat(shmid, shmaddr, shmflg, &ret, SHMLBA);
        if (err)
                return err;
        force_successful_syscall_return();
        return (long)ret;
}

#ifdef CONFIG_COMPAT

#ifndef COMPAT_SHMLBA
#define COMPAT_SHMLBA        SHMLBA
#endif

COMPAT_SYSCALL_DEFINE3(shmat, int, shmid, compat_uptr_t, shmaddr, int, shmflg)
{
        unsigned long ret;
        long err;

        err = do_shmat(shmid, compat_ptr(shmaddr), shmflg, &ret, COMPAT_SHMLBA);
        if (err)
                return err;
        force_successful_syscall_return();
        return (long)ret;
}
#endif

/*
 * detach and kill segment if marked destroyed.
 * The work is done in shm_close.
 */
long ksys_shmdt(char __user *shmaddr)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        unsigned long addr = (unsigned long)shmaddr;
        int retval = -EINVAL;
#ifdef CONFIG_MMU
        loff_t size = 0;
        struct file *file;
        struct vm_area_struct *next;
#endif

        if (addr & ~PAGE_MASK)
                return retval;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        /*
         * This function tries to be smart and unmap shm segments that
         * were modified by partial mlock or munmap calls:
         * - It first determines the size of the shm segment that should be
         *   unmapped: It searches for a vma that is backed by shm and that
         *   started at address shmaddr. It records it's size and then unmaps
         *   it.
         * - Then it unmaps all shm vmas that started at shmaddr and that
         *   are within the initially determined size and that are from the
         *   same shm segment from which we determined the size.
         * Errors from do_munmap are ignored: the function only fails if
         * it's called with invalid parameters or if it's called to unmap
         * a part of a vma. Both calls in this function are for full vmas,
         * the parameters are directly copied from the vma itself and always
         * valid - therefore do_munmap cannot fail. (famous last words?)
         */
        /*
         * If it had been mremap()'d, the starting address would not
         * match the usual checks anyway. So assume all vma's are
         * above the starting address given.
         */
        vma = find_vma(mm, addr);

#ifdef CONFIG_MMU
        while (vma) {
                next = vma->vm_next;

                /*
                 * Check if the starting address would match, i.e. it's
                 * a fragment created by mprotect() and/or munmap(), or it
                 * otherwise it starts at this address with no hassles.
                 */
                if ((vma->vm_ops == &shm_vm_ops) &&
                        (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) {

                        /*
                         * Record the file of the shm segment being
                         * unmapped.  With mremap(), someone could place
                         * page from another segment but with equal offsets
                         * in the range we are unmapping.
                         */
                        file = vma->vm_file;
                        size = i_size_read(file_inode(vma->vm_file));
                        do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
                        /*
                         * We discovered the size of the shm segment, so
                         * break out of here and fall through to the next
                         * loop that uses the size information to stop
                         * searching for matching vma's.
                         */
                        retval = 0;
                        vma = next;
                        break;
                }
                vma = next;
        }

        /*
         * We need look no further than the maximum address a fragment
         * could possibly have landed at. Also cast things to loff_t to
         * prevent overflows and make comparisons vs. equal-width types.
         */
        size = PAGE_ALIGN(size);
        while (vma && (loff_t)(vma->vm_end - addr) <= size) {
                next = vma->vm_next;

                /* finding a matching vma now does not alter retval */
                if ((vma->vm_ops == &shm_vm_ops) &&
                    ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) &&
                    (vma->vm_file == file))
                        do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
                vma = next;
        }

#else        /* CONFIG_MMU */
        /* under NOMMU conditions, the exact address to be destroyed must be
         * given
         */
        if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
                do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
                retval = 0;
        }

#endif

        mmap_write_unlock(mm);
        return retval;
}

SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
{
        return ksys_shmdt(shmaddr);
}

#ifdef CONFIG_PROC_FS
static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
{
        struct pid_namespace *pid_ns = ipc_seq_pid_ns(s);
        struct user_namespace *user_ns = seq_user_ns(s);
        struct kern_ipc_perm *ipcp = it;
        struct shmid_kernel *shp;
        unsigned long rss = 0, swp = 0;

        shp = container_of(ipcp, struct shmid_kernel, shm_perm);
        shm_add_rss_swap(shp, &rss, &swp);

#if BITS_PER_LONG <= 32
#define SIZE_SPEC "%10lu"
#else
#define SIZE_SPEC "%21lu"
#endif

        seq_printf(s,
                   "%10d %10d  %4o " SIZE_SPEC " %5u %5u  "
                   "%5lu %5u %5u %5u %5u %10llu %10llu %10llu "
                   SIZE_SPEC " " SIZE_SPEC "\n",
                   shp->shm_perm.key,
                   shp->shm_perm.id,
                   shp->shm_perm.mode,
                   shp->shm_segsz,
                   pid_nr_ns(shp->shm_cprid, pid_ns),
                   pid_nr_ns(shp->shm_lprid, pid_ns),
                   shp->shm_nattch,
                   from_kuid_munged(user_ns, shp->shm_perm.uid),
                   from_kgid_munged(user_ns, shp->shm_perm.gid),
                   from_kuid_munged(user_ns, shp->shm_perm.cuid),
                   from_kgid_munged(user_ns, shp->shm_perm.cgid),
                   shp->shm_atim,
                   shp->shm_dtim,
                   shp->shm_ctim,
                   rss * PAGE_SIZE,
                   swp * PAGE_SIZE);

        return 0;
}
#endif
























































































































































































































































































































































































































































































































































    1 
    1 
    1 
    1 

























    1 






































































































































































































































































































































































































































































































































































































    1 




    1 



    1 

    1 






























    1 






























    1 




















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Implementation of the kernel access vector cache (AVC).
 *
 * Authors:  Stephen Smalley, <sds@tycho.nsa.gov>
 *             James Morris <jmorris@redhat.com>
 *
 * Update:   KaiGai, Kohei <kaigai@ak.jp.nec.com>
 *        Replaced the avc_lock spinlock by RCU.
 *
 * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
 */
#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/dcache.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/percpu.h>
#include <linux/list.h>
#include <net/sock.h>
#include <linux/un.h>
#include <net/af_unix.h>
#include <linux/ip.h>
#include <linux/audit.h>
#include <linux/ipv6.h>
#include <net/ipv6.h>
#include "avc.h"
#include "avc_ss.h"
#include "classmap.h"

#define CREATE_TRACE_POINTS
#include <trace/events/avc.h>

#define AVC_CACHE_SLOTS                        512
#define AVC_DEF_CACHE_THRESHOLD                512
#define AVC_CACHE_RECLAIM                16

#ifdef CONFIG_SECURITY_SELINUX_AVC_STATS
#define avc_cache_stats_incr(field)        this_cpu_inc(avc_cache_stats.field)
#else
#define avc_cache_stats_incr(field)        do {} while (0)
#endif

struct avc_entry {
        u32                        ssid;
        u32                        tsid;
        u16                        tclass;
        struct av_decision        avd;
        struct avc_xperms_node        *xp_node;
};

struct avc_node {
        struct avc_entry        ae;
        struct hlist_node        list; /* anchored in avc_cache->slots[i] */
        struct rcu_head                rhead;
};

struct avc_xperms_decision_node {
        struct extended_perms_decision xpd;
        struct list_head xpd_list; /* list of extended_perms_decision */
};

struct avc_xperms_node {
        struct extended_perms xp;
        struct list_head xpd_head; /* list head of extended_perms_decision */
};

struct avc_cache {
        struct hlist_head        slots[AVC_CACHE_SLOTS]; /* head for avc_node->list */
        spinlock_t                slots_lock[AVC_CACHE_SLOTS]; /* lock for writes */
        atomic_t                lru_hint;        /* LRU hint for reclaim scan */
        atomic_t                active_nodes;
        u32                        latest_notif;        /* latest revocation notification */
};

struct avc_callback_node {
        int (*callback) (u32 event);
        u32 events;
        struct avc_callback_node *next;
};

#ifdef CONFIG_SECURITY_SELINUX_AVC_STATS
DEFINE_PER_CPU(struct avc_cache_stats, avc_cache_stats) = { 0 };
#endif

struct selinux_avc {
        unsigned int avc_cache_threshold;
        struct avc_cache avc_cache;
};

static struct selinux_avc selinux_avc;

void selinux_avc_init(struct selinux_avc **avc)
{
        int i;

        selinux_avc.avc_cache_threshold = AVC_DEF_CACHE_THRESHOLD;
        for (i = 0; i < AVC_CACHE_SLOTS; i++) {
                INIT_HLIST_HEAD(&selinux_avc.avc_cache.slots[i]);
                spin_lock_init(&selinux_avc.avc_cache.slots_lock[i]);
        }
        atomic_set(&selinux_avc.avc_cache.active_nodes, 0);
        atomic_set(&selinux_avc.avc_cache.lru_hint, 0);
        *avc = &selinux_avc;
}

unsigned int avc_get_cache_threshold(struct selinux_avc *avc)
{
        return avc->avc_cache_threshold;
}

void avc_set_cache_threshold(struct selinux_avc *avc,
                             unsigned int cache_threshold)
{
        avc->avc_cache_threshold = cache_threshold;
}

static struct avc_callback_node *avc_callbacks;
static struct kmem_cache *avc_node_cachep;
static struct kmem_cache *avc_xperms_data_cachep;
static struct kmem_cache *avc_xperms_decision_cachep;
static struct kmem_cache *avc_xperms_cachep;

static inline int avc_hash(u32 ssid, u32 tsid, u16 tclass)
{
        return (ssid ^ (tsid<<2) ^ (tclass<<4)) & (AVC_CACHE_SLOTS - 1);
}

/**
 * avc_init - Initialize the AVC.
 *
 * Initialize the access vector cache.
 */
void __init avc_init(void)
{
        avc_node_cachep = kmem_cache_create("avc_node", sizeof(struct avc_node),
                                        0, SLAB_PANIC, NULL);
        avc_xperms_cachep = kmem_cache_create("avc_xperms_node",
                                        sizeof(struct avc_xperms_node),
                                        0, SLAB_PANIC, NULL);
        avc_xperms_decision_cachep = kmem_cache_create(
                                        "avc_xperms_decision_node",
                                        sizeof(struct avc_xperms_decision_node),
                                        0, SLAB_PANIC, NULL);
        avc_xperms_data_cachep = kmem_cache_create("avc_xperms_data",
                                        sizeof(struct extended_perms_data),
                                        0, SLAB_PANIC, NULL);
}

int avc_get_hash_stats(struct selinux_avc *avc, char *page)
{
        int i, chain_len, max_chain_len, slots_used;
        struct avc_node *node;
        struct hlist_head *head;

        rcu_read_lock();

        slots_used = 0;
        max_chain_len = 0;
        for (i = 0; i < AVC_CACHE_SLOTS; i++) {
                head = &avc->avc_cache.slots[i];
                if (!hlist_empty(head)) {
                        slots_used++;
                        chain_len = 0;
                        hlist_for_each_entry_rcu(node, head, list)
                                chain_len++;
                        if (chain_len > max_chain_len)
                                max_chain_len = chain_len;
                }
        }

        rcu_read_unlock();

        return scnprintf(page, PAGE_SIZE, "entries: %d\nbuckets used: %d/%d\n"
                         "longest chain: %d\n",
                         atomic_read(&avc->avc_cache.active_nodes),
                         slots_used, AVC_CACHE_SLOTS, max_chain_len);
}

/*
 * using a linked list for extended_perms_decision lookup because the list is
 * always small. i.e. less than 5, typically 1
 */
static struct extended_perms_decision *avc_xperms_decision_lookup(u8 driver,
                                        struct avc_xperms_node *xp_node)
{
        struct avc_xperms_decision_node *xpd_node;

        list_for_each_entry(xpd_node, &xp_node->xpd_head, xpd_list) {
                if (xpd_node->xpd.driver == driver)
                        return &xpd_node->xpd;
        }
        return NULL;
}

static inline unsigned int
avc_xperms_has_perm(struct extended_perms_decision *xpd,
                                        u8 perm, u8 which)
{
        unsigned int rc = 0;

        if ((which == XPERMS_ALLOWED) &&
                        (xpd->used & XPERMS_ALLOWED))
                rc = security_xperm_test(xpd->allowed->p, perm);
        else if ((which == XPERMS_AUDITALLOW) &&
                        (xpd->used & XPERMS_AUDITALLOW))
                rc = security_xperm_test(xpd->auditallow->p, perm);
        else if ((which == XPERMS_DONTAUDIT) &&
                        (xpd->used & XPERMS_DONTAUDIT))
                rc = security_xperm_test(xpd->dontaudit->p, perm);
        return rc;
}

static void avc_xperms_allow_perm(struct avc_xperms_node *xp_node,
                                u8 driver, u8 perm)
{
        struct extended_perms_decision *xpd;
        security_xperm_set(xp_node->xp.drivers.p, driver);
        xpd = avc_xperms_decision_lookup(driver, xp_node);
        if (xpd && xpd->allowed)
                security_xperm_set(xpd->allowed->p, perm);
}

static void avc_xperms_decision_free(struct avc_xperms_decision_node *xpd_node)
{
        struct extended_perms_decision *xpd;

        xpd = &xpd_node->xpd;
        if (xpd->allowed)
                kmem_cache_free(avc_xperms_data_cachep, xpd->allowed);
        if (xpd->auditallow)
                kmem_cache_free(avc_xperms_data_cachep, xpd->auditallow);
        if (xpd->dontaudit)
                kmem_cache_free(avc_xperms_data_cachep, xpd->dontaudit);
        kmem_cache_free(avc_xperms_decision_cachep, xpd_node);
}

static void avc_xperms_free(struct avc_xperms_node *xp_node)
{
        struct avc_xperms_decision_node *xpd_node, *tmp;

        if (!xp_node)
                return;

        list_for_each_entry_safe(xpd_node, tmp, &xp_node->xpd_head, xpd_list) {
                list_del(&xpd_node->xpd_list);
                avc_xperms_decision_free(xpd_node);
        }
        kmem_cache_free(avc_xperms_cachep, xp_node);
}

static void avc_copy_xperms_decision(struct extended_perms_decision *dest,
                                        struct extended_perms_decision *src)
{
        dest->driver = src->driver;
        dest->used = src->used;
        if (dest->used & XPERMS_ALLOWED)
                memcpy(dest->allowed->p, src->allowed->p,
                                sizeof(src->allowed->p));
        if (dest->used & XPERMS_AUDITALLOW)
                memcpy(dest->auditallow->p, src->auditallow->p,
                                sizeof(src->auditallow->p));
        if (dest->used & XPERMS_DONTAUDIT)
                memcpy(dest->dontaudit->p, src->dontaudit->p,
                                sizeof(src->dontaudit->p));
}

/*
 * similar to avc_copy_xperms_decision, but only copy decision
 * information relevant to this perm
 */
static inline void avc_quick_copy_xperms_decision(u8 perm,
                        struct extended_perms_decision *dest,
                        struct extended_perms_decision *src)
{
        /*
         * compute index of the u32 of the 256 bits (8 u32s) that contain this
         * command permission
         */
        u8 i = perm >> 5;

        dest->used = src->used;
        if (dest->used & XPERMS_ALLOWED)
                dest->allowed->p[i] = src->allowed->p[i];
        if (dest->used & XPERMS_AUDITALLOW)
                dest->auditallow->p[i] = src->auditallow->p[i];
        if (dest->used & XPERMS_DONTAUDIT)
                dest->dontaudit->p[i] = src->dontaudit->p[i];
}

static struct avc_xperms_decision_node
                *avc_xperms_decision_alloc(u8 which)
{
        struct avc_xperms_decision_node *xpd_node;
        struct extended_perms_decision *xpd;

        xpd_node = kmem_cache_zalloc(avc_xperms_decision_cachep,
                                     GFP_NOWAIT | __GFP_NOWARN);
        if (!xpd_node)
                return NULL;

        xpd = &xpd_node->xpd;
        if (which & XPERMS_ALLOWED) {
                xpd->allowed = kmem_cache_zalloc(avc_xperms_data_cachep,
                                                GFP_NOWAIT | __GFP_NOWARN);
                if (!xpd->allowed)
                        goto error;
        }
        if (which & XPERMS_AUDITALLOW) {
                xpd->auditallow = kmem_cache_zalloc(avc_xperms_data_cachep,
                                                GFP_NOWAIT | __GFP_NOWARN);
                if (!xpd->auditallow)
                        goto error;
        }
        if (which & XPERMS_DONTAUDIT) {
                xpd->dontaudit = kmem_cache_zalloc(avc_xperms_data_cachep,
                                                GFP_NOWAIT | __GFP_NOWARN);
                if (!xpd->dontaudit)
                        goto error;
        }
        return xpd_node;
error:
        avc_xperms_decision_free(xpd_node);
        return NULL;
}

static int avc_add_xperms_decision(struct avc_node *node,
                        struct extended_perms_decision *src)
{
        struct avc_xperms_decision_node *dest_xpd;

        node->ae.xp_node->xp.len++;
        dest_xpd = avc_xperms_decision_alloc(src->used);
        if (!dest_xpd)
                return -ENOMEM;
        avc_copy_xperms_decision(&dest_xpd->xpd, src);
        list_add(&dest_xpd->xpd_list, &node->ae.xp_node->xpd_head);
        return 0;
}

static struct avc_xperms_node *avc_xperms_alloc(void)
{
        struct avc_xperms_node *xp_node;

        xp_node = kmem_cache_zalloc(avc_xperms_cachep, GFP_NOWAIT | __GFP_NOWARN);
        if (!xp_node)
                return xp_node;
        INIT_LIST_HEAD(&xp_node->xpd_head);
        return xp_node;
}

static int avc_xperms_populate(struct avc_node *node,
                                struct avc_xperms_node *src)
{
        struct avc_xperms_node *dest;
        struct avc_xperms_decision_node *dest_xpd;
        struct avc_xperms_decision_node *src_xpd;

        if (src->xp.len == 0)
                return 0;
        dest = avc_xperms_alloc();
        if (!dest)
                return -ENOMEM;

        memcpy(dest->xp.drivers.p, src->xp.drivers.p, sizeof(dest->xp.drivers.p));
        dest->xp.len = src->xp.len;

        /* for each source xpd allocate a destination xpd and copy */
        list_for_each_entry(src_xpd, &src->xpd_head, xpd_list) {
                dest_xpd = avc_xperms_decision_alloc(src_xpd->xpd.used);
                if (!dest_xpd)
                        goto error;
                avc_copy_xperms_decision(&dest_xpd->xpd, &src_xpd->xpd);
                list_add(&dest_xpd->xpd_list, &dest->xpd_head);
        }
        node->ae.xp_node = dest;
        return 0;
error:
        avc_xperms_free(dest);
        return -ENOMEM;

}

static inline u32 avc_xperms_audit_required(u32 requested,
                                        struct av_decision *avd,
                                        struct extended_perms_decision *xpd,
                                        u8 perm,
                                        int result,
                                        u32 *deniedp)
{
        u32 denied, audited;

        denied = requested & ~avd->allowed;
        if (unlikely(denied)) {
                audited = denied & avd->auditdeny;
                if (audited && xpd) {
                        if (avc_xperms_has_perm(xpd, perm, XPERMS_DONTAUDIT))
                                audited &= ~requested;
                }
        } else if (result) {
                audited = denied = requested;
        } else {
                audited = requested & avd->auditallow;
                if (audited && xpd) {
                        if (!avc_xperms_has_perm(xpd, perm, XPERMS_AUDITALLOW))
                                audited &= ~requested;
                }
        }

        *deniedp = denied;
        return audited;
}

static inline int avc_xperms_audit(struct selinux_state *state,
                                   u32 ssid, u32 tsid, u16 tclass,
                                   u32 requested, struct av_decision *avd,
                                   struct extended_perms_decision *xpd,
                                   u8 perm, int result,
                                   struct common_audit_data *ad)
{
        u32 audited, denied;

        audited = avc_xperms_audit_required(
                        requested, avd, xpd, perm, result, &denied);
        if (likely(!audited))
                return 0;
        return slow_avc_audit(state, ssid, tsid, tclass, requested,
                        audited, denied, result, ad);
}

static void avc_node_free(struct rcu_head *rhead)
{
        struct avc_node *node = container_of(rhead, struct avc_node, rhead);
        avc_xperms_free(node->ae.xp_node);
        kmem_cache_free(avc_node_cachep, node);
        avc_cache_stats_incr(frees);
}

static void avc_node_delete(struct selinux_avc *avc, struct avc_node *node)
{
        hlist_del_rcu(&node->list);
        call_rcu(&node->rhead, avc_node_free);
        atomic_dec(&avc->avc_cache.active_nodes);
}

static void avc_node_kill(struct selinux_avc *avc, struct avc_node *node)
{
        avc_xperms_free(node->ae.xp_node);
        kmem_cache_free(avc_node_cachep, node);
        avc_cache_stats_incr(frees);
        atomic_dec(&avc->avc_cache.active_nodes);
}

static void avc_node_replace(struct selinux_avc *avc,
                             struct avc_node *new, struct avc_node *old)
{
        hlist_replace_rcu(&old->list, &new->list);
        call_rcu(&old->rhead, avc_node_free);
        atomic_dec(&avc->avc_cache.active_nodes);
}

static inline int avc_reclaim_node(struct selinux_avc *avc)
{
        struct avc_node *node;
        int hvalue, try, ecx;
        unsigned long flags;
        struct hlist_head *head;
        spinlock_t *lock;

        for (try = 0, ecx = 0; try < AVC_CACHE_SLOTS; try++) {
                hvalue = atomic_inc_return(&avc->avc_cache.lru_hint) &
                        (AVC_CACHE_SLOTS - 1);
                head = &avc->avc_cache.slots[hvalue];
                lock = &avc->avc_cache.slots_lock[hvalue];

                if (!spin_trylock_irqsave(lock, flags))
                        continue;

                rcu_read_lock();
                hlist_for_each_entry(node, head, list) {
                        avc_node_delete(avc, node);
                        avc_cache_stats_incr(reclaims);
                        ecx++;
                        if (ecx >= AVC_CACHE_RECLAIM) {
                                rcu_read_unlock();
                                spin_unlock_irqrestore(lock, flags);
                                goto out;
                        }
                }
                rcu_read_unlock();
                spin_unlock_irqrestore(lock, flags);
        }
out:
        return ecx;
}

static struct avc_node *avc_alloc_node(struct selinux_avc *avc)
{
        struct avc_node *node;

        node = kmem_cache_zalloc(avc_node_cachep, GFP_NOWAIT | __GFP_NOWARN);
        if (!node)
                goto out;

        INIT_HLIST_NODE(&node->list);
        avc_cache_stats_incr(allocations);

        if (atomic_inc_return(&avc->avc_cache.active_nodes) >
            avc->avc_cache_threshold)
                avc_reclaim_node(avc);

out:
        return node;
}

static void avc_node_populate(struct avc_node *node, u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd)
{
        node->ae.ssid = ssid;
        node->ae.tsid = tsid;
        node->ae.tclass = tclass;
        memcpy(&node->ae.avd, avd, sizeof(node->ae.avd));
}

static inline struct avc_node *avc_search_node(struct selinux_avc *avc,
                                               u32 ssid, u32 tsid, u16 tclass)
{
        struct avc_node *node, *ret = NULL;
        int hvalue;
        struct hlist_head *head;

        hvalue = avc_hash(ssid, tsid, tclass);
        head = &avc->avc_cache.slots[hvalue];
        hlist_for_each_entry_rcu(node, head, list) {
                if (ssid == node->ae.ssid &&
                    tclass == node->ae.tclass &&
                    tsid == node->ae.tsid) {
                        ret = node;
                        break;
                }
        }

        return ret;
}

/**
 * avc_lookup - Look up an AVC entry.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 *
 * Look up an AVC entry that is valid for the
 * (@ssid, @tsid), interpreting the permissions
 * based on @tclass.  If a valid AVC entry exists,
 * then this function returns the avc_node.
 * Otherwise, this function returns NULL.
 */
static struct avc_node *avc_lookup(struct selinux_avc *avc,
                                   u32 ssid, u32 tsid, u16 tclass)
{
        struct avc_node *node;

        avc_cache_stats_incr(lookups);
        node = avc_search_node(avc, ssid, tsid, tclass);

        if (node)
                return node;

        avc_cache_stats_incr(misses);
        return NULL;
}

static int avc_latest_notif_update(struct selinux_avc *avc,
                                   int seqno, int is_insert)
{
        int ret = 0;
        static DEFINE_SPINLOCK(notif_lock);
        unsigned long flag;

        spin_lock_irqsave(&notif_lock, flag);
        if (is_insert) {
                if (seqno < avc->avc_cache.latest_notif) {
                        pr_warn("SELinux: avc:  seqno %d < latest_notif %d\n",
                               seqno, avc->avc_cache.latest_notif);
                        ret = -EAGAIN;
                }
        } else {
                if (seqno > avc->avc_cache.latest_notif)
                        avc->avc_cache.latest_notif = seqno;
        }
        spin_unlock_irqrestore(&notif_lock, flag);

        return ret;
}

/**
 * avc_insert - Insert an AVC entry.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 * @avd: resulting av decision
 * @xp_node: resulting extended permissions
 *
 * Insert an AVC entry for the SID pair
 * (@ssid, @tsid) and class @tclass.
 * The access vectors and the sequence number are
 * normally provided by the security server in
 * response to a security_compute_av() call.  If the
 * sequence number @avd->seqno is not less than the latest
 * revocation notification, then the function copies
 * the access vectors into a cache entry, returns
 * avc_node inserted. Otherwise, this function returns NULL.
 */
static struct avc_node *avc_insert(struct selinux_avc *avc,
                                   u32 ssid, u32 tsid, u16 tclass,
                                   struct av_decision *avd,
                                   struct avc_xperms_node *xp_node)
{
        struct avc_node *pos, *node = NULL;
        int hvalue;
        unsigned long flag;
        spinlock_t *lock;
        struct hlist_head *head;

        if (avc_latest_notif_update(avc, avd->seqno, 1))
                return NULL;

        node = avc_alloc_node(avc);
        if (!node)
                return NULL;

        avc_node_populate(node, ssid, tsid, tclass, avd);
        if (avc_xperms_populate(node, xp_node)) {
                avc_node_kill(avc, node);
                return NULL;
        }

        hvalue = avc_hash(ssid, tsid, tclass);
        head = &avc->avc_cache.slots[hvalue];
        lock = &avc->avc_cache.slots_lock[hvalue];
        spin_lock_irqsave(lock, flag);
        hlist_for_each_entry(pos, head, list) {
                if (pos->ae.ssid == ssid &&
                        pos->ae.tsid == tsid &&
                        pos->ae.tclass == tclass) {
                        avc_node_replace(avc, node, pos);
                        goto found;
                }
        }
        hlist_add_head_rcu(&node->list, head);
found:
        spin_unlock_irqrestore(lock, flag);
        return node;
}

/**
 * avc_audit_pre_callback - SELinux specific information
 * will be called by generic audit code
 * @ab: the audit buffer
 * @a: audit_data
 */
static void avc_audit_pre_callback(struct audit_buffer *ab, void *a)
{
        struct common_audit_data *ad = a;
        struct selinux_audit_data *sad = ad->selinux_audit_data;
        u32 av = sad->audited;
        const char **perms;
        int i, perm;

        audit_log_format(ab, "avc:  %s ", sad->denied ? "denied" : "granted");

        if (av == 0) {
                audit_log_format(ab, " null");
                return;
        }

        perms = secclass_map[sad->tclass-1].perms;

        audit_log_format(ab, " {");
        i = 0;
        perm = 1;
        while (i < (sizeof(av) * 8)) {
                if ((perm & av) && perms[i]) {
                        audit_log_format(ab, " %s", perms[i]);
                        av &= ~perm;
                }
                i++;
                perm <<= 1;
        }

        if (av)
                audit_log_format(ab, " 0x%x", av);

        audit_log_format(ab, " } for ");
}

/**
 * avc_audit_post_callback - SELinux specific information
 * will be called by generic audit code
 * @ab: the audit buffer
 * @a: audit_data
 */
static void avc_audit_post_callback(struct audit_buffer *ab, void *a)
{
        struct common_audit_data *ad = a;
        struct selinux_audit_data *sad = ad->selinux_audit_data;
        char *scontext = NULL;
        char *tcontext = NULL;
        const char *tclass = NULL;
        u32 scontext_len;
        u32 tcontext_len;
        int rc;

        rc = security_sid_to_context(sad->state, sad->ssid, &scontext,
                                     &scontext_len);
        if (rc)
                audit_log_format(ab, " ssid=%d", sad->ssid);
        else
                audit_log_format(ab, " scontext=%s", scontext);

        rc = security_sid_to_context(sad->state, sad->tsid, &tcontext,
                                     &tcontext_len);
        if (rc)
                audit_log_format(ab, " tsid=%d", sad->tsid);
        else
                audit_log_format(ab, " tcontext=%s", tcontext);

        tclass = secclass_map[sad->tclass-1].name;
        audit_log_format(ab, " tclass=%s", tclass);

        if (sad->denied)
                audit_log_format(ab, " permissive=%u", sad->result ? 0 : 1);

        trace_selinux_audited(sad, scontext, tcontext, tclass);
        kfree(tcontext);
        kfree(scontext);

        /* in case of invalid context report also the actual context string */
        rc = security_sid_to_context_inval(sad->state, sad->ssid, &scontext,
                                           &scontext_len);
        if (!rc && scontext) {
                if (scontext_len && scontext[scontext_len - 1] == '\0')
                        scontext_len--;
                audit_log_format(ab, " srawcon=");
                audit_log_n_untrustedstring(ab, scontext, scontext_len);
                kfree(scontext);
        }

        rc = security_sid_to_context_inval(sad->state, sad->tsid, &scontext,
                                           &scontext_len);
        if (!rc && scontext) {
                if (scontext_len && scontext[scontext_len - 1] == '\0')
                        scontext_len--;
                audit_log_format(ab, " trawcon=");
                audit_log_n_untrustedstring(ab, scontext, scontext_len);
                kfree(scontext);
        }
}

/* This is the slow part of avc audit with big stack footprint */
noinline int slow_avc_audit(struct selinux_state *state,
                            u32 ssid, u32 tsid, u16 tclass,
                            u32 requested, u32 audited, u32 denied, int result,
                            struct common_audit_data *a)
{
        struct common_audit_data stack_data;
        struct selinux_audit_data sad;

        if (WARN_ON(!tclass || tclass >= ARRAY_SIZE(secclass_map)))
                return -EINVAL;

        if (!a) {
                a = &stack_data;
                a->type = LSM_AUDIT_DATA_NONE;
        }

        sad.tclass = tclass;
        sad.requested = requested;
        sad.ssid = ssid;
        sad.tsid = tsid;
        sad.audited = audited;
        sad.denied = denied;
        sad.result = result;
        sad.state = state;

        a->selinux_audit_data = &sad;

        common_lsm_audit(a, avc_audit_pre_callback, avc_audit_post_callback);
        return 0;
}

/**
 * avc_add_callback - Register a callback for security events.
 * @callback: callback function
 * @events: security events
 *
 * Register a callback function for events in the set @events.
 * Returns %0 on success or -%ENOMEM if insufficient memory
 * exists to add the callback.
 */
int __init avc_add_callback(int (*callback)(u32 event), u32 events)
{
        struct avc_callback_node *c;
        int rc = 0;

        c = kmalloc(sizeof(*c), GFP_KERNEL);
        if (!c) {
                rc = -ENOMEM;
                goto out;
        }

        c->callback = callback;
        c->events = events;
        c->next = avc_callbacks;
        avc_callbacks = c;
out:
        return rc;
}

/**
 * avc_update_node Update an AVC entry
 * @event : Updating event
 * @perms : Permission mask bits
 * @ssid,@tsid,@tclass : identifier of an AVC entry
 * @seqno : sequence number when decision was made
 * @xpd: extended_perms_decision to be added to the node
 * @flags: the AVC_* flags, e.g. AVC_NONBLOCKING, AVC_EXTENDED_PERMS, or 0.
 *
 * if a valid AVC entry doesn't exist,this function returns -ENOENT.
 * if kmalloc() called internal returns NULL, this function returns -ENOMEM.
 * otherwise, this function updates the AVC entry. The original AVC-entry object
 * will release later by RCU.
 */
static int avc_update_node(struct selinux_avc *avc,
                           u32 event, u32 perms, u8 driver, u8 xperm, u32 ssid,
                           u32 tsid, u16 tclass, u32 seqno,
                           struct extended_perms_decision *xpd,
                           u32 flags)
{
        int hvalue, rc = 0;
        unsigned long flag;
        struct avc_node *pos, *node, *orig = NULL;
        struct hlist_head *head;
        spinlock_t *lock;

        /*
         * If we are in a non-blocking code path, e.g. VFS RCU walk,
         * then we must not add permissions to a cache entry
         * because we will not audit the denial.  Otherwise,
         * during the subsequent blocking retry (e.g. VFS ref walk), we
         * will find the permissions already granted in the cache entry
         * and won't audit anything at all, leading to silent denials in
         * permissive mode that only appear when in enforcing mode.
         *
         * See the corresponding handling of MAY_NOT_BLOCK in avc_audit()
         * and selinux_inode_permission().
         */
        if (flags & AVC_NONBLOCKING)
                return 0;

        node = avc_alloc_node(avc);
        if (!node) {
                rc = -ENOMEM;
                goto out;
        }

        /* Lock the target slot */
        hvalue = avc_hash(ssid, tsid, tclass);

        head = &avc->avc_cache.slots[hvalue];
        lock = &avc->avc_cache.slots_lock[hvalue];

        spin_lock_irqsave(lock, flag);

        hlist_for_each_entry(pos, head, list) {
                if (ssid == pos->ae.ssid &&
                    tsid == pos->ae.tsid &&
                    tclass == pos->ae.tclass &&
                    seqno == pos->ae.avd.seqno){
                        orig = pos;
                        break;
                }
        }

        if (!orig) {
                rc = -ENOENT;
                avc_node_kill(avc, node);
                goto out_unlock;
        }

        /*
         * Copy and replace original node.
         */

        avc_node_populate(node, ssid, tsid, tclass, &orig->ae.avd);

        if (orig->ae.xp_node) {
                rc = avc_xperms_populate(node, orig->ae.xp_node);
                if (rc) {
                        avc_node_kill(avc, node);
                        goto out_unlock;
                }
        }

        switch (event) {
        case AVC_CALLBACK_GRANT:
                node->ae.avd.allowed |= perms;
                if (node->ae.xp_node && (flags & AVC_EXTENDED_PERMS))
                        avc_xperms_allow_perm(node->ae.xp_node, driver, xperm);
                break;
        case AVC_CALLBACK_TRY_REVOKE:
        case AVC_CALLBACK_REVOKE:
                node->ae.avd.allowed &= ~perms;
                break;
        case AVC_CALLBACK_AUDITALLOW_ENABLE:
                node->ae.avd.auditallow |= perms;
                break;
        case AVC_CALLBACK_AUDITALLOW_DISABLE:
                node->ae.avd.auditallow &= ~perms;
                break;
        case AVC_CALLBACK_AUDITDENY_ENABLE:
                node->ae.avd.auditdeny |= perms;
                break;
        case AVC_CALLBACK_AUDITDENY_DISABLE:
                node->ae.avd.auditdeny &= ~perms;
                break;
        case AVC_CALLBACK_ADD_XPERMS:
                avc_add_xperms_decision(node, xpd);
                break;
        }
        avc_node_replace(avc, node, orig);
out_unlock:
        spin_unlock_irqrestore(lock, flag);
out:
        return rc;
}

/**
 * avc_flush - Flush the cache
 */
static void avc_flush(struct selinux_avc *avc)
{
        struct hlist_head *head;
        struct avc_node *node;
        spinlock_t *lock;
        unsigned long flag;
        int i;

        for (i = 0; i < AVC_CACHE_SLOTS; i++) {
                head = &avc->avc_cache.slots[i];
                lock = &avc->avc_cache.slots_lock[i];

                spin_lock_irqsave(lock, flag);
                /*
                 * With preemptable RCU, the outer spinlock does not
                 * prevent RCU grace periods from ending.
                 */
                rcu_read_lock();
                hlist_for_each_entry(node, head, list)
                        avc_node_delete(avc, node);
                rcu_read_unlock();
                spin_unlock_irqrestore(lock, flag);
        }
}

/**
 * avc_ss_reset - Flush the cache and revalidate migrated permissions.
 * @seqno: policy sequence number
 */
int avc_ss_reset(struct selinux_avc *avc, u32 seqno)
{
        struct avc_callback_node *c;
        int rc = 0, tmprc;

        avc_flush(avc);

        for (c = avc_callbacks; c; c = c->next) {
                if (c->events & AVC_CALLBACK_RESET) {
                        tmprc = c->callback(AVC_CALLBACK_RESET);
                        /* save the first error encountered for the return
                           value and continue processing the callbacks */
                        if (!rc)
                                rc = tmprc;
                }
        }

        avc_latest_notif_update(avc, seqno, 0);
        return rc;
}

/*
 * Slow-path helper function for avc_has_perm_noaudit,
 * when the avc_node lookup fails. We get called with
 * the RCU read lock held, and need to return with it
 * still held, but drop if for the security compute.
 *
 * Don't inline this, since it's the slow-path and just
 * results in a bigger stack frame.
 */
static noinline
struct avc_node *avc_compute_av(struct selinux_state *state,
                                u32 ssid, u32 tsid,
                                u16 tclass, struct av_decision *avd,
                                struct avc_xperms_node *xp_node)
{
        rcu_read_unlock();
        INIT_LIST_HEAD(&xp_node->xpd_head);
        security_compute_av(state, ssid, tsid, tclass, avd, &xp_node->xp);
        rcu_read_lock();
        return avc_insert(state->avc, ssid, tsid, tclass, avd, xp_node);
}

static noinline int avc_denied(struct selinux_state *state,
                               u32 ssid, u32 tsid,
                               u16 tclass, u32 requested,
                               u8 driver, u8 xperm, unsigned int flags,
                               struct av_decision *avd)
{
        if (flags & AVC_STRICT)
                return -EACCES;

        if (enforcing_enabled(state) &&
            !(avd->flags & AVD_FLAGS_PERMISSIVE))
                return -EACCES;

        avc_update_node(state->avc, AVC_CALLBACK_GRANT, requested, driver,
                        xperm, ssid, tsid, tclass, avd->seqno, NULL, flags);
        return 0;
}

/*
 * The avc extended permissions logic adds an additional 256 bits of
 * permissions to an avc node when extended permissions for that node are
 * specified in the avtab. If the additional 256 permissions is not adequate,
 * as-is the case with ioctls, then multiple may be chained together and the
 * driver field is used to specify which set contains the permission.
 */
int avc_has_extended_perms(struct selinux_state *state,
                           u32 ssid, u32 tsid, u16 tclass, u32 requested,
                           u8 driver, u8 xperm, struct common_audit_data *ad)
{
        struct avc_node *node;
        struct av_decision avd;
        u32 denied;
        struct extended_perms_decision local_xpd;
        struct extended_perms_decision *xpd = NULL;
        struct extended_perms_data allowed;
        struct extended_perms_data auditallow;
        struct extended_perms_data dontaudit;
        struct avc_xperms_node local_xp_node;
        struct avc_xperms_node *xp_node;
        int rc = 0, rc2;

        xp_node = &local_xp_node;
        if (WARN_ON(!requested))
                return -EACCES;

        rcu_read_lock();

        node = avc_lookup(state->avc, ssid, tsid, tclass);
        if (unlikely(!node)) {
                node = avc_compute_av(state, ssid, tsid, tclass, &avd, xp_node);
        } else {
                memcpy(&avd, &node->ae.avd, sizeof(avd));
                xp_node = node->ae.xp_node;
        }
        /* if extended permissions are not defined, only consider av_decision */
        if (!xp_node || !xp_node->xp.len)
                goto decision;

        local_xpd.allowed = &allowed;
        local_xpd.auditallow = &auditallow;
        local_xpd.dontaudit = &dontaudit;

        xpd = avc_xperms_decision_lookup(driver, xp_node);
        if (unlikely(!xpd)) {
                /*
                 * Compute the extended_perms_decision only if the driver
                 * is flagged
                 */
                if (!security_xperm_test(xp_node->xp.drivers.p, driver)) {
                        avd.allowed &= ~requested;
                        goto decision;
                }
                rcu_read_unlock();
                security_compute_xperms_decision(state, ssid, tsid, tclass,
                                                 driver, &local_xpd);
                rcu_read_lock();
                avc_update_node(state->avc, AVC_CALLBACK_ADD_XPERMS, requested,
                                driver, xperm, ssid, tsid, tclass, avd.seqno,
                                &local_xpd, 0);
        } else {
                avc_quick_copy_xperms_decision(xperm, &local_xpd, xpd);
        }
        xpd = &local_xpd;

        if (!avc_xperms_has_perm(xpd, xperm, XPERMS_ALLOWED))
                avd.allowed &= ~requested;

decision:
        denied = requested & ~(avd.allowed);
        if (unlikely(denied))
                rc = avc_denied(state, ssid, tsid, tclass, requested,
                                driver, xperm, AVC_EXTENDED_PERMS, &avd);

        rcu_read_unlock();

        rc2 = avc_xperms_audit(state, ssid, tsid, tclass, requested,
                        &avd, xpd, xperm, rc, ad);
        if (rc2)
                return rc2;
        return rc;
}

/**
 * avc_has_perm_noaudit - Check permissions but perform no auditing.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 * @requested: requested permissions, interpreted based on @tclass
 * @flags:  AVC_STRICT, AVC_NONBLOCKING, or 0
 * @avd: access vector decisions
 *
 * Check the AVC to determine whether the @requested permissions are granted
 * for the SID pair (@ssid, @tsid), interpreting the permissions
 * based on @tclass, and call the security server on a cache miss to obtain
 * a new decision and add it to the cache.  Return a copy of the decisions
 * in @avd.  Return %0 if all @requested permissions are granted,
 * -%EACCES if any permissions are denied, or another -errno upon
 * other errors.  This function is typically called by avc_has_perm(),
 * but may also be called directly to separate permission checking from
 * auditing, e.g. in cases where a lock must be held for the check but
 * should be released for the auditing.
 */
inline int avc_has_perm_noaudit(struct selinux_state *state,
                                u32 ssid, u32 tsid,
                                u16 tclass, u32 requested,
                                unsigned int flags,
                                struct av_decision *avd)
{
        struct avc_node *node;
        struct avc_xperms_node xp_node;
        int rc = 0;
        u32 denied;

        if (WARN_ON(!requested))
                return -EACCES;

        rcu_read_lock();

        node = avc_lookup(state->avc, ssid, tsid, tclass);
        if (unlikely(!node))
                node = avc_compute_av(state, ssid, tsid, tclass, avd, &xp_node);
        else
                memcpy(avd, &node->ae.avd, sizeof(*avd));

        denied = requested & ~(avd->allowed);
        if (unlikely(denied))
                rc = avc_denied(state, ssid, tsid, tclass, requested, 0, 0,
                                flags, avd);

        rcu_read_unlock();
        return rc;
}

/**
 * avc_has_perm - Check permissions and perform any appropriate auditing.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 * @requested: requested permissions, interpreted based on @tclass
 * @auditdata: auxiliary audit data
 *
 * Check the AVC to determine whether the @requested permissions are granted
 * for the SID pair (@ssid, @tsid), interpreting the permissions
 * based on @tclass, and call the security server on a cache miss to obtain
 * a new decision and add it to the cache.  Audit the granting or denial of
 * permissions in accordance with the policy.  Return %0 if all @requested
 * permissions are granted, -%EACCES if any permissions are denied, or
 * another -errno upon other errors.
 */
int avc_has_perm(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass,
                 u32 requested, struct common_audit_data *auditdata)
{
        struct av_decision avd;
        int rc, rc2;

        rc = avc_has_perm_noaudit(state, ssid, tsid, tclass, requested, 0,
                                  &avd);

        rc2 = avc_audit(state, ssid, tsid, tclass, requested, &avd, rc,
                        auditdata, 0);
        if (rc2)
                return rc2;
        return rc;
}

int avc_has_perm_flags(struct selinux_state *state,
                       u32 ssid, u32 tsid, u16 tclass, u32 requested,
                       struct common_audit_data *auditdata,
                       int flags)
{
        struct av_decision avd;
        int rc, rc2;

        rc = avc_has_perm_noaudit(state, ssid, tsid, tclass, requested,
                                  (flags & MAY_NOT_BLOCK) ? AVC_NONBLOCKING : 0,
                                  &avd);

        rc2 = avc_audit(state, ssid, tsid, tclass, requested, &avd, rc,
                        auditdata, flags);
        if (rc2)
                return rc2;
        return rc;
}

u32 avc_policy_seqno(struct selinux_state *state)
{
        return state->avc->avc_cache.latest_notif;
}

void avc_disable(void)
{
        /*
         * If you are looking at this because you have realized that we are
         * not destroying the avc_node_cachep it might be easy to fix, but
         * I don't know the memory barrier semantics well enough to know.  It's
         * possible that some other task dereferenced security_ops when
         * it still pointed to selinux operations.  If that is the case it's
         * possible that it is about to use the avc and is about to need the
         * avc_node_cachep.  I know I could wrap the security.c security_ops call
         * in an rcu_lock, but seriously, it's not worth it.  Instead I just flush
         * the cache and get that memory back.
         */
        if (avc_node_cachep) {
                avc_flush(selinux_state.avc);
                /* kmem_cache_destroy(avc_node_cachep); */
        }
}

























































    1 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM x86_fpu

#if !defined(_TRACE_FPU_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_FPU_H

#include <linux/tracepoint.h>

DECLARE_EVENT_CLASS(x86_fpu,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu),

        TP_STRUCT__entry(
                __field(struct fpu *, fpu)
                __field(bool, load_fpu)
                __field(u64, xfeatures)
                __field(u64, xcomp_bv)
                ),

        TP_fast_assign(
                __entry->fpu                = fpu;
                __entry->load_fpu        = test_thread_flag(TIF_NEED_FPU_LOAD);
                if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
                        __entry->xfeatures = fpu->state.xsave.header.xfeatures;
                        __entry->xcomp_bv  = fpu->state.xsave.header.xcomp_bv;
                }
        ),
        TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx",
                        __entry->fpu,
                        __entry->load_fpu,
                        __entry->xfeatures,
                        __entry->xcomp_bv
        )
);

DEFINE_EVENT(x86_fpu, x86_fpu_before_save,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_after_save,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_before_restore,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_after_restore,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_regs_activated,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_regs_deactivated,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_init_state,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_dropped,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_copy_src,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_copy_dst,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_xstate_check_failed,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH asm/trace/
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE fpu
#endif /* _TRACE_FPU_H */

/* This part must be outside protection */
#include <trace/define_trace.h>














    1 
    1 





    1 



    1 

    1 







    1 


    1 





























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
// SPDX-License-Identifier: GPL-2.0
#include <linux/bitops.h>
#include <linux/fault-inject-usercopy.h>
#include <linux/instrumented.h>
#include <linux/uaccess.h>
#include <linux/nospec.h>

/* out-of-line parts */

#ifndef INLINE_COPY_FROM_USER
unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n)
{
        unsigned long res = n;
        might_fault();
        if (!should_fail_usercopy() && likely(access_ok(from, n))) {
                /*
                 * Ensure that bad access_ok() speculation will not
                 * lead to nasty side effects *after* the copy is
                 * finished:
                 */
                barrier_nospec();
                instrument_copy_from_user(to, from, n);
                res = raw_copy_from_user(to, from, n);
        }
        if (unlikely(res))
                memset(to + (n - res), 0, res);
        return res;
}
EXPORT_SYMBOL(_copy_from_user);
#endif

#ifndef INLINE_COPY_TO_USER
unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n)
{
        might_fault();
        if (should_fail_usercopy())
                return n;
        if (likely(access_ok(to, n))) {
                instrument_copy_to_user(to, from, n);
                n = raw_copy_to_user(to, from, n);
        }
        return n;
}
EXPORT_SYMBOL(_copy_to_user);
#endif

/**
 * check_zeroed_user: check if a userspace buffer only contains zero bytes
 * @from: Source address, in userspace.
 * @size: Size of buffer.
 *
 * This is effectively shorthand for "memchr_inv(from, 0, size) == NULL" for
 * userspace addresses (and is more efficient because we don't care where the
 * first non-zero byte is).
 *
 * Returns:
 *  * 0: There were non-zero bytes present in the buffer.
 *  * 1: The buffer was full of zero bytes.
 *  * -EFAULT: access to userspace failed.
 */
int check_zeroed_user(const void __user *from, size_t size)
{
        unsigned long val;
        uintptr_t align = (uintptr_t) from % sizeof(unsigned long);

        if (unlikely(size == 0))
                return 1;

        from -= align;
        size += align;

        if (!user_read_access_begin(from, size))
                return -EFAULT;

        unsafe_get_user(val, (unsigned long __user *) from, err_fault);
        if (align)
                val &= ~aligned_byte_mask(align);

        while (size > sizeof(unsigned long)) {
                if (unlikely(val))
                        goto done;

                from += sizeof(unsigned long);
                size -= sizeof(unsigned long);

                unsafe_get_user(val, (unsigned long __user *) from, err_fault);
        }

        if (size < sizeof(unsigned long))
                val &= aligned_byte_mask(size);

done:
        user_read_access_end();
        return (val == 0);
err_fault:
        user_read_access_end();
        return -EFAULT;
}
EXPORT_SYMBOL(check_zeroed_user);


























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_MMU_CONTEXT_H
#define _ASM_X86_MMU_CONTEXT_H

#include <asm/desc.h>
#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/pkeys.h>

#include <trace/events/tlb.h>

#include <asm/tlbflush.h>
#include <asm/paravirt.h>
#include <asm/debugreg.h>

extern atomic64_t last_mm_ctx_id;

#ifndef CONFIG_PARAVIRT_XXL
static inline void paravirt_activate_mm(struct mm_struct *prev,
                                        struct mm_struct *next)
{
}
#endif        /* !CONFIG_PARAVIRT_XXL */

#ifdef CONFIG_PERF_EVENTS
DECLARE_STATIC_KEY_FALSE(rdpmc_never_available_key);
DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key);
void cr4_update_pce(void *ignored);
#endif

#ifdef CONFIG_MODIFY_LDT_SYSCALL
/*
 * ldt_structs can be allocated, used, and freed, but they are never
 * modified while live.
 */
struct ldt_struct {
        /*
         * Xen requires page-aligned LDTs with special permissions.  This is
         * needed to prevent us from installing evil descriptors such as
         * call gates.  On native, we could merge the ldt_struct and LDT
         * allocations, but it's not worth trying to optimize.
         */
        struct desc_struct        *entries;
        unsigned int                nr_entries;

        /*
         * If PTI is in use, then the entries array is not mapped while we're
         * in user mode.  The whole array will be aliased at the addressed
         * given by ldt_slot_va(slot).  We use two slots so that we can allocate
         * and map, and enable a new LDT without invalidating the mapping
         * of an older, still-in-use LDT.
         *
         * slot will be -1 if this LDT doesn't have an alias mapping.
         */
        int                        slot;
};

/*
 * Used for LDT copy/destruction.
 */
static inline void init_new_context_ldt(struct mm_struct *mm)
{
        mm->context.ldt = NULL;
        init_rwsem(&mm->context.ldt_usr_sem);
}
int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
void destroy_context_ldt(struct mm_struct *mm);
void ldt_arch_exit_mmap(struct mm_struct *mm);
#else        /* CONFIG_MODIFY_LDT_SYSCALL */
static inline void init_new_context_ldt(struct mm_struct *mm) { }
static inline int ldt_dup_context(struct mm_struct *oldmm,
                                  struct mm_struct *mm)
{
        return 0;
}
static inline void destroy_context_ldt(struct mm_struct *mm) { }
static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
#endif

#ifdef CONFIG_MODIFY_LDT_SYSCALL
extern void load_mm_ldt(struct mm_struct *mm);
extern void switch_ldt(struct mm_struct *prev, struct mm_struct *next);
#else
static inline void load_mm_ldt(struct mm_struct *mm)
{
        clear_LDT();
}
static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
{
        DEBUG_LOCKS_WARN_ON(preemptible());
}
#endif

extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);

/*
 * Init a new mm.  Used on mm copies, like at fork()
 * and on mm's that are brand-new, like at execve().
 */
static inline int init_new_context(struct task_struct *tsk,
                                   struct mm_struct *mm)
{
        mutex_init(&mm->context.lock);

        mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
        atomic64_set(&mm->context.tlb_gen, 0);

#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
        if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
                /* pkey 0 is the default and allocated implicitly */
                mm->context.pkey_allocation_map = 0x1;
                /* -1 means unallocated or invalid */
                mm->context.execute_only_pkey = -1;
        }
#endif
        init_new_context_ldt(mm);
        return 0;
}
static inline void destroy_context(struct mm_struct *mm)
{
        destroy_context_ldt(mm);
}

extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
                      struct task_struct *tsk);

extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                               struct task_struct *tsk);
#define switch_mm_irqs_off switch_mm_irqs_off

#define activate_mm(prev, next)                        \
do {                                                \
        paravirt_activate_mm((prev), (next));        \
        switch_mm((prev), (next), NULL);        \
} while (0);

#ifdef CONFIG_X86_32
#define deactivate_mm(tsk, mm)                        \
do {                                                \
        lazy_load_gs(0);                        \
} while (0)
#else
#define deactivate_mm(tsk, mm)                        \
do {                                                \
        load_gs_index(0);                        \
        loadsegment(fs, 0);                        \
} while (0)
#endif

static inline void arch_dup_pkeys(struct mm_struct *oldmm,
                                  struct mm_struct *mm)
{
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return;

        /* Duplicate the oldmm pkey state in mm: */
        mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map;
        mm->context.execute_only_pkey   = oldmm->context.execute_only_pkey;
#endif
}

static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
        arch_dup_pkeys(oldmm, mm);
        paravirt_arch_dup_mmap(oldmm, mm);
        return ldt_dup_context(oldmm, mm);
}

static inline void arch_exit_mmap(struct mm_struct *mm)
{
        paravirt_arch_exit_mmap(mm);
        ldt_arch_exit_mmap(mm);
}

#ifdef CONFIG_X86_64
static inline bool is_64bit_mm(struct mm_struct *mm)
{
        return        !IS_ENABLED(CONFIG_IA32_EMULATION) ||
                !(mm->context.ia32_compat == TIF_IA32);
}
#else
static inline bool is_64bit_mm(struct mm_struct *mm)
{
        return false;
}
#endif

static inline void arch_unmap(struct mm_struct *mm, unsigned long start,
                              unsigned long end)
{
}

/*
 * We only want to enforce protection keys on the current process
 * because we effectively have no access to PKRU for other
 * processes or any way to tell *which * PKRU in a threaded
 * process we could use.
 *
 * So do not enforce things if the VMA is not from the current
 * mm, or if we are in a kernel thread.
 */
static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
                bool write, bool execute, bool foreign)
{
        /* pkeys never affect instruction fetches */
        if (execute)
                return true;
        /* allow access if the VMA is not one from this process */
        if (foreign || vma_is_foreign(vma))
                return true;
        return __pkru_allows_pkey(vma_pkey(vma), write);
}

unsigned long __get_current_cr3_fast(void);

#endif /* _ASM_X86_MMU_CONTEXT_H */






























































































    1 


















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_64_H
#define _ASM_X86_PGTABLE_64_H

#include <linux/const.h>
#include <asm/pgtable_64_types.h>

#ifndef __ASSEMBLY__

/*
 * This file contains the functions and defines necessary to modify and use
 * the x86-64 page table tree.
 */
#include <asm/processor.h>
#include <linux/bitops.h>
#include <linux/threads.h>
#include <asm/fixmap.h>

extern p4d_t level4_kernel_pgt[512];
extern p4d_t level4_ident_pgt[512];
extern pud_t level3_kernel_pgt[512];
extern pud_t level3_ident_pgt[512];
extern pmd_t level2_kernel_pgt[512];
extern pmd_t level2_fixmap_pgt[512];
extern pmd_t level2_ident_pgt[512];
extern pte_t level1_fixmap_pgt[512 * FIXMAP_PMD_NUM];
extern pgd_t init_top_pgt[];

#define swapper_pg_dir init_top_pgt

extern void paging_init(void);
static inline void sync_initial_page_table(void) { }

#define pte_ERROR(e)                                        \
        pr_err("%s:%d: bad pte %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), pte_val(e))
#define pmd_ERROR(e)                                        \
        pr_err("%s:%d: bad pmd %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), pmd_val(e))
#define pud_ERROR(e)                                        \
        pr_err("%s:%d: bad pud %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), pud_val(e))

#if CONFIG_PGTABLE_LEVELS >= 5
#define p4d_ERROR(e)                                        \
        pr_err("%s:%d: bad p4d %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), p4d_val(e))
#endif

#define pgd_ERROR(e)                                        \
        pr_err("%s:%d: bad pgd %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), pgd_val(e))

struct mm_struct;

#define mm_p4d_folded mm_p4d_folded
static inline bool mm_p4d_folded(struct mm_struct *mm)
{
        return !pgtable_l5_enabled();
}

void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte);
void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);

static inline void native_set_pte(pte_t *ptep, pte_t pte)
{
        WRITE_ONCE(*ptep, pte);
}

static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
                                    pte_t *ptep)
{
        native_set_pte(ptep, native_make_pte(0));
}

static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
{
        native_set_pte(ptep, pte);
}

static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
{
        WRITE_ONCE(*pmdp, pmd);
}

static inline void native_pmd_clear(pmd_t *pmd)
{
        native_set_pmd(pmd, native_make_pmd(0));
}

static inline pte_t native_ptep_get_and_clear(pte_t *xp)
{
#ifdef CONFIG_SMP
        return native_make_pte(xchg(&xp->pte, 0));
#else
        /* native_local_ptep_get_and_clear,
           but duplicated because of cyclic dependency */
        pte_t ret = *xp;
        native_pte_clear(NULL, 0, xp);
        return ret;
#endif
}

static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
{
#ifdef CONFIG_SMP
        return native_make_pmd(xchg(&xp->pmd, 0));
#else
        /* native_local_pmdp_get_and_clear,
           but duplicated because of cyclic dependency */
        pmd_t ret = *xp;
        native_pmd_clear(xp);
        return ret;
#endif
}

static inline void native_set_pud(pud_t *pudp, pud_t pud)
{
        WRITE_ONCE(*pudp, pud);
}

static inline void native_pud_clear(pud_t *pud)
{
        native_set_pud(pud, native_make_pud(0));
}

static inline pud_t native_pudp_get_and_clear(pud_t *xp)
{
#ifdef CONFIG_SMP
        return native_make_pud(xchg(&xp->pud, 0));
#else
        /* native_local_pudp_get_and_clear,
         * but duplicated because of cyclic dependency
         */
        pud_t ret = *xp;

        native_pud_clear(xp);
        return ret;
#endif
}

static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
{
        pgd_t pgd;

        if (pgtable_l5_enabled() || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) {
                WRITE_ONCE(*p4dp, p4d);
                return;
        }

        pgd = native_make_pgd(native_p4d_val(p4d));
        pgd = pti_set_user_pgtbl((pgd_t *)p4dp, pgd);
        WRITE_ONCE(*p4dp, native_make_p4d(native_pgd_val(pgd)));
}

static inline void native_p4d_clear(p4d_t *p4d)
{
        native_set_p4d(p4d, native_make_p4d(0));
}

static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
        WRITE_ONCE(*pgdp, pti_set_user_pgtbl(pgdp, pgd));
}

static inline void native_pgd_clear(pgd_t *pgd)
{
        native_set_pgd(pgd, native_make_pgd(0));
}

/*
 * Conversion functions: convert a page and protection to a page entry,
 * and a page entry and page directory to the page they refer to.
 */

/* PGD - Level 4 access */

/* PUD - Level 3 access */

/* PMD - Level 2 access */

/* PTE - Level 1 access */

/*
 * Encode and de-code a swap entry
 *
 * |     ...            | 11| 10|  9|8|7|6|5| 4| 3|2| 1|0| <- bit number
 * |     ...            |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
 * | TYPE (59-63) | ~OFFSET (9-58)  |0|0|X|X| X| X|F|SD|0| <- swp entry
 *
 * G (8) is aliased and used as a PROT_NONE indicator for
 * !present ptes.  We need to start storing swap entries above
 * there.  We also need to avoid using A and D because of an
 * erratum where they can be incorrectly set by hardware on
 * non-present PTEs.
 *
 * SD Bits 1-4 are not used in non-present format and available for
 * special use described below:
 *
 * SD (1) in swp entry is used to store soft dirty bit, which helps us
 * remember soft dirty over page migration
 *
 * F (2) in swp entry is used to record when a pagetable is
 * writeprotected by userfaultfd WP support.
 *
 * Bit 7 in swp entry should be 0 because pmd_present checks not only P,
 * but also L and G.
 *
 * The offset is inverted by a binary not operation to make the high
 * physical bits set.
 */
#define SWP_TYPE_BITS                5

#define SWP_OFFSET_FIRST_BIT        (_PAGE_BIT_PROTNONE + 1)

/* We always extract/encode the offset by shifting it all the way up, and then down again */
#define SWP_OFFSET_SHIFT        (SWP_OFFSET_FIRST_BIT+SWP_TYPE_BITS)

#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)

/* Extract the high bits for type */
#define __swp_type(x) ((x).val >> (64 - SWP_TYPE_BITS))

/* Shift up (to get rid of type), then down to get value */
#define __swp_offset(x) (~(x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT)

/*
 * Shift the offset up "too far" by TYPE bits, then down again
 * The offset is inverted by a binary not operation to make the high
 * physical bits set.
 */
#define __swp_entry(type, offset) ((swp_entry_t) { \
        (~(unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \
        | ((unsigned long)(type) << (64-SWP_TYPE_BITS)) })

#define __pte_to_swp_entry(pte)                ((swp_entry_t) { pte_val((pte)) })
#define __pmd_to_swp_entry(pmd)                ((swp_entry_t) { pmd_val((pmd)) })
#define __swp_entry_to_pte(x)                (__pte((x).val))
#define __swp_entry_to_pmd(x)                (__pmd((x).val))

extern int kern_addr_valid(unsigned long addr);
extern void cleanup_highmap(void);

#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN

#define PAGE_AGP    PAGE_KERNEL_NOCACHE
#define HAVE_PAGE_AGP 1

/* fs/proc/kcore.c */
#define        kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
#define        kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)

#define __HAVE_ARCH_PTE_SAME

#define vmemmap ((struct page *)VMEMMAP_START)

extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);

#define gup_fast_permitted gup_fast_permitted
static inline bool gup_fast_permitted(unsigned long start, unsigned long end)
{
        if (end >> __VIRTUAL_MASK_SHIFT)
                return false;
        return true;
}

#include <asm/pgtable-invert.h>

#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_64_H */
































































































































    1 























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


























































    1 





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
// SPDX-License-Identifier: GPL-2.0+
/*
 * User-space Probes (UProbes)
 *
 * Copyright (C) IBM Corporation, 2008-2012
 * Authors:
 *        Srikar Dronamraju
 *        Jim Keniston
 * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra
 */

#include <linux/kernel.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>        /* read_mapping_page */
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/export.h>
#include <linux/rmap.h>                /* anon_vma_prepare */
#include <linux/mmu_notifier.h>        /* set_pte_at_notify */
#include <linux/swap.h>                /* try_to_free_swap */
#include <linux/ptrace.h>        /* user_enable_single_step */
#include <linux/kdebug.h>        /* notifier mechanism */
#include "../../mm/internal.h"        /* munlock_vma_page */
#include <linux/percpu-rwsem.h>
#include <linux/task_work.h>
#include <linux/shmem_fs.h>
#include <linux/khugepaged.h>

#include <linux/uprobes.h>

#define UINSNS_PER_PAGE                        (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
#define MAX_UPROBE_XOL_SLOTS                UINSNS_PER_PAGE

static struct rb_root uprobes_tree = RB_ROOT;
/*
 * allows us to skip the uprobe_mmap if there are no uprobe events active
 * at this time.  Probably a fine grained per inode count is better?
 */
#define no_uprobe_events()        RB_EMPTY_ROOT(&uprobes_tree)

static DEFINE_SPINLOCK(uprobes_treelock);        /* serialize rbtree access */

#define UPROBES_HASH_SZ        13
/* serialize uprobe->pending_list */
static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
#define uprobes_mmap_hash(v)        (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])

DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);

/* Have a copy of original instruction */
#define UPROBE_COPY_INSN        0

struct uprobe {
        struct rb_node                rb_node;        /* node in the rb tree */
        refcount_t                ref;
        struct rw_semaphore        register_rwsem;
        struct rw_semaphore        consumer_rwsem;
        struct list_head        pending_list;
        struct uprobe_consumer        *consumers;
        struct inode                *inode;                /* Also hold a ref to inode */
        loff_t                        offset;
        loff_t                        ref_ctr_offset;
        unsigned long                flags;

        /*
         * The generic code assumes that it has two members of unknown type
         * owned by the arch-specific code:
         *
         *         insn -        copy_insn() saves the original instruction here for
         *                arch_uprobe_analyze_insn().
         *
         *        ixol -        potentially modified instruction to execute out of
         *                line, copied to xol_area by xol_get_insn_slot().
         */
        struct arch_uprobe        arch;
};

struct delayed_uprobe {
        struct list_head list;
        struct uprobe *uprobe;
        struct mm_struct *mm;
};

static DEFINE_MUTEX(delayed_uprobe_lock);
static LIST_HEAD(delayed_uprobe_list);

/*
 * Execute out of line area: anonymous executable mapping installed
 * by the probed task to execute the copy of the original instruction
 * mangled by set_swbp().
 *
 * On a breakpoint hit, thread contests for a slot.  It frees the
 * slot after singlestep. Currently a fixed number of slots are
 * allocated.
 */
struct xol_area {
        wait_queue_head_t                 wq;                /* if all slots are busy */
        atomic_t                         slot_count;        /* number of in-use slots */
        unsigned long                         *bitmap;        /* 0 = free slot */

        struct vm_special_mapping        xol_mapping;
        struct page                         *pages[2];
        /*
         * We keep the vma's vm_start rather than a pointer to the vma
         * itself.  The probed process or a naughty kernel module could make
         * the vma go away, and we must handle that reasonably gracefully.
         */
        unsigned long                         vaddr;                /* Page(s) of instruction slots */
};

/*
 * valid_vma: Verify if the specified vma is an executable vma
 * Relax restrictions while unregistering: vm_flags might have
 * changed after breakpoint was inserted.
 *        - is_register: indicates if we are in register context.
 *        - Return 1 if the specified virtual address is in an
 *          executable vma.
 */
static bool valid_vma(struct vm_area_struct *vma, bool is_register)
{
        vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;

        if (is_register)
                flags |= VM_WRITE;

        return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;
}

static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
{
        return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
}

static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
{
        return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
}

/**
 * __replace_page - replace page in vma by new page.
 * based on replace_page in mm/ksm.c
 *
 * @vma:      vma that holds the pte pointing to page
 * @addr:     address the old @page is mapped at
 * @old_page: the page we are replacing by new_page
 * @new_page: the modified page we replace page by
 *
 * If @new_page is NULL, only unmap @old_page.
 *
 * Returns 0 on success, negative error code otherwise.
 */
static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
                                struct page *old_page, struct page *new_page)
{
        struct mm_struct *mm = vma->vm_mm;
        struct page_vma_mapped_walk pvmw = {
                .page = compound_head(old_page),
                .vma = vma,
                .address = addr,
        };
        int err;
        struct mmu_notifier_range range;

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
                                addr + PAGE_SIZE);

        if (new_page) {
                err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL);
                if (err)
                        return err;
        }

        /* For try_to_free_swap() and munlock_vma_page() below */
        lock_page(old_page);

        mmu_notifier_invalidate_range_start(&range);
        err = -EAGAIN;
        if (!page_vma_mapped_walk(&pvmw))
                goto unlock;
        VM_BUG_ON_PAGE(addr != pvmw.address, old_page);

        if (new_page) {
                get_page(new_page);
                page_add_new_anon_rmap(new_page, vma, addr, false);
                lru_cache_add_inactive_or_unevictable(new_page, vma);
        } else
                /* no new page, just dec_mm_counter for old_page */
                dec_mm_counter(mm, MM_ANONPAGES);

        if (!PageAnon(old_page)) {
                dec_mm_counter(mm, mm_counter_file(old_page));
                inc_mm_counter(mm, MM_ANONPAGES);
        }

        flush_cache_page(vma, addr, pte_pfn(*pvmw.pte));
        ptep_clear_flush_notify(vma, addr, pvmw.pte);
        if (new_page)
                set_pte_at_notify(mm, addr, pvmw.pte,
                                  mk_pte(new_page, vma->vm_page_prot));

        page_remove_rmap(old_page, false);
        if (!page_mapped(old_page))
                try_to_free_swap(old_page);
        page_vma_mapped_walk_done(&pvmw);

        if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page))
                munlock_vma_page(old_page);
        put_page(old_page);

        err = 0;
 unlock:
        mmu_notifier_invalidate_range_end(&range);
        unlock_page(old_page);
        return err;
}

/**
 * is_swbp_insn - check if instruction is breakpoint instruction.
 * @insn: instruction to be checked.
 * Default implementation of is_swbp_insn
 * Returns true if @insn is a breakpoint instruction.
 */
bool __weak is_swbp_insn(uprobe_opcode_t *insn)
{
        return *insn == UPROBE_SWBP_INSN;
}

/**
 * is_trap_insn - check if instruction is breakpoint instruction.
 * @insn: instruction to be checked.
 * Default implementation of is_trap_insn
 * Returns true if @insn is a breakpoint instruction.
 *
 * This function is needed for the case where an architecture has multiple
 * trap instructions (like powerpc).
 */
bool __weak is_trap_insn(uprobe_opcode_t *insn)
{
        return is_swbp_insn(insn);
}

static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
{
        void *kaddr = kmap_atomic(page);
        memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
        kunmap_atomic(kaddr);
}

static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
{
        void *kaddr = kmap_atomic(page);
        memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
        kunmap_atomic(kaddr);
}

static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
{
        uprobe_opcode_t old_opcode;
        bool is_swbp;

        /*
         * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
         * We do not check if it is any other 'trap variant' which could
         * be conditional trap instruction such as the one powerpc supports.
         *
         * The logic is that we do not care if the underlying instruction
         * is a trap variant; uprobes always wins over any other (gdb)
         * breakpoint.
         */
        copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
        is_swbp = is_swbp_insn(&old_opcode);

        if (is_swbp_insn(new_opcode)) {
                if (is_swbp)                /* register: already installed? */
                        return 0;
        } else {
                if (!is_swbp)                /* unregister: was it changed by us? */
                        return 0;
        }

        return 1;
}

static struct delayed_uprobe *
delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm)
{
        struct delayed_uprobe *du;

        list_for_each_entry(du, &delayed_uprobe_list, list)
                if (du->uprobe == uprobe && du->mm == mm)
                        return du;
        return NULL;
}

static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm)
{
        struct delayed_uprobe *du;

        if (delayed_uprobe_check(uprobe, mm))
                return 0;

        du  = kzalloc(sizeof(*du), GFP_KERNEL);
        if (!du)
                return -ENOMEM;

        du->uprobe = uprobe;
        du->mm = mm;
        list_add(&du->list, &delayed_uprobe_list);
        return 0;
}

static void delayed_uprobe_delete(struct delayed_uprobe *du)
{
        if (WARN_ON(!du))
                return;
        list_del(&du->list);
        kfree(du);
}

static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm)
{
        struct list_head *pos, *q;
        struct delayed_uprobe *du;

        if (!uprobe && !mm)
                return;

        list_for_each_safe(pos, q, &delayed_uprobe_list) {
                du = list_entry(pos, struct delayed_uprobe, list);

                if (uprobe && du->uprobe != uprobe)
                        continue;
                if (mm && du->mm != mm)
                        continue;

                delayed_uprobe_delete(du);
        }
}

static bool valid_ref_ctr_vma(struct uprobe *uprobe,
                              struct vm_area_struct *vma)
{
        unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset);

        return uprobe->ref_ctr_offset &&
                vma->vm_file &&
                file_inode(vma->vm_file) == uprobe->inode &&
                (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
                vma->vm_start <= vaddr &&
                vma->vm_end > vaddr;
}

static struct vm_area_struct *
find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm)
{
        struct vm_area_struct *tmp;

        for (tmp = mm->mmap; tmp; tmp = tmp->vm_next)
                if (valid_ref_ctr_vma(uprobe, tmp))
                        return tmp;

        return NULL;
}

static int
__update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
{
        void *kaddr;
        struct page *page;
        struct vm_area_struct *vma;
        int ret;
        short *ptr;

        if (!vaddr || !d)
                return -EINVAL;

        ret = get_user_pages_remote(mm, vaddr, 1,
                        FOLL_WRITE, &page, &vma, NULL);
        if (unlikely(ret <= 0)) {
                /*
                 * We are asking for 1 page. If get_user_pages_remote() fails,
                 * it may return 0, in that case we have to return error.
                 */
                return ret == 0 ? -EBUSY : ret;
        }

        kaddr = kmap_atomic(page);
        ptr = kaddr + (vaddr & ~PAGE_MASK);

        if (unlikely(*ptr + d < 0)) {
                pr_warn("ref_ctr going negative. vaddr: 0x%lx, "
                        "curr val: %d, delta: %d\n", vaddr, *ptr, d);
                ret = -EINVAL;
                goto out;
        }

        *ptr += d;
        ret = 0;
out:
        kunmap_atomic(kaddr);
        put_page(page);
        return ret;
}

static void update_ref_ctr_warn(struct uprobe *uprobe,
                                struct mm_struct *mm, short d)
{
        pr_warn("ref_ctr %s failed for inode: 0x%lx offset: "
                "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n",
                d > 0 ? "increment" : "decrement", uprobe->inode->i_ino,
                (unsigned long long) uprobe->offset,
                (unsigned long long) uprobe->ref_ctr_offset, mm);
}

static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
                          short d)
{
        struct vm_area_struct *rc_vma;
        unsigned long rc_vaddr;
        int ret = 0;

        rc_vma = find_ref_ctr_vma(uprobe, mm);

        if (rc_vma) {
                rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset);
                ret = __update_ref_ctr(mm, rc_vaddr, d);
                if (ret)
                        update_ref_ctr_warn(uprobe, mm, d);

                if (d > 0)
                        return ret;
        }

        mutex_lock(&delayed_uprobe_lock);
        if (d > 0)
                ret = delayed_uprobe_add(uprobe, mm);
        else
                delayed_uprobe_remove(uprobe, mm);
        mutex_unlock(&delayed_uprobe_lock);

        return ret;
}

/*
 * NOTE:
 * Expect the breakpoint instruction to be the smallest size instruction for
 * the architecture. If an arch has variable length instruction and the
 * breakpoint instruction is not of the smallest length instruction
 * supported by that architecture then we need to modify is_trap_at_addr and
 * uprobe_write_opcode accordingly. This would never be a problem for archs
 * that have fixed length instructions.
 *
 * uprobe_write_opcode - write the opcode at a given virtual address.
 * @mm: the probed process address space.
 * @vaddr: the virtual address to store the opcode.
 * @opcode: opcode to be written at @vaddr.
 *
 * Called with mm->mmap_lock held for write.
 * Return 0 (success) or a negative errno.
 */
int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
                        unsigned long vaddr, uprobe_opcode_t opcode)
{
        struct uprobe *uprobe;
        struct page *old_page, *new_page;
        struct vm_area_struct *vma;
        int ret, is_register, ref_ctr_updated = 0;
        bool orig_page_huge = false;
        unsigned int gup_flags = FOLL_FORCE;

        is_register = is_swbp_insn(&opcode);
        uprobe = container_of(auprobe, struct uprobe, arch);

retry:
        if (is_register)
                gup_flags |= FOLL_SPLIT_PMD;
        /* Read the page with vaddr into memory */
        ret = get_user_pages_remote(mm, vaddr, 1, gup_flags,
                                    &old_page, &vma, NULL);
        if (ret <= 0)
                return ret;

        ret = verify_opcode(old_page, vaddr, &opcode);
        if (ret <= 0)
                goto put_old;

        if (WARN(!is_register && PageCompound(old_page),
                 "uprobe unregister should never work on compound page\n")) {
                ret = -EINVAL;
                goto put_old;
        }

        /* We are going to replace instruction, update ref_ctr. */
        if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
                ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
                if (ret)
                        goto put_old;

                ref_ctr_updated = 1;
        }

        ret = 0;
        if (!is_register && !PageAnon(old_page))
                goto put_old;

        ret = anon_vma_prepare(vma);
        if (ret)
                goto put_old;

        ret = -ENOMEM;
        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
        if (!new_page)
                goto put_old;

        __SetPageUptodate(new_page);
        copy_highpage(new_page, old_page);
        copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);

        if (!is_register) {
                struct page *orig_page;
                pgoff_t index;

                VM_BUG_ON_PAGE(!PageAnon(old_page), old_page);

                index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT;
                orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
                                          index);

                if (orig_page) {
                        if (PageUptodate(orig_page) &&
                            pages_identical(new_page, orig_page)) {
                                /* let go new_page */
                                put_page(new_page);
                                new_page = NULL;

                                if (PageCompound(orig_page))
                                        orig_page_huge = true;
                        }
                        put_page(orig_page);
                }
        }

        ret = __replace_page(vma, vaddr, old_page, new_page);
        if (new_page)
                put_page(new_page);
put_old:
        put_page(old_page);

        if (unlikely(ret == -EAGAIN))
                goto retry;

        /* Revert back reference counter if instruction update failed. */
        if (ret && is_register && ref_ctr_updated)
                update_ref_ctr(uprobe, mm, -1);

        /* try collapse pmd for compound page */
        if (!ret && orig_page_huge)
                collapse_pte_mapped_thp(mm, vaddr);

        return ret;
}

/**
 * set_swbp - store breakpoint at a given address.
 * @auprobe: arch specific probepoint information.
 * @mm: the probed process address space.
 * @vaddr: the virtual address to insert the opcode.
 *
 * For mm @mm, store the breakpoint instruction at @vaddr.
 * Return 0 (success) or a negative errno.
 */
int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
{
        return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN);
}

/**
 * set_orig_insn - Restore the original instruction.
 * @mm: the probed process address space.
 * @auprobe: arch specific probepoint information.
 * @vaddr: the virtual address to insert the opcode.
 *
 * For mm @mm, restore the original opcode (opcode) at @vaddr.
 * Return 0 (success) or a negative errno.
 */
int __weak
set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
{
        return uprobe_write_opcode(auprobe, mm, vaddr,
                        *(uprobe_opcode_t *)&auprobe->insn);
}

static struct uprobe *get_uprobe(struct uprobe *uprobe)
{
        refcount_inc(&uprobe->ref);
        return uprobe;
}

static void put_uprobe(struct uprobe *uprobe)
{
        if (refcount_dec_and_test(&uprobe->ref)) {
                /*
                 * If application munmap(exec_vma) before uprobe_unregister()
                 * gets called, we don't get a chance to remove uprobe from
                 * delayed_uprobe_list from remove_breakpoint(). Do it here.
                 */
                mutex_lock(&delayed_uprobe_lock);
                delayed_uprobe_remove(uprobe, NULL);
                mutex_unlock(&delayed_uprobe_lock);
                kfree(uprobe);
        }
}

static int match_uprobe(struct uprobe *l, struct uprobe *r)
{
        if (l->inode < r->inode)
                return -1;

        if (l->inode > r->inode)
                return 1;

        if (l->offset < r->offset)
                return -1;

        if (l->offset > r->offset)
                return 1;

        return 0;
}

static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
{
        struct uprobe u = { .inode = inode, .offset = offset };
        struct rb_node *n = uprobes_tree.rb_node;
        struct uprobe *uprobe;
        int match;

        while (n) {
                uprobe = rb_entry(n, struct uprobe, rb_node);
                match = match_uprobe(&u, uprobe);
                if (!match)
                        return get_uprobe(uprobe);

                if (match < 0)
                        n = n->rb_left;
                else
                        n = n->rb_right;
        }
        return NULL;
}

/*
 * Find a uprobe corresponding to a given inode:offset
 * Acquires uprobes_treelock
 */
static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
{
        struct uprobe *uprobe;

        spin_lock(&uprobes_treelock);
        uprobe = __find_uprobe(inode, offset);
        spin_unlock(&uprobes_treelock);

        return uprobe;
}

static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
{
        struct rb_node **p = &uprobes_tree.rb_node;
        struct rb_node *parent = NULL;
        struct uprobe *u;
        int match;

        while (*p) {
                parent = *p;
                u = rb_entry(parent, struct uprobe, rb_node);
                match = match_uprobe(uprobe, u);
                if (!match)
                        return get_uprobe(u);

                if (match < 0)
                        p = &parent->rb_left;
                else
                        p = &parent->rb_right;

        }

        u = NULL;
        rb_link_node(&uprobe->rb_node, parent, p);
        rb_insert_color(&uprobe->rb_node, &uprobes_tree);
        /* get access + creation ref */
        refcount_set(&uprobe->ref, 2);

        return u;
}

/*
 * Acquire uprobes_treelock.
 * Matching uprobe already exists in rbtree;
 *        increment (access refcount) and return the matching uprobe.
 *
 * No matching uprobe; insert the uprobe in rb_tree;
 *        get a double refcount (access + creation) and return NULL.
 */
static struct uprobe *insert_uprobe(struct uprobe *uprobe)
{
        struct uprobe *u;

        spin_lock(&uprobes_treelock);
        u = __insert_uprobe(uprobe);
        spin_unlock(&uprobes_treelock);

        return u;
}

static void
ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe)
{
        pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx "
                "ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n",
                uprobe->inode->i_ino, (unsigned long long) uprobe->offset,
                (unsigned long long) cur_uprobe->ref_ctr_offset,
                (unsigned long long) uprobe->ref_ctr_offset);
}

static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
                                   loff_t ref_ctr_offset)
{
        struct uprobe *uprobe, *cur_uprobe;

        uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
        if (!uprobe)
                return NULL;

        uprobe->inode = inode;
        uprobe->offset = offset;
        uprobe->ref_ctr_offset = ref_ctr_offset;
        init_rwsem(&uprobe->register_rwsem);
        init_rwsem(&uprobe->consumer_rwsem);

        /* add to uprobes_tree, sorted on inode:offset */
        cur_uprobe = insert_uprobe(uprobe);
        /* a uprobe exists for this inode:offset combination */
        if (cur_uprobe) {
                if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) {
                        ref_ctr_mismatch_warn(cur_uprobe, uprobe);
                        put_uprobe(cur_uprobe);
                        kfree(uprobe);
                        return ERR_PTR(-EINVAL);
                }
                kfree(uprobe);
                uprobe = cur_uprobe;
        }

        return uprobe;
}

static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
        down_write(&uprobe->consumer_rwsem);
        uc->next = uprobe->consumers;
        uprobe->consumers = uc;
        up_write(&uprobe->consumer_rwsem);
}

/*
 * For uprobe @uprobe, delete the consumer @uc.
 * Return true if the @uc is deleted successfully
 * or return false.
 */
static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
        struct uprobe_consumer **con;
        bool ret = false;

        down_write(&uprobe->consumer_rwsem);
        for (con = &uprobe->consumers; *con; con = &(*con)->next) {
                if (*con == uc) {
                        *con = uc->next;
                        ret = true;
                        break;
                }
        }
        up_write(&uprobe->consumer_rwsem);

        return ret;
}

static int __copy_insn(struct address_space *mapping, struct file *filp,
                        void *insn, int nbytes, loff_t offset)
{
        struct page *page;
        /*
         * Ensure that the page that has the original instruction is populated
         * and in page-cache. If ->readpage == NULL it must be shmem_mapping(),
         * see uprobe_register().
         */
        if (mapping->a_ops->readpage)
                page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
        else
                page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
        if (IS_ERR(page))
                return PTR_ERR(page);

        copy_from_page(page, offset, insn, nbytes);
        put_page(page);

        return 0;
}

static int copy_insn(struct uprobe *uprobe, struct file *filp)
{
        struct address_space *mapping = uprobe->inode->i_mapping;
        loff_t offs = uprobe->offset;
        void *insn = &uprobe->arch.insn;
        int size = sizeof(uprobe->arch.insn);
        int len, err = -EIO;

        /* Copy only available bytes, -EIO if nothing was read */
        do {
                if (offs >= i_size_read(uprobe->inode))
                        break;

                len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
                err = __copy_insn(mapping, filp, insn, len, offs);
                if (err)
                        break;

                insn += len;
                offs += len;
                size -= len;
        } while (size);

        return err;
}

static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
                                struct mm_struct *mm, unsigned long vaddr)
{
        int ret = 0;

        if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
                return ret;

        /* TODO: move this into _register, until then we abuse this sem. */
        down_write(&uprobe->consumer_rwsem);
        if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
                goto out;

        ret = copy_insn(uprobe, file);
        if (ret)
                goto out;

        ret = -ENOTSUPP;
        if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn))
                goto out;

        ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
        if (ret)
                goto out;

        smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */
        set_bit(UPROBE_COPY_INSN, &uprobe->flags);

 out:
        up_write(&uprobe->consumer_rwsem);

        return ret;
}

static inline bool consumer_filter(struct uprobe_consumer *uc,
                                   enum uprobe_filter_ctx ctx, struct mm_struct *mm)
{
        return !uc->filter || uc->filter(uc, ctx, mm);
}

static bool filter_chain(struct uprobe *uprobe,
                         enum uprobe_filter_ctx ctx, struct mm_struct *mm)
{
        struct uprobe_consumer *uc;
        bool ret = false;

        down_read(&uprobe->consumer_rwsem);
        for (uc = uprobe->consumers; uc; uc = uc->next) {
                ret = consumer_filter(uc, ctx, mm);
                if (ret)
                        break;
        }
        up_read(&uprobe->consumer_rwsem);

        return ret;
}

static int
install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
                        struct vm_area_struct *vma, unsigned long vaddr)
{
        bool first_uprobe;
        int ret;

        ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
        if (ret)
                return ret;

        /*
         * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
         * the task can hit this breakpoint right after __replace_page().
         */
        first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags);
        if (first_uprobe)
                set_bit(MMF_HAS_UPROBES, &mm->flags);

        ret = set_swbp(&uprobe->arch, mm, vaddr);
        if (!ret)
                clear_bit(MMF_RECALC_UPROBES, &mm->flags);
        else if (first_uprobe)
                clear_bit(MMF_HAS_UPROBES, &mm->flags);

        return ret;
}

static int
remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
{
        set_bit(MMF_RECALC_UPROBES, &mm->flags);
        return set_orig_insn(&uprobe->arch, mm, vaddr);
}

static inline bool uprobe_is_active(struct uprobe *uprobe)
{
        return !RB_EMPTY_NODE(&uprobe->rb_node);
}
/*
 * There could be threads that have already hit the breakpoint. They
 * will recheck the current insn and restart if find_uprobe() fails.
 * See find_active_uprobe().
 */
static void delete_uprobe(struct uprobe *uprobe)
{
        if (WARN_ON(!uprobe_is_active(uprobe)))
                return;

        spin_lock(&uprobes_treelock);
        rb_erase(&uprobe->rb_node, &uprobes_tree);
        spin_unlock(&uprobes_treelock);
        RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
        put_uprobe(uprobe);
}

struct map_info {
        struct map_info *next;
        struct mm_struct *mm;
        unsigned long vaddr;
};

static inline struct map_info *free_map_info(struct map_info *info)
{
        struct map_info *next = info->next;
        kfree(info);
        return next;
}

static struct map_info *
build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
{
        unsigned long pgoff = offset >> PAGE_SHIFT;
        struct vm_area_struct *vma;
        struct map_info *curr = NULL;
        struct map_info *prev = NULL;
        struct map_info *info;
        int more = 0;

 again:
        i_mmap_lock_read(mapping);
        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                if (!valid_vma(vma, is_register))
                        continue;

                if (!prev && !more) {
                        /*
                         * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
                         * reclaim. This is optimistic, no harm done if it fails.
                         */
                        prev = kmalloc(sizeof(struct map_info),
                                        GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
                        if (prev)
                                prev->next = NULL;
                }
                if (!prev) {
                        more++;
                        continue;
                }

                if (!mmget_not_zero(vma->vm_mm))
                        continue;

                info = prev;
                prev = prev->next;
                info->next = curr;
                curr = info;

                info->mm = vma->vm_mm;
                info->vaddr = offset_to_vaddr(vma, offset);
        }
        i_mmap_unlock_read(mapping);

        if (!more)
                goto out;

        prev = curr;
        while (curr) {
                mmput(curr->mm);
                curr = curr->next;
        }

        do {
                info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
                if (!info) {
                        curr = ERR_PTR(-ENOMEM);
                        goto out;
                }
                info->next = prev;
                prev = info;
        } while (--more);

        goto again;
 out:
        while (prev)
                prev = free_map_info(prev);
        return curr;
}

static int
register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
{
        bool is_register = !!new;
        struct map_info *info;
        int err = 0;

        percpu_down_write(&dup_mmap_sem);
        info = build_map_info(uprobe->inode->i_mapping,
                                        uprobe->offset, is_register);
        if (IS_ERR(info)) {
                err = PTR_ERR(info);
                goto out;
        }

        while (info) {
                struct mm_struct *mm = info->mm;
                struct vm_area_struct *vma;

                if (err && is_register)
                        goto free;

                mmap_write_lock(mm);
                vma = find_vma(mm, info->vaddr);
                if (!vma || !valid_vma(vma, is_register) ||
                    file_inode(vma->vm_file) != uprobe->inode)
                        goto unlock;

                if (vma->vm_start > info->vaddr ||
                    vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
                        goto unlock;

                if (is_register) {
                        /* consult only the "caller", new consumer. */
                        if (consumer_filter(new,
                                        UPROBE_FILTER_REGISTER, mm))
                                err = install_breakpoint(uprobe, mm, vma, info->vaddr);
                } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
                        if (!filter_chain(uprobe,
                                        UPROBE_FILTER_UNREGISTER, mm))
                                err |= remove_breakpoint(uprobe, mm, info->vaddr);
                }

 unlock:
                mmap_write_unlock(mm);
 free:
                mmput(mm);
                info = free_map_info(info);
        }
 out:
        percpu_up_write(&dup_mmap_sem);
        return err;
}

static void
__uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
        int err;

        if (WARN_ON(!consumer_del(uprobe, uc)))
                return;

        err = register_for_each_vma(uprobe, NULL);
        /* TODO : cant unregister? schedule a worker thread */
        if (!uprobe->consumers && !err)
                delete_uprobe(uprobe);
}

/*
 * uprobe_unregister - unregister an already registered probe.
 * @inode: the file in which the probe has to be removed.
 * @offset: offset from the start of the file.
 * @uc: identify which probe if multiple probes are colocated.
 */
void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
{
        struct uprobe *uprobe;

        uprobe = find_uprobe(inode, offset);
        if (WARN_ON(!uprobe))
                return;

        down_write(&uprobe->register_rwsem);
        __uprobe_unregister(uprobe, uc);
        up_write(&uprobe->register_rwsem);
        put_uprobe(uprobe);
}
EXPORT_SYMBOL_GPL(uprobe_unregister);

/*
 * __uprobe_register - register a probe
 * @inode: the file in which the probe has to be placed.
 * @offset: offset from the start of the file.
 * @uc: information on howto handle the probe..
 *
 * Apart from the access refcount, __uprobe_register() takes a creation
 * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
 * inserted into the rbtree (i.e first consumer for a @inode:@offset
 * tuple).  Creation refcount stops uprobe_unregister from freeing the
 * @uprobe even before the register operation is complete. Creation
 * refcount is released when the last @uc for the @uprobe
 * unregisters. Caller of __uprobe_register() is required to keep @inode
 * (and the containing mount) referenced.
 *
 * Return errno if it cannot successully install probes
 * else return 0 (success)
 */
static int __uprobe_register(struct inode *inode, loff_t offset,
                             loff_t ref_ctr_offset, struct uprobe_consumer *uc)
{
        struct uprobe *uprobe;
        int ret;

        /* Uprobe must have at least one set consumer */
        if (!uc->handler && !uc->ret_handler)
                return -EINVAL;

        /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
        if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping))
                return -EIO;
        /* Racy, just to catch the obvious mistakes */
        if (offset > i_size_read(inode))
                return -EINVAL;

        /*
         * This ensures that copy_from_page(), copy_to_page() and
         * __update_ref_ctr() can't cross page boundary.
         */
        if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
                return -EINVAL;
        if (!IS_ALIGNED(ref_ctr_offset, sizeof(short)))
                return -EINVAL;

 retry:
        uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
        if (!uprobe)
                return -ENOMEM;
        if (IS_ERR(uprobe))
                return PTR_ERR(uprobe);

        /*
         * We can race with uprobe_unregister()->delete_uprobe().
         * Check uprobe_is_active() and retry if it is false.
         */
        down_write(&uprobe->register_rwsem);
        ret = -EAGAIN;
        if (likely(uprobe_is_active(uprobe))) {
                consumer_add(uprobe, uc);
                ret = register_for_each_vma(uprobe, uc);
                if (ret)
                        __uprobe_unregister(uprobe, uc);
        }
        up_write(&uprobe->register_rwsem);
        put_uprobe(uprobe);

        if (unlikely(ret == -EAGAIN))
                goto retry;
        return ret;
}

int uprobe_register(struct inode *inode, loff_t offset,
                    struct uprobe_consumer *uc)
{
        return __uprobe_register(inode, offset, 0, uc);
}
EXPORT_SYMBOL_GPL(uprobe_register);

int uprobe_register_refctr(struct inode *inode, loff_t offset,
                           loff_t ref_ctr_offset, struct uprobe_consumer *uc)
{
        return __uprobe_register(inode, offset, ref_ctr_offset, uc);
}
EXPORT_SYMBOL_GPL(uprobe_register_refctr);

/*
 * uprobe_apply - unregister an already registered probe.
 * @inode: the file in which the probe has to be removed.
 * @offset: offset from the start of the file.
 * @uc: consumer which wants to add more or remove some breakpoints
 * @add: add or remove the breakpoints
 */
int uprobe_apply(struct inode *inode, loff_t offset,
                        struct uprobe_consumer *uc, bool add)
{
        struct uprobe *uprobe;
        struct uprobe_consumer *con;
        int ret = -ENOENT;

        uprobe = find_uprobe(inode, offset);
        if (WARN_ON(!uprobe))
                return ret;

        down_write(&uprobe->register_rwsem);
        for (con = uprobe->consumers; con && con != uc ; con = con->next)
                ;
        if (con)
                ret = register_for_each_vma(uprobe, add ? uc : NULL);
        up_write(&uprobe->register_rwsem);
        put_uprobe(uprobe);

        return ret;
}

static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
{
        struct vm_area_struct *vma;
        int err = 0;

        mmap_read_lock(mm);
        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                unsigned long vaddr;
                loff_t offset;

                if (!valid_vma(vma, false) ||
                    file_inode(vma->vm_file) != uprobe->inode)
                        continue;

                offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
                if (uprobe->offset <  offset ||
                    uprobe->offset >= offset + vma->vm_end - vma->vm_start)
                        continue;

                vaddr = offset_to_vaddr(vma, uprobe->offset);
                err |= remove_breakpoint(uprobe, mm, vaddr);
        }
        mmap_read_unlock(mm);

        return err;
}

static struct rb_node *
find_node_in_range(struct inode *inode, loff_t min, loff_t max)
{
        struct rb_node *n = uprobes_tree.rb_node;

        while (n) {
                struct uprobe *u = rb_entry(n, struct uprobe, rb_node);

                if (inode < u->inode) {
                        n = n->rb_left;
                } else if (inode > u->inode) {
                        n = n->rb_right;
                } else {
                        if (max < u->offset)
                                n = n->rb_left;
                        else if (min > u->offset)
                                n = n->rb_right;
                        else
                                break;
                }
        }

        return n;
}

/*
 * For a given range in vma, build a list of probes that need to be inserted.
 */
static void build_probe_list(struct inode *inode,
                                struct vm_area_struct *vma,
                                unsigned long start, unsigned long end,
                                struct list_head *head)
{
        loff_t min, max;
        struct rb_node *n, *t;
        struct uprobe *u;

        INIT_LIST_HEAD(head);
        min = vaddr_to_offset(vma, start);
        max = min + (end - start) - 1;

        spin_lock(&uprobes_treelock);
        n = find_node_in_range(inode, min, max);
        if (n) {
                for (t = n; t; t = rb_prev(t)) {
                        u = rb_entry(t, struct uprobe, rb_node);
                        if (u->inode != inode || u->offset < min)
                                break;
                        list_add(&u->pending_list, head);
                        get_uprobe(u);
                }
                for (t = n; (t = rb_next(t)); ) {
                        u = rb_entry(t, struct uprobe, rb_node);
                        if (u->inode != inode || u->offset > max)
                                break;
                        list_add(&u->pending_list, head);
                        get_uprobe(u);
                }
        }
        spin_unlock(&uprobes_treelock);
}

/* @vma contains reference counter, not the probed instruction. */
static int delayed_ref_ctr_inc(struct vm_area_struct *vma)
{
        struct list_head *pos, *q;
        struct delayed_uprobe *du;
        unsigned long vaddr;
        int ret = 0, err = 0;

        mutex_lock(&delayed_uprobe_lock);
        list_for_each_safe(pos, q, &delayed_uprobe_list) {
                du = list_entry(pos, struct delayed_uprobe, list);

                if (du->mm != vma->vm_mm ||
                    !valid_ref_ctr_vma(du->uprobe, vma))
                        continue;

                vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset);
                ret = __update_ref_ctr(vma->vm_mm, vaddr, 1);
                if (ret) {
                        update_ref_ctr_warn(du->uprobe, vma->vm_mm, 1);
                        if (!err)
                                err = ret;
                }
                delayed_uprobe_delete(du);
        }
        mutex_unlock(&delayed_uprobe_lock);
        return err;
}

/*
 * Called from mmap_region/vma_adjust with mm->mmap_lock acquired.
 *
 * Currently we ignore all errors and always return 0, the callers
 * can't handle the failure anyway.
 */
int uprobe_mmap(struct vm_area_struct *vma)
{
        struct list_head tmp_list;
        struct uprobe *uprobe, *u;
        struct inode *inode;

        if (no_uprobe_events())
                return 0;

        if (vma->vm_file &&
            (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
            test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags))
                delayed_ref_ctr_inc(vma);

        if (!valid_vma(vma, true))
                return 0;

        inode = file_inode(vma->vm_file);
        if (!inode)
                return 0;

        mutex_lock(uprobes_mmap_hash(inode));
        build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
        /*
         * We can race with uprobe_unregister(), this uprobe can be already
         * removed. But in this case filter_chain() must return false, all
         * consumers have gone away.
         */
        list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
                if (!fatal_signal_pending(current) &&
                    filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
                        unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
                        install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
                }
                put_uprobe(uprobe);
        }
        mutex_unlock(uprobes_mmap_hash(inode));

        return 0;
}

static bool
vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end)
{
        loff_t min, max;
        struct inode *inode;
        struct rb_node *n;

        inode = file_inode(vma->vm_file);

        min = vaddr_to_offset(vma, start);
        max = min + (end - start) - 1;

        spin_lock(&uprobes_treelock);
        n = find_node_in_range(inode, min, max);
        spin_unlock(&uprobes_treelock);

        return !!n;
}

/*
 * Called in context of a munmap of a vma.
 */
void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
{
        if (no_uprobe_events() || !valid_vma(vma, false))
                return;

        if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
                return;

        if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) ||
             test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags))
                return;

        if (vma_has_uprobes(vma, start, end))
                set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
}

/* Slot allocation for XOL */
static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
{
        struct vm_area_struct *vma;
        int ret;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        if (mm->uprobes_state.xol_area) {
                ret = -EALREADY;
                goto fail;
        }

        if (!area->vaddr) {
                /* Try to map as high as possible, this is only a hint. */
                area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
                                                PAGE_SIZE, 0, 0);
                if (IS_ERR_VALUE(area->vaddr)) {
                        ret = area->vaddr;
                        goto fail;
                }
        }

        vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
                                VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO,
                                &area->xol_mapping);
        if (IS_ERR(vma)) {
                ret = PTR_ERR(vma);
                goto fail;
        }

        ret = 0;
        /* pairs with get_xol_area() */
        smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */
 fail:
        mmap_write_unlock(mm);

        return ret;
}

static struct xol_area *__create_xol_area(unsigned long vaddr)
{
        struct mm_struct *mm = current->mm;
        uprobe_opcode_t insn = UPROBE_SWBP_INSN;
        struct xol_area *area;

        area = kmalloc(sizeof(*area), GFP_KERNEL);
        if (unlikely(!area))
                goto out;

        area->bitmap = kcalloc(BITS_TO_LONGS(UINSNS_PER_PAGE), sizeof(long),
                               GFP_KERNEL);
        if (!area->bitmap)
                goto free_area;

        area->xol_mapping.name = "[uprobes]";
        area->xol_mapping.fault = NULL;
        area->xol_mapping.pages = area->pages;
        area->pages[0] = alloc_page(GFP_HIGHUSER);
        if (!area->pages[0])
                goto free_bitmap;
        area->pages[1] = NULL;

        area->vaddr = vaddr;
        init_waitqueue_head(&area->wq);
        /* Reserve the 1st slot for get_trampoline_vaddr() */
        set_bit(0, area->bitmap);
        atomic_set(&area->slot_count, 1);
        arch_uprobe_copy_ixol(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE);

        if (!xol_add_vma(mm, area))
                return area;

        __free_page(area->pages[0]);
 free_bitmap:
        kfree(area->bitmap);
 free_area:
        kfree(area);
 out:
        return NULL;
}

/*
 * get_xol_area - Allocate process's xol_area if necessary.
 * This area will be used for storing instructions for execution out of line.
 *
 * Returns the allocated area or NULL.
 */
static struct xol_area *get_xol_area(void)
{
        struct mm_struct *mm = current->mm;
        struct xol_area *area;

        if (!mm->uprobes_state.xol_area)
                __create_xol_area(0);

        /* Pairs with xol_add_vma() smp_store_release() */
        area = READ_ONCE(mm->uprobes_state.xol_area); /* ^^^ */
        return area;
}

/*
 * uprobe_clear_state - Free the area allocated for slots.
 */
void uprobe_clear_state(struct mm_struct *mm)
{
        struct xol_area *area = mm->uprobes_state.xol_area;

        mutex_lock(&delayed_uprobe_lock);
        delayed_uprobe_remove(NULL, mm);
        mutex_unlock(&delayed_uprobe_lock);

        if (!area)
                return;

        put_page(area->pages[0]);
        kfree(area->bitmap);
        kfree(area);
}

void uprobe_start_dup_mmap(void)
{
        percpu_down_read(&dup_mmap_sem);
}

void uprobe_end_dup_mmap(void)
{
        percpu_up_read(&dup_mmap_sem);
}

void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
{
        if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
                set_bit(MMF_HAS_UPROBES, &newmm->flags);
                /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
                set_bit(MMF_RECALC_UPROBES, &newmm->flags);
        }
}

/*
 *  - search for a free slot.
 */
static unsigned long xol_take_insn_slot(struct xol_area *area)
{
        unsigned long slot_addr;
        int slot_nr;

        do {
                slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
                if (slot_nr < UINSNS_PER_PAGE) {
                        if (!test_and_set_bit(slot_nr, area->bitmap))
                                break;

                        slot_nr = UINSNS_PER_PAGE;
                        continue;
                }
                wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
        } while (slot_nr >= UINSNS_PER_PAGE);

        slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES);
        atomic_inc(&area->slot_count);

        return slot_addr;
}

/*
 * xol_get_insn_slot - allocate a slot for xol.
 * Returns the allocated slot address or 0.
 */
static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
{
        struct xol_area *area;
        unsigned long xol_vaddr;

        area = get_xol_area();
        if (!area)
                return 0;

        xol_vaddr = xol_take_insn_slot(area);
        if (unlikely(!xol_vaddr))
                return 0;

        arch_uprobe_copy_ixol(area->pages[0], xol_vaddr,
                              &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));

        return xol_vaddr;
}

/*
 * xol_free_insn_slot - If slot was earlier allocated by
 * @xol_get_insn_slot(), make the slot available for
 * subsequent requests.
 */
static void xol_free_insn_slot(struct task_struct *tsk)
{
        struct xol_area *area;
        unsigned long vma_end;
        unsigned long slot_addr;

        if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask)
                return;

        slot_addr = tsk->utask->xol_vaddr;
        if (unlikely(!slot_addr))
                return;

        area = tsk->mm->uprobes_state.xol_area;
        vma_end = area->vaddr + PAGE_SIZE;
        if (area->vaddr <= slot_addr && slot_addr < vma_end) {
                unsigned long offset;
                int slot_nr;

                offset = slot_addr - area->vaddr;
                slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
                if (slot_nr >= UINSNS_PER_PAGE)
                        return;

                clear_bit(slot_nr, area->bitmap);
                atomic_dec(&area->slot_count);
                smp_mb__after_atomic(); /* pairs with prepare_to_wait() */
                if (waitqueue_active(&area->wq))
                        wake_up(&area->wq);

                tsk->utask->xol_vaddr = 0;
        }
}

void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
                                  void *src, unsigned long len)
{
        /* Initialize the slot */
        copy_to_page(page, vaddr, src, len);

        /*
         * We probably need flush_icache_user_page() but it needs vma.
         * This should work on most of architectures by default. If
         * architecture needs to do something different it can define
         * its own version of the function.
         */
        flush_dcache_page(page);
}

/**
 * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
 * @regs: Reflects the saved state of the task after it has hit a breakpoint
 * instruction.
 * Return the address of the breakpoint instruction.
 */
unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
{
        return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
}

unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
{
        struct uprobe_task *utask = current->utask;

        if (unlikely(utask && utask->active_uprobe))
                return utask->vaddr;

        return instruction_pointer(regs);
}

static struct return_instance *free_ret_instance(struct return_instance *ri)
{
        struct return_instance *next = ri->next;
        put_uprobe(ri->uprobe);
        kfree(ri);
        return next;
}

/*
 * Called with no locks held.
 * Called in context of an exiting or an exec-ing thread.
 */
void uprobe_free_utask(struct task_struct *t)
{
        struct uprobe_task *utask = t->utask;
        struct return_instance *ri;

        if (!utask)
                return;

        if (utask->active_uprobe)
                put_uprobe(utask->active_uprobe);

        ri = utask->return_instances;
        while (ri)
                ri = free_ret_instance(ri);

        xol_free_insn_slot(t);
        kfree(utask);
        t->utask = NULL;
}

/*
 * Allocate a uprobe_task object for the task if necessary.
 * Called when the thread hits a breakpoint.
 *
 * Returns:
 * - pointer to new uprobe_task on success
 * - NULL otherwise
 */
static struct uprobe_task *get_utask(void)
{
        if (!current->utask)
                current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
        return current->utask;
}

static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
{
        struct uprobe_task *n_utask;
        struct return_instance **p, *o, *n;

        n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
        if (!n_utask)
                return -ENOMEM;
        t->utask = n_utask;

        p = &n_utask->return_instances;
        for (o = o_utask->return_instances; o; o = o->next) {
                n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
                if (!n)
                        return -ENOMEM;

                *n = *o;
                get_uprobe(n->uprobe);
                n->next = NULL;

                *p = n;
                p = &n->next;
                n_utask->depth++;
        }

        return 0;
}

static void uprobe_warn(struct task_struct *t, const char *msg)
{
        pr_warn("uprobe: %s:%d failed to %s\n",
                        current->comm, current->pid, msg);
}

static void dup_xol_work(struct callback_head *work)
{
        if (current->flags & PF_EXITING)
                return;

        if (!__create_xol_area(current->utask->dup_xol_addr) &&
                        !fatal_signal_pending(current))
                uprobe_warn(current, "dup xol area");
}

/*
 * Called in context of a new clone/fork from copy_process.
 */
void uprobe_copy_process(struct task_struct *t, unsigned long flags)
{
        struct uprobe_task *utask = current->utask;
        struct mm_struct *mm = current->mm;
        struct xol_area *area;

        t->utask = NULL;

        if (!utask || !utask->return_instances)
                return;

        if (mm == t->mm && !(flags & CLONE_VFORK))
                return;

        if (dup_utask(t, utask))
                return uprobe_warn(t, "dup ret instances");

        /* The task can fork() after dup_xol_work() fails */
        area = mm->uprobes_state.xol_area;
        if (!area)
                return uprobe_warn(t, "dup xol area");

        if (mm == t->mm)
                return;

        t->utask->dup_xol_addr = area->vaddr;
        init_task_work(&t->utask->dup_xol_work, dup_xol_work);
        task_work_add(t, &t->utask->dup_xol_work, TWA_RESUME);
}

/*
 * Current area->vaddr notion assume the trampoline address is always
 * equal area->vaddr.
 *
 * Returns -1 in case the xol_area is not allocated.
 */
static unsigned long get_trampoline_vaddr(void)
{
        struct xol_area *area;
        unsigned long trampoline_vaddr = -1;

        /* Pairs with xol_add_vma() smp_store_release() */
        area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */
        if (area)
                trampoline_vaddr = area->vaddr;

        return trampoline_vaddr;
}

static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
                                        struct pt_regs *regs)
{
        struct return_instance *ri = utask->return_instances;
        enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;

        while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
                ri = free_ret_instance(ri);
                utask->depth--;
        }
        utask->return_instances = ri;
}

static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
{
        struct return_instance *ri;
        struct uprobe_task *utask;
        unsigned long orig_ret_vaddr, trampoline_vaddr;
        bool chained;

        if (!get_xol_area())
                return;

        utask = get_utask();
        if (!utask)
                return;

        if (utask->depth >= MAX_URETPROBE_DEPTH) {
                printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
                                " nestedness limit pid/tgid=%d/%d\n",
                                current->pid, current->tgid);
                return;
        }

        ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
        if (!ri)
                return;

        trampoline_vaddr = get_trampoline_vaddr();
        orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
        if (orig_ret_vaddr == -1)
                goto fail;

        /* drop the entries invalidated by longjmp() */
        chained = (orig_ret_vaddr == trampoline_vaddr);
        cleanup_return_instances(utask, chained, regs);

        /*
         * We don't want to keep trampoline address in stack, rather keep the
         * original return address of first caller thru all the consequent
         * instances. This also makes breakpoint unwrapping easier.
         */
        if (chained) {
                if (!utask->return_instances) {
                        /*
                         * This situation is not possible. Likely we have an
                         * attack from user-space.
                         */
                        uprobe_warn(current, "handle tail call");
                        goto fail;
                }
                orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
        }

        ri->uprobe = get_uprobe(uprobe);
        ri->func = instruction_pointer(regs);
        ri->stack = user_stack_pointer(regs);
        ri->orig_ret_vaddr = orig_ret_vaddr;
        ri->chained = chained;

        utask->depth++;
        ri->next = utask->return_instances;
        utask->return_instances = ri;

        return;
 fail:
        kfree(ri);
}

/* Prepare to single-step probed instruction out of line. */
static int
pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
{
        struct uprobe_task *utask;
        unsigned long xol_vaddr;
        int err;

        utask = get_utask();
        if (!utask)
                return -ENOMEM;

        xol_vaddr = xol_get_insn_slot(uprobe);
        if (!xol_vaddr)
                return -ENOMEM;

        utask->xol_vaddr = xol_vaddr;
        utask->vaddr = bp_vaddr;

        err = arch_uprobe_pre_xol(&uprobe->arch, regs);
        if (unlikely(err)) {
                xol_free_insn_slot(current);
                return err;
        }

        utask->active_uprobe = uprobe;
        utask->state = UTASK_SSTEP;
        return 0;
}

/*
 * If we are singlestepping, then ensure this thread is not connected to
 * non-fatal signals until completion of singlestep.  When xol insn itself
 * triggers the signal,  restart the original insn even if the task is
 * already SIGKILL'ed (since coredump should report the correct ip).  This
 * is even more important if the task has a handler for SIGSEGV/etc, The
 * _same_ instruction should be repeated again after return from the signal
 * handler, and SSTEP can never finish in this case.
 */
bool uprobe_deny_signal(void)
{
        struct task_struct *t = current;
        struct uprobe_task *utask = t->utask;

        if (likely(!utask || !utask->active_uprobe))
                return false;

        WARN_ON_ONCE(utask->state != UTASK_SSTEP);

        if (task_sigpending(t)) {
                spin_lock_irq(&t->sighand->siglock);
                clear_tsk_thread_flag(t, TIF_SIGPENDING);
                spin_unlock_irq(&t->sighand->siglock);

                if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
                        utask->state = UTASK_SSTEP_TRAPPED;
                        set_tsk_thread_flag(t, TIF_UPROBE);
                }
        }

        return true;
}

static void mmf_recalc_uprobes(struct mm_struct *mm)
{
        struct vm_area_struct *vma;

        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                if (!valid_vma(vma, false))
                        continue;
                /*
                 * This is not strictly accurate, we can race with
                 * uprobe_unregister() and see the already removed
                 * uprobe if delete_uprobe() was not yet called.
                 * Or this uprobe can be filtered out.
                 */
                if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
                        return;
        }

        clear_bit(MMF_HAS_UPROBES, &mm->flags);
}

static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
{
        struct page *page;
        uprobe_opcode_t opcode;
        int result;

        if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE)))
                return -EINVAL;

        pagefault_disable();
        result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr);
        pagefault_enable();

        if (likely(result == 0))
                goto out;

        /*
         * The NULL 'tsk' here ensures that any faults that occur here
         * will not be accounted to the task.  'mm' *is* current->mm,
         * but we treat this as a 'remote' access since it is
         * essentially a kernel access to the memory.
         */
        result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page,
                        NULL, NULL);
        if (result < 0)
                return result;

        copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
        put_page(page);
 out:
        /* This needs to return true for any variant of the trap insn */
        return is_trap_insn(&opcode);
}

static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
{
        struct mm_struct *mm = current->mm;
        struct uprobe *uprobe = NULL;
        struct vm_area_struct *vma;

        mmap_read_lock(mm);
        vma = find_vma(mm, bp_vaddr);
        if (vma && vma->vm_start <= bp_vaddr) {
                if (valid_vma(vma, false)) {
                        struct inode *inode = file_inode(vma->vm_file);
                        loff_t offset = vaddr_to_offset(vma, bp_vaddr);

                        uprobe = find_uprobe(inode, offset);
                }

                if (!uprobe)
                        *is_swbp = is_trap_at_addr(mm, bp_vaddr);
        } else {
                *is_swbp = -EFAULT;
        }

        if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags))
                mmf_recalc_uprobes(mm);
        mmap_read_unlock(mm);

        return uprobe;
}

static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
{
        struct uprobe_consumer *uc;
        int remove = UPROBE_HANDLER_REMOVE;
        bool need_prep = false; /* prepare return uprobe, when needed */

        down_read(&uprobe->register_rwsem);
        for (uc = uprobe->consumers; uc; uc = uc->next) {
                int rc = 0;

                if (uc->handler) {
                        rc = uc->handler(uc, regs);
                        WARN(rc & ~UPROBE_HANDLER_MASK,
                                "bad rc=0x%x from %ps()\n", rc, uc->handler);
                }

                if (uc->ret_handler)
                        need_prep = true;

                remove &= rc;
        }

        if (need_prep && !remove)
                prepare_uretprobe(uprobe, regs); /* put bp at return */

        if (remove && uprobe->consumers) {
                WARN_ON(!uprobe_is_active(uprobe));
                unapply_uprobe(uprobe, current->mm);
        }
        up_read(&uprobe->register_rwsem);
}

static void
handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
{
        struct uprobe *uprobe = ri->uprobe;
        struct uprobe_consumer *uc;

        down_read(&uprobe->register_rwsem);
        for (uc = uprobe->consumers; uc; uc = uc->next) {
                if (uc->ret_handler)
                        uc->ret_handler(uc, ri->func, regs);
        }
        up_read(&uprobe->register_rwsem);
}

static struct return_instance *find_next_ret_chain(struct return_instance *ri)
{
        bool chained;

        do {
                chained = ri->chained;
                ri = ri->next;        /* can't be NULL if chained */
        } while (chained);

        return ri;
}

static void handle_trampoline(struct pt_regs *regs)
{
        struct uprobe_task *utask;
        struct return_instance *ri, *next;
        bool valid;

        utask = current->utask;
        if (!utask)
                goto sigill;

        ri = utask->return_instances;
        if (!ri)
                goto sigill;

        do {
                /*
                 * We should throw out the frames invalidated by longjmp().
                 * If this chain is valid, then the next one should be alive
                 * or NULL; the latter case means that nobody but ri->func
                 * could hit this trampoline on return. TODO: sigaltstack().
                 */
                next = find_next_ret_chain(ri);
                valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs);

                instruction_pointer_set(regs, ri->orig_ret_vaddr);
                do {
                        if (valid)
                                handle_uretprobe_chain(ri, regs);
                        ri = free_ret_instance(ri);
                        utask->depth--;
                } while (ri != next);
        } while (!valid);

        utask->return_instances = ri;
        return;

 sigill:
        uprobe_warn(current, "handle uretprobe, sending SIGILL.");
        force_sig(SIGILL);

}

bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
{
        return false;
}

bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
                                        struct pt_regs *regs)
{
        return true;
}

/*
 * Run handler and ask thread to singlestep.
 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
 */
static void handle_swbp(struct pt_regs *regs)
{
        struct uprobe *uprobe;
        unsigned long bp_vaddr;
        int is_swbp;

        bp_vaddr = uprobe_get_swbp_addr(regs);
        if (bp_vaddr == get_trampoline_vaddr())
                return handle_trampoline(regs);

        uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
        if (!uprobe) {
                if (is_swbp > 0) {
                        /* No matching uprobe; signal SIGTRAP. */
                        force_sig(SIGTRAP);
                } else {
                        /*
                         * Either we raced with uprobe_unregister() or we can't
                         * access this memory. The latter is only possible if
                         * another thread plays with our ->mm. In both cases
                         * we can simply restart. If this vma was unmapped we
                         * can pretend this insn was not executed yet and get
                         * the (correct) SIGSEGV after restart.
                         */
                        instruction_pointer_set(regs, bp_vaddr);
                }
                return;
        }

        /* change it in advance for ->handler() and restart */
        instruction_pointer_set(regs, bp_vaddr);

        /*
         * TODO: move copy_insn/etc into _register and remove this hack.
         * After we hit the bp, _unregister + _register can install the
         * new and not-yet-analyzed uprobe at the same address, restart.
         */
        if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
                goto out;

        /*
         * Pairs with the smp_wmb() in prepare_uprobe().
         *
         * Guarantees that if we see the UPROBE_COPY_INSN bit set, then
         * we must also see the stores to &uprobe->arch performed by the
         * prepare_uprobe() call.
         */
        smp_rmb();

        /* Tracing handlers use ->utask to communicate with fetch methods */
        if (!get_utask())
                goto out;

        if (arch_uprobe_ignore(&uprobe->arch, regs))
                goto out;

        handler_chain(uprobe, regs);

        if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
                goto out;

        if (!pre_ssout(uprobe, regs, bp_vaddr))
                return;

        /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
out:
        put_uprobe(uprobe);
}

/*
 * Perform required fix-ups and disable singlestep.
 * Allow pending signals to take effect.
 */
static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
{
        struct uprobe *uprobe;
        int err = 0;

        uprobe = utask->active_uprobe;
        if (utask->state == UTASK_SSTEP_ACK)
                err = arch_uprobe_post_xol(&uprobe->arch, regs);
        else if (utask->state == UTASK_SSTEP_TRAPPED)
                arch_uprobe_abort_xol(&uprobe->arch, regs);
        else
                WARN_ON_ONCE(1);

        put_uprobe(uprobe);
        utask->active_uprobe = NULL;
        utask->state = UTASK_RUNNING;
        xol_free_insn_slot(current);

        spin_lock_irq(&current->sighand->siglock);
        recalc_sigpending(); /* see uprobe_deny_signal() */
        spin_unlock_irq(&current->sighand->siglock);

        if (unlikely(err)) {
                uprobe_warn(current, "execute the probed insn, sending SIGILL.");
                force_sig(SIGILL);
        }
}

/*
 * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and
 * allows the thread to return from interrupt. After that handle_swbp()
 * sets utask->active_uprobe.
 *
 * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag
 * and allows the thread to return from interrupt.
 *
 * While returning to userspace, thread notices the TIF_UPROBE flag and calls
 * uprobe_notify_resume().
 */
void uprobe_notify_resume(struct pt_regs *regs)
{
        struct uprobe_task *utask;

        clear_thread_flag(TIF_UPROBE);

        utask = current->utask;
        if (utask && utask->active_uprobe)
                handle_singlestep(utask, regs);
        else
                handle_swbp(regs);
}

/*
 * uprobe_pre_sstep_notifier gets called from interrupt context as part of
 * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit.
 */
int uprobe_pre_sstep_notifier(struct pt_regs *regs)
{
        if (!current->mm)
                return 0;

        if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
            (!current->utask || !current->utask->return_instances))
                return 0;

        set_thread_flag(TIF_UPROBE);
        return 1;
}

/*
 * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier
 * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep.
 */
int uprobe_post_sstep_notifier(struct pt_regs *regs)
{
        struct uprobe_task *utask = current->utask;

        if (!current->mm || !utask || !utask->active_uprobe)
                /* task is currently not uprobed */
                return 0;

        utask->state = UTASK_SSTEP_ACK;
        set_thread_flag(TIF_UPROBE);
        return 1;
}

static struct notifier_block uprobe_exception_nb = {
        .notifier_call                = arch_uprobe_exception_notify,
        .priority                = INT_MAX-1,        /* notified after kprobes, kgdb */
};

void __init uprobes_init(void)
{
        int i;

        for (i = 0; i < UPROBES_HASH_SZ; i++)
                mutex_init(&uprobes_mmap_mutex[i]);

        BUG_ON(register_die_notifier(&uprobe_exception_nb));
}




































    1 





























































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/readahead.c - address_space-level file readahead.
 *
 * Copyright (C) 2002, Linus Torvalds
 *
 * 09Apr2002        Andrew Morton
 *                Initial version.
 */

#include <linux/kernel.h>
#include <linux/dax.h>
#include <linux/gfp.h>
#include <linux/export.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/pagevec.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/file.h>
#include <linux/mm_inline.h>
#include <linux/blk-cgroup.h>
#include <linux/fadvise.h>
#include <linux/sched/mm.h>

#include "internal.h"

/*
 * Initialise a struct file's readahead state.  Assumes that the caller has
 * memset *ra to zero.
 */
void
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
{
        ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
        ra->prev_pos = -1;
}
EXPORT_SYMBOL_GPL(file_ra_state_init);

/*
 * see if a page needs releasing upon read_cache_pages() failure
 * - the caller of read_cache_pages() may have set PG_private or PG_fscache
 *   before calling, such as the NFS fs marking pages that are cached locally
 *   on disk, thus we need to give the fs a chance to clean up in the event of
 *   an error
 */
static void read_cache_pages_invalidate_page(struct address_space *mapping,
                                             struct page *page)
{
        if (page_has_private(page)) {
                if (!trylock_page(page))
                        BUG();
                page->mapping = mapping;
                do_invalidatepage(page, 0, PAGE_SIZE);
                page->mapping = NULL;
                unlock_page(page);
        }
        put_page(page);
}

/*
 * release a list of pages, invalidating them first if need be
 */
static void read_cache_pages_invalidate_pages(struct address_space *mapping,
                                              struct list_head *pages)
{
        struct page *victim;

        while (!list_empty(pages)) {
                victim = lru_to_page(pages);
                list_del(&victim->lru);
                read_cache_pages_invalidate_page(mapping, victim);
        }
}

/**
 * read_cache_pages - populate an address space with some pages & start reads against them
 * @mapping: the address_space
 * @pages: The address of a list_head which contains the target pages.  These
 *   pages have their ->index populated and are otherwise uninitialised.
 * @filler: callback routine for filling a single page.
 * @data: private data for the callback routine.
 *
 * Hides the details of the LRU cache etc from the filesystems.
 *
 * Returns: %0 on success, error return by @filler otherwise
 */
int read_cache_pages(struct address_space *mapping, struct list_head *pages,
                        int (*filler)(void *, struct page *), void *data)
{
        struct page *page;
        int ret = 0;

        while (!list_empty(pages)) {
                page = lru_to_page(pages);
                list_del(&page->lru);
                if (add_to_page_cache_lru(page, mapping, page->index,
                                readahead_gfp_mask(mapping))) {
                        read_cache_pages_invalidate_page(mapping, page);
                        continue;
                }
                put_page(page);

                ret = filler(data, page);
                if (unlikely(ret)) {
                        read_cache_pages_invalidate_pages(mapping, pages);
                        break;
                }
                task_io_account_read(PAGE_SIZE);
        }
        return ret;
}

EXPORT_SYMBOL(read_cache_pages);

static void read_pages(struct readahead_control *rac, struct list_head *pages,
                bool skip_page)
{
        const struct address_space_operations *aops = rac->mapping->a_ops;
        struct page *page;
        struct blk_plug plug;

        if (!readahead_count(rac))
                goto out;

        blk_start_plug(&plug);

        if (aops->readahead) {
                aops->readahead(rac);
                /* Clean up the remaining pages */
                while ((page = readahead_page(rac))) {
                        unlock_page(page);
                        put_page(page);
                }
        } else if (aops->readpages) {
                aops->readpages(rac->file, rac->mapping, pages,
                                readahead_count(rac));
                /* Clean up the remaining pages */
                put_pages_list(pages);
                rac->_index += rac->_nr_pages;
                rac->_nr_pages = 0;
        } else {
                while ((page = readahead_page(rac))) {
                        aops->readpage(rac->file, page);
                        put_page(page);
                }
        }

        blk_finish_plug(&plug);

        BUG_ON(!list_empty(pages));
        BUG_ON(readahead_count(rac));

out:
        if (skip_page)
                rac->_index++;
}

/**
 * page_cache_ra_unbounded - Start unchecked readahead.
 * @ractl: Readahead control.
 * @nr_to_read: The number of pages to read.
 * @lookahead_size: Where to start the next readahead.
 *
 * This function is for filesystems to call when they want to start
 * readahead beyond a file's stated i_size.  This is almost certainly
 * not the function you want to call.  Use page_cache_async_readahead()
 * or page_cache_sync_readahead() instead.
 *
 * Context: File is referenced by caller.  Mutexes may be held by caller.
 * May sleep, but will not reenter filesystem to reclaim memory.
 */
void page_cache_ra_unbounded(struct readahead_control *ractl,
                unsigned long nr_to_read, unsigned long lookahead_size)
{
        struct address_space *mapping = ractl->mapping;
        unsigned long index = readahead_index(ractl);
        LIST_HEAD(page_pool);
        gfp_t gfp_mask = readahead_gfp_mask(mapping);
        unsigned long i;

        /*
         * Partway through the readahead operation, we will have added
         * locked pages to the page cache, but will not yet have submitted
         * them for I/O.  Adding another page may need to allocate memory,
         * which can trigger memory reclaim.  Telling the VM we're in
         * the middle of a filesystem operation will cause it to not
         * touch file-backed pages, preventing a deadlock.  Most (all?)
         * filesystems already specify __GFP_NOFS in their mapping's
         * gfp_mask, but let's be explicit here.
         */
        unsigned int nofs = memalloc_nofs_save();

        /*
         * Preallocate as many pages as we will need.
         */
        for (i = 0; i < nr_to_read; i++) {
                struct page *page = xa_load(&mapping->i_pages, index + i);

                BUG_ON(index + i != ractl->_index + ractl->_nr_pages);

                if (page && !xa_is_value(page)) {
                        /*
                         * Page already present?  Kick off the current batch
                         * of contiguous pages before continuing with the
                         * next batch.  This page may be the one we would
                         * have intended to mark as Readahead, but we don't
                         * have a stable reference to this page, and it's
                         * not worth getting one just for that.
                         */
                        read_pages(ractl, &page_pool, true);
                        continue;
                }

                page = __page_cache_alloc(gfp_mask);
                if (!page)
                        break;
                if (mapping->a_ops->readpages) {
                        page->index = index + i;
                        list_add(&page->lru, &page_pool);
                } else if (add_to_page_cache_lru(page, mapping, index + i,
                                        gfp_mask) < 0) {
                        put_page(page);
                        read_pages(ractl, &page_pool, true);
                        continue;
                }
                if (i == nr_to_read - lookahead_size)
                        SetPageReadahead(page);
                ractl->_nr_pages++;
        }

        /*
         * Now start the IO.  We ignore I/O errors - if the page is not
         * uptodate then the caller will launch readpage again, and
         * will then handle the error.
         */
        read_pages(ractl, &page_pool, false);
        memalloc_nofs_restore(nofs);
}
EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);

/*
 * do_page_cache_ra() actually reads a chunk of disk.  It allocates
 * the pages first, then submits them for I/O. This avoids the very bad
 * behaviour which would occur if page allocations are causing VM writeback.
 * We really don't want to intermingle reads and writes like that.
 */
void do_page_cache_ra(struct readahead_control *ractl,
                unsigned long nr_to_read, unsigned long lookahead_size)
{
        struct inode *inode = ractl->mapping->host;
        unsigned long index = readahead_index(ractl);
        loff_t isize = i_size_read(inode);
        pgoff_t end_index;        /* The last page we want to read */

        if (isize == 0)
                return;

        end_index = (isize - 1) >> PAGE_SHIFT;
        if (index > end_index)
                return;
        /* Don't read past the page containing the last byte of the file */
        if (nr_to_read > end_index - index)
                nr_to_read = end_index - index + 1;

        page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
}

/*
 * Chunk the readahead into 2 megabyte units, so that we don't pin too much
 * memory at once.
 */
void force_page_cache_ra(struct readahead_control *ractl,
                struct file_ra_state *ra, unsigned long nr_to_read)
{
        struct address_space *mapping = ractl->mapping;
        struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
        unsigned long max_pages, index;

        if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages &&
                        !mapping->a_ops->readahead))
                return;

        /*
         * If the request exceeds the readahead window, allow the read to
         * be up to the optimal hardware IO size
         */
        index = readahead_index(ractl);
        max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
        nr_to_read = min_t(unsigned long, nr_to_read, max_pages);
        while (nr_to_read) {
                unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;

                if (this_chunk > nr_to_read)
                        this_chunk = nr_to_read;
                ractl->_index = index;
                do_page_cache_ra(ractl, this_chunk, 0);

                index += this_chunk;
                nr_to_read -= this_chunk;
        }
}

/*
 * Set the initial window size, round to next power of 2 and square
 * for small size, x 4 for medium, and x 2 for large
 * for 128k (32 page) max ra
 * 1-8 page = 32k initial, > 8 page = 128k initial
 */
static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
{
        unsigned long newsize = roundup_pow_of_two(size);

        if (newsize <= max / 32)
                newsize = newsize * 4;
        else if (newsize <= max / 4)
                newsize = newsize * 2;
        else
                newsize = max;

        return newsize;
}

/*
 *  Get the previous window size, ramp it up, and
 *  return it as the new window size.
 */
static unsigned long get_next_ra_size(struct file_ra_state *ra,
                                      unsigned long max)
{
        unsigned long cur = ra->size;

        if (cur < max / 16)
                return 4 * cur;
        if (cur <= max / 2)
                return 2 * cur;
        return max;
}

/*
 * On-demand readahead design.
 *
 * The fields in struct file_ra_state represent the most-recently-executed
 * readahead attempt:
 *
 *                        |<----- async_size ---------|
 *     |------------------- size -------------------->|
 *     |==================#===========================|
 *     ^start             ^page marked with PG_readahead
 *
 * To overlap application thinking time and disk I/O time, we do
 * `readahead pipelining': Do not wait until the application consumed all
 * readahead pages and stalled on the missing page at readahead_index;
 * Instead, submit an asynchronous readahead I/O as soon as there are
 * only async_size pages left in the readahead window. Normally async_size
 * will be equal to size, for maximum pipelining.
 *
 * In interleaved sequential reads, concurrent streams on the same fd can
 * be invalidating each other's readahead state. So we flag the new readahead
 * page at (start+size-async_size) with PG_readahead, and use it as readahead
 * indicator. The flag won't be set on already cached pages, to avoid the
 * readahead-for-nothing fuss, saving pointless page cache lookups.
 *
 * prev_pos tracks the last visited byte in the _previous_ read request.
 * It should be maintained by the caller, and will be used for detecting
 * small random reads. Note that the readahead algorithm checks loosely
 * for sequential patterns. Hence interleaved reads might be served as
 * sequential ones.
 *
 * There is a special-case: if the first page which the application tries to
 * read happens to be the first page of the file, it is assumed that a linear
 * read is about to happen and the window is immediately set to the initial size
 * based on I/O request size and the max_readahead.
 *
 * The code ramps up the readahead size aggressively at first, but slow down as
 * it approaches max_readhead.
 */

/*
 * Count contiguously cached pages from @index-1 to @index-@max,
 * this count is a conservative estimation of
 *         - length of the sequential read sequence, or
 *         - thrashing threshold in memory tight systems
 */
static pgoff_t count_history_pages(struct address_space *mapping,
                                   pgoff_t index, unsigned long max)
{
        pgoff_t head;

        rcu_read_lock();
        head = page_cache_prev_miss(mapping, index - 1, max);
        rcu_read_unlock();

        return index - 1 - head;
}

/*
 * page cache context based read-ahead
 */
static int try_context_readahead(struct address_space *mapping,
                                 struct file_ra_state *ra,
                                 pgoff_t index,
                                 unsigned long req_size,
                                 unsigned long max)
{
        pgoff_t size;

        size = count_history_pages(mapping, index, max);

        /*
         * not enough history pages:
         * it could be a random read
         */
        if (size <= req_size)
                return 0;

        /*
         * starts from beginning of file:
         * it is a strong indication of long-run stream (or whole-file-read)
         */
        if (size >= index)
                size *= 2;

        ra->start = index;
        ra->size = min(size + req_size, max);
        ra->async_size = 1;

        return 1;
}

/*
 * A minimal readahead algorithm for trivial sequential/random reads.
 */
static void ondemand_readahead(struct readahead_control *ractl,
                struct file_ra_state *ra, bool hit_readahead_marker,
                unsigned long req_size)
{
        struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
        unsigned long max_pages = ra->ra_pages;
        unsigned long add_pages;
        unsigned long index = readahead_index(ractl);
        pgoff_t prev_index;

        /*
         * If the request exceeds the readahead window, allow the read to
         * be up to the optimal hardware IO size
         */
        if (req_size > max_pages && bdi->io_pages > max_pages)
                max_pages = min(req_size, bdi->io_pages);

        /*
         * start of file
         */
        if (!index)
                goto initial_readahead;

        /*
         * It's the expected callback index, assume sequential access.
         * Ramp up sizes, and push forward the readahead window.
         */
        if ((index == (ra->start + ra->size - ra->async_size) ||
             index == (ra->start + ra->size))) {
                ra->start += ra->size;
                ra->size = get_next_ra_size(ra, max_pages);
                ra->async_size = ra->size;
                goto readit;
        }

        /*
         * Hit a marked page without valid readahead state.
         * E.g. interleaved reads.
         * Query the pagecache for async_size, which normally equals to
         * readahead size. Ramp it up and use it as the new readahead size.
         */
        if (hit_readahead_marker) {
                pgoff_t start;

                rcu_read_lock();
                start = page_cache_next_miss(ractl->mapping, index + 1,
                                max_pages);
                rcu_read_unlock();

                if (!start || start - index > max_pages)
                        return;

                ra->start = start;
                ra->size = start - index;        /* old async_size */
                ra->size += req_size;
                ra->size = get_next_ra_size(ra, max_pages);
                ra->async_size = ra->size;
                goto readit;
        }

        /*
         * oversize read
         */
        if (req_size > max_pages)
                goto initial_readahead;

        /*
         * sequential cache miss
         * trivial case: (index - prev_index) == 1
         * unaligned reads: (index - prev_index) == 0
         */
        prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
        if (index - prev_index <= 1UL)
                goto initial_readahead;

        /*
         * Query the page cache and look for the traces(cached history pages)
         * that a sequential stream would leave behind.
         */
        if (try_context_readahead(ractl->mapping, ra, index, req_size,
                        max_pages))
                goto readit;

        /*
         * standalone, small random read
         * Read as is, and do not pollute the readahead state.
         */
        do_page_cache_ra(ractl, req_size, 0);
        return;

initial_readahead:
        ra->start = index;
        ra->size = get_init_ra_size(req_size, max_pages);
        ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;

readit:
        /*
         * Will this read hit the readahead marker made by itself?
         * If so, trigger the readahead marker hit now, and merge
         * the resulted next readahead window into the current one.
         * Take care of maximum IO pages as above.
         */
        if (index == ra->start && ra->size == ra->async_size) {
                add_pages = get_next_ra_size(ra, max_pages);
                if (ra->size + add_pages <= max_pages) {
                        ra->async_size = add_pages;
                        ra->size += add_pages;
                } else {
                        ra->size = max_pages;
                        ra->async_size = max_pages >> 1;
                }
        }

        ractl->_index = ra->start;
        do_page_cache_ra(ractl, ra->size, ra->async_size);
}

void page_cache_sync_ra(struct readahead_control *ractl,
                struct file_ra_state *ra, unsigned long req_count)
{
        bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);

        /*
         * Even if read-ahead is disabled, issue this request as read-ahead
         * as we'll need it to satisfy the requested range. The forced
         * read-ahead will do the right thing and limit the read to just the
         * requested range, which we'll set to 1 page for this case.
         */
        if (!ra->ra_pages || blk_cgroup_congested()) {
                if (!ractl->file)
                        return;
                req_count = 1;
                do_forced_ra = true;
        }

        /* be dumb */
        if (do_forced_ra) {
                force_page_cache_ra(ractl, ra, req_count);
                return;
        }

        /* do read-ahead */
        ondemand_readahead(ractl, ra, false, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_sync_ra);

void page_cache_async_ra(struct readahead_control *ractl,
                struct file_ra_state *ra, struct page *page,
                unsigned long req_count)
{
        /* no read-ahead */
        if (!ra->ra_pages)
                return;

        /*
         * Same bit is used for PG_readahead and PG_reclaim.
         */
        if (PageWriteback(page))
                return;

        ClearPageReadahead(page);

        /*
         * Defer asynchronous read-ahead on IO congestion.
         */
        if (inode_read_congested(ractl->mapping->host))
                return;

        if (blk_cgroup_congested())
                return;

        /* do read-ahead */
        ondemand_readahead(ractl, ra, true, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_async_ra);

ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
{
        ssize_t ret;
        struct fd f;

        ret = -EBADF;
        f = fdget(fd);
        if (!f.file || !(f.file->f_mode & FMODE_READ))
                goto out;

        /*
         * The readahead() syscall is intended to run only on files
         * that can execute readahead. If readahead is not possible
         * on this file, then we must return -EINVAL.
         */
        ret = -EINVAL;
        if (!f.file->f_mapping || !f.file->f_mapping->a_ops ||
            (!S_ISREG(file_inode(f.file)->i_mode) &&
            !S_ISBLK(file_inode(f.file)->i_mode)))
                goto out;

        ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED);
out:
        fdput(f);
        return ret;
}

SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
{
        return ksys_readahead(fd, offset, count);
}
































































































































    1 
































    1 





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_WAIT_H
#define _LINUX_WAIT_H
/*
 * Linux wait queue related types and methods
 */
#include <linux/list.h>
#include <linux/stddef.h>
#include <linux/spinlock.h>

#include <asm/current.h>
#include <uapi/linux/wait.h>

typedef struct wait_queue_entry wait_queue_entry_t;

typedef int (*wait_queue_func_t)(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key);
int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key);

/* wait_queue_entry::flags */
#define WQ_FLAG_EXCLUSIVE        0x01
#define WQ_FLAG_WOKEN                0x02
#define WQ_FLAG_BOOKMARK        0x04
#define WQ_FLAG_CUSTOM                0x08
#define WQ_FLAG_DONE                0x10

/*
 * A single wait-queue entry structure:
 */
struct wait_queue_entry {
        unsigned int                flags;
        void                        *private;
        wait_queue_func_t        func;
        struct list_head        entry;
};

struct wait_queue_head {
        spinlock_t                lock;
        struct list_head        head;
};
typedef struct wait_queue_head wait_queue_head_t;

struct task_struct;

/*
 * Macros for declaration and initialisaton of the datatypes
 */

#define __WAITQUEUE_INITIALIZER(name, tsk) {                                        \
        .private        = tsk,                                                        \
        .func                = default_wake_function,                                \
        .entry                = { NULL, NULL } }

#define DECLARE_WAITQUEUE(name, tsk)                                                \
        struct wait_queue_entry name = __WAITQUEUE_INITIALIZER(name, tsk)

#define __WAIT_QUEUE_HEAD_INITIALIZER(name) {                                        \
        .lock                = __SPIN_LOCK_UNLOCKED(name.lock),                        \
        .head                = { &(name).head, &(name).head } }

#define DECLARE_WAIT_QUEUE_HEAD(name) \
        struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name)

extern void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *);

#define init_waitqueue_head(wq_head)                                                \
        do {                                                                        \
                static struct lock_class_key __key;                                \
                                                                                \
                __init_waitqueue_head((wq_head), #wq_head, &__key);                \
        } while (0)

#ifdef CONFIG_LOCKDEP
# define __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) \
        ({ init_waitqueue_head(&name); name; })
# define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) \
        struct wait_queue_head name = __WAIT_QUEUE_HEAD_INIT_ONSTACK(name)
#else
# define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) DECLARE_WAIT_QUEUE_HEAD(name)
#endif

static inline void init_waitqueue_entry(struct wait_queue_entry *wq_entry, struct task_struct *p)
{
        wq_entry->flags                = 0;
        wq_entry->private        = p;
        wq_entry->func                = default_wake_function;
}

static inline void
init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t func)
{
        wq_entry->flags                = 0;
        wq_entry->private        = NULL;
        wq_entry->func                = func;
}

/**
 * waitqueue_active -- locklessly test for waiters on the queue
 * @wq_head: the waitqueue to test for waiters
 *
 * returns true if the wait list is not empty
 *
 * NOTE: this function is lockless and requires care, incorrect usage _will_
 * lead to sporadic and non-obvious failure.
 *
 * Use either while holding wait_queue_head::lock or when used for wakeups
 * with an extra smp_mb() like::
 *
 *      CPU0 - waker                    CPU1 - waiter
 *
 *                                      for (;;) {
 *      @cond = true;                     prepare_to_wait(&wq_head, &wait, state);
 *      smp_mb();                         // smp_mb() from set_current_state()
 *      if (waitqueue_active(wq_head))         if (@cond)
 *        wake_up(wq_head);                      break;
 *                                        schedule();
 *                                      }
 *                                      finish_wait(&wq_head, &wait);
 *
 * Because without the explicit smp_mb() it's possible for the
 * waitqueue_active() load to get hoisted over the @cond store such that we'll
 * observe an empty wait list while the waiter might not observe @cond.
 *
 * Also note that this 'optimization' trades a spin_lock() for an smp_mb(),
 * which (when the lock is uncontended) are of roughly equal cost.
 */
static inline int waitqueue_active(struct wait_queue_head *wq_head)
{
        return !list_empty(&wq_head->head);
}

/**
 * wq_has_single_sleeper - check if there is only one sleeper
 * @wq_head: wait queue head
 *
 * Returns true of wq_head has only one sleeper on the list.
 *
 * Please refer to the comment for waitqueue_active.
 */
static inline bool wq_has_single_sleeper(struct wait_queue_head *wq_head)
{
        return list_is_singular(&wq_head->head);
}

/**
 * wq_has_sleeper - check if there are any waiting processes
 * @wq_head: wait queue head
 *
 * Returns true if wq_head has waiting processes
 *
 * Please refer to the comment for waitqueue_active.
 */
static inline bool wq_has_sleeper(struct wait_queue_head *wq_head)
{
        /*
         * We need to be sure we are in sync with the
         * add_wait_queue modifications to the wait queue.
         *
         * This memory barrier should be paired with one on the
         * waiting side.
         */
        smp_mb();
        return waitqueue_active(wq_head);
}

extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);

static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        list_add(&wq_entry->entry, &wq_head->head);
}

/*
 * Used for wake-one threads:
 */
static inline void
__add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
        __add_wait_queue(wq_head, wq_entry);
}

static inline void __add_wait_queue_entry_tail(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        list_add_tail(&wq_entry->entry, &wq_head->head);
}

static inline void
__add_wait_queue_entry_tail_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
        __add_wait_queue_entry_tail(wq_head, wq_entry);
}

static inline void
__remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        list_del(&wq_entry->entry);
}

void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
                unsigned int mode, void *key, wait_queue_entry_t *bookmark);
void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode);
void __wake_up_pollfree(struct wait_queue_head *wq_head);

#define wake_up(x)                        __wake_up(x, TASK_NORMAL, 1, NULL)
#define wake_up_nr(x, nr)                __wake_up(x, TASK_NORMAL, nr, NULL)
#define wake_up_all(x)                        __wake_up(x, TASK_NORMAL, 0, NULL)
#define wake_up_locked(x)                __wake_up_locked((x), TASK_NORMAL, 1)
#define wake_up_all_locked(x)                __wake_up_locked((x), TASK_NORMAL, 0)

#define wake_up_interruptible(x)        __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)
#define wake_up_interruptible_nr(x, nr)        __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL)
#define wake_up_interruptible_all(x)        __wake_up(x, TASK_INTERRUPTIBLE, 0, NULL)
#define wake_up_interruptible_sync(x)        __wake_up_sync((x), TASK_INTERRUPTIBLE)

/*
 * Wakeup macros to be used to report events to the targets.
 */
#define poll_to_key(m) ((void *)(__force uintptr_t)(__poll_t)(m))
#define key_to_poll(m) ((__force __poll_t)(uintptr_t)(void *)(m))
#define wake_up_poll(x, m)                                                        \
        __wake_up(x, TASK_NORMAL, 1, poll_to_key(m))
#define wake_up_locked_poll(x, m)                                                \
        __wake_up_locked_key((x), TASK_NORMAL, poll_to_key(m))
#define wake_up_interruptible_poll(x, m)                                        \
        __wake_up(x, TASK_INTERRUPTIBLE, 1, poll_to_key(m))
#define wake_up_interruptible_sync_poll(x, m)                                        \
        __wake_up_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))
#define wake_up_interruptible_sync_poll_locked(x, m)                                \
        __wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))

/**
 * wake_up_pollfree - signal that a polled waitqueue is going away
 * @wq_head: the wait queue head
 *
 * In the very rare cases where a ->poll() implementation uses a waitqueue whose
 * lifetime is tied to a task rather than to the 'struct file' being polled,
 * this function must be called before the waitqueue is freed so that
 * non-blocking polls (e.g. epoll) are notified that the queue is going away.
 *
 * The caller must also RCU-delay the freeing of the wait_queue_head, e.g. via
 * an explicit synchronize_rcu() or call_rcu(), or via SLAB_TYPESAFE_BY_RCU.
 */
static inline void wake_up_pollfree(struct wait_queue_head *wq_head)
{
        /*
         * For performance reasons, we don't always take the queue lock here.
         * Therefore, we might race with someone removing the last entry from
         * the queue, and proceed while they still hold the queue lock.
         * However, rcu_read_lock() is required to be held in such cases, so we
         * can safely proceed with an RCU-delayed free.
         */
        if (waitqueue_active(wq_head))
                __wake_up_pollfree(wq_head);
}

#define ___wait_cond_timeout(condition)                                                \
({                                                                                \
        bool __cond = (condition);                                                \
        if (__cond && !__ret)                                                        \
                __ret = 1;                                                        \
        __cond || !__ret;                                                        \
})

#define ___wait_is_interruptible(state)                                                \
        (!__builtin_constant_p(state) ||                                        \
                state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE)                \

extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags);

/*
 * The below macro ___wait_event() has an explicit shadow of the __ret
 * variable when used from the wait_event_*() macros.
 *
 * This is so that both can use the ___wait_cond_timeout() construct
 * to wrap the condition.
 *
 * The type inconsistency of the wait_event_*() __ret variable is also
 * on purpose; we use long where we can return timeout values and int
 * otherwise.
 */

#define ___wait_event(wq_head, condition, state, exclusive, ret, cmd)                \
({                                                                                \
        __label__ __out;                                                        \
        struct wait_queue_entry __wq_entry;                                        \
        long __ret = ret;        /* explicit shadow */                                \
                                                                                \
        init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0);        \
        for (;;) {                                                                \
                long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\
                                                                                \
                if (condition)                                                        \
                        break;                                                        \
                                                                                \
                if (___wait_is_interruptible(state) && __int) {                        \
                        __ret = __int;                                                \
                        goto __out;                                                \
                }                                                                \
                                                                                \
                cmd;                                                                \
        }                                                                        \
        finish_wait(&wq_head, &__wq_entry);                                        \
__out:        __ret;                                                                        \
})

#define __wait_event(wq_head, condition)                                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            schedule())

/**
 * wait_event - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 */
#define wait_event(wq_head, condition)                                                \
do {                                                                                \
        might_sleep();                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event(wq_head, condition);                                        \
} while (0)

#define __io_wait_event(wq_head, condition)                                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            io_schedule())

/*
 * io_wait_event() -- like wait_event() but with io_schedule()
 */
#define io_wait_event(wq_head, condition)                                        \
do {                                                                                \
        might_sleep();                                                                \
        if (condition)                                                                \
                break;                                                                \
        __io_wait_event(wq_head, condition);                                        \
} while (0)

#define __wait_event_freezable(wq_head, condition)                                \
        ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0,                \
                            freezable_schedule())

/**
 * wait_event_freezable - sleep (or freeze) until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE -- so as not to contribute
 * to system load) until the @condition evaluates to true. The
 * @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 */
#define wait_event_freezable(wq_head, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_freezable(wq_head, condition);                \
        __ret;                                                                        \
})

#define __wait_event_timeout(wq_head, condition, timeout)                        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_UNINTERRUPTIBLE, 0, timeout,                                \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_timeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * or the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed.
 */
#define wait_event_timeout(wq_head, condition, timeout)                                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_timeout(wq_head, condition, timeout);        \
        __ret;                                                                        \
})

#define __wait_event_freezable_timeout(wq_head, condition, timeout)                \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_INTERRUPTIBLE, 0, timeout,                                \
                      __ret = freezable_schedule_timeout(__ret))

/*
 * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid
 * increasing load and is freezable.
 */
#define wait_event_freezable_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_freezable_timeout(wq_head, condition, timeout); \
        __ret;                                                                        \
})

#define __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2)                \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 1, 0,        \
                            cmd1; schedule(); cmd2)
/*
 * Just like wait_event_cmd(), except it sets exclusive flag
 */
#define wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2)                \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2);                \
} while (0)

#define __wait_event_cmd(wq_head, condition, cmd1, cmd2)                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            cmd1; schedule(); cmd2)

/**
 * wait_event_cmd - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @cmd1: the command will be executed before sleep
 * @cmd2: the command will be executed after sleep
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 */
#define wait_event_cmd(wq_head, condition, cmd1, cmd2)                                \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_cmd(wq_head, condition, cmd1, cmd2);                        \
} while (0)

#define __wait_event_interruptible(wq_head, condition)                                \
        ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0,                \
                      schedule())

/**
 * wait_event_interruptible - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible(wq_head, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible(wq_head, condition);                \
        __ret;                                                                        \
})

#define __wait_event_interruptible_timeout(wq_head, condition, timeout)                \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_INTERRUPTIBLE, 0, timeout,                                \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was
 * interrupted by a signal.
 */
#define wait_event_interruptible_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_interruptible_timeout(wq_head,                \
                                                condition, timeout);                \
        __ret;                                                                        \
})

#define __wait_event_hrtimeout(wq_head, condition, timeout, state)                \
({                                                                                \
        int __ret = 0;                                                                \
        struct hrtimer_sleeper __t;                                                \
                                                                                \
        hrtimer_init_sleeper_on_stack(&__t, CLOCK_MONOTONIC,                        \
                                      HRTIMER_MODE_REL);                        \
        if ((timeout) != KTIME_MAX) {                                                \
                hrtimer_set_expires_range_ns(&__t.timer, timeout,                \
                                        current->timer_slack_ns);                \
                hrtimer_sleeper_start_expires(&__t, HRTIMER_MODE_REL);                \
        }                                                                        \
                                                                                \
        __ret = ___wait_event(wq_head, condition, state, 0, 0,                        \
                if (!__t.task) {                                                \
                        __ret = -ETIME;                                                \
                        break;                                                        \
                }                                                                \
                schedule());                                                        \
                                                                                \
        hrtimer_cancel(&__t.timer);                                                \
        destroy_hrtimer_on_stack(&__t.timer);                                        \
        __ret;                                                                        \
})

/**
 * wait_event_hrtimeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, as a ktime_t
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function returns 0 if @condition became true, or -ETIME if the timeout
 * elapsed.
 */
#define wait_event_hrtimeout(wq_head, condition, timeout)                        \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_hrtimeout(wq_head, condition, timeout,        \
                                               TASK_UNINTERRUPTIBLE);                \
        __ret;                                                                        \
})

/**
 * wait_event_interruptible_hrtimeout - sleep until a condition gets true or a timeout elapses
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, as a ktime_t
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function returns 0 if @condition became true, -ERESTARTSYS if it was
 * interrupted by a signal, or -ETIME if the timeout elapsed.
 */
#define wait_event_interruptible_hrtimeout(wq, condition, timeout)                \
({                                                                                \
        long __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_hrtimeout(wq, condition, timeout,                \
                                               TASK_INTERRUPTIBLE);                \
        __ret;                                                                        \
})

#define __wait_event_interruptible_exclusive(wq, condition)                        \
        ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0,                        \
                      schedule())

#define wait_event_interruptible_exclusive(wq, condition)                        \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible_exclusive(wq, condition);        \
        __ret;                                                                        \
})

#define __wait_event_killable_exclusive(wq, condition)                                \
        ___wait_event(wq, condition, TASK_KILLABLE, 1, 0,                        \
                      schedule())

#define wait_event_killable_exclusive(wq, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_killable_exclusive(wq, condition);                \
        __ret;                                                                        \
})


#define __wait_event_freezable_exclusive(wq, condition)                                \
        ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0,                        \
                        freezable_schedule())

#define wait_event_freezable_exclusive(wq, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_freezable_exclusive(wq, condition);        \
        __ret;                                                                        \
})

/**
 * wait_event_idle - wait for a condition without contributing to system load
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 */
#define wait_event_idle(wq_head, condition)                                        \
do {                                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                ___wait_event(wq_head, condition, TASK_IDLE, 0, 0, schedule());        \
} while (0)

/**
 * wait_event_idle_exclusive - wait for a condition with contributing to system load
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus if other processes wait on the same list, when this
 * process is woken further processes are not considered.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 */
#define wait_event_idle_exclusive(wq_head, condition)                                \
do {                                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                ___wait_event(wq_head, condition, TASK_IDLE, 1, 0, schedule());        \
} while (0)

#define __wait_event_idle_timeout(wq_head, condition, timeout)                        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_IDLE, 0, timeout,                                        \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_idle_timeout - sleep without load until a condition becomes true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * or the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed.
 */
#define wait_event_idle_timeout(wq_head, condition, timeout)                        \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_idle_timeout(wq_head, condition, timeout);        \
        __ret;                                                                        \
})

#define __wait_event_idle_exclusive_timeout(wq_head, condition, timeout)        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_IDLE, 1, timeout,                                        \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_idle_exclusive_timeout - sleep without load until a condition becomes true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus if other processes wait on the same list, when this
 * process is woken further processes are not considered.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * or the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed.
 */
#define wait_event_idle_exclusive_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_idle_exclusive_timeout(wq_head, condition, timeout);\
        __ret;                                                                        \
})

extern int do_wait_intr(wait_queue_head_t *, wait_queue_entry_t *);
extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *);

#define __wait_event_interruptible_locked(wq, condition, exclusive, fn)                \
({                                                                                \
        int __ret;                                                                \
        DEFINE_WAIT(__wait);                                                        \
        if (exclusive)                                                                \
                __wait.flags |= WQ_FLAG_EXCLUSIVE;                                \
        do {                                                                        \
                __ret = fn(&(wq), &__wait);                                        \
                if (__ret)                                                        \
                        break;                                                        \
        } while (!(condition));                                                        \
        __remove_wait_queue(&(wq), &__wait);                                        \
        __set_current_state(TASK_RUNNING);                                        \
        __ret;                                                                        \
})


/**
 * wait_event_interruptible_locked - sleep until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock()/spin_unlock()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_locked(wq, condition)                                \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr))

/**
 * wait_event_interruptible_locked_irq - sleep until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_locked_irq(wq, condition)                        \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr_irq))

/**
 * wait_event_interruptible_exclusive_locked - sleep exclusively until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock()/spin_unlock()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus when other process waits process on the list if this
 * process is awaken further processes are not considered.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_exclusive_locked(wq, condition)                \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr))

/**
 * wait_event_interruptible_exclusive_locked_irq - sleep until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus when other process waits process on the list if this
 * process is awaken further processes are not considered.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_exclusive_locked_irq(wq, condition)                \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr_irq))


#define __wait_event_killable(wq, condition)                                        \
        ___wait_event(wq, condition, TASK_KILLABLE, 0, 0, schedule())

/**
 * wait_event_killable - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_KILLABLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_killable(wq_head, condition)                                        \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_killable(wq_head, condition);                \
        __ret;                                                                        \
})

#define __wait_event_killable_timeout(wq_head, condition, timeout)                \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_KILLABLE, 0, timeout,                                \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_killable_timeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_KILLABLE) until the
 * @condition evaluates to true or a kill signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was
 * interrupted by a kill signal.
 *
 * Only kill signals interrupt this process.
 */
#define wait_event_killable_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_killable_timeout(wq_head,                        \
                                                condition, timeout);                \
        __ret;                                                                        \
})


#define __wait_event_lock_irq(wq_head, condition, lock, cmd)                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            spin_unlock_irq(&lock);                                \
                            cmd;                                                \
                            schedule();                                                \
                            spin_lock_irq(&lock))

/**
 * wait_event_lock_irq_cmd - sleep until a condition gets true. The
 *                             condition is checked under the lock. This
 *                             is expected to be called with the lock
 *                             taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before cmd
 *          and schedule() and reacquired afterwards.
 * @cmd: a command which is invoked outside the critical section before
 *         sleep
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before invoking the cmd and going to sleep and is reacquired
 * afterwards.
 */
#define wait_event_lock_irq_cmd(wq_head, condition, lock, cmd)                        \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_lock_irq(wq_head, condition, lock, cmd);                        \
} while (0)

/**
 * wait_event_lock_irq - sleep until a condition gets true. The
 *                         condition is checked under the lock. This
 *                         is expected to be called with the lock
 *                         taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before schedule()
 *          and reacquired afterwards.
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before going to sleep and is reacquired afterwards.
 */
#define wait_event_lock_irq(wq_head, condition, lock)                                \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_lock_irq(wq_head, condition, lock, );                        \
} while (0)


#define __wait_event_interruptible_lock_irq(wq_head, condition, lock, cmd)        \
        ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0,                \
                      spin_unlock_irq(&lock);                                        \
                      cmd;                                                        \
                      schedule();                                                \
                      spin_lock_irq(&lock))

/**
 * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true.
 *                The condition is checked under the lock. This is expected to
 *                be called with the lock taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before cmd and
 *          schedule() and reacquired afterwards.
 * @cmd: a command which is invoked outside the critical section before
 *         sleep
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received. The @condition is
 * checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before invoking the cmd and going to sleep and is reacquired
 * afterwards.
 *
 * The macro will return -ERESTARTSYS if it was interrupted by a signal
 * and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_lock_irq_cmd(wq_head, condition, lock, cmd)        \
({                                                                                \
        int __ret = 0;                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible_lock_irq(wq_head,                \
                                                condition, lock, cmd);                \
        __ret;                                                                        \
})

/**
 * wait_event_interruptible_lock_irq - sleep until a condition gets true.
 *                The condition is checked under the lock. This is expected
 *                to be called with the lock taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before schedule()
 *          and reacquired afterwards.
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or signal is received. The @condition is
 * checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before going to sleep and is reacquired afterwards.
 *
 * The macro will return -ERESTARTSYS if it was interrupted by a signal
 * and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_lock_irq(wq_head, condition, lock)                \
({                                                                                \
        int __ret = 0;                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible_lock_irq(wq_head,                \
                                                condition, lock,);                \
        __ret;                                                                        \
})

#define __wait_event_lock_irq_timeout(wq_head, condition, lock, timeout, state)        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      state, 0, timeout,                                        \
                      spin_unlock_irq(&lock);                                        \
                      __ret = schedule_timeout(__ret);                                \
                      spin_lock_irq(&lock));

/**
 * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets
 *                true or a timeout elapses. The condition is checked under
 *                the lock. This is expected to be called with the lock taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before schedule()
 *          and reacquired afterwards.
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or signal is received. The @condition is
 * checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before going to sleep and is reacquired afterwards.
 *
 * The function returns 0 if the @timeout elapsed, -ERESTARTSYS if it
 * was interrupted by a signal, and the remaining jiffies otherwise
 * if the condition evaluated to true before the timeout elapsed.
 */
#define wait_event_interruptible_lock_irq_timeout(wq_head, condition, lock,        \
                                                  timeout)                        \
({                                                                                \
        long __ret = timeout;                                                        \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_lock_irq_timeout(                                \
                                        wq_head, condition, lock, timeout,        \
                                        TASK_INTERRUPTIBLE);                        \
        __ret;                                                                        \
})

#define wait_event_lock_irq_timeout(wq_head, condition, lock, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_lock_irq_timeout(                                \
                                        wq_head, condition, lock, timeout,        \
                                        TASK_UNINTERRUPTIBLE);                        \
        __ret;                                                                        \
})

/*
 * Waitqueues which are removed from the waitqueue_head at wakeup time
 */
void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout);
int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);
int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);

#define DEFINE_WAIT_FUNC(name, function)                                        \
        struct wait_queue_entry name = {                                        \
                .private        = current,                                        \
                .func                = function,                                        \
                .entry                = LIST_HEAD_INIT((name).entry),                        \
        }

#define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function)

#define init_wait(wait)                                                                \
        do {                                                                        \
                (wait)->private = current;                                        \
                (wait)->func = autoremove_wake_function;                        \
                INIT_LIST_HEAD(&(wait)->entry);                                        \
                (wait)->flags = 0;                                                \
        } while (0)

bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg);

#endif /* _LINUX_WAIT_H */










































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_RTNETLINK_H
#define __LINUX_RTNETLINK_H


#include <linux/mutex.h>
#include <linux/netdevice.h>
#include <linux/wait.h>
#include <linux/refcount.h>
#include <uapi/linux/rtnetlink.h>

extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo);
extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid);
extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid,
                        u32 group, struct nlmsghdr *nlh, gfp_t flags);
extern void rtnl_set_sk_err(struct net *net, u32 group, int error);
extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics);
extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
                              u32 id, long expires, u32 error);

void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags);
void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
                         gfp_t flags, int *new_nsid, int new_ifindex);
struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
                                       unsigned change, u32 event,
                                       gfp_t flags, int *new_nsid,
                                       int new_ifindex);
void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev,
                       gfp_t flags);


/* RTNL is used as a global lock for all changes to network configuration  */
extern void rtnl_lock(void);
extern void rtnl_unlock(void);
extern int rtnl_trylock(void);
extern int rtnl_is_locked(void);
extern int rtnl_lock_killable(void);
extern bool refcount_dec_and_rtnl_lock(refcount_t *r);

extern wait_queue_head_t netdev_unregistering_wq;
extern struct rw_semaphore pernet_ops_rwsem;
extern struct rw_semaphore net_rwsem;

#ifdef CONFIG_PROVE_LOCKING
extern bool lockdep_rtnl_is_held(void);
#else
static inline bool lockdep_rtnl_is_held(void)
{
        return true;
}
#endif /* #ifdef CONFIG_PROVE_LOCKING */

/**
 * rcu_dereference_rtnl - rcu_dereference with debug checking
 * @p: The pointer to read, prior to dereferencing
 *
 * Do an rcu_dereference(p), but check caller either holds rcu_read_lock()
 * or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference()
 */
#define rcu_dereference_rtnl(p)                                        \
        rcu_dereference_check(p, lockdep_rtnl_is_held())

/**
 * rcu_dereference_bh_rtnl - rcu_dereference_bh with debug checking
 * @p: The pointer to read, prior to dereference
 *
 * Do an rcu_dereference_bh(p), but check caller either holds rcu_read_lock_bh()
 * or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference_bh()
 */
#define rcu_dereference_bh_rtnl(p)                                \
        rcu_dereference_bh_check(p, lockdep_rtnl_is_held())

/**
 * rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL
 * @p: The pointer to read, prior to dereferencing
 *
 * Return the value of the specified RCU-protected pointer, but omit
 * the READ_ONCE(), because caller holds RTNL.
 */
#define rtnl_dereference(p)                                        \
        rcu_dereference_protected(p, lockdep_rtnl_is_held())

static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
{
        return rtnl_dereference(dev->ingress_queue);
}

static inline struct netdev_queue *dev_ingress_queue_rcu(struct net_device *dev)
{
        return rcu_dereference(dev->ingress_queue);
}

struct netdev_queue *dev_ingress_queue_create(struct net_device *dev);

#ifdef CONFIG_NET_INGRESS
void net_inc_ingress_queue(void);
void net_dec_ingress_queue(void);
#endif

#ifdef CONFIG_NET_EGRESS
void net_inc_egress_queue(void);
void net_dec_egress_queue(void);
#endif

void rtnetlink_init(void);
void __rtnl_unlock(void);
void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail);

#define ASSERT_RTNL() \
        WARN_ONCE(!rtnl_is_locked(), \
                  "RTNL: assertion failed at %s (%d)\n", __FILE__,  __LINE__)

extern int ndo_dflt_fdb_dump(struct sk_buff *skb,
                             struct netlink_callback *cb,
                             struct net_device *dev,
                             struct net_device *filter_dev,
                             int *idx);
extern int ndo_dflt_fdb_add(struct ndmsg *ndm,
                            struct nlattr *tb[],
                            struct net_device *dev,
                            const unsigned char *addr,
                            u16 vid,
                            u16 flags);
extern int ndo_dflt_fdb_del(struct ndmsg *ndm,
                            struct nlattr *tb[],
                            struct net_device *dev,
                            const unsigned char *addr,
                            u16 vid);

extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
                                   struct net_device *dev, u16 mode,
                                   u32 flags, u32 mask, int nlflags,
                                   u32 filter_mask,
                                   int (*vlan_fill)(struct sk_buff *skb,
                                                    struct net_device *dev,
                                                    u32 filter_mask));
#endif        /* __LINUX_RTNETLINK_H */
































    1 





























































































































































































































































































































































































































































































































































    1 



























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_UACCESS_H
#define _ASM_X86_UACCESS_H
/*
 * User space memory access functions
 */
#include <linux/compiler.h>
#include <linux/kasan-checks.h>
#include <linux/string.h>
#include <asm/asm.h>
#include <asm/page.h>
#include <asm/smap.h>
#include <asm/extable.h>

/*
 * Test whether a block of memory is a valid user space address.
 * Returns 0 if the range is valid, nonzero otherwise.
 */
static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, unsigned long limit)
{
        /*
         * If we have used "sizeof()" for the size,
         * we know it won't overflow the limit (but
         * it might overflow the 'addr', so it's
         * important to subtract the size from the
         * limit, not add it to the address).
         */
        if (__builtin_constant_p(size))
                return unlikely(addr > limit - size);

        /* Arbitrary sizes? Be careful about overflow */
        addr += size;
        if (unlikely(addr < size))
                return true;
        return unlikely(addr > limit);
}

#define __range_not_ok(addr, size, limit)                                \
({                                                                        \
        __chk_user_ptr(addr);                                                \
        __chk_range_not_ok((unsigned long __force)(addr), size, limit); \
})

#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
static inline bool pagefault_disabled(void);
# define WARN_ON_IN_IRQ()        \
        WARN_ON_ONCE(!in_task() && !pagefault_disabled())
#else
# define WARN_ON_IN_IRQ()
#endif

/**
 * access_ok - Checks if a user space pointer is valid
 * @addr: User space pointer to start of block to check
 * @size: Size of block to check
 *
 * Context: User context only. This function may sleep if pagefaults are
 *          enabled.
 *
 * Checks if a pointer to a block of memory in user space is valid.
 *
 * Note that, depending on architecture, this function probably just
 * checks that the pointer is in the user space range - after calling
 * this function, memory access functions may still return -EFAULT.
 *
 * Return: true (nonzero) if the memory block may be valid, false (zero)
 * if it is definitely invalid.
 */
#define access_ok(addr, size)                                        \
({                                                                        \
        WARN_ON_IN_IRQ();                                                \
        likely(!__range_not_ok(addr, size, TASK_SIZE_MAX));                \
})

extern int __get_user_1(void);
extern int __get_user_2(void);
extern int __get_user_4(void);
extern int __get_user_8(void);
extern int __get_user_nocheck_1(void);
extern int __get_user_nocheck_2(void);
extern int __get_user_nocheck_4(void);
extern int __get_user_nocheck_8(void);
extern int __get_user_bad(void);

#define __uaccess_begin() stac()
#define __uaccess_end()   clac()
#define __uaccess_begin_nospec()        \
({                                        \
        stac();                                \
        barrier_nospec();                \
})

/*
 * This is the smallest unsigned integer type that can fit a value
 * (up to 'long long')
 */
#define __inttype(x) __typeof__(                \
        __typefits(x,char,                        \
          __typefits(x,short,                        \
            __typefits(x,int,                        \
              __typefits(x,long,0ULL)))))

#define __typefits(x,type,not) \
        __builtin_choose_expr(sizeof(x)<=sizeof(type),(unsigned type)0,not)

/*
 * This is used for both get_user() and __get_user() to expand to
 * the proper special function call that has odd calling conventions
 * due to returning both a value and an error, and that depends on
 * the size of the pointer passed in.
 *
 * Careful: we have to cast the result to the type of the pointer
 * for sign reasons.
 *
 * The use of _ASM_DX as the register specifier is a bit of a
 * simplification, as gcc only cares about it as the starting point
 * and not size: for a 64-bit value it will use %ecx:%edx on 32 bits
 * (%ecx being the next register in gcc's x86 register sequence), and
 * %rdx on 64 bits.
 *
 * Clang/LLVM cares about the size of the register, but still wants
 * the base register for something that ends up being a pair.
 */
#define do_get_user_call(fn,x,ptr)                                        \
({                                                                        \
        int __ret_gu;                                                        \
        register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX);                \
        __chk_user_ptr(ptr);                                                \
        asm volatile("call __" #fn "_%P4"                                \
                     : "=a" (__ret_gu), "=r" (__val_gu),                \
                        ASM_CALL_CONSTRAINT                                \
                     : "0" (ptr), "i" (sizeof(*(ptr))));                \
        (x) = (__force __typeof__(*(ptr))) __val_gu;                        \
        __builtin_expect(__ret_gu, 0);                                        \
})

/**
 * get_user - Get a simple variable from user space.
 * @x:   Variable to store result.
 * @ptr: Source address, in user space.
 *
 * Context: User context only. This function may sleep if pagefaults are
 *          enabled.
 *
 * This macro copies a single simple variable from user space to kernel
 * space.  It supports simple types like char and int, but not larger
 * data types like structures or arrays.
 *
 * @ptr must have pointer-to-simple-variable type, and the result of
 * dereferencing @ptr must be assignable to @x without a cast.
 *
 * Return: zero on success, or -EFAULT on error.
 * On error, the variable @x is set to zero.
 */
#define get_user(x,ptr) ({ might_fault(); do_get_user_call(get_user,x,ptr); })

/**
 * __get_user - Get a simple variable from user space, with less checking.
 * @x:   Variable to store result.
 * @ptr: Source address, in user space.
 *
 * Context: User context only. This function may sleep if pagefaults are
 *          enabled.
 *
 * This macro copies a single simple variable from user space to kernel
 * space.  It supports simple types like char and int, but not larger
 * data types like structures or arrays.
 *
 * @ptr must have pointer-to-simple-variable type, and the result of
 * dereferencing @ptr must be assignable to @x without a cast.
 *
 * Caller must check the pointer with access_ok() before calling this
 * function.
 *
 * Return: zero on success, or -EFAULT on error.
 * On error, the variable @x is set to zero.
 */
#define __get_user(x,ptr) do_get_user_call(get_user_nocheck,x,ptr)


#ifdef CONFIG_X86_32
#define __put_user_goto_u64(x, addr, label)                        \
        asm_volatile_goto("\n"                                        \
                     "1:        movl %%eax,0(%1)\n"                \
                     "2:        movl %%edx,4(%1)\n"                \
                     _ASM_EXTABLE_UA(1b, %l2)                        \
                     _ASM_EXTABLE_UA(2b, %l2)                        \
                     : : "A" (x), "r" (addr)                        \
                     : : label)

#else
#define __put_user_goto_u64(x, ptr, label) \
        __put_user_goto(x, ptr, "q", "er", label)
#endif

extern void __put_user_bad(void);

/*
 * Strange magic calling convention: pointer in %ecx,
 * value in %eax(:%edx), return value in %ecx. clobbers %rbx
 */
extern void __put_user_1(void);
extern void __put_user_2(void);
extern void __put_user_4(void);
extern void __put_user_8(void);
extern void __put_user_nocheck_1(void);
extern void __put_user_nocheck_2(void);
extern void __put_user_nocheck_4(void);
extern void __put_user_nocheck_8(void);

/*
 * ptr must be evaluated and assigned to the temporary __ptr_pu before
 * the assignment of x to __val_pu, to avoid any function calls
 * involved in the ptr expression (possibly implicitly generated due
 * to KASAN) from clobbering %ax.
 */
#define do_put_user_call(fn,x,ptr)                                        \
({                                                                        \
        int __ret_pu;                                                        \
        void __user *__ptr_pu;                                                \
        register __typeof__(*(ptr)) __val_pu asm("%"_ASM_AX);                \
        __chk_user_ptr(ptr);                                                \
        __ptr_pu = (ptr);                                                \
        __val_pu = (x);                                                        \
        asm volatile("call __" #fn "_%P[size]"                                \
                     : "=c" (__ret_pu),                                        \
                        ASM_CALL_CONSTRAINT                                \
                     : "0" (__ptr_pu),                                        \
                       "r" (__val_pu),                                        \
                       [size] "i" (sizeof(*(ptr)))                        \
                     :"ebx");                                                \
        __builtin_expect(__ret_pu, 0);                                        \
})

/**
 * put_user - Write a simple value into user space.
 * @x:   Value to copy to user space.
 * @ptr: Destination address, in user space.
 *
 * Context: User context only. This function may sleep if pagefaults are
 *          enabled.
 *
 * This macro copies a single simple value from kernel space to user
 * space.  It supports simple types like char and int, but not larger
 * data types like structures or arrays.
 *
 * @ptr must have pointer-to-simple-variable type, and @x must be assignable
 * to the result of dereferencing @ptr.
 *
 * Return: zero on success, or -EFAULT on error.
 */
#define put_user(x, ptr) ({ might_fault(); do_put_user_call(put_user,x,ptr); })

/**
 * __put_user - Write a simple value into user space, with less checking.
 * @x:   Value to copy to user space.
 * @ptr: Destination address, in user space.
 *
 * Context: User context only. This function may sleep if pagefaults are
 *          enabled.
 *
 * This macro copies a single simple value from kernel space to user
 * space.  It supports simple types like char and int, but not larger
 * data types like structures or arrays.
 *
 * @ptr must have pointer-to-simple-variable type, and @x must be assignable
 * to the result of dereferencing @ptr.
 *
 * Caller must check the pointer with access_ok() before calling this
 * function.
 *
 * Return: zero on success, or -EFAULT on error.
 */
#define __put_user(x, ptr) do_put_user_call(put_user_nocheck,x,ptr)

#define __put_user_size(x, ptr, size, label)                                \
do {                                                                        \
        __chk_user_ptr(ptr);                                                \
        switch (size) {                                                        \
        case 1:                                                                \
                __put_user_goto(x, ptr, "b", "iq", label);                \
                break;                                                        \
        case 2:                                                                \
                __put_user_goto(x, ptr, "w", "ir", label);                \
                break;                                                        \
        case 4:                                                                \
                __put_user_goto(x, ptr, "l", "ir", label);                \
                break;                                                        \
        case 8:                                                                \
                __put_user_goto_u64(x, ptr, label);                        \
                break;                                                        \
        default:                                                        \
                __put_user_bad();                                        \
        }                                                                \
} while (0)

#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT

#ifdef CONFIG_X86_32
#define __get_user_asm_u64(x, ptr, label) do {                                \
        unsigned int __gu_low, __gu_high;                                \
        const unsigned int __user *__gu_ptr;                                \
        __gu_ptr = (const void __user *)(ptr);                                \
        __get_user_asm(__gu_low, __gu_ptr, "l", "=r", label);                \
        __get_user_asm(__gu_high, __gu_ptr+1, "l", "=r", label);        \
        (x) = ((unsigned long long)__gu_high << 32) | __gu_low;                \
} while (0)
#else
#define __get_user_asm_u64(x, ptr, label)                                \
        __get_user_asm(x, ptr, "q", "=r", label)
#endif

#define __get_user_size(x, ptr, size, label)                                \
do {                                                                        \
        __chk_user_ptr(ptr);                                                \
        switch (size) {                                                        \
        case 1:        {                                                        \
                unsigned char x_u8__;                                        \
                __get_user_asm(x_u8__, ptr, "b", "=q", label);                \
                (x) = x_u8__;                                                \
                break;                                                        \
        }                                                                \
        case 2:                                                                \
                __get_user_asm(x, ptr, "w", "=r", label);                \
                break;                                                        \
        case 4:                                                                \
                __get_user_asm(x, ptr, "l", "=r", label);                \
                break;                                                        \
        case 8:                                                                \
                __get_user_asm_u64(x, ptr, label);                        \
                break;                                                        \
        default:                                                        \
                (x) = __get_user_bad();                                        \
        }                                                                \
} while (0)

#define __get_user_asm(x, addr, itype, ltype, label)                        \
        asm_volatile_goto("\n"                                                \
                     "1:        mov"itype" %[umem],%[output]\n"                \
                     _ASM_EXTABLE_UA(1b, %l2)                                \
                     : [output] ltype(x)                                \
                     : [umem] "m" (__m(addr))                                \
                     : : label)

#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT

#ifdef CONFIG_X86_32
#define __get_user_asm_u64(x, ptr, retval)                                \
({                                                                        \
        __typeof__(ptr) __ptr = (ptr);                                        \
        asm volatile("\n"                                                \
                     "1:        movl %[lowbits],%%eax\n"                \
                     "2:        movl %[highbits],%%edx\n"                \
                     "3:\n"                                                \
                     ".section .fixup,\"ax\"\n"                                \
                     "4:        mov %[efault],%[errout]\n"                \
                     "        xorl %%eax,%%eax\n"                                \
                     "        xorl %%edx,%%edx\n"                                \
                     "        jmp 3b\n"                                        \
                     ".previous\n"                                        \
                     _ASM_EXTABLE_UA(1b, 4b)                                \
                     _ASM_EXTABLE_UA(2b, 4b)                                \
                     : [errout] "=r" (retval),                                \
                       [output] "=&A"(x)                                \
                     : [lowbits] "m" (__m(__ptr)),                        \
                       [highbits] "m" __m(((u32 __user *)(__ptr)) + 1),        \
                       [efault] "i" (-EFAULT), "0" (retval));                \
})

#else
#define __get_user_asm_u64(x, ptr, retval) \
         __get_user_asm(x, ptr, retval, "q", "=r")
#endif

#define __get_user_size(x, ptr, size, retval)                                \
do {                                                                        \
        unsigned char x_u8__;                                                \
                                                                        \
        retval = 0;                                                        \
        __chk_user_ptr(ptr);                                                \
        switch (size) {                                                        \
        case 1:                                                                \
                __get_user_asm(x_u8__, ptr, retval, "b", "=q");                \
                (x) = x_u8__;                                                \
                break;                                                        \
        case 2:                                                                \
                __get_user_asm(x, ptr, retval, "w", "=r");                \
                break;                                                        \
        case 4:                                                                \
                __get_user_asm(x, ptr, retval, "l", "=r");                \
                break;                                                        \
        case 8:                                                                \
                __get_user_asm_u64(x, ptr, retval);                        \
                break;                                                        \
        default:                                                        \
                (x) = __get_user_bad();                                        \
        }                                                                \
} while (0)

#define __get_user_asm(x, addr, err, itype, ltype)                        \
        asm volatile("\n"                                                \
                     "1:        mov"itype" %[umem],%[output]\n"                \
                     "2:\n"                                                \
                     ".section .fixup,\"ax\"\n"                                \
                     "3:        mov %[efault],%[errout]\n"                \
                     "        xorl %k[output],%k[output]\n"                        \
                     "        jmp 2b\n"                                        \
                     ".previous\n"                                        \
                     _ASM_EXTABLE_UA(1b, 3b)                                \
                     : [errout] "=r" (err),                                \
                       [output] ltype(x)                                \
                     : [umem] "m" (__m(addr)),                                \
                       [efault] "i" (-EFAULT), "0" (err))

#endif // CONFIG_CC_ASM_GOTO_OUTPUT

#ifdef CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
#define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label)        ({ \
        bool success;                                                        \
        __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);                \
        __typeof__(*(_ptr)) __old = *_old;                                \
        __typeof__(*(_ptr)) __new = (_new);                                \
        asm_volatile_goto("\n"                                                \
                     "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\
                     _ASM_EXTABLE_UA(1b, %l[label])                        \
                     : CC_OUT(z) (success),                                \
                       [ptr] "+m" (*_ptr),                                \
                       [old] "+a" (__old)                                \
                     : [new] ltype (__new)                                \
                     : "memory"                                                \
                     : label);                                                \
        if (unlikely(!success))                                                \
                *_old = __old;                                                \
        likely(success);                                        })

#ifdef CONFIG_X86_32
#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label)        ({        \
        bool success;                                                        \
        __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);                \
        __typeof__(*(_ptr)) __old = *_old;                                \
        __typeof__(*(_ptr)) __new = (_new);                                \
        asm_volatile_goto("\n"                                                \
                     "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n"                \
                     _ASM_EXTABLE_UA(1b, %l[label])                        \
                     : CC_OUT(z) (success),                                \
                       "+A" (__old),                                        \
                       [ptr] "+m" (*_ptr)                                \
                     : "b" ((u32)__new),                                \
                       "c" ((u32)((u64)__new >> 32))                        \
                     : "memory"                                                \
                     : label);                                                \
        if (unlikely(!success))                                                \
                *_old = __old;                                                \
        likely(success);                                        })
#endif // CONFIG_X86_32
#else  // !CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
#define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label)        ({ \
        int __err = 0;                                                        \
        bool success;                                                        \
        __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);                \
        __typeof__(*(_ptr)) __old = *_old;                                \
        __typeof__(*(_ptr)) __new = (_new);                                \
        asm volatile("\n"                                                \
                     "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\
                     CC_SET(z)                                                \
                     "2:\n"                                                \
                     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG,        \
                                           %[errout])                        \
                     : CC_OUT(z) (success),                                \
                       [errout] "+r" (__err),                                \
                       [ptr] "+m" (*_ptr),                                \
                       [old] "+a" (__old)                                \
                     : [new] ltype (__new)                                \
                     : "memory");                                        \
        if (unlikely(__err))                                                \
                goto label;                                                \
        if (unlikely(!success))                                                \
                *_old = __old;                                                \
        likely(success);                                        })

#ifdef CONFIG_X86_32
/*
 * Unlike the normal CMPXCHG, hardcode ECX for both success/fail and error.
 * There are only six GPRs available and four (EAX, EBX, ECX, and EDX) are
 * hardcoded by CMPXCHG8B, leaving only ESI and EDI.  If the compiler uses
 * both ESI and EDI for the memory operand, compilation will fail if the error
 * is an input+output as there will be no register available for input.
 */
#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label)        ({        \
        int __result;                                                        \
        __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);                \
        __typeof__(*(_ptr)) __old = *_old;                                \
        __typeof__(*(_ptr)) __new = (_new);                                \
        asm volatile("\n"                                                \
                     "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n"                \
                     "mov $0, %%ecx\n\t"                                \
                     "setz %%cl\n"                                        \
                     "2:\n"                                                \
                     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %%ecx) \
                     : [result]"=c" (__result),                                \
                       "+A" (__old),                                        \
                       [ptr] "+m" (*_ptr)                                \
                     : "b" ((u32)__new),                                \
                       "c" ((u32)((u64)__new >> 32))                        \
                     : "memory", "cc");                                        \
        if (unlikely(__result < 0))                                        \
                goto label;                                                \
        if (unlikely(!__result))                                        \
                *_old = __old;                                                \
        likely(__result);                                        })
#endif // CONFIG_X86_32
#endif // CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT

/* FIXME: this hack is definitely wrong -AK */
struct __large_struct { unsigned long buf[100]; };
#define __m(x) (*(struct __large_struct __user *)(x))

/*
 * Tell gcc we read from memory instead of writing: this is because
 * we do not write to any memory gcc knows about, so there are no
 * aliasing issues.
 */
#define __put_user_goto(x, addr, itype, ltype, label)                        \
        asm_volatile_goto("\n"                                                \
                "1:        mov"itype" %0,%1\n"                                \
                _ASM_EXTABLE_UA(1b, %l2)                                \
                : : ltype(x), "m" (__m(addr))                                \
                : : label)

extern unsigned long
copy_from_user_nmi(void *to, const void __user *from, unsigned long n);
extern __must_check long
strncpy_from_user(char *dst, const char __user *src, long count);

extern __must_check long strnlen_user(const char __user *str, long n);

unsigned long __must_check clear_user(void __user *mem, unsigned long len);
unsigned long __must_check __clear_user(void __user *mem, unsigned long len);

#ifdef CONFIG_ARCH_HAS_COPY_MC
unsigned long __must_check
copy_mc_to_kernel(void *to, const void *from, unsigned len);
#define copy_mc_to_kernel copy_mc_to_kernel

unsigned long __must_check
copy_mc_to_user(void __user *to, const void *from, unsigned len);
#endif

/*
 * movsl can be slow when source and dest are not both 8-byte aligned
 */
#ifdef CONFIG_X86_INTEL_USERCOPY
extern struct movsl_mask {
        int mask;
} ____cacheline_aligned_in_smp movsl_mask;
#endif

#define ARCH_HAS_NOCACHE_UACCESS 1

#ifdef CONFIG_X86_32
# include <asm/uaccess_32.h>
#else
# include <asm/uaccess_64.h>
#endif

/*
 * The "unsafe" user accesses aren't really "unsafe", but the naming
 * is a big fat warning: you have to not only do the access_ok()
 * checking before using them, but you have to surround them with the
 * user_access_begin/end() pair.
 */
static __must_check __always_inline bool user_access_begin(const void __user *ptr, size_t len)
{
        if (unlikely(!access_ok(ptr,len)))
                return 0;
        __uaccess_begin_nospec();
        return 1;
}
#define user_access_begin(a,b)        user_access_begin(a,b)
#define user_access_end()        __uaccess_end()

#define user_access_save()        smap_save()
#define user_access_restore(x)        smap_restore(x)

#define unsafe_put_user(x, ptr, label)        \
        __put_user_size((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), label)

#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define unsafe_get_user(x, ptr, err_label)                                        \
do {                                                                                \
        __inttype(*(ptr)) __gu_val;                                                \
        __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), err_label);                \
        (x) = (__force __typeof__(*(ptr)))__gu_val;                                \
} while (0)
#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define unsafe_get_user(x, ptr, err_label)                                        \
do {                                                                                \
        int __gu_err;                                                                \
        __inttype(*(ptr)) __gu_val;                                                \
        __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err);                \
        (x) = (__force __typeof__(*(ptr)))__gu_val;                                \
        if (unlikely(__gu_err)) goto err_label;                                        \
} while (0)
#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT

extern void __try_cmpxchg_user_wrong_size(void);

#ifndef CONFIG_X86_32
#define __try_cmpxchg64_user_asm(_ptr, _oldp, _nval, _label)                \
        __try_cmpxchg_user_asm("q", "r", (_ptr), (_oldp), (_nval), _label)
#endif

/*
 * Force the pointer to u<size> to match the size expected by the asm helper.
 * clang/LLVM compiles all cases and only discards the unused paths after
 * processing errors, which breaks i386 if the pointer is an 8-byte value.
 */
#define unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label) ({                        \
        bool __ret;                                                                \
        __chk_user_ptr(_ptr);                                                        \
        switch (sizeof(*(_ptr))) {                                                \
        case 1:        __ret = __try_cmpxchg_user_asm("b", "q",                        \
                                               (__force u8 *)(_ptr), (_oldp),        \
                                               (_nval), _label);                \
                break;                                                                \
        case 2:        __ret = __try_cmpxchg_user_asm("w", "r",                        \
                                               (__force u16 *)(_ptr), (_oldp),        \
                                               (_nval), _label);                \
                break;                                                                \
        case 4:        __ret = __try_cmpxchg_user_asm("l", "r",                        \
                                               (__force u32 *)(_ptr), (_oldp),        \
                                               (_nval), _label);                \
                break;                                                                \
        case 8:        __ret = __try_cmpxchg64_user_asm((__force u64 *)(_ptr), (_oldp),\
                                                 (_nval), _label);                \
                break;                                                                \
        default: __try_cmpxchg_user_wrong_size();                                \
        }                                                                        \
        __ret;                                                })

/* "Returns" 0 on success, 1 on failure, -EFAULT if the access faults. */
#define __try_cmpxchg_user(_ptr, _oldp, _nval, _label)        ({                \
        int __ret = -EFAULT;                                                \
        __uaccess_begin_nospec();                                        \
        __ret = !unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label);        \
_label:                                                                        \
        __uaccess_end();                                                \
        __ret;                                                                \
                                                        })

/*
 * We want the unsafe accessors to always be inlined and use
 * the error labels - thus the macro games.
 */
#define unsafe_copy_loop(dst, src, len, type, label)                                \
        while (len >= sizeof(type)) {                                                \
                unsafe_put_user(*(type *)(src),(type __user *)(dst),label);        \
                dst += sizeof(type);                                                \
                src += sizeof(type);                                                \
                len -= sizeof(type);                                                \
        }

#define unsafe_copy_to_user(_dst,_src,_len,label)                        \
do {                                                                        \
        char __user *__ucu_dst = (_dst);                                \
        const char *__ucu_src = (_src);                                        \
        size_t __ucu_len = (_len);                                        \
        unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label);        \
        unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label);        \
        unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label);        \
        unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label);        \
} while (0)

#define HAVE_GET_KERNEL_NOFAULT

#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define __get_kernel_nofault(dst, src, type, err_label)                        \
        __get_user_size(*((type *)(dst)), (__force type __user *)(src),        \
                        sizeof(type), err_label)
#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define __get_kernel_nofault(dst, src, type, err_label)                        \
do {                                                                        \
        int __kr_err;                                                        \
                                                                        \
        __get_user_size(*((type *)(dst)), (__force type __user *)(src),        \
                        sizeof(type), __kr_err);                        \
        if (unlikely(__kr_err))                                                \
                goto err_label;                                                \
} while (0)
#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT

#define __put_kernel_nofault(dst, src, type, err_label)                        \
        __put_user_size(*((type *)(src)), (__force type __user *)(dst),        \
                        sizeof(type), err_label)

#endif /* _ASM_X86_UACCESS_H */

















































    1 








    1 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
// SPDX-License-Identifier: GPL-2.0
#include <linux/memblock.h>
#include <linux/mmdebug.h>
#include <linux/export.h>
#include <linux/mm.h>

#include <asm/page.h>
#include <linux/vmalloc.h>

#include "physaddr.h"

#ifdef CONFIG_X86_64

#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
        unsigned long y = x - __START_KERNEL_map;

        /* use the carry flag to determine if x was < __START_KERNEL_map */
        if (unlikely(x > y)) {
                x = y + phys_base;

                VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
        } else {
                x = y + (__START_KERNEL_map - PAGE_OFFSET);

                /* carry flag will be set if starting x was >= PAGE_OFFSET */
                VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x));
        }

        return x;
}
EXPORT_SYMBOL(__phys_addr);

unsigned long __phys_addr_symbol(unsigned long x)
{
        unsigned long y = x - __START_KERNEL_map;

        /* only check upper bounds since lower bounds will trigger carry */
        VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);

        return y + phys_base;
}
EXPORT_SYMBOL(__phys_addr_symbol);
#endif

bool __virt_addr_valid(unsigned long x)
{
        unsigned long y = x - __START_KERNEL_map;

        /* use the carry flag to determine if x was < __START_KERNEL_map */
        if (unlikely(x > y)) {
                x = y + phys_base;

                if (y >= KERNEL_IMAGE_SIZE)
                        return false;
        } else {
                x = y + (__START_KERNEL_map - PAGE_OFFSET);

                /* carry flag will be set if starting x was >= PAGE_OFFSET */
                if ((x > y) || !phys_addr_valid(x))
                        return false;
        }

        return pfn_valid(x >> PAGE_SHIFT);
}
EXPORT_SYMBOL(__virt_addr_valid);

#else

#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
        unsigned long phys_addr = x - PAGE_OFFSET;
        /* VMALLOC_* aren't constants  */
        VIRTUAL_BUG_ON(x < PAGE_OFFSET);
        VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
        /* max_low_pfn is set early, but not _that_ early */
        if (max_low_pfn) {
                VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn);
                BUG_ON(slow_virt_to_phys((void *)x) != phys_addr);
        }
        return phys_addr;
}
EXPORT_SYMBOL(__phys_addr);
#endif

bool __virt_addr_valid(unsigned long x)
{
        if (x < PAGE_OFFSET)
                return false;
        if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
                return false;
        if (x >= FIXADDR_START)
                return false;
        return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
}
EXPORT_SYMBOL(__virt_addr_valid);

#endif        /* CONFIG_X86_64 */

















































































































































































































































    1 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Flexible mmap layout support
 *
 * Based on code by Ingo Molnar and Andi Kleen, copyrighted
 * as follows:
 *
 * Copyright 2003-2009 Red Hat Inc.
 * All Rights Reserved.
 * Copyright 2005 Andi Kleen, SUSE Labs.
 * Copyright 2007 Jiri Kosina, SUSE Labs.
 */

#include <linux/personality.h>
#include <linux/mm.h>
#include <linux/random.h>
#include <linux/limits.h>
#include <linux/sched/signal.h>
#include <linux/sched/mm.h>
#include <linux/compat.h>
#include <linux/elf-randomize.h>
#include <asm/elf.h>
#include <asm/io.h>

#include "physaddr.h"

struct va_alignment __read_mostly va_align = {
        .flags = -1,
};

unsigned long task_size_32bit(void)
{
        return IA32_PAGE_OFFSET;
}

unsigned long task_size_64bit(int full_addr_space)
{
        return full_addr_space ? TASK_SIZE_MAX : DEFAULT_MAP_WINDOW;
}

static unsigned long stack_maxrandom_size(unsigned long task_size)
{
        unsigned long max = 0;
        if (current->flags & PF_RANDOMIZE) {
                max = (-1UL) & __STACK_RND_MASK(task_size == task_size_32bit());
                max <<= PAGE_SHIFT;
        }

        return max;
}

#ifdef CONFIG_COMPAT
# define mmap32_rnd_bits  mmap_rnd_compat_bits
# define mmap64_rnd_bits  mmap_rnd_bits
#else
# define mmap32_rnd_bits  mmap_rnd_bits
# define mmap64_rnd_bits  mmap_rnd_bits
#endif

#define SIZE_128M    (128 * 1024 * 1024UL)

static int mmap_is_legacy(void)
{
        if (current->personality & ADDR_COMPAT_LAYOUT)
                return 1;

        return sysctl_legacy_va_layout;
}

static unsigned long arch_rnd(unsigned int rndbits)
{
        if (!(current->flags & PF_RANDOMIZE))
                return 0;
        return (get_random_long() & ((1UL << rndbits) - 1)) << PAGE_SHIFT;
}

unsigned long arch_mmap_rnd(void)
{
        return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits);
}

static unsigned long mmap_base(unsigned long rnd, unsigned long task_size,
                               struct rlimit *rlim_stack)
{
        unsigned long gap = rlim_stack->rlim_cur;
        unsigned long pad = stack_maxrandom_size(task_size) + stack_guard_gap;
        unsigned long gap_min, gap_max;

        /* Values close to RLIM_INFINITY can overflow. */
        if (gap + pad > gap)
                gap += pad;

        /*
         * Top of mmap area (just below the process stack).
         * Leave an at least ~128 MB hole with possible stack randomization.
         */
        gap_min = SIZE_128M;
        gap_max = (task_size / 6) * 5;

        if (gap < gap_min)
                gap = gap_min;
        else if (gap > gap_max)
                gap = gap_max;

        return PAGE_ALIGN(task_size - gap - rnd);
}

static unsigned long mmap_legacy_base(unsigned long rnd,
                                      unsigned long task_size)
{
        return __TASK_UNMAPPED_BASE(task_size) + rnd;
}

/*
 * This function, called very early during the creation of a new
 * process VM image, sets up which VM layout function to use:
 */
static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base,
                unsigned long random_factor, unsigned long task_size,
                struct rlimit *rlim_stack)
{
        *legacy_base = mmap_legacy_base(random_factor, task_size);
        if (mmap_is_legacy())
                *base = *legacy_base;
        else
                *base = mmap_base(random_factor, task_size, rlim_stack);
}

void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
        if (mmap_is_legacy())
                mm->get_unmapped_area = arch_get_unmapped_area;
        else
                mm->get_unmapped_area = arch_get_unmapped_area_topdown;

        arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base,
                        arch_rnd(mmap64_rnd_bits), task_size_64bit(0),
                        rlim_stack);

#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
        /*
         * The mmap syscall mapping base decision depends solely on the
         * syscall type (64-bit or compat). This applies for 64bit
         * applications and 32bit applications. The 64bit syscall uses
         * mmap_base, the compat syscall uses mmap_compat_base.
         */
        arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base,
                        arch_rnd(mmap32_rnd_bits), task_size_32bit(),
                        rlim_stack);
#endif
}

unsigned long get_mmap_base(int is_legacy)
{
        struct mm_struct *mm = current->mm;

#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
        if (in_32bit_syscall()) {
                return is_legacy ? mm->mmap_compat_legacy_base
                                 : mm->mmap_compat_base;
        }
#endif
        return is_legacy ? mm->mmap_legacy_base : mm->mmap_base;
}

const char *arch_vma_name(struct vm_area_struct *vma)
{
        return NULL;
}

/**
 * mmap_address_hint_valid - Validate the address hint of mmap
 * @addr:        Address hint
 * @len:        Mapping length
 *
 * Check whether @addr and @addr + @len result in a valid mapping.
 *
 * On 32bit this only checks whether @addr + @len is <= TASK_SIZE.
 *
 * On 64bit with 5-level page tables another sanity check is required
 * because mappings requested by mmap(@addr, 0) which cross the 47-bit
 * virtual address boundary can cause the following theoretical issue:
 *
 *  An application calls mmap(addr, 0), i.e. without MAP_FIXED, where @addr
 *  is below the border of the 47-bit address space and @addr + @len is
 *  above the border.
 *
 *  With 4-level paging this request succeeds, but the resulting mapping
 *  address will always be within the 47-bit virtual address space, because
 *  the hint address does not result in a valid mapping and is
 *  ignored. Hence applications which are not prepared to handle virtual
 *  addresses above 47-bit work correctly.
 *
 *  With 5-level paging this request would be granted and result in a
 *  mapping which crosses the border of the 47-bit virtual address
 *  space. If the application cannot handle addresses above 47-bit this
 *  will lead to misbehaviour and hard to diagnose failures.
 *
 * Therefore ignore address hints which would result in a mapping crossing
 * the 47-bit virtual address boundary.
 *
 * Note, that in the same scenario with MAP_FIXED the behaviour is
 * different. The request with @addr < 47-bit and @addr + @len > 47-bit
 * fails on a 4-level paging machine but succeeds on a 5-level paging
 * machine. It is reasonable to expect that an application does not rely on
 * the failure of such a fixed mapping request, so the restriction is not
 * applied.
 */
bool mmap_address_hint_valid(unsigned long addr, unsigned long len)
{
        if (TASK_SIZE - len < addr)
                return false;

        return (addr > DEFAULT_MAP_WINDOW) == (addr + len > DEFAULT_MAP_WINDOW);
}

/* Can we access it for direct reading/writing? Must be RAM: */
int valid_phys_addr_range(phys_addr_t addr, size_t count)
{
        return addr + count - 1 <= __pa(high_memory - 1);
}

/* Can we access it through mmap? Must be a valid physical address: */
int valid_mmap_phys_addr_range(unsigned long pfn, size_t count)
{
        phys_addr_t addr = (phys_addr_t)pfn << PAGE_SHIFT;

        return phys_addr_valid(addr + count - 1);
}

/*
 * Only allow root to set high MMIO mappings to PROT_NONE.
 * This prevents an unpriv. user to set them to PROT_NONE and invert
 * them, then pointing to valid memory for L1TF speculation.
 *
 * Note: for locked down kernels may want to disable the root override.
 */
bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
{
        if (!boot_cpu_has_bug(X86_BUG_L1TF))
                return true;
        if (!__pte_needs_invert(pgprot_val(prot)))
                return true;
        /* If it's real memory always allow */
        if (pfn_valid(pfn))
                return true;
        if (pfn >= l1tf_pfn_limit() && !capable(CAP_SYS_ADMIN))
                return false;
        return true;
}








































































































































































    1 






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




    1 





    1 
    1 









    1 
























































    1 
























































































































































































































































































































    1 
    1 

    1 



    1 





    1 



    1 
    1 

































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Simple NUMA memory policy for the Linux kernel.
 *
 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
 *
 * NUMA policy allows the user to give hints in which node(s) memory should
 * be allocated.
 *
 * Support four policies per VMA and per process:
 *
 * The VMA policy has priority over the process policy for a page fault.
 *
 * interleave     Allocate memory interleaved over a set of nodes,
 *                with normal fallback if it fails.
 *                For VMA based allocations this interleaves based on the
 *                offset into the backing object or offset into the mapping
 *                for anonymous memory. For process policy an process counter
 *                is used.
 *
 * bind           Only allocate memory on a specific set of nodes,
 *                no fallback.
 *                FIXME: memory is allocated starting with the first node
 *                to the last. It would be better if bind would truly restrict
 *                the allocation to memory nodes instead
 *
 * preferred       Try a specific node first before normal fallback.
 *                As a special case NUMA_NO_NODE here means do the allocation
 *                on the local CPU. This is normally identical to default,
 *                but useful to set in a VMA when you have a non default
 *                process policy.
 *
 * default        Allocate on the local node first, or when on a VMA
 *                use the process policy. This is what Linux always did
 *                  in a NUMA aware kernel and still does by, ahem, default.
 *
 * The process policy is applied for most non interrupt memory allocations
 * in that process' context. Interrupts ignore the policies and always
 * try to allocate on the local CPU. The VMA policy is only applied for memory
 * allocations for a VMA in the VM.
 *
 * Currently there are a few corner cases in swapping where the policy
 * is not applied, but the majority should be handled. When process policy
 * is used it is not remembered over swap outs/swap ins.
 *
 * Only the highest zone in the zone hierarchy gets policied. Allocations
 * requesting a lower zone just use default policy. This implies that
 * on systems with highmem kernel lowmem allocation don't get policied.
 * Same with GFP_DMA allocations.
 *
 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
 * all users and remembered even when nobody has memory mapped.
 */

/* Notebook:
   fix mmap readahead to honour policy and enable policy for any page cache
   object
   statistics for bigpages
   global policy for page cache? currently it uses process policy. Requires
   first item above.
   handle mremap for shared memory (currently ignored for the policy)
   grows down?
   make bind policy root only? It can trigger oom much faster and the
   kernel is not always grateful with that.
*/

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mempolicy.h>
#include <linux/pagewalk.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/nodemask.h>
#include <linux/cpuset.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/export.h>
#include <linux/nsproxy.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/compat.h>
#include <linux/ptrace.h>
#include <linux/swap.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/migrate.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/ctype.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/printk.h>
#include <linux/swapops.h>

#include <asm/tlbflush.h>
#include <linux/uaccess.h>

#include "internal.h"

/* Internal flags */
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)        /* Skip checks for continuous vmas */
#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)                /* Invert check for nodemask */

static struct kmem_cache *policy_cache;
static struct kmem_cache *sn_cache;

/* Highest zone. An specific allocation for a zone below that is not
   policied. */
enum zone_type policy_zone = 0;

/*
 * run-time system-wide default policy => local allocation
 */
static struct mempolicy default_policy = {
        .refcnt = ATOMIC_INIT(1), /* never free it */
        .mode = MPOL_PREFERRED,
        .flags = MPOL_F_LOCAL,
};

static struct mempolicy preferred_node_policy[MAX_NUMNODES];

/**
 * numa_map_to_online_node - Find closest online node
 * @node: Node id to start the search
 *
 * Lookup the next closest node by distance if @nid is not online.
 */
int numa_map_to_online_node(int node)
{
        int min_dist = INT_MAX, dist, n, min_node;

        if (node == NUMA_NO_NODE || node_online(node))
                return node;

        min_node = node;
        for_each_online_node(n) {
                dist = node_distance(node, n);
                if (dist < min_dist) {
                        min_dist = dist;
                        min_node = n;
                }
        }

        return min_node;
}
EXPORT_SYMBOL_GPL(numa_map_to_online_node);

struct mempolicy *get_task_policy(struct task_struct *p)
{
        struct mempolicy *pol = p->mempolicy;
        int node;

        if (pol)
                return pol;

        node = numa_node_id();
        if (node != NUMA_NO_NODE) {
                pol = &preferred_node_policy[node];
                /* preferred_node_policy is not initialised early in boot */
                if (pol->mode)
                        return pol;
        }

        return &default_policy;
}

static const struct mempolicy_operations {
        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
        void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
} mpol_ops[MPOL_MAX];

static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
{
        return pol->flags & MPOL_MODE_FLAGS;
}

static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
                                   const nodemask_t *rel)
{
        nodemask_t tmp;
        nodes_fold(tmp, *orig, nodes_weight(*rel));
        nodes_onto(*ret, tmp, *rel);
}

static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
{
        if (nodes_empty(*nodes))
                return -EINVAL;
        pol->v.nodes = *nodes;
        return 0;
}

static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
{
        if (!nodes)
                pol->flags |= MPOL_F_LOCAL;        /* local allocation */
        else if (nodes_empty(*nodes))
                return -EINVAL;                        /*  no allowed nodes */
        else
                pol->v.preferred_node = first_node(*nodes);
        return 0;
}

static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
{
        if (nodes_empty(*nodes))
                return -EINVAL;
        pol->v.nodes = *nodes;
        return 0;
}

/*
 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 * any, for the new policy.  mpol_new() has already validated the nodes
 * parameter with respect to the policy mode and flags.  But, we need to
 * handle an empty nodemask with MPOL_PREFERRED here.
 *
 * Must be called holding task's alloc_lock to protect task's mems_allowed
 * and mempolicy.  May also be called holding the mmap_lock for write.
 */
static int mpol_set_nodemask(struct mempolicy *pol,
                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
{
        int ret;

        /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
        if (pol == NULL)
                return 0;
        /* Check N_MEMORY */
        nodes_and(nsc->mask1,
                  cpuset_current_mems_allowed, node_states[N_MEMORY]);

        VM_BUG_ON(!nodes);
        if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
                nodes = NULL;        /* explicit local allocation */
        else {
                if (pol->flags & MPOL_F_RELATIVE_NODES)
                        mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
                else
                        nodes_and(nsc->mask2, *nodes, nsc->mask1);

                if (mpol_store_user_nodemask(pol))
                        pol->w.user_nodemask = *nodes;
                else
                        pol->w.cpuset_mems_allowed =
                                                cpuset_current_mems_allowed;
        }

        if (nodes)
                ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
        else
                ret = mpol_ops[pol->mode].create(pol, NULL);
        return ret;
}

/*
 * This function just creates a new policy, does some check and simple
 * initialization. You must invoke mpol_set_nodemask() to set nodes.
 */
static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
                                  nodemask_t *nodes)
{
        struct mempolicy *policy;

        pr_debug("setting mode %d flags %d nodes[0] %lx\n",
                 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);

        if (mode == MPOL_DEFAULT) {
                if (nodes && !nodes_empty(*nodes))
                        return ERR_PTR(-EINVAL);
                return NULL;
        }
        VM_BUG_ON(!nodes);

        /*
         * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
         * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
         * All other modes require a valid pointer to a non-empty nodemask.
         */
        if (mode == MPOL_PREFERRED) {
                if (nodes_empty(*nodes)) {
                        if (((flags & MPOL_F_STATIC_NODES) ||
                             (flags & MPOL_F_RELATIVE_NODES)))
                                return ERR_PTR(-EINVAL);
                }
        } else if (mode == MPOL_LOCAL) {
                if (!nodes_empty(*nodes) ||
                    (flags & MPOL_F_STATIC_NODES) ||
                    (flags & MPOL_F_RELATIVE_NODES))
                        return ERR_PTR(-EINVAL);
                mode = MPOL_PREFERRED;
        } else if (nodes_empty(*nodes))
                return ERR_PTR(-EINVAL);
        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
        if (!policy)
                return ERR_PTR(-ENOMEM);
        atomic_set(&policy->refcnt, 1);
        policy->mode = mode;
        policy->flags = flags;

        return policy;
}

/* Slow path of a mpol destructor. */
void __mpol_put(struct mempolicy *p)
{
        if (!atomic_dec_and_test(&p->refcnt))
                return;
        kmem_cache_free(policy_cache, p);
}

static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
{
}

static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
        nodemask_t tmp;

        if (pol->flags & MPOL_F_STATIC_NODES)
                nodes_and(tmp, pol->w.user_nodemask, *nodes);
        else if (pol->flags & MPOL_F_RELATIVE_NODES)
                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
        else {
                nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
                                                                *nodes);
                pol->w.cpuset_mems_allowed = *nodes;
        }

        if (nodes_empty(tmp))
                tmp = *nodes;

        pol->v.nodes = tmp;
}

static void mpol_rebind_preferred(struct mempolicy *pol,
                                                const nodemask_t *nodes)
{
        nodemask_t tmp;

        if (pol->flags & MPOL_F_STATIC_NODES) {
                int node = first_node(pol->w.user_nodemask);

                if (node_isset(node, *nodes)) {
                        pol->v.preferred_node = node;
                        pol->flags &= ~MPOL_F_LOCAL;
                } else
                        pol->flags |= MPOL_F_LOCAL;
        } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
                pol->v.preferred_node = first_node(tmp);
        } else if (!(pol->flags & MPOL_F_LOCAL)) {
                pol->v.preferred_node = node_remap(pol->v.preferred_node,
                                                   pol->w.cpuset_mems_allowed,
                                                   *nodes);
                pol->w.cpuset_mems_allowed = *nodes;
        }
}

/*
 * mpol_rebind_policy - Migrate a policy to a different set of nodes
 *
 * Per-vma policies are protected by mmap_lock. Allocations using per-task
 * policies are protected by task->mems_allowed_seq to prevent a premature
 * OOM/allocation failure due to parallel nodemask modification.
 */
static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
        if (!pol || pol->mode == MPOL_LOCAL)
                return;
        if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
                return;

        mpol_ops[pol->mode].rebind(pol, newmask);
}

/*
 * Wrapper for mpol_rebind_policy() that just requires task
 * pointer, and updates task mempolicy.
 *
 * Called with task's alloc_lock held.
 */

void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
{
        mpol_rebind_policy(tsk->mempolicy, new);
}

/*
 * Rebind each vma in mm to new nodemask.
 *
 * Call holding a reference to mm.  Takes mm->mmap_lock during call.
 */

void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
        struct vm_area_struct *vma;

        mmap_write_lock(mm);
        for (vma = mm->mmap; vma; vma = vma->vm_next)
                mpol_rebind_policy(vma->vm_policy, new);
        mmap_write_unlock(mm);
}

static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
        [MPOL_DEFAULT] = {
                .rebind = mpol_rebind_default,
        },
        [MPOL_INTERLEAVE] = {
                .create = mpol_new_interleave,
                .rebind = mpol_rebind_nodemask,
        },
        [MPOL_PREFERRED] = {
                .create = mpol_new_preferred,
                .rebind = mpol_rebind_preferred,
        },
        [MPOL_BIND] = {
                .create = mpol_new_bind,
                .rebind = mpol_rebind_nodemask,
        },
};

static int migrate_page_add(struct page *page, struct list_head *pagelist,
                                unsigned long flags);

struct queue_pages {
        struct list_head *pagelist;
        unsigned long flags;
        nodemask_t *nmask;
        unsigned long start;
        unsigned long end;
        struct vm_area_struct *first;
};

/*
 * Check if the page's nid is in qp->nmask.
 *
 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
 * in the invert of qp->nmask.
 */
static inline bool queue_pages_required(struct page *page,
                                        struct queue_pages *qp)
{
        int nid = page_to_nid(page);
        unsigned long flags = qp->flags;

        return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
}

/*
 * queue_pages_pmd() has four possible return values:
 * 0 - pages are placed on the right node or queued successfully.
 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
 *     specified.
 * 2 - THP was split.
 * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
 *        existing page was already on a node that does not follow the
 *        policy.
 */
static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
                                unsigned long end, struct mm_walk *walk)
        __releases(ptl)
{
        int ret = 0;
        struct page *page;
        struct queue_pages *qp = walk->private;
        unsigned long flags;

        if (unlikely(is_pmd_migration_entry(*pmd))) {
                ret = -EIO;
                goto unlock;
        }
        page = pmd_page(*pmd);
        if (is_huge_zero_page(page)) {
                spin_unlock(ptl);
                __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
                ret = 2;
                goto out;
        }
        if (!queue_pages_required(page, qp))
                goto unlock;

        flags = qp->flags;
        /* go to thp migration */
        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
                if (!vma_migratable(walk->vma) ||
                    migrate_page_add(page, qp->pagelist, flags)) {
                        ret = 1;
                        goto unlock;
                }
        } else
                ret = -EIO;
unlock:
        spin_unlock(ptl);
out:
        return ret;
}

/*
 * Scan through pages checking if pages follow certain conditions,
 * and move them to the pagelist if they do.
 *
 * queue_pages_pte_range() has three possible return values:
 * 0 - pages are placed on the right node or queued successfully.
 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
 *     specified.
 * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
 *        on a node that does not follow the policy.
 */
static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
                        unsigned long end, struct mm_walk *walk)
{
        struct vm_area_struct *vma = walk->vma;
        struct page *page;
        struct queue_pages *qp = walk->private;
        unsigned long flags = qp->flags;
        int ret;
        bool has_unmovable = false;
        pte_t *pte, *mapped_pte;
        spinlock_t *ptl;

        ptl = pmd_trans_huge_lock(pmd, vma);
        if (ptl) {
                ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
                if (ret != 2)
                        return ret;
        }
        /* THP was split, fall through to pte walk */

        if (pmd_trans_unstable(pmd))
                return 0;

        mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
        for (; addr != end; pte++, addr += PAGE_SIZE) {
                if (!pte_present(*pte))
                        continue;
                page = vm_normal_page(vma, addr, *pte);
                if (!page)
                        continue;
                /*
                 * vm_normal_page() filters out zero pages, but there might
                 * still be PageReserved pages to skip, perhaps in a VDSO.
                 */
                if (PageReserved(page))
                        continue;
                if (!queue_pages_required(page, qp))
                        continue;
                if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
                        /* MPOL_MF_STRICT must be specified if we get here */
                        if (!vma_migratable(vma)) {
                                has_unmovable = true;
                                break;
                        }

                        /*
                         * Do not abort immediately since there may be
                         * temporary off LRU pages in the range.  Still
                         * need migrate other LRU pages.
                         */
                        if (migrate_page_add(page, qp->pagelist, flags))
                                has_unmovable = true;
                } else
                        break;
        }
        pte_unmap_unlock(mapped_pte, ptl);
        cond_resched();

        if (has_unmovable)
                return 1;

        return addr != end ? -EIO : 0;
}

static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
                               unsigned long addr, unsigned long end,
                               struct mm_walk *walk)
{
        int ret = 0;
#ifdef CONFIG_HUGETLB_PAGE
        struct queue_pages *qp = walk->private;
        unsigned long flags = (qp->flags & MPOL_MF_VALID);
        struct page *page;
        spinlock_t *ptl;
        pte_t entry;

        ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
        entry = huge_ptep_get(pte);
        if (!pte_present(entry))
                goto unlock;
        page = pte_page(entry);
        if (!queue_pages_required(page, qp))
                goto unlock;

        if (flags == MPOL_MF_STRICT) {
                /*
                 * STRICT alone means only detecting misplaced page and no
                 * need to further check other vma.
                 */
                ret = -EIO;
                goto unlock;
        }

        if (!vma_migratable(walk->vma)) {
                /*
                 * Must be STRICT with MOVE*, otherwise .test_walk() have
                 * stopped walking current vma.
                 * Detecting misplaced page but allow migrating pages which
                 * have been queued.
                 */
                ret = 1;
                goto unlock;
        }

        /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
        if (flags & (MPOL_MF_MOVE_ALL) ||
            (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 &&
             !hugetlb_pmd_shared(pte))) {
                if (isolate_hugetlb(page, qp->pagelist) &&
                        (flags & MPOL_MF_STRICT))
                        /*
                         * Failed to isolate page but allow migrating pages
                         * which have been queued.
                         */
                        ret = 1;
        }
unlock:
        spin_unlock(ptl);
#else
        BUG();
#endif
        return ret;
}

#ifdef CONFIG_NUMA_BALANCING
/*
 * This is used to mark a range of virtual addresses to be inaccessible.
 * These are later cleared by a NUMA hinting fault. Depending on these
 * faults, pages may be migrated for better NUMA placement.
 *
 * This is assuming that NUMA faults are handled using PROT_NONE. If
 * an architecture makes a different choice, it will need further
 * changes to the core.
 */
unsigned long change_prot_numa(struct vm_area_struct *vma,
                        unsigned long addr, unsigned long end)
{
        int nr_updated;

        nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
        if (nr_updated)
                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);

        return nr_updated;
}
#else
static unsigned long change_prot_numa(struct vm_area_struct *vma,
                        unsigned long addr, unsigned long end)
{
        return 0;
}
#endif /* CONFIG_NUMA_BALANCING */

static int queue_pages_test_walk(unsigned long start, unsigned long end,
                                struct mm_walk *walk)
{
        struct vm_area_struct *vma = walk->vma;
        struct queue_pages *qp = walk->private;
        unsigned long endvma = vma->vm_end;
        unsigned long flags = qp->flags;

        /* range check first */
        VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma);

        if (!qp->first) {
                qp->first = vma;
                if (!(flags & MPOL_MF_DISCONTIG_OK) &&
                        (qp->start < vma->vm_start))
                        /* hole at head side of range */
                        return -EFAULT;
        }
        if (!(flags & MPOL_MF_DISCONTIG_OK) &&
                ((vma->vm_end < qp->end) &&
                (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
                /* hole at middle or tail of range */
                return -EFAULT;

        /*
         * Need check MPOL_MF_STRICT to return -EIO if possible
         * regardless of vma_migratable
         */
        if (!vma_migratable(vma) &&
            !(flags & MPOL_MF_STRICT))
                return 1;

        if (endvma > end)
                endvma = end;

        if (flags & MPOL_MF_LAZY) {
                /* Similar to task_numa_work, skip inaccessible VMAs */
                if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
                        !(vma->vm_flags & VM_MIXEDMAP))
                        change_prot_numa(vma, start, endvma);
                return 1;
        }

        /* queue pages from current vma */
        if (flags & MPOL_MF_VALID)
                return 0;
        return 1;
}

static const struct mm_walk_ops queue_pages_walk_ops = {
        .hugetlb_entry                = queue_pages_hugetlb,
        .pmd_entry                = queue_pages_pte_range,
        .test_walk                = queue_pages_test_walk,
};

/*
 * Walk through page tables and collect pages to be migrated.
 *
 * If pages found in a given range are on a set of nodes (determined by
 * @nodes and @flags,) it's isolated and queued to the pagelist which is
 * passed via @private.
 *
 * queue_pages_range() has three possible return values:
 * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
 *     specified.
 * 0 - queue pages successfully or no misplaced page.
 * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
 *         memory range specified by nodemask and maxnode points outside
 *         your accessible address space (-EFAULT)
 */
static int
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                nodemask_t *nodes, unsigned long flags,
                struct list_head *pagelist)
{
        int err;
        struct queue_pages qp = {
                .pagelist = pagelist,
                .flags = flags,
                .nmask = nodes,
                .start = start,
                .end = end,
                .first = NULL,
        };

        err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);

        if (!qp.first)
                /* whole range in hole */
                err = -EFAULT;

        return err;
}

/*
 * Apply policy to a single VMA
 * This must be called with the mmap_lock held for writing.
 */
static int vma_replace_policy(struct vm_area_struct *vma,
                                                struct mempolicy *pol)
{
        int err;
        struct mempolicy *old;
        struct mempolicy *new;

        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
                 vma->vm_ops, vma->vm_file,
                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);

        new = mpol_dup(pol);
        if (IS_ERR(new))
                return PTR_ERR(new);

        if (vma->vm_ops && vma->vm_ops->set_policy) {
                err = vma->vm_ops->set_policy(vma, new);
                if (err)
                        goto err_out;
        }

        old = vma->vm_policy;
        vma->vm_policy = new; /* protected by mmap_lock */
        mpol_put(old);

        return 0;
 err_out:
        mpol_put(new);
        return err;
}

/* Step 2: apply policy to a range and do splits. */
static int mbind_range(struct mm_struct *mm, unsigned long start,
                       unsigned long end, struct mempolicy *new_pol)
{
        struct vm_area_struct *prev;
        struct vm_area_struct *vma;
        int err = 0;
        pgoff_t pgoff;
        unsigned long vmstart;
        unsigned long vmend;

        vma = find_vma(mm, start);
        VM_BUG_ON(!vma);

        prev = vma->vm_prev;
        if (start > vma->vm_start)
                prev = vma;

        for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) {
                vmstart = max(start, vma->vm_start);
                vmend   = min(end, vma->vm_end);

                if (mpol_equal(vma_policy(vma), new_pol))
                        continue;

                pgoff = vma->vm_pgoff +
                        ((vmstart - vma->vm_start) >> PAGE_SHIFT);
                prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
                                 vma->anon_vma, vma->vm_file, pgoff,
                                 new_pol, vma->vm_userfaultfd_ctx);
                if (prev) {
                        vma = prev;
                        goto replace;
                }
                if (vma->vm_start != vmstart) {
                        err = split_vma(vma->vm_mm, vma, vmstart, 1);
                        if (err)
                                goto out;
                }
                if (vma->vm_end != vmend) {
                        err = split_vma(vma->vm_mm, vma, vmend, 0);
                        if (err)
                                goto out;
                }
 replace:
                err = vma_replace_policy(vma, new_pol);
                if (err)
                        goto out;
        }

 out:
        return err;
}

/* Set the process memory policy */
static long do_set_mempolicy(unsigned short mode, unsigned short flags,
                             nodemask_t *nodes)
{
        struct mempolicy *new, *old;
        NODEMASK_SCRATCH(scratch);
        int ret;

        if (!scratch)
                return -ENOMEM;

        new = mpol_new(mode, flags, nodes);
        if (IS_ERR(new)) {
                ret = PTR_ERR(new);
                goto out;
        }

        ret = mpol_set_nodemask(new, nodes, scratch);
        if (ret) {
                mpol_put(new);
                goto out;
        }
        task_lock(current);
        old = current->mempolicy;
        current->mempolicy = new;
        if (new && new->mode == MPOL_INTERLEAVE)
                current->il_prev = MAX_NUMNODES-1;
        task_unlock(current);
        mpol_put(old);
        ret = 0;
out:
        NODEMASK_SCRATCH_FREE(scratch);
        return ret;
}

/*
 * Return nodemask for policy for get_mempolicy() query
 *
 * Called with task's alloc_lock held
 */
static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
{
        nodes_clear(*nodes);
        if (p == &default_policy)
                return;

        switch (p->mode) {
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
                *nodes = p->v.nodes;
                break;
        case MPOL_PREFERRED:
                if (!(p->flags & MPOL_F_LOCAL))
                        node_set(p->v.preferred_node, *nodes);
                /* else return empty node mask for local allocation */
                break;
        default:
                BUG();
        }
}

static int lookup_node(struct mm_struct *mm, unsigned long addr)
{
        struct page *p = NULL;
        int err;

        int locked = 1;
        err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
        if (err > 0) {
                err = page_to_nid(p);
                put_page(p);
        }
        if (locked)
                mmap_read_unlock(mm);
        return err;
}

/* Retrieve NUMA policy */
static long do_get_mempolicy(int *policy, nodemask_t *nmask,
                             unsigned long addr, unsigned long flags)
{
        int err;
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma = NULL;
        struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;

        if (flags &
                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
                return -EINVAL;

        if (flags & MPOL_F_MEMS_ALLOWED) {
                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
                        return -EINVAL;
                *policy = 0;        /* just so it's initialized */
                task_lock(current);
                *nmask  = cpuset_current_mems_allowed;
                task_unlock(current);
                return 0;
        }

        if (flags & MPOL_F_ADDR) {
                /*
                 * Do NOT fall back to task policy if the
                 * vma/shared policy at addr is NULL.  We
                 * want to return MPOL_DEFAULT in this case.
                 */
                mmap_read_lock(mm);
                vma = find_vma_intersection(mm, addr, addr+1);
                if (!vma) {
                        mmap_read_unlock(mm);
                        return -EFAULT;
                }
                if (vma->vm_ops && vma->vm_ops->get_policy)
                        pol = vma->vm_ops->get_policy(vma, addr);
                else
                        pol = vma->vm_policy;
        } else if (addr)
                return -EINVAL;

        if (!pol)
                pol = &default_policy;        /* indicates default behavior */

        if (flags & MPOL_F_NODE) {
                if (flags & MPOL_F_ADDR) {
                        /*
                         * Take a refcount on the mpol, lookup_node()
                         * wil drop the mmap_lock, so after calling
                         * lookup_node() only "pol" remains valid, "vma"
                         * is stale.
                         */
                        pol_refcount = pol;
                        vma = NULL;
                        mpol_get(pol);
                        err = lookup_node(mm, addr);
                        if (err < 0)
                                goto out;
                        *policy = err;
                } else if (pol == current->mempolicy &&
                                pol->mode == MPOL_INTERLEAVE) {
                        *policy = next_node_in(current->il_prev, pol->v.nodes);
                } else {
                        err = -EINVAL;
                        goto out;
                }
        } else {
                *policy = pol == &default_policy ? MPOL_DEFAULT :
                                                pol->mode;
                /*
                 * Internal mempolicy flags must be masked off before exposing
                 * the policy to userspace.
                 */
                *policy |= (pol->flags & MPOL_MODE_FLAGS);
        }

        err = 0;
        if (nmask) {
                if (mpol_store_user_nodemask(pol)) {
                        *nmask = pol->w.user_nodemask;
                } else {
                        task_lock(current);
                        get_policy_nodemask(pol, nmask);
                        task_unlock(current);
                }
        }

 out:
        mpol_cond_put(pol);
        if (vma)
                mmap_read_unlock(mm);
        if (pol_refcount)
                mpol_put(pol_refcount);
        return err;
}

#ifdef CONFIG_MIGRATION
/*
 * page migration, thp tail pages can be passed.
 */
static int migrate_page_add(struct page *page, struct list_head *pagelist,
                                unsigned long flags)
{
        struct page *head = compound_head(page);
        /*
         * Avoid migrating a page that is shared with others.
         */
        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
                if (!isolate_lru_page(head)) {
                        list_add_tail(&head->lru, pagelist);
                        mod_node_page_state(page_pgdat(head),
                                NR_ISOLATED_ANON + page_is_file_lru(head),
                                thp_nr_pages(head));
                } else if (flags & MPOL_MF_STRICT) {
                        /*
                         * Non-movable page may reach here.  And, there may be
                         * temporary off LRU pages or non-LRU movable pages.
                         * Treat them as unmovable pages since they can't be
                         * isolated, so they can't be moved at the moment.  It
                         * should return -EIO for this case too.
                         */
                        return -EIO;
                }
        }

        return 0;
}

/*
 * Migrate pages from one node to a target node.
 * Returns error or the number of pages not migrated.
 */
static int migrate_to_node(struct mm_struct *mm, int source, int dest,
                           int flags)
{
        nodemask_t nmask;
        LIST_HEAD(pagelist);
        int err = 0;
        struct migration_target_control mtc = {
                .nid = dest,
                .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
        };

        nodes_clear(nmask);
        node_set(source, nmask);

        /*
         * This does not "check" the range but isolates all pages that
         * need migration.  Between passing in the full user address
         * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
         */
        VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
        queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);

        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, alloc_migration_target, NULL,
                                (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
                if (err)
                        putback_movable_pages(&pagelist);
        }

        return err;
}

/*
 * Move pages between the two nodesets so as to preserve the physical
 * layout as much as possible.
 *
 * Returns the number of page that could not be moved.
 */
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                     const nodemask_t *to, int flags)
{
        int busy = 0;
        int err;
        nodemask_t tmp;

        err = migrate_prep();
        if (err)
                return err;

        mmap_read_lock(mm);

        /*
         * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
         * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
         * bit in 'tmp', and return that <source, dest> pair for migration.
         * The pair of nodemasks 'to' and 'from' define the map.
         *
         * If no pair of bits is found that way, fallback to picking some
         * pair of 'source' and 'dest' bits that are not the same.  If the
         * 'source' and 'dest' bits are the same, this represents a node
         * that will be migrating to itself, so no pages need move.
         *
         * If no bits are left in 'tmp', or if all remaining bits left
         * in 'tmp' correspond to the same bit in 'to', return false
         * (nothing left to migrate).
         *
         * This lets us pick a pair of nodes to migrate between, such that
         * if possible the dest node is not already occupied by some other
         * source node, minimizing the risk of overloading the memory on a
         * node that would happen if we migrated incoming memory to a node
         * before migrating outgoing memory source that same node.
         *
         * A single scan of tmp is sufficient.  As we go, we remember the
         * most recent <s, d> pair that moved (s != d).  If we find a pair
         * that not only moved, but what's better, moved to an empty slot
         * (d is not set in tmp), then we break out then, with that pair.
         * Otherwise when we finish scanning from_tmp, we at least have the
         * most recent <s, d> pair that moved.  If we get all the way through
         * the scan of tmp without finding any node that moved, much less
         * moved to an empty node, then there is nothing left worth migrating.
         */

        tmp = *from;
        while (!nodes_empty(tmp)) {
                int s,d;
                int source = NUMA_NO_NODE;
                int dest = 0;

                for_each_node_mask(s, tmp) {

                        /*
                         * do_migrate_pages() tries to maintain the relative
                         * node relationship of the pages established between
                         * threads and memory areas.
                         *
                         * However if the number of source nodes is not equal to
                         * the number of destination nodes we can not preserve
                         * this node relative relationship.  In that case, skip
                         * copying memory from a node that is in the destination
                         * mask.
                         *
                         * Example: [2,3,4] -> [3,4,5] moves everything.
                         *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
                         */

                        if ((nodes_weight(*from) != nodes_weight(*to)) &&
                                                (node_isset(s, *to)))
                                continue;

                        d = node_remap(s, *from, *to);
                        if (s == d)
                                continue;

                        source = s;        /* Node moved. Memorize */
                        dest = d;

                        /* dest not in remaining from nodes? */
                        if (!node_isset(dest, tmp))
                                break;
                }
                if (source == NUMA_NO_NODE)
                        break;

                node_clear(source, tmp);
                err = migrate_to_node(mm, source, dest, flags);
                if (err > 0)
                        busy += err;
                if (err < 0)
                        break;
        }
        mmap_read_unlock(mm);
        if (err < 0)
                return err;
        return busy;

}

/*
 * Allocate a new page for page migration based on vma policy.
 * Start by assuming the page is mapped by the same vma as contains @start.
 * Search forward from there, if not.  N.B., this assumes that the
 * list of pages handed to migrate_pages()--which is how we get here--
 * is in virtual address order.
 */
static struct page *new_page(struct page *page, unsigned long start)
{
        struct vm_area_struct *vma;
        unsigned long address;

        vma = find_vma(current->mm, start);
        while (vma) {
                address = page_address_in_vma(page, vma);
                if (address != -EFAULT)
                        break;
                vma = vma->vm_next;
        }

        if (PageHuge(page)) {
                return alloc_huge_page_vma(page_hstate(compound_head(page)),
                                vma, address);
        } else if (PageTransHuge(page)) {
                struct page *thp;

                thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
                                         HPAGE_PMD_ORDER);
                if (!thp)
                        return NULL;
                prep_transhuge_page(thp);
                return thp;
        }
        /*
         * if !vma, alloc_page_vma() will use task or system default policy
         */
        return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
                        vma, address);
}
#else

static int migrate_page_add(struct page *page, struct list_head *pagelist,
                                unsigned long flags)
{
        return -EIO;
}

int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                     const nodemask_t *to, int flags)
{
        return -ENOSYS;
}

static struct page *new_page(struct page *page, unsigned long start)
{
        return NULL;
}
#endif

static long do_mbind(unsigned long start, unsigned long len,
                     unsigned short mode, unsigned short mode_flags,
                     nodemask_t *nmask, unsigned long flags)
{
        struct mm_struct *mm = current->mm;
        struct mempolicy *new;
        unsigned long end;
        int err;
        int ret;
        LIST_HEAD(pagelist);

        if (flags & ~(unsigned long)MPOL_MF_VALID)
                return -EINVAL;
        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
                return -EPERM;

        if (start & ~PAGE_MASK)
                return -EINVAL;

        if (mode == MPOL_DEFAULT)
                flags &= ~MPOL_MF_STRICT;

        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
        end = start + len;

        if (end < start)
                return -EINVAL;
        if (end == start)
                return 0;

        new = mpol_new(mode, mode_flags, nmask);
        if (IS_ERR(new))
                return PTR_ERR(new);

        if (flags & MPOL_MF_LAZY)
                new->flags |= MPOL_F_MOF;

        /*
         * If we are using the default policy then operation
         * on discontinuous address spaces is okay after all
         */
        if (!new)
                flags |= MPOL_MF_DISCONTIG_OK;

        pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
                 start, start + len, mode, mode_flags,
                 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);

        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {

                err = migrate_prep();
                if (err)
                        goto mpol_out;
        }
        {
                NODEMASK_SCRATCH(scratch);
                if (scratch) {
                        mmap_write_lock(mm);
                        err = mpol_set_nodemask(new, nmask, scratch);
                        if (err)
                                mmap_write_unlock(mm);
                } else
                        err = -ENOMEM;
                NODEMASK_SCRATCH_FREE(scratch);
        }
        if (err)
                goto mpol_out;

        ret = queue_pages_range(mm, start, end, nmask,
                          flags | MPOL_MF_INVERT, &pagelist);

        if (ret < 0) {
                err = ret;
                goto up_out;
        }

        err = mbind_range(mm, start, end, new);

        if (!err) {
                int nr_failed = 0;

                if (!list_empty(&pagelist)) {
                        WARN_ON_ONCE(flags & MPOL_MF_LAZY);
                        nr_failed = migrate_pages(&pagelist, new_page, NULL,
                                start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
                        if (nr_failed)
                                putback_movable_pages(&pagelist);
                }

                if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
                        err = -EIO;
        } else {
up_out:
                if (!list_empty(&pagelist))
                        putback_movable_pages(&pagelist);
        }

        mmap_write_unlock(mm);
mpol_out:
        mpol_put(new);
        return err;
}

/*
 * User space interface with variable sized bitmaps for nodelists.
 */

/* Copy a node mask from user space. */
static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
                     unsigned long maxnode)
{
        unsigned long k;
        unsigned long t;
        unsigned long nlongs;
        unsigned long endmask;

        --maxnode;
        nodes_clear(*nodes);
        if (maxnode == 0 || !nmask)
                return 0;
        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
                return -EINVAL;

        nlongs = BITS_TO_LONGS(maxnode);
        if ((maxnode % BITS_PER_LONG) == 0)
                endmask = ~0UL;
        else
                endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;

        /*
         * When the user specified more nodes than supported just check
         * if the non supported part is all zero.
         *
         * If maxnode have more longs than MAX_NUMNODES, check
         * the bits in that area first. And then go through to
         * check the rest bits which equal or bigger than MAX_NUMNODES.
         * Otherwise, just check bits [MAX_NUMNODES, maxnode).
         */
        if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
                for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
                        if (get_user(t, nmask + k))
                                return -EFAULT;
                        if (k == nlongs - 1) {
                                if (t & endmask)
                                        return -EINVAL;
                        } else if (t)
                                return -EINVAL;
                }
                nlongs = BITS_TO_LONGS(MAX_NUMNODES);
                endmask = ~0UL;
        }

        if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
                unsigned long valid_mask = endmask;

                valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
                if (get_user(t, nmask + nlongs - 1))
                        return -EFAULT;
                if (t & valid_mask)
                        return -EINVAL;
        }

        if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
                return -EFAULT;
        nodes_addr(*nodes)[nlongs-1] &= endmask;
        return 0;
}

/* Copy a kernel node mask to user space */
static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
                              nodemask_t *nodes)
{
        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
        unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);

        if (copy > nbytes) {
                if (copy > PAGE_SIZE)
                        return -EINVAL;
                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
                        return -EFAULT;
                copy = nbytes;
        }
        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
}

static long kernel_mbind(unsigned long start, unsigned long len,
                         unsigned long mode, const unsigned long __user *nmask,
                         unsigned long maxnode, unsigned int flags)
{
        nodemask_t nodes;
        int err;
        unsigned short mode_flags;

        start = untagged_addr(start);
        mode_flags = mode & MPOL_MODE_FLAGS;
        mode &= ~MPOL_MODE_FLAGS;
        if (mode >= MPOL_MAX)
                return -EINVAL;
        if ((mode_flags & MPOL_F_STATIC_NODES) &&
            (mode_flags & MPOL_F_RELATIVE_NODES))
                return -EINVAL;
        err = get_nodes(&nodes, nmask, maxnode);
        if (err)
                return err;
        return do_mbind(start, len, mode, mode_flags, &nodes, flags);
}

SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
                unsigned long, mode, const unsigned long __user *, nmask,
                unsigned long, maxnode, unsigned int, flags)
{
        return kernel_mbind(start, len, mode, nmask, maxnode, flags);
}

/* Set the process memory policy */
static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
                                 unsigned long maxnode)
{
        int err;
        nodemask_t nodes;
        unsigned short flags;

        flags = mode & MPOL_MODE_FLAGS;
        mode &= ~MPOL_MODE_FLAGS;
        if ((unsigned int)mode >= MPOL_MAX)
                return -EINVAL;
        if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
                return -EINVAL;
        err = get_nodes(&nodes, nmask, maxnode);
        if (err)
                return err;
        return do_set_mempolicy(mode, flags, &nodes);
}

SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
                unsigned long, maxnode)
{
        return kernel_set_mempolicy(mode, nmask, maxnode);
}

static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
                                const unsigned long __user *old_nodes,
                                const unsigned long __user *new_nodes)
{
        struct mm_struct *mm = NULL;
        struct task_struct *task;
        nodemask_t task_nodes;
        int err;
        nodemask_t *old;
        nodemask_t *new;
        NODEMASK_SCRATCH(scratch);

        if (!scratch)
                return -ENOMEM;

        old = &scratch->mask1;
        new = &scratch->mask2;

        err = get_nodes(old, old_nodes, maxnode);
        if (err)
                goto out;

        err = get_nodes(new, new_nodes, maxnode);
        if (err)
                goto out;

        /* Find the mm_struct */
        rcu_read_lock();
        task = pid ? find_task_by_vpid(pid) : current;
        if (!task) {
                rcu_read_unlock();
                err = -ESRCH;
                goto out;
        }
        get_task_struct(task);

        err = -EINVAL;

        /*
         * Check if this process has the right to modify the specified process.
         * Use the regular "ptrace_may_access()" checks.
         */
        if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
                rcu_read_unlock();
                err = -EPERM;
                goto out_put;
        }
        rcu_read_unlock();

        task_nodes = cpuset_mems_allowed(task);
        /* Is the user allowed to access the target nodes? */
        if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
                err = -EPERM;
                goto out_put;
        }

        task_nodes = cpuset_mems_allowed(current);
        nodes_and(*new, *new, task_nodes);
        if (nodes_empty(*new))
                goto out_put;

        err = security_task_movememory(task);
        if (err)
                goto out_put;

        mm = get_task_mm(task);
        put_task_struct(task);

        if (!mm) {
                err = -EINVAL;
                goto out;
        }

        err = do_migrate_pages(mm, old, new,
                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);

        mmput(mm);
out:
        NODEMASK_SCRATCH_FREE(scratch);

        return err;

out_put:
        put_task_struct(task);
        goto out;

}

SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
                const unsigned long __user *, old_nodes,
                const unsigned long __user *, new_nodes)
{
        return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
}


/* Retrieve NUMA policy */
static int kernel_get_mempolicy(int __user *policy,
                                unsigned long __user *nmask,
                                unsigned long maxnode,
                                unsigned long addr,
                                unsigned long flags)
{
        int err;
        int pval;
        nodemask_t nodes;

        if (nmask != NULL && maxnode < nr_node_ids)
                return -EINVAL;

        addr = untagged_addr(addr);

        err = do_get_mempolicy(&pval, &nodes, addr, flags);

        if (err)
                return err;

        if (policy && put_user(pval, policy))
                return -EFAULT;

        if (nmask)
                err = copy_nodes_to_user(nmask, maxnode, &nodes);

        return err;
}

SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
                unsigned long __user *, nmask, unsigned long, maxnode,
                unsigned long, addr, unsigned long, flags)
{
        return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
}

#ifdef CONFIG_COMPAT

COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
                       compat_ulong_t __user *, nmask,
                       compat_ulong_t, maxnode,
                       compat_ulong_t, addr, compat_ulong_t, flags)
{
        long err;
        unsigned long __user *nm = NULL;
        unsigned long nr_bits, alloc_size;
        DECLARE_BITMAP(bm, MAX_NUMNODES);

        nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;

        if (nmask)
                nm = compat_alloc_user_space(alloc_size);

        err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);

        if (!err && nmask) {
                unsigned long copy_size;
                copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
                err = copy_from_user(bm, nm, copy_size);
                /* ensure entire bitmap is zeroed */
                err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
                err |= compat_put_bitmap(nmask, bm, nr_bits);
        }

        return err;
}

COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
                       compat_ulong_t, maxnode)
{
        unsigned long __user *nm = NULL;
        unsigned long nr_bits, alloc_size;
        DECLARE_BITMAP(bm, MAX_NUMNODES);

        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;

        if (nmask) {
                if (compat_get_bitmap(bm, nmask, nr_bits))
                        return -EFAULT;
                nm = compat_alloc_user_space(alloc_size);
                if (copy_to_user(nm, bm, alloc_size))
                        return -EFAULT;
        }

        return kernel_set_mempolicy(mode, nm, nr_bits+1);
}

COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
                       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
                       compat_ulong_t, maxnode, compat_ulong_t, flags)
{
        unsigned long __user *nm = NULL;
        unsigned long nr_bits, alloc_size;
        nodemask_t bm;

        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;

        if (nmask) {
                if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
                        return -EFAULT;
                nm = compat_alloc_user_space(alloc_size);
                if (copy_to_user(nm, nodes_addr(bm), alloc_size))
                        return -EFAULT;
        }

        return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
}

COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
                       compat_ulong_t, maxnode,
                       const compat_ulong_t __user *, old_nodes,
                       const compat_ulong_t __user *, new_nodes)
{
        unsigned long __user *old = NULL;
        unsigned long __user *new = NULL;
        nodemask_t tmp_mask;
        unsigned long nr_bits;
        unsigned long size;

        nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
        size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
        if (old_nodes) {
                if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
                        return -EFAULT;
                old = compat_alloc_user_space(new_nodes ? size * 2 : size);
                if (new_nodes)
                        new = old + size / sizeof(unsigned long);
                if (copy_to_user(old, nodes_addr(tmp_mask), size))
                        return -EFAULT;
        }
        if (new_nodes) {
                if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
                        return -EFAULT;
                if (new == NULL)
                        new = compat_alloc_user_space(size);
                if (copy_to_user(new, nodes_addr(tmp_mask), size))
                        return -EFAULT;
        }
        return kernel_migrate_pages(pid, nr_bits + 1, old, new);
}

#endif /* CONFIG_COMPAT */

bool vma_migratable(struct vm_area_struct *vma)
{
        if (vma->vm_flags & (VM_IO | VM_PFNMAP))
                return false;

        /*
         * DAX device mappings require predictable access latency, so avoid
         * incurring periodic faults.
         */
        if (vma_is_dax(vma))
                return false;

        if (is_vm_hugetlb_page(vma) &&
                !hugepage_migration_supported(hstate_vma(vma)))
                return false;

        /*
         * Migration allocates pages in the highest zone. If we cannot
         * do so then migration (at least from node to node) is not
         * possible.
         */
        if (vma->vm_file &&
                gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
                        < policy_zone)
                return false;
        return true;
}

struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
                                                unsigned long addr)
{
        struct mempolicy *pol = NULL;

        if (vma) {
                if (vma->vm_ops && vma->vm_ops->get_policy) {
                        pol = vma->vm_ops->get_policy(vma, addr);
                } else if (vma->vm_policy) {
                        pol = vma->vm_policy;

                        /*
                         * shmem_alloc_page() passes MPOL_F_SHARED policy with
                         * a pseudo vma whose vma->vm_ops=NULL. Take a reference
                         * count on these policies which will be dropped by
                         * mpol_cond_put() later
                         */
                        if (mpol_needs_cond_ref(pol))
                                mpol_get(pol);
                }
        }

        return pol;
}

/*
 * get_vma_policy(@vma, @addr)
 * @vma: virtual memory area whose policy is sought
 * @addr: address in @vma for shared policy lookup
 *
 * Returns effective policy for a VMA at specified address.
 * Falls back to current->mempolicy or system default policy, as necessary.
 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
 * count--added by the get_policy() vm_op, as appropriate--to protect against
 * freeing by another task.  It is the caller's responsibility to free the
 * extra reference for shared policies.
 */
static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
                                                unsigned long addr)
{
        struct mempolicy *pol = __get_vma_policy(vma, addr);

        if (!pol)
                pol = get_task_policy(current);

        return pol;
}

bool vma_policy_mof(struct vm_area_struct *vma)
{
        struct mempolicy *pol;

        if (vma->vm_ops && vma->vm_ops->get_policy) {
                bool ret = false;

                pol = vma->vm_ops->get_policy(vma, vma->vm_start);
                if (pol && (pol->flags & MPOL_F_MOF))
                        ret = true;
                mpol_cond_put(pol);

                return ret;
        }

        pol = vma->vm_policy;
        if (!pol)
                pol = get_task_policy(current);

        return pol->flags & MPOL_F_MOF;
}

static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
{
        enum zone_type dynamic_policy_zone = policy_zone;

        BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);

        /*
         * if policy->v.nodes has movable memory only,
         * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
         *
         * policy->v.nodes is intersect with node_states[N_MEMORY].
         * so if the following test faile, it implies
         * policy->v.nodes has movable memory only.
         */
        if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
                dynamic_policy_zone = ZONE_MOVABLE;

        return zone >= dynamic_policy_zone;
}

/*
 * Return a nodemask representing a mempolicy for filtering nodes for
 * page allocation
 */
nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
{
        /* Lower zones don't get a nodemask applied for MPOL_BIND */
        if (unlikely(policy->mode == MPOL_BIND) &&
                        apply_policy_zone(policy, gfp_zone(gfp)) &&
                        cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
                return &policy->v.nodes;

        return NULL;
}

/* Return the node id preferred by the given mempolicy, or the given id */
static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
{
        if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
                nd = policy->v.preferred_node;
        else {
                /*
                 * __GFP_THISNODE shouldn't even be used with the bind policy
                 * because we might easily break the expectation to stay on the
                 * requested node and not break the policy.
                 */
                WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
        }

        return nd;
}

/* Do dynamic interleaving for a process */
static unsigned interleave_nodes(struct mempolicy *policy)
{
        unsigned next;
        struct task_struct *me = current;

        next = next_node_in(me->il_prev, policy->v.nodes);
        if (next < MAX_NUMNODES)
                me->il_prev = next;
        return next;
}

/*
 * Depending on the memory policy provide a node from which to allocate the
 * next slab entry.
 */
unsigned int mempolicy_slab_node(void)
{
        struct mempolicy *policy;
        int node = numa_mem_id();

        if (in_interrupt())
                return node;

        policy = current->mempolicy;
        if (!policy || policy->flags & MPOL_F_LOCAL)
                return node;

        switch (policy->mode) {
        case MPOL_PREFERRED:
                /*
                 * handled MPOL_F_LOCAL above
                 */
                return policy->v.preferred_node;

        case MPOL_INTERLEAVE:
                return interleave_nodes(policy);

        case MPOL_BIND: {
                struct zoneref *z;

                /*
                 * Follow bind policy behavior and start allocation at the
                 * first node.
                 */
                struct zonelist *zonelist;
                enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
                zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
                z = first_zones_zonelist(zonelist, highest_zoneidx,
                                                        &policy->v.nodes);
                return z->zone ? zone_to_nid(z->zone) : node;
        }

        default:
                BUG();
        }
}

/*
 * Do static interleaving for a VMA with known offset @n.  Returns the n'th
 * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
 * number of present nodes.
 */
static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
{
        unsigned nnodes = nodes_weight(pol->v.nodes);
        unsigned target;
        int i;
        int nid;

        if (!nnodes)
                return numa_node_id();
        target = (unsigned int)n % nnodes;
        nid = first_node(pol->v.nodes);
        for (i = 0; i < target; i++)
                nid = next_node(nid, pol->v.nodes);
        return nid;
}

/* Determine a node number for interleave */
static inline unsigned interleave_nid(struct mempolicy *pol,
                 struct vm_area_struct *vma, unsigned long addr, int shift)
{
        if (vma) {
                unsigned long off;

                /*
                 * for small pages, there is no difference between
                 * shift and PAGE_SHIFT, so the bit-shift is safe.
                 * for huge pages, since vm_pgoff is in units of small
                 * pages, we need to shift off the always 0 bits to get
                 * a useful offset.
                 */
                BUG_ON(shift < PAGE_SHIFT);
                off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
                off += (addr - vma->vm_start) >> shift;
                return offset_il_node(pol, off);
        } else
                return interleave_nodes(pol);
}

#ifdef CONFIG_HUGETLBFS
/*
 * huge_node(@vma, @addr, @gfp_flags, @mpol)
 * @vma: virtual memory area whose policy is sought
 * @addr: address in @vma for shared policy lookup and interleave policy
 * @gfp_flags: for requested zone
 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
 *
 * Returns a nid suitable for a huge page allocation and a pointer
 * to the struct mempolicy for conditional unref after allocation.
 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
 * @nodemask for filtering the zonelist.
 *
 * Must be protected by read_mems_allowed_begin()
 */
int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
                                struct mempolicy **mpol, nodemask_t **nodemask)
{
        int nid;

        *mpol = get_vma_policy(vma, addr);
        *nodemask = NULL;        /* assume !MPOL_BIND */

        if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
                nid = interleave_nid(*mpol, vma, addr,
                                        huge_page_shift(hstate_vma(vma)));
        } else {
                nid = policy_node(gfp_flags, *mpol, numa_node_id());
                if ((*mpol)->mode == MPOL_BIND)
                        *nodemask = &(*mpol)->v.nodes;
        }
        return nid;
}

/*
 * init_nodemask_of_mempolicy
 *
 * If the current task's mempolicy is "default" [NULL], return 'false'
 * to indicate default policy.  Otherwise, extract the policy nodemask
 * for 'bind' or 'interleave' policy into the argument nodemask, or
 * initialize the argument nodemask to contain the single node for
 * 'preferred' or 'local' policy and return 'true' to indicate presence
 * of non-default mempolicy.
 *
 * We don't bother with reference counting the mempolicy [mpol_get/put]
 * because the current task is examining it's own mempolicy and a task's
 * mempolicy is only ever changed by the task itself.
 *
 * N.B., it is the caller's responsibility to free a returned nodemask.
 */
bool init_nodemask_of_mempolicy(nodemask_t *mask)
{
        struct mempolicy *mempolicy;
        int nid;

        if (!(mask && current->mempolicy))
                return false;

        task_lock(current);
        mempolicy = current->mempolicy;
        switch (mempolicy->mode) {
        case MPOL_PREFERRED:
                if (mempolicy->flags & MPOL_F_LOCAL)
                        nid = numa_node_id();
                else
                        nid = mempolicy->v.preferred_node;
                init_nodemask_of_node(mask, nid);
                break;

        case MPOL_BIND:
        case MPOL_INTERLEAVE:
                *mask =  mempolicy->v.nodes;
                break;

        default:
                BUG();
        }
        task_unlock(current);

        return true;
}
#endif

/*
 * mempolicy_nodemask_intersects
 *
 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
 * policy.  Otherwise, check for intersection between mask and the policy
 * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
 * policy, always return true since it may allocate elsewhere on fallback.
 *
 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
 */
bool mempolicy_nodemask_intersects(struct task_struct *tsk,
                                        const nodemask_t *mask)
{
        struct mempolicy *mempolicy;
        bool ret = true;

        if (!mask)
                return ret;
        task_lock(tsk);
        mempolicy = tsk->mempolicy;
        if (!mempolicy)
                goto out;

        switch (mempolicy->mode) {
        case MPOL_PREFERRED:
                /*
                 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
                 * allocate from, they may fallback to other nodes when oom.
                 * Thus, it's possible for tsk to have allocated memory from
                 * nodes in mask.
                 */
                break;
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
                ret = nodes_intersects(mempolicy->v.nodes, *mask);
                break;
        default:
                BUG();
        }
out:
        task_unlock(tsk);
        return ret;
}

/* Allocate a page in interleaved policy.
   Own path because it needs to do special accounting. */
static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
                                        unsigned nid)
{
        struct page *page;

        page = __alloc_pages(gfp, order, nid);
        /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
        if (!static_branch_likely(&vm_numa_stat_key))
                return page;
        if (page && page_to_nid(page) == nid) {
                preempt_disable();
                __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
                preempt_enable();
        }
        return page;
}

/**
 *         alloc_pages_vma        - Allocate a page for a VMA.
 *
 *         @gfp:
 *      %GFP_USER    user allocation.
 *      %GFP_KERNEL  kernel allocations,
 *      %GFP_HIGHMEM highmem/user allocations,
 *      %GFP_FS      allocation should not call back into a file system.
 *      %GFP_ATOMIC  don't sleep.
 *
 *        @order:Order of the GFP allocation.
 *         @vma:  Pointer to VMA or NULL if not available.
 *        @addr: Virtual Address of the allocation. Must be inside the VMA.
 *        @node: Which node to prefer for allocation (modulo policy).
 *        @hugepage: for hugepages try only the preferred node if possible
 *
 *         This function allocates a page from the kernel page pool and applies
 *        a NUMA policy associated with the VMA or the current process.
 *        When VMA is not NULL caller must read-lock the mmap_lock of the
 *        mm_struct of the VMA to prevent it from going away. Should be used for
 *        all allocations for pages that will be mapped into user space. Returns
 *        NULL when no page can be allocated.
 */
struct page *
alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
                unsigned long addr, int node, bool hugepage)
{
        struct mempolicy *pol;
        struct page *page;
        int preferred_nid;
        nodemask_t *nmask;

        pol = get_vma_policy(vma, addr);

        if (pol->mode == MPOL_INTERLEAVE) {
                unsigned nid;

                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
                mpol_cond_put(pol);
                page = alloc_page_interleave(gfp, order, nid);
                goto out;
        }

        if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
                int hpage_node = node;

                /*
                 * For hugepage allocation and non-interleave policy which
                 * allows the current node (or other explicitly preferred
                 * node) we only try to allocate from the current/preferred
                 * node and don't fall back to other nodes, as the cost of
                 * remote accesses would likely offset THP benefits.
                 *
                 * If the policy is interleave, or does not allow the current
                 * node in its nodemask, we allocate the standard way.
                 */
                if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
                        hpage_node = pol->v.preferred_node;

                nmask = policy_nodemask(gfp, pol);
                if (!nmask || node_isset(hpage_node, *nmask)) {
                        mpol_cond_put(pol);
                        /*
                         * First, try to allocate THP only on local node, but
                         * don't reclaim unnecessarily, just compact.
                         */
                        page = __alloc_pages_node(hpage_node,
                                gfp | __GFP_THISNODE | __GFP_NORETRY, order);

                        /*
                         * If hugepage allocations are configured to always
                         * synchronous compact or the vma has been madvised
                         * to prefer hugepage backing, retry allowing remote
                         * memory with both reclaim and compact as well.
                         */
                        if (!page && (gfp & __GFP_DIRECT_RECLAIM))
                                page = __alloc_pages_nodemask(gfp, order,
                                                        hpage_node, nmask);

                        goto out;
                }
        }

        nmask = policy_nodemask(gfp, pol);
        preferred_nid = policy_node(gfp, pol, node);
        page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
        mpol_cond_put(pol);
out:
        return page;
}
EXPORT_SYMBOL(alloc_pages_vma);

/**
 *         alloc_pages_current - Allocate pages.
 *
 *        @gfp:
 *                %GFP_USER   user allocation,
 *              %GFP_KERNEL kernel allocation,
 *              %GFP_HIGHMEM highmem allocation,
 *              %GFP_FS     don't call back into a file system.
 *              %GFP_ATOMIC don't sleep.
 *        @order: Power of two of allocation size in pages. 0 is a single page.
 *
 *        Allocate a page from the kernel page pool.  When not in
 *        interrupt context and apply the current process NUMA policy.
 *        Returns NULL when no page can be allocated.
 */
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
        struct mempolicy *pol = &default_policy;
        struct page *page;

        if (!in_interrupt() && !(gfp & __GFP_THISNODE))
                pol = get_task_policy(current);

        /*
         * No reference counting needed for current->mempolicy
         * nor system default_policy
         */
        if (pol->mode == MPOL_INTERLEAVE)
                page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
        else
                page = __alloc_pages_nodemask(gfp, order,
                                policy_node(gfp, pol, numa_node_id()),
                                policy_nodemask(gfp, pol));

        return page;
}
EXPORT_SYMBOL(alloc_pages_current);

int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
        struct mempolicy *pol = mpol_dup(vma_policy(src));

        if (IS_ERR(pol))
                return PTR_ERR(pol);
        dst->vm_policy = pol;
        return 0;
}

/*
 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
 * with the mems_allowed returned by cpuset_mems_allowed().  This
 * keeps mempolicies cpuset relative after its cpuset moves.  See
 * further kernel/cpuset.c update_nodemask().
 *
 * current's mempolicy may be rebinded by the other task(the task that changes
 * cpuset's mems), so we needn't do rebind work for current task.
 */

/* Slow path of a mempolicy duplicate */
struct mempolicy *__mpol_dup(struct mempolicy *old)
{
        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);

        if (!new)
                return ERR_PTR(-ENOMEM);

        /* task's mempolicy is protected by alloc_lock */
        if (old == current->mempolicy) {
                task_lock(current);
                *new = *old;
                task_unlock(current);
        } else
                *new = *old;

        if (current_cpuset_is_being_rebound()) {
                nodemask_t mems = cpuset_mems_allowed(current);
                mpol_rebind_policy(new, &mems);
        }
        atomic_set(&new->refcnt, 1);
        return new;
}

/* Slow path of a mempolicy comparison */
bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
        if (!a || !b)
                return false;
        if (a->mode != b->mode)
                return false;
        if (a->flags != b->flags)
                return false;
        if (mpol_store_user_nodemask(a))
                if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
                        return false;

        switch (a->mode) {
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
                return !!nodes_equal(a->v.nodes, b->v.nodes);
        case MPOL_PREFERRED:
                /* a's ->flags is the same as b's */
                if (a->flags & MPOL_F_LOCAL)
                        return true;
                return a->v.preferred_node == b->v.preferred_node;
        default:
                BUG();
                return false;
        }
}

/*
 * Shared memory backing store policy support.
 *
 * Remember policies even when nobody has shared memory mapped.
 * The policies are kept in Red-Black tree linked from the inode.
 * They are protected by the sp->lock rwlock, which should be held
 * for any accesses to the tree.
 */

/*
 * lookup first element intersecting start-end.  Caller holds sp->lock for
 * reading or for writing
 */
static struct sp_node *
sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
{
        struct rb_node *n = sp->root.rb_node;

        while (n) {
                struct sp_node *p = rb_entry(n, struct sp_node, nd);

                if (start >= p->end)
                        n = n->rb_right;
                else if (end <= p->start)
                        n = n->rb_left;
                else
                        break;
        }
        if (!n)
                return NULL;
        for (;;) {
                struct sp_node *w = NULL;
                struct rb_node *prev = rb_prev(n);
                if (!prev)
                        break;
                w = rb_entry(prev, struct sp_node, nd);
                if (w->end <= start)
                        break;
                n = prev;
        }
        return rb_entry(n, struct sp_node, nd);
}

/*
 * Insert a new shared policy into the list.  Caller holds sp->lock for
 * writing.
 */
static void sp_insert(struct shared_policy *sp, struct sp_node *new)
{
        struct rb_node **p = &sp->root.rb_node;
        struct rb_node *parent = NULL;
        struct sp_node *nd;

        while (*p) {
                parent = *p;
                nd = rb_entry(parent, struct sp_node, nd);
                if (new->start < nd->start)
                        p = &(*p)->rb_left;
                else if (new->end > nd->end)
                        p = &(*p)->rb_right;
                else
                        BUG();
        }
        rb_link_node(&new->nd, parent, p);
        rb_insert_color(&new->nd, &sp->root);
        pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
                 new->policy ? new->policy->mode : 0);
}

/* Find shared policy intersecting idx */
struct mempolicy *
mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
{
        struct mempolicy *pol = NULL;
        struct sp_node *sn;

        if (!sp->root.rb_node)
                return NULL;
        read_lock(&sp->lock);
        sn = sp_lookup(sp, idx, idx+1);
        if (sn) {
                mpol_get(sn->policy);
                pol = sn->policy;
        }
        read_unlock(&sp->lock);
        return pol;
}

static void sp_free(struct sp_node *n)
{
        mpol_put(n->policy);
        kmem_cache_free(sn_cache, n);
}

/**
 * mpol_misplaced - check whether current page node is valid in policy
 *
 * @page: page to be checked
 * @vma: vm area where page mapped
 * @addr: virtual address where page mapped
 *
 * Lookup current policy node id for vma,addr and "compare to" page's
 * node id.
 *
 * Returns:
 *        -1        - not misplaced, page is in the right node
 *        node        - node id where the page should be
 *
 * Policy determination "mimics" alloc_page_vma().
 * Called from fault path where we know the vma and faulting address.
 */
int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
{
        struct mempolicy *pol;
        struct zoneref *z;
        int curnid = page_to_nid(page);
        unsigned long pgoff;
        int thiscpu = raw_smp_processor_id();
        int thisnid = cpu_to_node(thiscpu);
        int polnid = NUMA_NO_NODE;
        int ret = -1;

        pol = get_vma_policy(vma, addr);
        if (!(pol->flags & MPOL_F_MOF))
                goto out;

        switch (pol->mode) {
        case MPOL_INTERLEAVE:
                pgoff = vma->vm_pgoff;
                pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
                polnid = offset_il_node(pol, pgoff);
                break;

        case MPOL_PREFERRED:
                if (pol->flags & MPOL_F_LOCAL)
                        polnid = numa_node_id();
                else
                        polnid = pol->v.preferred_node;
                break;

        case MPOL_BIND:

                /*
                 * allows binding to multiple nodes.
                 * use current page if in policy nodemask,
                 * else select nearest allowed node, if any.
                 * If no allowed nodes, use current [!misplaced].
                 */
                if (node_isset(curnid, pol->v.nodes))
                        goto out;
                z = first_zones_zonelist(
                                node_zonelist(numa_node_id(), GFP_HIGHUSER),
                                gfp_zone(GFP_HIGHUSER),
                                &pol->v.nodes);
                polnid = zone_to_nid(z->zone);
                break;

        default:
                BUG();
        }

        /* Migrate the page towards the node whose CPU is referencing it */
        if (pol->flags & MPOL_F_MORON) {
                polnid = thisnid;

                if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
                        goto out;
        }

        if (curnid != polnid)
                ret = polnid;
out:
        mpol_cond_put(pol);

        return ret;
}

/*
 * Drop the (possibly final) reference to task->mempolicy.  It needs to be
 * dropped after task->mempolicy is set to NULL so that any allocation done as
 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
 * policy.
 */
void mpol_put_task_policy(struct task_struct *task)
{
        struct mempolicy *pol;

        task_lock(task);
        pol = task->mempolicy;
        task->mempolicy = NULL;
        task_unlock(task);
        mpol_put(pol);
}

static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
        rb_erase(&n->nd, &sp->root);
        sp_free(n);
}

static void sp_node_init(struct sp_node *node, unsigned long start,
                        unsigned long end, struct mempolicy *pol)
{
        node->start = start;
        node->end = end;
        node->policy = pol;
}

static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
                                struct mempolicy *pol)
{
        struct sp_node *n;
        struct mempolicy *newpol;

        n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
        if (!n)
                return NULL;

        newpol = mpol_dup(pol);
        if (IS_ERR(newpol)) {
                kmem_cache_free(sn_cache, n);
                return NULL;
        }
        newpol->flags |= MPOL_F_SHARED;
        sp_node_init(n, start, end, newpol);

        return n;
}

/* Replace a policy range. */
static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
                                 unsigned long end, struct sp_node *new)
{
        struct sp_node *n;
        struct sp_node *n_new = NULL;
        struct mempolicy *mpol_new = NULL;
        int ret = 0;

restart:
        write_lock(&sp->lock);
        n = sp_lookup(sp, start, end);
        /* Take care of old policies in the same range. */
        while (n && n->start < end) {
                struct rb_node *next = rb_next(&n->nd);
                if (n->start >= start) {
                        if (n->end <= end)
                                sp_delete(sp, n);
                        else
                                n->start = end;
                } else {
                        /* Old policy spanning whole new range. */
                        if (n->end > end) {
                                if (!n_new)
                                        goto alloc_new;

                                *mpol_new = *n->policy;
                                atomic_set(&mpol_new->refcnt, 1);
                                sp_node_init(n_new, end, n->end, mpol_new);
                                n->end = start;
                                sp_insert(sp, n_new);
                                n_new = NULL;
                                mpol_new = NULL;
                                break;
                        } else
                                n->end = start;
                }
                if (!next)
                        break;
                n = rb_entry(next, struct sp_node, nd);
        }
        if (new)
                sp_insert(sp, new);
        write_unlock(&sp->lock);
        ret = 0;

err_out:
        if (mpol_new)
                mpol_put(mpol_new);
        if (n_new)
                kmem_cache_free(sn_cache, n_new);

        return ret;

alloc_new:
        write_unlock(&sp->lock);
        ret = -ENOMEM;
        n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
        if (!n_new)
                goto err_out;
        mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
        if (!mpol_new)
                goto err_out;
        atomic_set(&mpol_new->refcnt, 1);
        goto restart;
}

/**
 * mpol_shared_policy_init - initialize shared policy for inode
 * @sp: pointer to inode shared policy
 * @mpol:  struct mempolicy to install
 *
 * Install non-NULL @mpol in inode's shared policy rb-tree.
 * On entry, the current task has a reference on a non-NULL @mpol.
 * This must be released on exit.
 * This is called at get_inode() calls and we can use GFP_KERNEL.
 */
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
{
        int ret;

        sp->root = RB_ROOT;                /* empty tree == default mempolicy */
        rwlock_init(&sp->lock);

        if (mpol) {
                struct vm_area_struct pvma;
                struct mempolicy *new;
                NODEMASK_SCRATCH(scratch);

                if (!scratch)
                        goto put_mpol;
                /* contextualize the tmpfs mount point mempolicy */
                new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
                if (IS_ERR(new))
                        goto free_scratch; /* no valid nodemask intersection */

                task_lock(current);
                ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
                task_unlock(current);
                if (ret)
                        goto put_new;

                /* Create pseudo-vma that contains just the policy */
                vma_init(&pvma, NULL);
                pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
                mpol_set_shared_policy(sp, &pvma, new); /* adds ref */

put_new:
                mpol_put(new);                        /* drop initial ref */
free_scratch:
                NODEMASK_SCRATCH_FREE(scratch);
put_mpol:
                mpol_put(mpol);        /* drop our incoming ref on sb mpol */
        }
}

int mpol_set_shared_policy(struct shared_policy *info,
                        struct vm_area_struct *vma, struct mempolicy *npol)
{
        int err;
        struct sp_node *new = NULL;
        unsigned long sz = vma_pages(vma);

        pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
                 vma->vm_pgoff,
                 sz, npol ? npol->mode : -1,
                 npol ? npol->flags : -1,
                 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);

        if (npol) {
                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
                if (!new)
                        return -ENOMEM;
        }
        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
        if (err && new)
                sp_free(new);
        return err;
}

/* Free a backing policy store on inode delete. */
void mpol_free_shared_policy(struct shared_policy *p)
{
        struct sp_node *n;
        struct rb_node *next;

        if (!p->root.rb_node)
                return;
        write_lock(&p->lock);
        next = rb_first(&p->root);
        while (next) {
                n = rb_entry(next, struct sp_node, nd);
                next = rb_next(&n->nd);
                sp_delete(p, n);
        }
        write_unlock(&p->lock);
}

#ifdef CONFIG_NUMA_BALANCING
static int __initdata numabalancing_override;

static void __init check_numabalancing_enable(void)
{
        bool numabalancing_default = false;

        if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
                numabalancing_default = true;

        /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
        if (numabalancing_override)
                set_numabalancing_state(numabalancing_override == 1);

        if (num_online_nodes() > 1 && !numabalancing_override) {
                pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
                        numabalancing_default ? "Enabling" : "Disabling");
                set_numabalancing_state(numabalancing_default);
        }
}

static int __init setup_numabalancing(char *str)
{
        int ret = 0;
        if (!str)
                goto out;

        if (!strcmp(str, "enable")) {
                numabalancing_override = 1;
                ret = 1;
        } else if (!strcmp(str, "disable")) {
                numabalancing_override = -1;
                ret = 1;
        }
out:
        if (!ret)
                pr_warn("Unable to parse numa_balancing=\n");

        return ret;
}
__setup("numa_balancing=", setup_numabalancing);
#else
static inline void __init check_numabalancing_enable(void)
{
}
#endif /* CONFIG_NUMA_BALANCING */

/* assumes fs == KERNEL_DS */
void __init numa_policy_init(void)
{
        nodemask_t interleave_nodes;
        unsigned long largest = 0;
        int nid, prefer = 0;

        policy_cache = kmem_cache_create("numa_policy",
                                         sizeof(struct mempolicy),
                                         0, SLAB_PANIC, NULL);

        sn_cache = kmem_cache_create("shared_policy_node",
                                     sizeof(struct sp_node),
                                     0, SLAB_PANIC, NULL);

        for_each_node(nid) {
                preferred_node_policy[nid] = (struct mempolicy) {
                        .refcnt = ATOMIC_INIT(1),
                        .mode = MPOL_PREFERRED,
                        .flags = MPOL_F_MOF | MPOL_F_MORON,
                        .v = { .preferred_node = nid, },
                };
        }

        /*
         * Set interleaving policy for system init. Interleaving is only
         * enabled across suitably sized nodes (default is >= 16MB), or
         * fall back to the largest node if they're all smaller.
         */
        nodes_clear(interleave_nodes);
        for_each_node_state(nid, N_MEMORY) {
                unsigned long total_pages = node_present_pages(nid);

                /* Preserve the largest node */
                if (largest < total_pages) {
                        largest = total_pages;
                        prefer = nid;
                }

                /* Interleave this node? */
                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
                        node_set(nid, interleave_nodes);
        }

        /* All too small, use the largest */
        if (unlikely(nodes_empty(interleave_nodes)))
                node_set(prefer, interleave_nodes);

        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
                pr_err("%s: interleaving failed\n", __func__);

        check_numabalancing_enable();
}

/* Reset policy of current process to default */
void numa_default_policy(void)
{
        do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
}

/*
 * Parse and format mempolicy from/to strings
 */

/*
 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
 */
static const char * const policy_modes[] =
{
        [MPOL_DEFAULT]    = "default",
        [MPOL_PREFERRED]  = "prefer",
        [MPOL_BIND]       = "bind",
        [MPOL_INTERLEAVE] = "interleave",
        [MPOL_LOCAL]      = "local",
};


#ifdef CONFIG_TMPFS
/**
 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
 * @str:  string containing mempolicy to parse
 * @mpol:  pointer to struct mempolicy pointer, returned on success.
 *
 * Format of input:
 *        <mode>[=<flags>][:<nodelist>]
 *
 * On success, returns 0, else 1
 */
int mpol_parse_str(char *str, struct mempolicy **mpol)
{
        struct mempolicy *new = NULL;
        unsigned short mode_flags;
        nodemask_t nodes;
        char *nodelist = strchr(str, ':');
        char *flags = strchr(str, '=');
        int err = 1, mode;

        if (flags)
                *flags++ = '\0';        /* terminate mode string */

        if (nodelist) {
                /* NUL-terminate mode or flags string */
                *nodelist++ = '\0';
                if (nodelist_parse(nodelist, nodes))
                        goto out;
                if (!nodes_subset(nodes, node_states[N_MEMORY]))
                        goto out;
        } else
                nodes_clear(nodes);

        mode = match_string(policy_modes, MPOL_MAX, str);
        if (mode < 0)
                goto out;

        switch (mode) {
        case MPOL_PREFERRED:
                /*
                 * Insist on a nodelist of one node only, although later
                 * we use first_node(nodes) to grab a single node, so here
                 * nodelist (or nodes) cannot be empty.
                 */
                if (nodelist) {
                        char *rest = nodelist;
                        while (isdigit(*rest))
                                rest++;
                        if (*rest)
                                goto out;
                        if (nodes_empty(nodes))
                                goto out;
                }
                break;
        case MPOL_INTERLEAVE:
                /*
                 * Default to online nodes with memory if no nodelist
                 */
                if (!nodelist)
                        nodes = node_states[N_MEMORY];
                break;
        case MPOL_LOCAL:
                /*
                 * Don't allow a nodelist;  mpol_new() checks flags
                 */
                if (nodelist)
                        goto out;
                mode = MPOL_PREFERRED;
                break;
        case MPOL_DEFAULT:
                /*
                 * Insist on a empty nodelist
                 */
                if (!nodelist)
                        err = 0;
                goto out;
        case MPOL_BIND:
                /*
                 * Insist on a nodelist
                 */
                if (!nodelist)
                        goto out;
        }

        mode_flags = 0;
        if (flags) {
                /*
                 * Currently, we only support two mutually exclusive
                 * mode flags.
                 */
                if (!strcmp(flags, "static"))
                        mode_flags |= MPOL_F_STATIC_NODES;
                else if (!strcmp(flags, "relative"))
                        mode_flags |= MPOL_F_RELATIVE_NODES;
                else
                        goto out;
        }

        new = mpol_new(mode, mode_flags, &nodes);
        if (IS_ERR(new))
                goto out;

        /*
         * Save nodes for mpol_to_str() to show the tmpfs mount options
         * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
         */
        if (mode != MPOL_PREFERRED)
                new->v.nodes = nodes;
        else if (nodelist)
                new->v.preferred_node = first_node(nodes);
        else
                new->flags |= MPOL_F_LOCAL;

        /*
         * Save nodes for contextualization: this will be used to "clone"
         * the mempolicy in a specific context [cpuset] at a later time.
         */
        new->w.user_nodemask = nodes;

        err = 0;

out:
        /* Restore string for error message */
        if (nodelist)
                *--nodelist = ':';
        if (flags)
                *--flags = '=';
        if (!err)
                *mpol = new;
        return err;
}
#endif /* CONFIG_TMPFS */

/**
 * mpol_to_str - format a mempolicy structure for printing
 * @buffer:  to contain formatted mempolicy string
 * @maxlen:  length of @buffer
 * @pol:  pointer to mempolicy to be formatted
 *
 * Convert @pol into a string.  If @buffer is too short, truncate the string.
 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
 * longest flag, "relative", and to display at least a few node ids.
 */
void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
{
        char *p = buffer;
        nodemask_t nodes = NODE_MASK_NONE;
        unsigned short mode = MPOL_DEFAULT;
        unsigned short flags = 0;

        if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
                mode = pol->mode;
                flags = pol->flags;
        }

        switch (mode) {
        case MPOL_DEFAULT:
                break;
        case MPOL_PREFERRED:
                if (flags & MPOL_F_LOCAL)
                        mode = MPOL_LOCAL;
                else
                        node_set(pol->v.preferred_node, nodes);
                break;
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
                nodes = pol->v.nodes;
                break;
        default:
                WARN_ON_ONCE(1);
                snprintf(p, maxlen, "unknown");
                return;
        }

        p += snprintf(p, maxlen, "%s", policy_modes[mode]);

        if (flags & MPOL_MODE_FLAGS) {
                p += snprintf(p, buffer + maxlen - p, "=");

                /*
                 * Currently, the only defined flags are mutually exclusive
                 */
                if (flags & MPOL_F_STATIC_NODES)
                        p += snprintf(p, buffer + maxlen - p, "static");
                else if (flags & MPOL_F_RELATIVE_NODES)
                        p += snprintf(p, buffer + maxlen - p, "relative");
        }

        if (!nodes_empty(nodes))
                p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
                               nodemask_pr_args(&nodes));
}


















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 

    1 


    1 


    1 



    1 
    1 













    1 






















    1 


































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
// SPDX-License-Identifier: GPL-2.0
/*
 * Basic worker thread pool for io_uring
 *
 * Copyright (C) 2019 Jens Axboe
 *
 */
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/sched/signal.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/rculist_nulls.h>
#include <linux/cpu.h>
#include <linux/tracehook.h>
#include <uapi/linux/io_uring.h>

#include "io-wq.h"

#define WORKER_IDLE_TIMEOUT        (5 * HZ)

enum {
        IO_WORKER_F_UP                = 1,        /* up and active */
        IO_WORKER_F_RUNNING        = 2,        /* account as running */
        IO_WORKER_F_FREE        = 4,        /* worker on free list */
        IO_WORKER_F_BOUND        = 8,        /* is doing bounded work */
};

enum {
        IO_WQ_BIT_EXIT                = 0,        /* wq exiting */
};

enum {
        IO_ACCT_STALLED_BIT        = 0,        /* stalled on hash */
};

/*
 * One for each thread in a wqe pool
 */
struct io_worker {
        refcount_t ref;
        unsigned flags;
        struct hlist_nulls_node nulls_node;
        struct list_head all_list;
        struct task_struct *task;
        struct io_wqe *wqe;

        struct io_wq_work *cur_work;
        spinlock_t lock;

        struct completion ref_done;

        unsigned long create_state;
        struct callback_head create_work;
        int create_index;

        union {
                struct rcu_head rcu;
                struct work_struct work;
        };
};

#if BITS_PER_LONG == 64
#define IO_WQ_HASH_ORDER        6
#else
#define IO_WQ_HASH_ORDER        5
#endif

#define IO_WQ_NR_HASH_BUCKETS        (1u << IO_WQ_HASH_ORDER)

struct io_wqe_acct {
        unsigned nr_workers;
        unsigned max_workers;
        int index;
        atomic_t nr_running;
        struct io_wq_work_list work_list;
        unsigned long flags;
};

enum {
        IO_WQ_ACCT_BOUND,
        IO_WQ_ACCT_UNBOUND,
        IO_WQ_ACCT_NR,
};

/*
 * Per-node worker thread pool
 */
struct io_wqe {
        raw_spinlock_t lock;
        struct io_wqe_acct acct[2];

        int node;

        struct hlist_nulls_head free_list;
        struct list_head all_list;

        struct wait_queue_entry wait;

        struct io_wq *wq;
        struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];

        cpumask_var_t cpu_mask;
};

/*
 * Per io_wq state
  */
struct io_wq {
        unsigned long state;

        free_work_fn *free_work;
        io_wq_work_fn *do_work;

        struct io_wq_hash *hash;

        atomic_t worker_refs;
        struct completion worker_done;

        struct hlist_node cpuhp_node;

        struct task_struct *task;

        struct io_wqe *wqes[];
};

static enum cpuhp_state io_wq_online;

struct io_cb_cancel_data {
        work_cancel_fn *fn;
        void *data;
        int nr_running;
        int nr_pending;
        bool cancel_all;
};

static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index);
static void io_wqe_dec_running(struct io_worker *worker);
static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
                                        struct io_wqe_acct *acct,
                                        struct io_cb_cancel_data *match);
static void create_worker_cb(struct callback_head *cb);
static void io_wq_cancel_tw_create(struct io_wq *wq);

static bool io_worker_get(struct io_worker *worker)
{
        return refcount_inc_not_zero(&worker->ref);
}

static void io_worker_release(struct io_worker *worker)
{
        if (refcount_dec_and_test(&worker->ref))
                complete(&worker->ref_done);
}

static inline struct io_wqe_acct *io_get_acct(struct io_wqe *wqe, bool bound)
{
        return &wqe->acct[bound ? IO_WQ_ACCT_BOUND : IO_WQ_ACCT_UNBOUND];
}

static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe,
                                                   struct io_wq_work *work)
{
        return io_get_acct(wqe, !(work->flags & IO_WQ_WORK_UNBOUND));
}

static inline struct io_wqe_acct *io_wqe_get_acct(struct io_worker *worker)
{
        return io_get_acct(worker->wqe, worker->flags & IO_WORKER_F_BOUND);
}

static void io_worker_ref_put(struct io_wq *wq)
{
        if (atomic_dec_and_test(&wq->worker_refs))
                complete(&wq->worker_done);
}

bool io_wq_worker_stopped(void)
{
        struct io_worker *worker = current->pf_io_worker;

        if (WARN_ON_ONCE(!io_wq_current_is_worker()))
                return true;

        return test_bit(IO_WQ_BIT_EXIT, &worker->wqe->wq->state);
}

static void io_worker_cancel_cb(struct io_worker *worker)
{
        struct io_wqe_acct *acct = io_wqe_get_acct(worker);
        struct io_wqe *wqe = worker->wqe;
        struct io_wq *wq = wqe->wq;

        atomic_dec(&acct->nr_running);
        raw_spin_lock(&worker->wqe->lock);
        acct->nr_workers--;
        raw_spin_unlock(&worker->wqe->lock);
        io_worker_ref_put(wq);
        clear_bit_unlock(0, &worker->create_state);
        io_worker_release(worker);
}

static bool io_task_worker_match(struct callback_head *cb, void *data)
{
        struct io_worker *worker;

        if (cb->func != create_worker_cb)
                return false;
        worker = container_of(cb, struct io_worker, create_work);
        return worker == data;
}

static void io_worker_exit(struct io_worker *worker)
{
        struct io_wqe *wqe = worker->wqe;
        struct io_wq *wq = wqe->wq;

        while (1) {
                struct callback_head *cb = task_work_cancel_match(wq->task,
                                                io_task_worker_match, worker);

                if (!cb)
                        break;
                io_worker_cancel_cb(worker);
        }

        if (refcount_dec_and_test(&worker->ref))
                complete(&worker->ref_done);
        wait_for_completion(&worker->ref_done);

        raw_spin_lock(&wqe->lock);
        if (worker->flags & IO_WORKER_F_FREE)
                hlist_nulls_del_rcu(&worker->nulls_node);
        list_del_rcu(&worker->all_list);
        preempt_disable();
        io_wqe_dec_running(worker);
        worker->flags = 0;
        current->flags &= ~PF_IO_WORKER;
        preempt_enable();
        raw_spin_unlock(&wqe->lock);

        kfree_rcu(worker, rcu);
        io_worker_ref_put(wqe->wq);
        do_exit(0);
}

static inline bool io_acct_run_queue(struct io_wqe_acct *acct)
{
        if (!wq_list_empty(&acct->work_list) &&
            !test_bit(IO_ACCT_STALLED_BIT, &acct->flags))
                return true;
        return false;
}

/*
 * Check head of free list for an available worker. If one isn't available,
 * caller must create one.
 */
static bool io_wqe_activate_free_worker(struct io_wqe *wqe,
                                        struct io_wqe_acct *acct)
        __must_hold(RCU)
{
        struct hlist_nulls_node *n;
        struct io_worker *worker;

        /*
         * Iterate free_list and see if we can find an idle worker to
         * activate. If a given worker is on the free_list but in the process
         * of exiting, keep trying.
         */
        hlist_nulls_for_each_entry_rcu(worker, n, &wqe->free_list, nulls_node) {
                if (!io_worker_get(worker))
                        continue;
                if (io_wqe_get_acct(worker) != acct) {
                        io_worker_release(worker);
                        continue;
                }
                if (wake_up_process(worker->task)) {
                        io_worker_release(worker);
                        return true;
                }
                io_worker_release(worker);
        }

        return false;
}

/*
 * We need a worker. If we find a free one, we're good. If not, and we're
 * below the max number of workers, create one.
 */
static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
{
        /*
         * Most likely an attempt to queue unbounded work on an io_wq that
         * wasn't setup with any unbounded workers.
         */
        if (unlikely(!acct->max_workers))
                pr_warn_once("io-wq is not configured for unbound workers");

        raw_spin_lock(&wqe->lock);
        if (acct->nr_workers >= acct->max_workers) {
                raw_spin_unlock(&wqe->lock);
                return true;
        }
        acct->nr_workers++;
        raw_spin_unlock(&wqe->lock);
        atomic_inc(&acct->nr_running);
        atomic_inc(&wqe->wq->worker_refs);
        return create_io_worker(wqe->wq, wqe, acct->index);
}

static void io_wqe_inc_running(struct io_worker *worker)
{
        struct io_wqe_acct *acct = io_wqe_get_acct(worker);

        atomic_inc(&acct->nr_running);
}

static void create_worker_cb(struct callback_head *cb)
{
        struct io_worker *worker;
        struct io_wq *wq;
        struct io_wqe *wqe;
        struct io_wqe_acct *acct;
        bool do_create = false;

        worker = container_of(cb, struct io_worker, create_work);
        wqe = worker->wqe;
        wq = wqe->wq;
        acct = &wqe->acct[worker->create_index];
        raw_spin_lock(&wqe->lock);
        if (acct->nr_workers < acct->max_workers) {
                acct->nr_workers++;
                do_create = true;
        }
        raw_spin_unlock(&wqe->lock);
        if (do_create) {
                create_io_worker(wq, wqe, worker->create_index);
        } else {
                atomic_dec(&acct->nr_running);
                io_worker_ref_put(wq);
        }
        clear_bit_unlock(0, &worker->create_state);
        io_worker_release(worker);
}

static bool io_queue_worker_create(struct io_worker *worker,
                                   struct io_wqe_acct *acct,
                                   task_work_func_t func)
{
        struct io_wqe *wqe = worker->wqe;
        struct io_wq *wq = wqe->wq;

        /* raced with exit, just ignore create call */
        if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
                goto fail;
        if (!io_worker_get(worker))
                goto fail;
        /*
         * create_state manages ownership of create_work/index. We should
         * only need one entry per worker, as the worker going to sleep
         * will trigger the condition, and waking will clear it once it
         * runs the task_work.
         */
        if (test_bit(0, &worker->create_state) ||
            test_and_set_bit_lock(0, &worker->create_state))
                goto fail_release;

        atomic_inc(&wq->worker_refs);
        init_task_work(&worker->create_work, func);
        worker->create_index = acct->index;
        if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) {
                /*
                 * EXIT may have been set after checking it above, check after
                 * adding the task_work and remove any creation item if it is
                 * now set. wq exit does that too, but we can have added this
                 * work item after we canceled in io_wq_exit_workers().
                 */
                if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
                        io_wq_cancel_tw_create(wq);
                io_worker_ref_put(wq);
                return true;
        }
        io_worker_ref_put(wq);
        clear_bit_unlock(0, &worker->create_state);
fail_release:
        io_worker_release(worker);
fail:
        atomic_dec(&acct->nr_running);
        io_worker_ref_put(wq);
        return false;
}

static void io_wqe_dec_running(struct io_worker *worker)
        __must_hold(wqe->lock)
{
        struct io_wqe_acct *acct = io_wqe_get_acct(worker);
        struct io_wqe *wqe = worker->wqe;

        if (!(worker->flags & IO_WORKER_F_UP))
                return;

        if (atomic_dec_and_test(&acct->nr_running) && io_acct_run_queue(acct)) {
                atomic_inc(&acct->nr_running);
                atomic_inc(&wqe->wq->worker_refs);
                raw_spin_unlock(&wqe->lock);
                io_queue_worker_create(worker, acct, create_worker_cb);
                raw_spin_lock(&wqe->lock);
        }
}

/*
 * Worker will start processing some work. Move it to the busy list, if
 * it's currently on the freelist
 */
static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
                             struct io_wq_work *work)
        __must_hold(wqe->lock)
{
        if (worker->flags & IO_WORKER_F_FREE) {
                worker->flags &= ~IO_WORKER_F_FREE;
                hlist_nulls_del_init_rcu(&worker->nulls_node);
        }
}

/*
 * No work, worker going to sleep. Move to freelist, and unuse mm if we
 * have one attached. Dropping the mm may potentially sleep, so we drop
 * the lock in that case and return success. Since the caller has to
 * retry the loop in that case (we changed task state), we don't regrab
 * the lock if we return success.
 */
static void __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
        __must_hold(wqe->lock)
{
        if (!(worker->flags & IO_WORKER_F_FREE)) {
                worker->flags |= IO_WORKER_F_FREE;
                hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
        }
}

static inline unsigned int io_get_work_hash(struct io_wq_work *work)
{
        return work->flags >> IO_WQ_HASH_SHIFT;
}

static bool io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
{
        struct io_wq *wq = wqe->wq;
        bool ret = false;

        spin_lock_irq(&wq->hash->wait.lock);
        if (list_empty(&wqe->wait.entry)) {
                __add_wait_queue(&wq->hash->wait, &wqe->wait);
                if (!test_bit(hash, &wq->hash->map)) {
                        __set_current_state(TASK_RUNNING);
                        list_del_init(&wqe->wait.entry);
                        ret = true;
                }
        }
        spin_unlock_irq(&wq->hash->wait.lock);
        return ret;
}

static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
                                           struct io_worker *worker)
        __must_hold(wqe->lock)
{
        struct io_wq_work_node *node, *prev;
        struct io_wq_work *work, *tail;
        unsigned int stall_hash = -1U;
        struct io_wqe *wqe = worker->wqe;

        wq_list_for_each(node, prev, &acct->work_list) {
                unsigned int hash;

                work = container_of(node, struct io_wq_work, list);

                /* not hashed, can run anytime */
                if (!io_wq_is_hashed(work)) {
                        wq_list_del(&acct->work_list, node, prev);
                        return work;
                }

                hash = io_get_work_hash(work);
                /* all items with this hash lie in [work, tail] */
                tail = wqe->hash_tail[hash];

                /* hashed, can run if not already running */
                if (!test_and_set_bit(hash, &wqe->wq->hash->map)) {
                        wqe->hash_tail[hash] = NULL;
                        wq_list_cut(&acct->work_list, &tail->list, prev);
                        return work;
                }
                if (stall_hash == -1U)
                        stall_hash = hash;
                /* fast forward to a next hash, for-each will fix up @prev */
                node = &tail->list;
        }

        if (stall_hash != -1U) {
                bool unstalled;

                /*
                 * Set this before dropping the lock to avoid racing with new
                 * work being added and clearing the stalled bit.
                 */
                set_bit(IO_ACCT_STALLED_BIT, &acct->flags);
                raw_spin_unlock(&wqe->lock);
                unstalled = io_wait_on_hash(wqe, stall_hash);
                raw_spin_lock(&wqe->lock);
                if (unstalled) {
                        clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
                        if (wq_has_sleeper(&wqe->wq->hash->wait))
                                wake_up(&wqe->wq->hash->wait);
                }
        }

        return NULL;
}

static bool io_flush_signals(void)
{
        if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) {
                __set_current_state(TASK_RUNNING);
                tracehook_notify_signal();
                return true;
        }
        return false;
}

static void io_assign_current_work(struct io_worker *worker,
                                   struct io_wq_work *work)
{
        if (work) {
                io_flush_signals();
                cond_resched();
        }

        spin_lock(&worker->lock);
        worker->cur_work = work;
        spin_unlock(&worker->lock);
}

static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);

static void io_worker_handle_work(struct io_worker *worker)
        __releases(wqe->lock)
{
        struct io_wqe_acct *acct = io_wqe_get_acct(worker);
        struct io_wqe *wqe = worker->wqe;
        struct io_wq *wq = wqe->wq;
        bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state);

        do {
                struct io_wq_work *work;
get_next:
                /*
                 * If we got some work, mark us as busy. If we didn't, but
                 * the list isn't empty, it means we stalled on hashed work.
                 * Mark us stalled so we don't keep looking for work when we
                 * can't make progress, any work completion or insertion will
                 * clear the stalled flag.
                 */
                work = io_get_next_work(acct, worker);
                if (work)
                        __io_worker_busy(wqe, worker, work);

                raw_spin_unlock(&wqe->lock);
                if (!work)
                        break;
                io_assign_current_work(worker, work);
                __set_current_state(TASK_RUNNING);

                /* handle a whole dependent link */
                do {
                        struct io_wq_work *next_hashed, *linked;
                        unsigned int hash = io_get_work_hash(work);

                        next_hashed = wq_next_work(work);

                        if (unlikely(do_kill) && (work->flags & IO_WQ_WORK_UNBOUND))
                                work->flags |= IO_WQ_WORK_CANCEL;
                        wq->do_work(work);
                        io_assign_current_work(worker, NULL);

                        linked = wq->free_work(work);
                        work = next_hashed;
                        if (!work && linked && !io_wq_is_hashed(linked)) {
                                work = linked;
                                linked = NULL;
                        }
                        io_assign_current_work(worker, work);
                        if (linked)
                                io_wqe_enqueue(wqe, linked);

                        if (hash != -1U && !next_hashed) {
                                /* serialize hash clear with wake_up() */
                                spin_lock_irq(&wq->hash->wait.lock);
                                clear_bit(hash, &wq->hash->map);
                                clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
                                spin_unlock_irq(&wq->hash->wait.lock);
                                if (wq_has_sleeper(&wq->hash->wait))
                                        wake_up(&wq->hash->wait);
                                raw_spin_lock(&wqe->lock);
                                /* skip unnecessary unlock-lock wqe->lock */
                                if (!work)
                                        goto get_next;
                                raw_spin_unlock(&wqe->lock);
                        }
                } while (work);

                raw_spin_lock(&wqe->lock);
        } while (1);
}

static int io_wqe_worker(void *data)
{
        struct io_worker *worker = data;
        struct io_wqe_acct *acct = io_wqe_get_acct(worker);
        struct io_wqe *wqe = worker->wqe;
        struct io_wq *wq = wqe->wq;
        bool last_timeout = false;
        char buf[TASK_COMM_LEN];

        worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);

        snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid);
        set_task_comm(current, buf);

        while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
                long ret;

                set_current_state(TASK_INTERRUPTIBLE);
loop:
                raw_spin_lock(&wqe->lock);
                if (io_acct_run_queue(acct)) {
                        io_worker_handle_work(worker);
                        goto loop;
                }
                /* timed out, exit unless we're the last worker */
                if (last_timeout && acct->nr_workers > 1) {
                        acct->nr_workers--;
                        raw_spin_unlock(&wqe->lock);
                        __set_current_state(TASK_RUNNING);
                        break;
                }
                last_timeout = false;
                __io_worker_idle(wqe, worker);
                raw_spin_unlock(&wqe->lock);
                if (io_flush_signals())
                        continue;
                ret = schedule_timeout(WORKER_IDLE_TIMEOUT);
                if (signal_pending(current)) {
                        struct ksignal ksig;

                        if (!get_signal(&ksig))
                                continue;
                        break;
                }
                last_timeout = !ret;
        }

        if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
                raw_spin_lock(&wqe->lock);
                io_worker_handle_work(worker);
        }

        io_worker_exit(worker);
        return 0;
}

/*
 * Called when a worker is scheduled in. Mark us as currently running.
 */
void io_wq_worker_running(struct task_struct *tsk)
{
        struct io_worker *worker = tsk->pf_io_worker;

        if (!worker)
                return;
        if (!(worker->flags & IO_WORKER_F_UP))
                return;
        if (worker->flags & IO_WORKER_F_RUNNING)
                return;
        worker->flags |= IO_WORKER_F_RUNNING;
        io_wqe_inc_running(worker);
}

/*
 * Called when worker is going to sleep. If there are no workers currently
 * running and we have work pending, wake up a free one or create a new one.
 */
void io_wq_worker_sleeping(struct task_struct *tsk)
{
        struct io_worker *worker = tsk->pf_io_worker;

        if (!worker)
                return;
        if (!(worker->flags & IO_WORKER_F_UP))
                return;
        if (!(worker->flags & IO_WORKER_F_RUNNING))
                return;

        worker->flags &= ~IO_WORKER_F_RUNNING;

        raw_spin_lock(&worker->wqe->lock);
        io_wqe_dec_running(worker);
        raw_spin_unlock(&worker->wqe->lock);
}

static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker,
                               struct task_struct *tsk)
{
        tsk->pf_io_worker = worker;
        worker->task = tsk;
        set_cpus_allowed_ptr(tsk, wqe->cpu_mask);
        tsk->flags |= PF_NO_SETAFFINITY;

        raw_spin_lock(&wqe->lock);
        hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
        list_add_tail_rcu(&worker->all_list, &wqe->all_list);
        worker->flags |= IO_WORKER_F_FREE;
        raw_spin_unlock(&wqe->lock);
        wake_up_new_task(tsk);
}

static bool io_wq_work_match_all(struct io_wq_work *work, void *data)
{
        return true;
}

static inline bool io_should_retry_thread(long err)
{
        /*
         * Prevent perpetual task_work retry, if the task (or its group) is
         * exiting.
         */
        if (fatal_signal_pending(current))
                return false;

        switch (err) {
        case -EAGAIN:
        case -ERESTARTSYS:
        case -ERESTARTNOINTR:
        case -ERESTARTNOHAND:
                return true;
        default:
                return false;
        }
}

static void create_worker_cont(struct callback_head *cb)
{
        struct io_worker *worker;
        struct task_struct *tsk;
        struct io_wqe *wqe;

        worker = container_of(cb, struct io_worker, create_work);
        clear_bit_unlock(0, &worker->create_state);
        wqe = worker->wqe;
        tsk = create_io_thread(io_wqe_worker, worker, wqe->node);
        if (!IS_ERR(tsk)) {
                io_init_new_worker(wqe, worker, tsk);
                io_worker_release(worker);
                return;
        } else if (!io_should_retry_thread(PTR_ERR(tsk))) {
                struct io_wqe_acct *acct = io_wqe_get_acct(worker);

                atomic_dec(&acct->nr_running);
                raw_spin_lock(&wqe->lock);
                acct->nr_workers--;
                if (!acct->nr_workers) {
                        struct io_cb_cancel_data match = {
                                .fn                = io_wq_work_match_all,
                                .cancel_all        = true,
                        };

                        while (io_acct_cancel_pending_work(wqe, acct, &match))
                                raw_spin_lock(&wqe->lock);
                }
                raw_spin_unlock(&wqe->lock);
                io_worker_ref_put(wqe->wq);
                kfree(worker);
                return;
        }

        /* re-create attempts grab a new worker ref, drop the existing one */
        io_worker_release(worker);
        schedule_work(&worker->work);
}

static void io_workqueue_create(struct work_struct *work)
{
        struct io_worker *worker = container_of(work, struct io_worker, work);
        struct io_wqe_acct *acct = io_wqe_get_acct(worker);

        if (!io_queue_worker_create(worker, acct, create_worker_cont))
                kfree(worker);
}

static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
{
        struct io_wqe_acct *acct = &wqe->acct[index];
        struct io_worker *worker;
        struct task_struct *tsk;

        __set_current_state(TASK_RUNNING);

        worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
        if (!worker) {
fail:
                atomic_dec(&acct->nr_running);
                raw_spin_lock(&wqe->lock);
                acct->nr_workers--;
                raw_spin_unlock(&wqe->lock);
                io_worker_ref_put(wq);
                return false;
        }

        refcount_set(&worker->ref, 1);
        worker->wqe = wqe;
        spin_lock_init(&worker->lock);
        init_completion(&worker->ref_done);

        if (index == IO_WQ_ACCT_BOUND)
                worker->flags |= IO_WORKER_F_BOUND;

        tsk = create_io_thread(io_wqe_worker, worker, wqe->node);
        if (!IS_ERR(tsk)) {
                io_init_new_worker(wqe, worker, tsk);
        } else if (!io_should_retry_thread(PTR_ERR(tsk))) {
                kfree(worker);
                goto fail;
        } else {
                INIT_WORK(&worker->work, io_workqueue_create);
                schedule_work(&worker->work);
        }

        return true;
}

/*
 * Iterate the passed in list and call the specific function for each
 * worker that isn't exiting
 */
static bool io_wq_for_each_worker(struct io_wqe *wqe,
                                  bool (*func)(struct io_worker *, void *),
                                  void *data)
{
        struct io_worker *worker;
        bool ret = false;

        list_for_each_entry_rcu(worker, &wqe->all_list, all_list) {
                if (io_worker_get(worker)) {
                        /* no task if node is/was offline */
                        if (worker->task)
                                ret = func(worker, data);
                        io_worker_release(worker);
                        if (ret)
                                break;
                }
        }

        return ret;
}

static bool io_wq_worker_wake(struct io_worker *worker, void *data)
{
        set_notify_signal(worker->task);
        wake_up_process(worker->task);
        return false;
}

static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
{
        struct io_wq *wq = wqe->wq;

        do {
                work->flags |= IO_WQ_WORK_CANCEL;
                wq->do_work(work);
                work = wq->free_work(work);
        } while (work);
}

static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
{
        struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
        unsigned int hash;
        struct io_wq_work *tail;

        if (!io_wq_is_hashed(work)) {
append:
                wq_list_add_tail(&work->list, &acct->work_list);
                return;
        }

        hash = io_get_work_hash(work);
        tail = wqe->hash_tail[hash];
        wqe->hash_tail[hash] = work;
        if (!tail)
                goto append;

        wq_list_add_after(&work->list, &tail->list, &acct->work_list);
}

static bool io_wq_work_match_item(struct io_wq_work *work, void *data)
{
        return work == data;
}

static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
        struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
        unsigned work_flags = work->flags;
        bool do_create;

        /*
         * If io-wq is exiting for this task, or if the request has explicitly
         * been marked as one that should not get executed, cancel it here.
         */
        if (test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state) ||
            (work->flags & IO_WQ_WORK_CANCEL)) {
                io_run_cancel(work, wqe);
                return;
        }

        raw_spin_lock(&wqe->lock);
        io_wqe_insert_work(wqe, work);
        clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);

        rcu_read_lock();
        do_create = !io_wqe_activate_free_worker(wqe, acct);
        rcu_read_unlock();

        raw_spin_unlock(&wqe->lock);

        if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) ||
            !atomic_read(&acct->nr_running))) {
                bool did_create;

                did_create = io_wqe_create_worker(wqe, acct);
                if (likely(did_create))
                        return;

                raw_spin_lock(&wqe->lock);
                /* fatal condition, failed to create the first worker */
                if (!acct->nr_workers) {
                        struct io_cb_cancel_data match = {
                                .fn                = io_wq_work_match_item,
                                .data                = work,
                                .cancel_all        = false,
                        };

                        if (io_acct_cancel_pending_work(wqe, acct, &match))
                                raw_spin_lock(&wqe->lock);
                }
                raw_spin_unlock(&wqe->lock);
        }
}

void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
{
        struct io_wqe *wqe = wq->wqes[numa_node_id()];

        io_wqe_enqueue(wqe, work);
}

/*
 * Work items that hash to the same value will not be done in parallel.
 * Used to limit concurrent writes, generally hashed by inode.
 */
void io_wq_hash_work(struct io_wq_work *work, void *val)
{
        unsigned int bit;

        bit = hash_ptr(val, IO_WQ_HASH_ORDER);
        work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
}

static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
{
        struct io_cb_cancel_data *match = data;

        /*
         * Hold the lock to avoid ->cur_work going out of scope, caller
         * may dereference the passed in work.
         */
        spin_lock(&worker->lock);
        if (worker->cur_work &&
            match->fn(worker->cur_work, match->data)) {
                set_notify_signal(worker->task);
                match->nr_running++;
        }
        spin_unlock(&worker->lock);

        return match->nr_running && !match->cancel_all;
}

static inline void io_wqe_remove_pending(struct io_wqe *wqe,
                                         struct io_wq_work *work,
                                         struct io_wq_work_node *prev)
{
        struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
        unsigned int hash = io_get_work_hash(work);
        struct io_wq_work *prev_work = NULL;

        if (io_wq_is_hashed(work) && work == wqe->hash_tail[hash]) {
                if (prev)
                        prev_work = container_of(prev, struct io_wq_work, list);
                if (prev_work && io_get_work_hash(prev_work) == hash)
                        wqe->hash_tail[hash] = prev_work;
                else
                        wqe->hash_tail[hash] = NULL;
        }
        wq_list_del(&acct->work_list, &work->list, prev);
}

static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
                                        struct io_wqe_acct *acct,
                                        struct io_cb_cancel_data *match)
        __releases(wqe->lock)
{
        struct io_wq_work_node *node, *prev;
        struct io_wq_work *work;

        wq_list_for_each(node, prev, &acct->work_list) {
                work = container_of(node, struct io_wq_work, list);
                if (!match->fn(work, match->data))
                        continue;
                io_wqe_remove_pending(wqe, work, prev);
                raw_spin_unlock(&wqe->lock);
                io_run_cancel(work, wqe);
                match->nr_pending++;
                /* not safe to continue after unlock */
                return true;
        }

        return false;
}

static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
                                       struct io_cb_cancel_data *match)
{
        int i;
retry:
        raw_spin_lock(&wqe->lock);
        for (i = 0; i < IO_WQ_ACCT_NR; i++) {
                struct io_wqe_acct *acct = io_get_acct(wqe, i == 0);

                if (io_acct_cancel_pending_work(wqe, acct, match)) {
                        if (match->cancel_all)
                                goto retry;
                        return;
                }
        }
        raw_spin_unlock(&wqe->lock);
}

static void io_wqe_cancel_running_work(struct io_wqe *wqe,
                                       struct io_cb_cancel_data *match)
{
        rcu_read_lock();
        io_wq_for_each_worker(wqe, io_wq_worker_cancel, match);
        rcu_read_unlock();
}

enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
                                  void *data, bool cancel_all)
{
        struct io_cb_cancel_data match = {
                .fn                = cancel,
                .data                = data,
                .cancel_all        = cancel_all,
        };
        int node;

        /*
         * First check pending list, if we're lucky we can just remove it
         * from there. CANCEL_OK means that the work is returned as-new,
         * no completion will be posted for it.
         */
        for_each_node(node) {
                struct io_wqe *wqe = wq->wqes[node];

                io_wqe_cancel_pending_work(wqe, &match);
                if (match.nr_pending && !match.cancel_all)
                        return IO_WQ_CANCEL_OK;
        }

        /*
         * Now check if a free (going busy) or busy worker has the work
         * currently running. If we find it there, we'll return CANCEL_RUNNING
         * as an indication that we attempt to signal cancellation. The
         * completion will run normally in this case.
         */
        for_each_node(node) {
                struct io_wqe *wqe = wq->wqes[node];

                io_wqe_cancel_running_work(wqe, &match);
                if (match.nr_running && !match.cancel_all)
                        return IO_WQ_CANCEL_RUNNING;
        }

        if (match.nr_running)
                return IO_WQ_CANCEL_RUNNING;
        if (match.nr_pending)
                return IO_WQ_CANCEL_OK;
        return IO_WQ_CANCEL_NOTFOUND;
}

static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode,
                            int sync, void *key)
{
        struct io_wqe *wqe = container_of(wait, struct io_wqe, wait);
        int i;

        list_del_init(&wait->entry);

        rcu_read_lock();
        for (i = 0; i < IO_WQ_ACCT_NR; i++) {
                struct io_wqe_acct *acct = &wqe->acct[i];

                if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags))
                        io_wqe_activate_free_worker(wqe, acct);
        }
        rcu_read_unlock();
        return 1;
}

struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
{
        int ret, node, i;
        struct io_wq *wq;

        if (WARN_ON_ONCE(!data->free_work || !data->do_work))
                return ERR_PTR(-EINVAL);
        if (WARN_ON_ONCE(!bounded))
                return ERR_PTR(-EINVAL);

        wq = kzalloc(struct_size(wq, wqes, nr_node_ids), GFP_KERNEL);
        if (!wq)
                return ERR_PTR(-ENOMEM);
        ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node);
        if (ret)
                goto err_wq;

        refcount_inc(&data->hash->refs);
        wq->hash = data->hash;
        wq->free_work = data->free_work;
        wq->do_work = data->do_work;

        ret = -ENOMEM;
        for_each_node(node) {
                struct io_wqe *wqe;
                int alloc_node = node;

                if (!node_online(alloc_node))
                        alloc_node = NUMA_NO_NODE;
                wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node);
                if (!wqe)
                        goto err;
                wq->wqes[node] = wqe;
                if (!alloc_cpumask_var(&wqe->cpu_mask, GFP_KERNEL))
                        goto err;
                cpumask_copy(wqe->cpu_mask, cpumask_of_node(node));
                wqe->node = alloc_node;
                wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
                wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
                                        task_rlimit(current, RLIMIT_NPROC);
                INIT_LIST_HEAD(&wqe->wait.entry);
                wqe->wait.func = io_wqe_hash_wake;
                for (i = 0; i < IO_WQ_ACCT_NR; i++) {
                        struct io_wqe_acct *acct = &wqe->acct[i];

                        acct->index = i;
                        atomic_set(&acct->nr_running, 0);
                        INIT_WQ_LIST(&acct->work_list);
                }
                wqe->wq = wq;
                raw_spin_lock_init(&wqe->lock);
                INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
                INIT_LIST_HEAD(&wqe->all_list);
        }

        wq->task = get_task_struct(data->task);
        atomic_set(&wq->worker_refs, 1);
        init_completion(&wq->worker_done);
        return wq;
err:
        io_wq_put_hash(data->hash);
        cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
        for_each_node(node) {
                if (!wq->wqes[node])
                        continue;
                free_cpumask_var(wq->wqes[node]->cpu_mask);
                kfree(wq->wqes[node]);
        }
err_wq:
        kfree(wq);
        return ERR_PTR(ret);
}

static bool io_task_work_match(struct callback_head *cb, void *data)
{
        struct io_worker *worker;

        if (cb->func != create_worker_cb && cb->func != create_worker_cont)
                return false;
        worker = container_of(cb, struct io_worker, create_work);
        return worker->wqe->wq == data;
}

void io_wq_exit_start(struct io_wq *wq)
{
        set_bit(IO_WQ_BIT_EXIT, &wq->state);
}

static void io_wq_cancel_tw_create(struct io_wq *wq)
{
        struct callback_head *cb;

        while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
                struct io_worker *worker;

                worker = container_of(cb, struct io_worker, create_work);
                io_worker_cancel_cb(worker);
                /*
                 * Only the worker continuation helper has worker allocated and
                 * hence needs freeing.
                 */
                if (cb->func == create_worker_cont)
                        kfree(worker);
        }
}

static void io_wq_exit_workers(struct io_wq *wq)
{
        int node;

        if (!wq->task)
                return;

        io_wq_cancel_tw_create(wq);

        rcu_read_lock();
        for_each_node(node) {
                struct io_wqe *wqe = wq->wqes[node];

                io_wq_for_each_worker(wqe, io_wq_worker_wake, NULL);
        }
        rcu_read_unlock();
        io_worker_ref_put(wq);
        wait_for_completion(&wq->worker_done);

        for_each_node(node) {
                spin_lock_irq(&wq->hash->wait.lock);
                list_del_init(&wq->wqes[node]->wait.entry);
                spin_unlock_irq(&wq->hash->wait.lock);
        }
        put_task_struct(wq->task);
        wq->task = NULL;
}

static void io_wq_destroy(struct io_wq *wq)
{
        int node;

        cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);

        for_each_node(node) {
                struct io_wqe *wqe = wq->wqes[node];
                struct io_cb_cancel_data match = {
                        .fn                = io_wq_work_match_all,
                        .cancel_all        = true,
                };
                io_wqe_cancel_pending_work(wqe, &match);
                free_cpumask_var(wqe->cpu_mask);
                kfree(wqe);
        }
        io_wq_put_hash(wq->hash);
        kfree(wq);
}

void io_wq_put_and_exit(struct io_wq *wq)
{
        WARN_ON_ONCE(!test_bit(IO_WQ_BIT_EXIT, &wq->state));

        io_wq_exit_workers(wq);
        io_wq_destroy(wq);
}

struct online_data {
        unsigned int cpu;
        bool online;
};

static bool io_wq_worker_affinity(struct io_worker *worker, void *data)
{
        struct online_data *od = data;

        if (od->online)
                cpumask_set_cpu(od->cpu, worker->wqe->cpu_mask);
        else
                cpumask_clear_cpu(od->cpu, worker->wqe->cpu_mask);
        return false;
}

static int __io_wq_cpu_online(struct io_wq *wq, unsigned int cpu, bool online)
{
        struct online_data od = {
                .cpu = cpu,
                .online = online
        };
        int i;

        rcu_read_lock();
        for_each_node(i)
                io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, &od);
        rcu_read_unlock();
        return 0;
}

static int io_wq_cpu_online(unsigned int cpu, struct hlist_node *node)
{
        struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node);

        return __io_wq_cpu_online(wq, cpu, true);
}

static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node)
{
        struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node);

        return __io_wq_cpu_online(wq, cpu, false);
}

int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask)
{
        int i;

        rcu_read_lock();
        for_each_node(i) {
                struct io_wqe *wqe = wq->wqes[i];

                if (mask)
                        cpumask_copy(wqe->cpu_mask, mask);
                else
                        cpumask_copy(wqe->cpu_mask, cpumask_of_node(i));
        }
        rcu_read_unlock();
        return 0;
}

/*
 * Set max number of unbounded workers, returns old value. If new_count is 0,
 * then just return the old value.
 */
int io_wq_max_workers(struct io_wq *wq, int *new_count)
{
        int prev[IO_WQ_ACCT_NR];
        bool first_node = true;
        int i, node;

        BUILD_BUG_ON((int) IO_WQ_ACCT_BOUND   != (int) IO_WQ_BOUND);
        BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND);
        BUILD_BUG_ON((int) IO_WQ_ACCT_NR      != 2);

        for (i = 0; i < 2; i++) {
                if (new_count[i] > task_rlimit(current, RLIMIT_NPROC))
                        new_count[i] = task_rlimit(current, RLIMIT_NPROC);
        }

        for (i = 0; i < IO_WQ_ACCT_NR; i++)
                prev[i] = 0;

        rcu_read_lock();
        for_each_node(node) {
                struct io_wqe *wqe = wq->wqes[node];
                struct io_wqe_acct *acct;

                raw_spin_lock(&wqe->lock);
                for (i = 0; i < IO_WQ_ACCT_NR; i++) {
                        acct = &wqe->acct[i];
                        if (first_node)
                                prev[i] = max_t(int, acct->max_workers, prev[i]);
                        if (new_count[i])
                                acct->max_workers = new_count[i];
                }
                raw_spin_unlock(&wqe->lock);
                first_node = false;
        }
        rcu_read_unlock();

        for (i = 0; i < IO_WQ_ACCT_NR; i++)
                new_count[i] = prev[i];

        return 0;
}

static __init int io_wq_init(void)
{
        int ret;

        ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "io-wq/online",
                                        io_wq_cpu_online, io_wq_cpu_offline);
        if (ret < 0)
                return ret;
        io_wq_online = ret;
        return 0;
}
subsys_initcall(io_wq_init);
































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM writeback

#if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_WRITEBACK_H

#include <linux/tracepoint.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>

#define show_inode_state(state)                                        \
        __print_flags(state, "|",                                \
                {I_DIRTY_SYNC,                "I_DIRTY_SYNC"},        \
                {I_DIRTY_DATASYNC,        "I_DIRTY_DATASYNC"},        \
                {I_DIRTY_PAGES,                "I_DIRTY_PAGES"},        \
                {I_NEW,                        "I_NEW"},                \
                {I_WILL_FREE,                "I_WILL_FREE"},                \
                {I_FREEING,                "I_FREEING"},                \
                {I_CLEAR,                "I_CLEAR"},                \
                {I_SYNC,                "I_SYNC"},                \
                {I_DIRTY_TIME,                "I_DIRTY_TIME"},        \
                {I_REFERENCED,                "I_REFERENCED"}                \
        )

/* enums need to be exported to user space */
#undef EM
#undef EMe
#define EM(a,b)         TRACE_DEFINE_ENUM(a);
#define EMe(a,b)        TRACE_DEFINE_ENUM(a);

#define WB_WORK_REASON                                                        \
        EM( WB_REASON_BACKGROUND,                "background")                \
        EM( WB_REASON_VMSCAN,                        "vmscan")                \
        EM( WB_REASON_SYNC,                        "sync")                        \
        EM( WB_REASON_PERIODIC,                        "periodic")                \
        EM( WB_REASON_LAPTOP_TIMER,                "laptop_timer")                \
        EM( WB_REASON_FS_FREE_SPACE,                "fs_free_space")        \
        EMe(WB_REASON_FORKER_THREAD,                "forker_thread")

WB_WORK_REASON

/*
 * Now redefine the EM() and EMe() macros to map the enums to the strings
 * that will be printed in the output.
 */
#undef EM
#undef EMe
#define EM(a,b)                { a, b },
#define EMe(a,b)        { a, b }

struct wb_writeback_work;

DECLARE_EVENT_CLASS(writeback_page_template,

        TP_PROTO(struct page *page, struct address_space *mapping),

        TP_ARGS(page, mapping),

        TP_STRUCT__entry (
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(pgoff_t, index)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(mapping ? inode_to_bdi(mapping->host) :
                                         NULL), 32);
                __entry->ino = (mapping && mapping->host) ? mapping->host->i_ino : 0;
                __entry->index = page->index;
        ),

        TP_printk("bdi %s: ino=%lu index=%lu",
                __entry->name,
                (unsigned long)__entry->ino,
                __entry->index
        )
);

DEFINE_EVENT(writeback_page_template, writeback_dirty_page,

        TP_PROTO(struct page *page, struct address_space *mapping),

        TP_ARGS(page, mapping)
);

DEFINE_EVENT(writeback_page_template, wait_on_page_writeback,

        TP_PROTO(struct page *page, struct address_space *mapping),

        TP_ARGS(page, mapping)
);

DECLARE_EVENT_CLASS(writeback_dirty_inode_template,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags),

        TP_STRUCT__entry (
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(unsigned long, state)
                __field(unsigned long, flags)
        ),

        TP_fast_assign(
                struct backing_dev_info *bdi = inode_to_bdi(inode);

                /* may be called for files on pseudo FSes w/ unregistered bdi */
                strscpy_pad(__entry->name, bdi_dev_name(bdi), 32);
                __entry->ino                = inode->i_ino;
                __entry->state                = inode->i_state;
                __entry->flags                = flags;
        ),

        TP_printk("bdi %s: ino=%lu state=%s flags=%s",
                __entry->name,
                (unsigned long)__entry->ino,
                show_inode_state(__entry->state),
                show_inode_state(__entry->flags)
        )
);

DEFINE_EVENT(writeback_dirty_inode_template, writeback_mark_inode_dirty,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags)
);

DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags)
);

DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags)
);

#ifdef CREATE_TRACE_POINTS
#ifdef CONFIG_CGROUP_WRITEBACK

static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb)
{
        return cgroup_ino(wb->memcg_css->cgroup);
}

static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc)
{
        if (wbc->wb)
                return __trace_wb_assign_cgroup(wbc->wb);
        else
                return 1;
}
#else        /* CONFIG_CGROUP_WRITEBACK */

static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb)
{
        return 1;
}

static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc)
{
        return 1;
}

#endif        /* CONFIG_CGROUP_WRITEBACK */
#endif        /* CREATE_TRACE_POINTS */

#ifdef CONFIG_CGROUP_WRITEBACK
TRACE_EVENT(inode_foreign_history,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc,
                 unsigned int history),

        TP_ARGS(inode, wbc, history),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(ino_t,                ino)
                __field(ino_t,                cgroup_ino)
                __field(unsigned int,        history)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
                __entry->history        = history;
        ),

        TP_printk("bdi %s: ino=%lu cgroup_ino=%lu history=0x%x",
                __entry->name,
                (unsigned long)__entry->ino,
                (unsigned long)__entry->cgroup_ino,
                __entry->history
        )
);

TRACE_EVENT(inode_switch_wbs,

        TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb,
                 struct bdi_writeback *new_wb),

        TP_ARGS(inode, old_wb, new_wb),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(ino_t,                ino)
                __field(ino_t,                old_cgroup_ino)
                __field(ino_t,                new_cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32);
                __entry->ino                = inode->i_ino;
                __entry->old_cgroup_ino        = __trace_wb_assign_cgroup(old_wb);
                __entry->new_cgroup_ino        = __trace_wb_assign_cgroup(new_wb);
        ),

        TP_printk("bdi %s: ino=%lu old_cgroup_ino=%lu new_cgroup_ino=%lu",
                __entry->name,
                (unsigned long)__entry->ino,
                (unsigned long)__entry->old_cgroup_ino,
                (unsigned long)__entry->new_cgroup_ino
        )
);

TRACE_EVENT(track_foreign_dirty,

        TP_PROTO(struct page *page, struct bdi_writeback *wb),

        TP_ARGS(page, wb),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(u64,                bdi_id)
                __field(ino_t,                ino)
                __field(unsigned int,        memcg_id)
                __field(ino_t,                cgroup_ino)
                __field(ino_t,                page_cgroup_ino)
        ),

        TP_fast_assign(
                struct address_space *mapping = page_mapping(page);
                struct inode *inode = mapping ? mapping->host : NULL;

                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->bdi_id                = wb->bdi->id;
                __entry->ino                = inode ? inode->i_ino : 0;
                __entry->memcg_id        = wb->memcg_css->id;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
                __entry->page_cgroup_ino = cgroup_ino(page->mem_cgroup->css.cgroup);
        ),

        TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu",
                __entry->name,
                __entry->bdi_id,
                (unsigned long)__entry->ino,
                __entry->memcg_id,
                (unsigned long)__entry->cgroup_ino,
                (unsigned long)__entry->page_cgroup_ino
        )
);

TRACE_EVENT(flush_foreign,

        TP_PROTO(struct bdi_writeback *wb, unsigned int frn_bdi_id,
                 unsigned int frn_memcg_id),

        TP_ARGS(wb, frn_bdi_id, frn_memcg_id),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(ino_t,                cgroup_ino)
                __field(unsigned int,        frn_bdi_id)
                __field(unsigned int,        frn_memcg_id)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
                __entry->frn_bdi_id        = frn_bdi_id;
                __entry->frn_memcg_id        = frn_memcg_id;
        ),

        TP_printk("bdi %s: cgroup_ino=%lu frn_bdi_id=%u frn_memcg_id=%u",
                __entry->name,
                (unsigned long)__entry->cgroup_ino,
                __entry->frn_bdi_id,
                __entry->frn_memcg_id
        )
);
#endif

DECLARE_EVENT_CLASS(writeback_write_inode_template,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc),

        TP_ARGS(inode, wbc),

        TP_STRUCT__entry (
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(int, sync_mode)
                __field(ino_t, cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->sync_mode        = wbc->sync_mode;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
        ),

        TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%lu",
                __entry->name,
                (unsigned long)__entry->ino,
                __entry->sync_mode,
                (unsigned long)__entry->cgroup_ino
        )
);

DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode_start,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc),

        TP_ARGS(inode, wbc)
);

DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc),

        TP_ARGS(inode, wbc)
);

DECLARE_EVENT_CLASS(writeback_work_class,
        TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work),
        TP_ARGS(wb, work),
        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(long, nr_pages)
                __field(dev_t, sb_dev)
                __field(int, sync_mode)
                __field(int, for_kupdate)
                __field(int, range_cyclic)
                __field(int, for_background)
                __field(int, reason)
                __field(ino_t, cgroup_ino)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->nr_pages = work->nr_pages;
                __entry->sb_dev = work->sb ? work->sb->s_dev : 0;
                __entry->sync_mode = work->sync_mode;
                __entry->for_kupdate = work->for_kupdate;
                __entry->range_cyclic = work->range_cyclic;
                __entry->for_background        = work->for_background;
                __entry->reason = work->reason;
                __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
        ),
        TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
                  "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%lu",
                  __entry->name,
                  MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
                  __entry->nr_pages,
                  __entry->sync_mode,
                  __entry->for_kupdate,
                  __entry->range_cyclic,
                  __entry->for_background,
                  __print_symbolic(__entry->reason, WB_WORK_REASON),
                  (unsigned long)__entry->cgroup_ino
        )
);
#define DEFINE_WRITEBACK_WORK_EVENT(name) \
DEFINE_EVENT(writeback_work_class, name, \
        TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \
        TP_ARGS(wb, work))
DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
DEFINE_WRITEBACK_WORK_EVENT(writeback_written);
DEFINE_WRITEBACK_WORK_EVENT(writeback_wait);

TRACE_EVENT(writeback_pages_written,
        TP_PROTO(long pages_written),
        TP_ARGS(pages_written),
        TP_STRUCT__entry(
                __field(long,                pages)
        ),
        TP_fast_assign(
                __entry->pages                = pages_written;
        ),
        TP_printk("%ld", __entry->pages)
);

DECLARE_EVENT_CLASS(writeback_class,
        TP_PROTO(struct bdi_writeback *wb),
        TP_ARGS(wb),
        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(ino_t, cgroup_ino)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
        ),
        TP_printk("bdi %s: cgroup_ino=%lu",
                  __entry->name,
                  (unsigned long)__entry->cgroup_ino
        )
);
#define DEFINE_WRITEBACK_EVENT(name) \
DEFINE_EVENT(writeback_class, name, \
        TP_PROTO(struct bdi_writeback *wb), \
        TP_ARGS(wb))

DEFINE_WRITEBACK_EVENT(writeback_wake_background);

TRACE_EVENT(writeback_bdi_register,
        TP_PROTO(struct backing_dev_info *bdi),
        TP_ARGS(bdi),
        TP_STRUCT__entry(
                __array(char, name, 32)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(bdi), 32);
        ),
        TP_printk("bdi %s",
                __entry->name
        )
);

DECLARE_EVENT_CLASS(wbc_class,
        TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
        TP_ARGS(wbc, bdi),
        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(long, nr_to_write)
                __field(long, pages_skipped)
                __field(int, sync_mode)
                __field(int, for_kupdate)
                __field(int, for_background)
                __field(int, for_reclaim)
                __field(int, range_cyclic)
                __field(long, range_start)
                __field(long, range_end)
                __field(ino_t, cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(bdi), 32);
                __entry->nr_to_write        = wbc->nr_to_write;
                __entry->pages_skipped        = wbc->pages_skipped;
                __entry->sync_mode        = wbc->sync_mode;
                __entry->for_kupdate        = wbc->for_kupdate;
                __entry->for_background        = wbc->for_background;
                __entry->for_reclaim        = wbc->for_reclaim;
                __entry->range_cyclic        = wbc->range_cyclic;
                __entry->range_start        = (long)wbc->range_start;
                __entry->range_end        = (long)wbc->range_end;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
        ),

        TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
                "bgrd=%d reclm=%d cyclic=%d "
                "start=0x%lx end=0x%lx cgroup_ino=%lu",
                __entry->name,
                __entry->nr_to_write,
                __entry->pages_skipped,
                __entry->sync_mode,
                __entry->for_kupdate,
                __entry->for_background,
                __entry->for_reclaim,
                __entry->range_cyclic,
                __entry->range_start,
                __entry->range_end,
                (unsigned long)__entry->cgroup_ino
        )
)

#define DEFINE_WBC_EVENT(name) \
DEFINE_EVENT(wbc_class, name, \
        TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \
        TP_ARGS(wbc, bdi))
DEFINE_WBC_EVENT(wbc_writepage);

TRACE_EVENT(writeback_queue_io,
        TP_PROTO(struct bdi_writeback *wb,
                 struct wb_writeback_work *work,
                 unsigned long dirtied_before,
                 int moved),
        TP_ARGS(wb, work, dirtied_before, moved),
        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(unsigned long,        older)
                __field(long,                age)
                __field(int,                moved)
                __field(int,                reason)
                __field(ino_t,                cgroup_ino)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->older        = dirtied_before;
                __entry->age        = (jiffies - dirtied_before) * 1000 / HZ;
                __entry->moved        = moved;
                __entry->reason        = work->reason;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
        ),
        TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%lu",
                __entry->name,
                __entry->older,        /* dirtied_before in jiffies */
                __entry->age,        /* dirtied_before in relative milliseconds */
                __entry->moved,
                __print_symbolic(__entry->reason, WB_WORK_REASON),
                (unsigned long)__entry->cgroup_ino
        )
);

TRACE_EVENT(global_dirty_state,

        TP_PROTO(unsigned long background_thresh,
                 unsigned long dirty_thresh
        ),

        TP_ARGS(background_thresh,
                dirty_thresh
        ),

        TP_STRUCT__entry(
                __field(unsigned long,        nr_dirty)
                __field(unsigned long,        nr_writeback)
                __field(unsigned long,        background_thresh)
                __field(unsigned long,        dirty_thresh)
                __field(unsigned long,        dirty_limit)
                __field(unsigned long,        nr_dirtied)
                __field(unsigned long,        nr_written)
        ),

        TP_fast_assign(
                __entry->nr_dirty        = global_node_page_state(NR_FILE_DIRTY);
                __entry->nr_writeback        = global_node_page_state(NR_WRITEBACK);
                __entry->nr_dirtied        = global_node_page_state(NR_DIRTIED);
                __entry->nr_written        = global_node_page_state(NR_WRITTEN);
                __entry->background_thresh = background_thresh;
                __entry->dirty_thresh        = dirty_thresh;
                __entry->dirty_limit        = global_wb_domain.dirty_limit;
        ),

        TP_printk("dirty=%lu writeback=%lu "
                  "bg_thresh=%lu thresh=%lu limit=%lu "
                  "dirtied=%lu written=%lu",
                  __entry->nr_dirty,
                  __entry->nr_writeback,
                  __entry->background_thresh,
                  __entry->dirty_thresh,
                  __entry->dirty_limit,
                  __entry->nr_dirtied,
                  __entry->nr_written
        )
);

#define KBps(x)                        ((x) << (PAGE_SHIFT - 10))

TRACE_EVENT(bdi_dirty_ratelimit,

        TP_PROTO(struct bdi_writeback *wb,
                 unsigned long dirty_rate,
                 unsigned long task_ratelimit),

        TP_ARGS(wb, dirty_rate, task_ratelimit),

        TP_STRUCT__entry(
                __array(char,                bdi, 32)
                __field(unsigned long,        write_bw)
                __field(unsigned long,        avg_write_bw)
                __field(unsigned long,        dirty_rate)
                __field(unsigned long,        dirty_ratelimit)
                __field(unsigned long,        task_ratelimit)
                __field(unsigned long,        balanced_dirty_ratelimit)
                __field(ino_t,                cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32);
                __entry->write_bw        = KBps(wb->write_bandwidth);
                __entry->avg_write_bw        = KBps(wb->avg_write_bandwidth);
                __entry->dirty_rate        = KBps(dirty_rate);
                __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit);
                __entry->task_ratelimit        = KBps(task_ratelimit);
                __entry->balanced_dirty_ratelimit =
                                        KBps(wb->balanced_dirty_ratelimit);
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
        ),

        TP_printk("bdi %s: "
                  "write_bw=%lu awrite_bw=%lu dirty_rate=%lu "
                  "dirty_ratelimit=%lu task_ratelimit=%lu "
                  "balanced_dirty_ratelimit=%lu cgroup_ino=%lu",
                  __entry->bdi,
                  __entry->write_bw,                /* write bandwidth */
                  __entry->avg_write_bw,        /* avg write bandwidth */
                  __entry->dirty_rate,                /* bdi dirty rate */
                  __entry->dirty_ratelimit,        /* base ratelimit */
                  __entry->task_ratelimit, /* ratelimit with position control */
                  __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */
                  (unsigned long)__entry->cgroup_ino
        )
);

TRACE_EVENT(balance_dirty_pages,

        TP_PROTO(struct bdi_writeback *wb,
                 unsigned long thresh,
                 unsigned long bg_thresh,
                 unsigned long dirty,
                 unsigned long bdi_thresh,
                 unsigned long bdi_dirty,
                 unsigned long dirty_ratelimit,
                 unsigned long task_ratelimit,
                 unsigned long dirtied,
                 unsigned long period,
                 long pause,
                 unsigned long start_time),

        TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
                dirty_ratelimit, task_ratelimit,
                dirtied, period, pause, start_time),

        TP_STRUCT__entry(
                __array(         char,        bdi, 32)
                __field(unsigned long,        limit)
                __field(unsigned long,        setpoint)
                __field(unsigned long,        dirty)
                __field(unsigned long,        bdi_setpoint)
                __field(unsigned long,        bdi_dirty)
                __field(unsigned long,        dirty_ratelimit)
                __field(unsigned long,        task_ratelimit)
                __field(unsigned int,        dirtied)
                __field(unsigned int,        dirtied_pause)
                __field(unsigned long,        paused)
                __field(         long,        pause)
                __field(unsigned long,        period)
                __field(         long,        think)
                __field(ino_t,                cgroup_ino)
        ),

        TP_fast_assign(
                unsigned long freerun = (thresh + bg_thresh) / 2;
                strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32);

                __entry->limit                = global_wb_domain.dirty_limit;
                __entry->setpoint        = (global_wb_domain.dirty_limit +
                                                freerun) / 2;
                __entry->dirty                = dirty;
                __entry->bdi_setpoint        = __entry->setpoint *
                                                bdi_thresh / (thresh + 1);
                __entry->bdi_dirty        = bdi_dirty;
                __entry->dirty_ratelimit = KBps(dirty_ratelimit);
                __entry->task_ratelimit        = KBps(task_ratelimit);
                __entry->dirtied        = dirtied;
                __entry->dirtied_pause        = current->nr_dirtied_pause;
                __entry->think                = current->dirty_paused_when == 0 ? 0 :
                         (long)(jiffies - current->dirty_paused_when) * 1000/HZ;
                __entry->period                = period * 1000 / HZ;
                __entry->pause                = pause * 1000 / HZ;
                __entry->paused                = (jiffies - start_time) * 1000 / HZ;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
        ),


        TP_printk("bdi %s: "
                  "limit=%lu setpoint=%lu dirty=%lu "
                  "bdi_setpoint=%lu bdi_dirty=%lu "
                  "dirty_ratelimit=%lu task_ratelimit=%lu "
                  "dirtied=%u dirtied_pause=%u "
                  "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%lu",
                  __entry->bdi,
                  __entry->limit,
                  __entry->setpoint,
                  __entry->dirty,
                  __entry->bdi_setpoint,
                  __entry->bdi_dirty,
                  __entry->dirty_ratelimit,
                  __entry->task_ratelimit,
                  __entry->dirtied,
                  __entry->dirtied_pause,
                  __entry->paused,        /* ms */
                  __entry->pause,        /* ms */
                  __entry->period,        /* ms */
                  __entry->think,        /* ms */
                  (unsigned long)__entry->cgroup_ino
          )
);

TRACE_EVENT(writeback_sb_inodes_requeue,

        TP_PROTO(struct inode *inode),
        TP_ARGS(inode),

        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(unsigned long, state)
                __field(unsigned long, dirtied_when)
                __field(ino_t, cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->state                = inode->i_state;
                __entry->dirtied_when        = inode->dirtied_when;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(inode_to_wb(inode));
        ),

        TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%lu",
                  __entry->name,
                  (unsigned long)__entry->ino,
                  show_inode_state(__entry->state),
                  __entry->dirtied_when,
                  (jiffies - __entry->dirtied_when) / HZ,
                  (unsigned long)__entry->cgroup_ino
        )
);

DECLARE_EVENT_CLASS(writeback_congest_waited_template,

        TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),

        TP_ARGS(usec_timeout, usec_delayed),

        TP_STRUCT__entry(
                __field(        unsigned int,        usec_timeout        )
                __field(        unsigned int,        usec_delayed        )
        ),

        TP_fast_assign(
                __entry->usec_timeout        = usec_timeout;
                __entry->usec_delayed        = usec_delayed;
        ),

        TP_printk("usec_timeout=%u usec_delayed=%u",
                        __entry->usec_timeout,
                        __entry->usec_delayed)
);

DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait,

        TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),

        TP_ARGS(usec_timeout, usec_delayed)
);

DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested,

        TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),

        TP_ARGS(usec_timeout, usec_delayed)
);

DECLARE_EVENT_CLASS(writeback_single_inode_template,

        TP_PROTO(struct inode *inode,
                 struct writeback_control *wbc,
                 unsigned long nr_to_write
        ),

        TP_ARGS(inode, wbc, nr_to_write),

        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(unsigned long, state)
                __field(unsigned long, dirtied_when)
                __field(unsigned long, writeback_index)
                __field(long, nr_to_write)
                __field(unsigned long, wrote)
                __field(ino_t, cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->state                = inode->i_state;
                __entry->dirtied_when        = inode->dirtied_when;
                __entry->writeback_index = inode->i_mapping->writeback_index;
                __entry->nr_to_write        = nr_to_write;
                __entry->wrote                = nr_to_write - wbc->nr_to_write;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
        ),

        TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu "
                  "index=%lu to_write=%ld wrote=%lu cgroup_ino=%lu",
                  __entry->name,
                  (unsigned long)__entry->ino,
                  show_inode_state(__entry->state),
                  __entry->dirtied_when,
                  (jiffies - __entry->dirtied_when) / HZ,
                  __entry->writeback_index,
                  __entry->nr_to_write,
                  __entry->wrote,
                  (unsigned long)__entry->cgroup_ino
        )
);

DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_start,
        TP_PROTO(struct inode *inode,
                 struct writeback_control *wbc,
                 unsigned long nr_to_write),
        TP_ARGS(inode, wbc, nr_to_write)
);

DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode,
        TP_PROTO(struct inode *inode,
                 struct writeback_control *wbc,
                 unsigned long nr_to_write),
        TP_ARGS(inode, wbc, nr_to_write)
);

DECLARE_EVENT_CLASS(writeback_inode_template,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(unsigned long,        state                        )
                __field(        __u16, mode                        )
                __field(unsigned long, dirtied_when                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->state        = inode->i_state;
                __entry->mode        = inode->i_mode;
                __entry->dirtied_when = inode->dirtied_when;
        ),

        TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long)__entry->ino, __entry->dirtied_when,
                  show_inode_state(__entry->state), __entry->mode)
);

DEFINE_EVENT(writeback_inode_template, writeback_lazytime,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

DEFINE_EVENT(writeback_inode_template, writeback_lazytime_iput,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

DEFINE_EVENT(writeback_inode_template, writeback_dirty_inode_enqueue,

        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

/*
 * Inode writeback list tracking.
 */

DEFINE_EVENT(writeback_inode_template, sb_mark_inode_writeback,
        TP_PROTO(struct inode *inode),
        TP_ARGS(inode)
);

DEFINE_EVENT(writeback_inode_template, sb_clear_inode_writeback,
        TP_PROTO(struct inode *inode),
        TP_ARGS(inode)
);

#endif /* _TRACE_WRITEBACK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_TASK_H
#define _LINUX_SCHED_TASK_H

/*
 * Interface between the scheduler and various task lifetime (fork()/exit())
 * functionality:
 */

#include <linux/sched.h>
#include <linux/uaccess.h>

struct task_struct;
struct rusage;
union thread_union;
struct css_set;

/* All the bits taken by the old clone syscall. */
#define CLONE_LEGACY_FLAGS 0xffffffffULL

struct kernel_clone_args {
        u64 flags;
        int __user *pidfd;
        int __user *child_tid;
        int __user *parent_tid;
        int exit_signal;
        unsigned long stack;
        unsigned long stack_size;
        unsigned long tls;
        pid_t *set_tid;
        /* Number of elements in *set_tid */
        size_t set_tid_size;
        int cgroup;
        int io_thread;
        struct cgroup *cgrp;
        struct css_set *cset;
};

/*
 * This serializes "schedule()" and also protects
 * the run-queue from deletions/modifications (but
 * _adding_ to the beginning of the run-queue has
 * a separate lock).
 */
extern rwlock_t tasklist_lock;
extern spinlock_t mmlist_lock;

extern union thread_union init_thread_union;
extern struct task_struct init_task;

#ifdef CONFIG_PROVE_RCU
extern int lockdep_tasklist_lock_is_held(void);
#endif /* #ifdef CONFIG_PROVE_RCU */

extern asmlinkage void schedule_tail(struct task_struct *prev);
extern void init_idle(struct task_struct *idle, int cpu);

extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
extern void sched_post_fork(struct task_struct *p,
                            struct kernel_clone_args *kargs);
extern void sched_dead(struct task_struct *p);

void __noreturn do_task_dead(void);
void __noreturn make_task_dead(int signr);

extern void mm_cache_init(void);
extern void proc_caches_init(void);

extern void fork_init(void);

extern void release_task(struct task_struct * p);

extern int copy_thread(unsigned long, unsigned long, unsigned long,
                       struct task_struct *, unsigned long);

extern void flush_thread(void);

#ifdef CONFIG_HAVE_EXIT_THREAD
extern void exit_thread(struct task_struct *tsk);
#else
static inline void exit_thread(struct task_struct *tsk)
{
}
#endif
extern void do_group_exit(int);

extern void exit_files(struct task_struct *);
extern void exit_itimers(struct task_struct *);

extern pid_t kernel_clone(struct kernel_clone_args *kargs);
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node);
struct task_struct *fork_idle(int);
extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);
int kernel_wait(pid_t pid, int *stat);

extern void free_task(struct task_struct *tsk);

/* sched_exec is called by processes performing an exec */
#ifdef CONFIG_SMP
extern void sched_exec(void);
#else
#define sched_exec()   {}
#endif

static inline struct task_struct *get_task_struct(struct task_struct *t)
{
        refcount_inc(&t->usage);
        return t;
}

extern void __put_task_struct(struct task_struct *t);
extern void __put_task_struct_rcu_cb(struct rcu_head *rhp);

static inline void put_task_struct(struct task_struct *t)
{
        if (!refcount_dec_and_test(&t->usage))
                return;

        /*
         * under PREEMPT_RT, we can't call put_task_struct
         * in atomic context because it will indirectly
         * acquire sleeping locks.
         *
         * call_rcu() will schedule delayed_put_task_struct_rcu()
         * to be called in process context.
         *
         * __put_task_struct() is called when
         * refcount_dec_and_test(&t->usage) succeeds.
         *
         * This means that it can't "conflict" with
         * put_task_struct_rcu_user() which abuses ->rcu the same
         * way; rcu_users has a reference so task->usage can't be
         * zero after rcu_users 1 -> 0 transition.
         *
         * delayed_free_task() also uses ->rcu, but it is only called
         * when it fails to fork a process. Therefore, there is no
         * way it can conflict with put_task_struct().
         */
        if (IS_ENABLED(CONFIG_PREEMPT_RT) && !preemptible())
                call_rcu(&t->rcu, __put_task_struct_rcu_cb);
        else
                __put_task_struct(t);
}

static inline void put_task_struct_many(struct task_struct *t, int nr)
{
        if (refcount_sub_and_test(nr, &t->usage))
                __put_task_struct(t);
}

void put_task_struct_rcu_user(struct task_struct *task);

#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
extern int arch_task_struct_size __read_mostly;
#else
# define arch_task_struct_size (sizeof(struct task_struct))
#endif

#ifndef CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST
/*
 * If an architecture has not declared a thread_struct whitelist we
 * must assume something there may need to be copied to userspace.
 */
static inline void arch_thread_struct_whitelist(unsigned long *offset,
                                                unsigned long *size)
{
        *offset = 0;
        /* Handle dynamically sized thread_struct. */
        *size = arch_task_struct_size - offsetof(struct task_struct, thread);
}
#endif

#ifdef CONFIG_VMAP_STACK
static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
{
        return t->stack_vm_area;
}
#else
static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
{
        return NULL;
}
#endif

/*
 * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
 * subscriptions and synchronises with wait4().  Also used in procfs.  Also
 * pins the final release of task.io_context.  Also protects ->cpuset and
 * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist.
 *
 * Nests both inside and outside of read_lock(&tasklist_lock).
 * It must not be nested with write_lock_irq(&tasklist_lock),
 * neither inside nor outside.
 */
static inline void task_lock(struct task_struct *p)
{
        spin_lock(&p->alloc_lock);
}

static inline void task_unlock(struct task_struct *p)
{
        spin_unlock(&p->alloc_lock);
}

#endif /* _LINUX_SCHED_TASK_H */





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_CPUMASK_H
#define __LINUX_CPUMASK_H

/*
 * Cpumasks provide a bitmap suitable for representing the
 * set of CPU's in a system, one bit position per CPU number.  In general,
 * only nr_cpu_ids (<= NR_CPUS) bits are valid.
 */
#include <linux/kernel.h>
#include <linux/threads.h>
#include <linux/bitmap.h>
#include <linux/atomic.h>
#include <linux/bug.h>

/* Don't assign or return these: may not be this big! */
typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;

/**
 * cpumask_bits - get the bits in a cpumask
 * @maskp: the struct cpumask *
 *
 * You should only assume nr_cpu_ids bits of this mask are valid.  This is
 * a macro so it's const-correct.
 */
#define cpumask_bits(maskp) ((maskp)->bits)

/**
 * cpumask_pr_args - printf args to output a cpumask
 * @maskp: cpumask to be printed
 *
 * Can be used to provide arguments for '%*pb[l]' when printing a cpumask.
 */
#define cpumask_pr_args(maskp)                nr_cpu_ids, cpumask_bits(maskp)

#if NR_CPUS == 1
#define nr_cpu_ids                1U
#else
extern unsigned int nr_cpu_ids;
#endif

#ifdef CONFIG_CPUMASK_OFFSTACK
/* Assuming NR_CPUS is huge, a runtime limit is more efficient.  Also,
 * not all bits may be allocated. */
#define nr_cpumask_bits        nr_cpu_ids
#else
#define nr_cpumask_bits        ((unsigned int)NR_CPUS)
#endif

/*
 * The following particular system cpumasks and operations manage
 * possible, present, active and online cpus.
 *
 *     cpu_possible_mask- has bit 'cpu' set iff cpu is populatable
 *     cpu_present_mask - has bit 'cpu' set iff cpu is populated
 *     cpu_online_mask  - has bit 'cpu' set iff cpu available to scheduler
 *     cpu_active_mask  - has bit 'cpu' set iff cpu available to migration
 *
 *  If !CONFIG_HOTPLUG_CPU, present == possible, and active == online.
 *
 *  The cpu_possible_mask is fixed at boot time, as the set of CPU id's
 *  that it is possible might ever be plugged in at anytime during the
 *  life of that system boot.  The cpu_present_mask is dynamic(*),
 *  representing which CPUs are currently plugged in.  And
 *  cpu_online_mask is the dynamic subset of cpu_present_mask,
 *  indicating those CPUs available for scheduling.
 *
 *  If HOTPLUG is enabled, then cpu_possible_mask is forced to have
 *  all NR_CPUS bits set, otherwise it is just the set of CPUs that
 *  ACPI reports present at boot.
 *
 *  If HOTPLUG is enabled, then cpu_present_mask varies dynamically,
 *  depending on what ACPI reports as currently plugged in, otherwise
 *  cpu_present_mask is just a copy of cpu_possible_mask.
 *
 *  (*) Well, cpu_present_mask is dynamic in the hotplug case.  If not
 *      hotplug, it's a copy of cpu_possible_mask, hence fixed at boot.
 *
 * Subtleties:
 * 1) UP arch's (NR_CPUS == 1, CONFIG_SMP not defined) hardcode
 *    assumption that their single CPU is online.  The UP
 *    cpu_{online,possible,present}_masks are placebos.  Changing them
 *    will have no useful affect on the following num_*_cpus()
 *    and cpu_*() macros in the UP case.  This ugliness is a UP
 *    optimization - don't waste any instructions or memory references
 *    asking if you're online or how many CPUs there are if there is
 *    only one CPU.
 */

extern struct cpumask __cpu_possible_mask;
extern struct cpumask __cpu_online_mask;
extern struct cpumask __cpu_present_mask;
extern struct cpumask __cpu_active_mask;
#define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask)
#define cpu_online_mask   ((const struct cpumask *)&__cpu_online_mask)
#define cpu_present_mask  ((const struct cpumask *)&__cpu_present_mask)
#define cpu_active_mask   ((const struct cpumask *)&__cpu_active_mask)

extern atomic_t __num_online_cpus;

#if NR_CPUS > 1
/**
 * num_online_cpus() - Read the number of online CPUs
 *
 * Despite the fact that __num_online_cpus is of type atomic_t, this
 * interface gives only a momentary snapshot and is not protected against
 * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held
 * region.
 */
static inline unsigned int num_online_cpus(void)
{
        return atomic_read(&__num_online_cpus);
}
#define num_possible_cpus()        cpumask_weight(cpu_possible_mask)
#define num_present_cpus()        cpumask_weight(cpu_present_mask)
#define num_active_cpus()        cpumask_weight(cpu_active_mask)
#define cpu_online(cpu)                cpumask_test_cpu((cpu), cpu_online_mask)
#define cpu_possible(cpu)        cpumask_test_cpu((cpu), cpu_possible_mask)
#define cpu_present(cpu)        cpumask_test_cpu((cpu), cpu_present_mask)
#define cpu_active(cpu)                cpumask_test_cpu((cpu), cpu_active_mask)
#else
#define num_online_cpus()        1U
#define num_possible_cpus()        1U
#define num_present_cpus()        1U
#define num_active_cpus()        1U
#define cpu_online(cpu)                ((cpu) == 0)
#define cpu_possible(cpu)        ((cpu) == 0)
#define cpu_present(cpu)        ((cpu) == 0)
#define cpu_active(cpu)                ((cpu) == 0)
#endif

extern cpumask_t cpus_booted_once_mask;

static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits)
{
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
        WARN_ON_ONCE(cpu >= bits);
#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
}

/* verify cpu argument to cpumask_* operators */
static inline unsigned int cpumask_check(unsigned int cpu)
{
        cpu_max_bits_warn(cpu, nr_cpumask_bits);
        return cpu;
}

#if NR_CPUS == 1
/* Uniprocessor.  Assume all masks are "1". */
static inline unsigned int cpumask_first(const struct cpumask *srcp)
{
        return 0;
}

static inline unsigned int cpumask_last(const struct cpumask *srcp)
{
        return 0;
}

/* Valid inputs for n are -1 and 0. */
static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
{
        return n+1;
}

static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
{
        return n+1;
}

static inline unsigned int cpumask_next_and(int n,
                                            const struct cpumask *srcp,
                                            const struct cpumask *andp)
{
        return n+1;
}

static inline unsigned int cpumask_next_wrap(int n, const struct cpumask *mask,
                                             int start, bool wrap)
{
        /* cpu0 unless stop condition, wrap and at cpu0, then nr_cpumask_bits */
        return (wrap && n == 0);
}

/* cpu must be a valid cpu, ie 0, so there's no other choice. */
static inline unsigned int cpumask_any_but(const struct cpumask *mask,
                                           unsigned int cpu)
{
        return 1;
}

static inline unsigned int cpumask_local_spread(unsigned int i, int node)
{
        return 0;
}

static inline int cpumask_any_and_distribute(const struct cpumask *src1p,
                                             const struct cpumask *src2p) {
        return cpumask_next_and(-1, src1p, src2p);
}

#define for_each_cpu(cpu, mask)                        \
        for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
#define for_each_cpu_not(cpu, mask)                \
        for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
#define for_each_cpu_wrap(cpu, mask, start)        \
        for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)(start))
#define for_each_cpu_and(cpu, mask1, mask2)        \
        for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask1, (void)mask2)
#else
/**
 * cpumask_first - get the first cpu in a cpumask
 * @srcp: the cpumask pointer
 *
 * Returns >= nr_cpu_ids if no cpus set.
 */
static inline unsigned int cpumask_first(const struct cpumask *srcp)
{
        return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
}

/**
 * cpumask_last - get the last CPU in a cpumask
 * @srcp:        - the cpumask pointer
 *
 * Returns        >= nr_cpumask_bits if no CPUs set.
 */
static inline unsigned int cpumask_last(const struct cpumask *srcp)
{
        return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
}

unsigned int cpumask_next(int n, const struct cpumask *srcp);

/**
 * cpumask_next_zero - get the next unset cpu in a cpumask
 * @n: the cpu prior to the place to search (ie. return will be > @n)
 * @srcp: the cpumask pointer
 *
 * Returns >= nr_cpu_ids if no further cpus unset.
 */
static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1);
}

int cpumask_next_and(int n, const struct cpumask *, const struct cpumask *);
int cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
unsigned int cpumask_local_spread(unsigned int i, int node);
int cpumask_any_and_distribute(const struct cpumask *src1p,
                               const struct cpumask *src2p);

/**
 * for_each_cpu - iterate over every cpu in a mask
 * @cpu: the (optionally unsigned) integer iterator
 * @mask: the cpumask pointer
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu(cpu, mask)                                \
        for ((cpu) = -1;                                \
                (cpu) = cpumask_next((cpu), (mask)),        \
                (cpu) < nr_cpu_ids;)

/**
 * for_each_cpu_not - iterate over every cpu in a complemented mask
 * @cpu: the (optionally unsigned) integer iterator
 * @mask: the cpumask pointer
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_not(cpu, mask)                                \
        for ((cpu) = -1;                                        \
                (cpu) = cpumask_next_zero((cpu), (mask)),        \
                (cpu) < nr_cpu_ids;)

extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);

/**
 * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location
 * @cpu: the (optionally unsigned) integer iterator
 * @mask: the cpumask poiter
 * @start: the start location
 *
 * The implementation does not assume any bit in @mask is set (including @start).
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_wrap(cpu, mask, start)                                        \
        for ((cpu) = cpumask_next_wrap((start)-1, (mask), (start), false);        \
             (cpu) < nr_cpumask_bits;                                                \
             (cpu) = cpumask_next_wrap((cpu), (mask), (start), true))

/**
 * for_each_cpu_and - iterate over every cpu in both masks
 * @cpu: the (optionally unsigned) integer iterator
 * @mask1: the first cpumask pointer
 * @mask2: the second cpumask pointer
 *
 * This saves a temporary CPU mask in many places.  It is equivalent to:
 *        struct cpumask tmp;
 *        cpumask_and(&tmp, &mask1, &mask2);
 *        for_each_cpu(cpu, &tmp)
 *                ...
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_and(cpu, mask1, mask2)                                \
        for ((cpu) = -1;                                                \
                (cpu) = cpumask_next_and((cpu), (mask1), (mask2)),        \
                (cpu) < nr_cpu_ids;)
#endif /* SMP */

#define CPU_BITS_NONE                                                \
{                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL                        \
}

#define CPU_BITS_CPU0                                                \
{                                                                \
        [0] =  1UL                                                \
}

/**
 * cpumask_set_cpu - set a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @dstp: the cpumask pointer
 */
static inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
{
        set_bit(cpumask_check(cpu), cpumask_bits(dstp));
}

static inline void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
{
        __set_bit(cpumask_check(cpu), cpumask_bits(dstp));
}


/**
 * cpumask_clear_cpu - clear a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @dstp: the cpumask pointer
 */
static inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp)
{
        clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
}

static inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp)
{
        __clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
}

/**
 * cpumask_test_cpu - test for a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @cpumask: the cpumask pointer
 *
 * Returns 1 if @cpu is set in @cpumask, else returns 0
 */
static inline int cpumask_test_cpu(int cpu, const struct cpumask *cpumask)
{
        return test_bit(cpumask_check(cpu), cpumask_bits((cpumask)));
}

/**
 * cpumask_test_and_set_cpu - atomically test and set a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @cpumask: the cpumask pointer
 *
 * Returns 1 if @cpu is set in old bitmap of @cpumask, else returns 0
 *
 * test_and_set_bit wrapper for cpumasks.
 */
static inline int cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask)
{
        return test_and_set_bit(cpumask_check(cpu), cpumask_bits(cpumask));
}

/**
 * cpumask_test_and_clear_cpu - atomically test and clear a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @cpumask: the cpumask pointer
 *
 * Returns 1 if @cpu is set in old bitmap of @cpumask, else returns 0
 *
 * test_and_clear_bit wrapper for cpumasks.
 */
static inline int cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask)
{
        return test_and_clear_bit(cpumask_check(cpu), cpumask_bits(cpumask));
}

/**
 * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask
 * @dstp: the cpumask pointer
 */
static inline void cpumask_setall(struct cpumask *dstp)
{
        bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask
 * @dstp: the cpumask pointer
 */
static inline void cpumask_clear(struct cpumask *dstp)
{
        bitmap_zero(cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpumask_and - *dstp = *src1p & *src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 *
 * If *@dstp is empty, returns 0, else returns 1
 */
static inline int cpumask_and(struct cpumask *dstp,
                               const struct cpumask *src1p,
                               const struct cpumask *src2p)
{
        return bitmap_and(cpumask_bits(dstp), cpumask_bits(src1p),
                                       cpumask_bits(src2p), nr_cpumask_bits);
}

/**
 * cpumask_or - *dstp = *src1p | *src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 */
static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p,
                              const struct cpumask *src2p)
{
        bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p),
                                      cpumask_bits(src2p), nr_cpumask_bits);
}

/**
 * cpumask_xor - *dstp = *src1p ^ *src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 */
static inline void cpumask_xor(struct cpumask *dstp,
                               const struct cpumask *src1p,
                               const struct cpumask *src2p)
{
        bitmap_xor(cpumask_bits(dstp), cpumask_bits(src1p),
                                       cpumask_bits(src2p), nr_cpumask_bits);
}

/**
 * cpumask_andnot - *dstp = *src1p & ~*src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 *
 * If *@dstp is empty, returns 0, else returns 1
 */
static inline int cpumask_andnot(struct cpumask *dstp,
                                  const struct cpumask *src1p,
                                  const struct cpumask *src2p)
{
        return bitmap_andnot(cpumask_bits(dstp), cpumask_bits(src1p),
                                          cpumask_bits(src2p), nr_cpumask_bits);
}

/**
 * cpumask_complement - *dstp = ~*srcp
 * @dstp: the cpumask result
 * @srcp: the input to invert
 */
static inline void cpumask_complement(struct cpumask *dstp,
                                      const struct cpumask *srcp)
{
        bitmap_complement(cpumask_bits(dstp), cpumask_bits(srcp),
                                              nr_cpumask_bits);
}

/**
 * cpumask_equal - *src1p == *src2p
 * @src1p: the first input
 * @src2p: the second input
 */
static inline bool cpumask_equal(const struct cpumask *src1p,
                                const struct cpumask *src2p)
{
        return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p),
                                                 nr_cpumask_bits);
}

/**
 * cpumask_or_equal - *src1p | *src2p == *src3p
 * @src1p: the first input
 * @src2p: the second input
 * @src3p: the third input
 */
static inline bool cpumask_or_equal(const struct cpumask *src1p,
                                    const struct cpumask *src2p,
                                    const struct cpumask *src3p)
{
        return bitmap_or_equal(cpumask_bits(src1p), cpumask_bits(src2p),
                               cpumask_bits(src3p), nr_cpumask_bits);
}

/**
 * cpumask_intersects - (*src1p & *src2p) != 0
 * @src1p: the first input
 * @src2p: the second input
 */
static inline bool cpumask_intersects(const struct cpumask *src1p,
                                     const struct cpumask *src2p)
{
        return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p),
                                                      nr_cpumask_bits);
}

/**
 * cpumask_subset - (*src1p & ~*src2p) == 0
 * @src1p: the first input
 * @src2p: the second input
 *
 * Returns 1 if *@src1p is a subset of *@src2p, else returns 0
 */
static inline int cpumask_subset(const struct cpumask *src1p,
                                 const struct cpumask *src2p)
{
        return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p),
                                                  nr_cpumask_bits);
}

/**
 * cpumask_empty - *srcp == 0
 * @srcp: the cpumask to that all cpus < nr_cpu_ids are clear.
 */
static inline bool cpumask_empty(const struct cpumask *srcp)
{
        return bitmap_empty(cpumask_bits(srcp), nr_cpumask_bits);
}

/**
 * cpumask_full - *srcp == 0xFFFFFFFF...
 * @srcp: the cpumask to that all cpus < nr_cpu_ids are set.
 */
static inline bool cpumask_full(const struct cpumask *srcp)
{
        return bitmap_full(cpumask_bits(srcp), nr_cpumask_bits);
}

/**
 * cpumask_weight - Count of bits in *srcp
 * @srcp: the cpumask to count bits (< nr_cpu_ids) in.
 */
static inline unsigned int cpumask_weight(const struct cpumask *srcp)
{
        return bitmap_weight(cpumask_bits(srcp), nr_cpumask_bits);
}

/**
 * cpumask_shift_right - *dstp = *srcp >> n
 * @dstp: the cpumask result
 * @srcp: the input to shift
 * @n: the number of bits to shift by
 */
static inline void cpumask_shift_right(struct cpumask *dstp,
                                       const struct cpumask *srcp, int n)
{
        bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n,
                                               nr_cpumask_bits);
}

/**
 * cpumask_shift_left - *dstp = *srcp << n
 * @dstp: the cpumask result
 * @srcp: the input to shift
 * @n: the number of bits to shift by
 */
static inline void cpumask_shift_left(struct cpumask *dstp,
                                      const struct cpumask *srcp, int n)
{
        bitmap_shift_left(cpumask_bits(dstp), cpumask_bits(srcp), n,
                                              nr_cpumask_bits);
}

/**
 * cpumask_copy - *dstp = *srcp
 * @dstp: the result
 * @srcp: the input cpumask
 */
static inline void cpumask_copy(struct cpumask *dstp,
                                const struct cpumask *srcp)
{
        bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), nr_cpumask_bits);
}

/**
 * cpumask_any - pick a "random" cpu from *srcp
 * @srcp: the input cpumask
 *
 * Returns >= nr_cpu_ids if no cpus set.
 */
#define cpumask_any(srcp) cpumask_first(srcp)

/**
 * cpumask_first_and - return the first cpu from *srcp1 & *srcp2
 * @src1p: the first input
 * @src2p: the second input
 *
 * Returns >= nr_cpu_ids if no cpus set in both.  See also cpumask_next_and().
 */
#define cpumask_first_and(src1p, src2p) cpumask_next_and(-1, (src1p), (src2p))

/**
 * cpumask_any_and - pick a "random" cpu from *mask1 & *mask2
 * @mask1: the first input cpumask
 * @mask2: the second input cpumask
 *
 * Returns >= nr_cpu_ids if no cpus set.
 */
#define cpumask_any_and(mask1, mask2) cpumask_first_and((mask1), (mask2))

/**
 * cpumask_of - the cpumask containing just a given cpu
 * @cpu: the cpu (<= nr_cpu_ids)
 */
#define cpumask_of(cpu) (get_cpu_mask(cpu))

/**
 * cpumask_parse_user - extract a cpumask from a user string
 * @buf: the buffer to extract from
 * @len: the length of the buffer
 * @dstp: the cpumask to set.
 *
 * Returns -errno, or 0 for success.
 */
static inline int cpumask_parse_user(const char __user *buf, int len,
                                     struct cpumask *dstp)
{
        return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpumask_parselist_user - extract a cpumask from a user string
 * @buf: the buffer to extract from
 * @len: the length of the buffer
 * @dstp: the cpumask to set.
 *
 * Returns -errno, or 0 for success.
 */
static inline int cpumask_parselist_user(const char __user *buf, int len,
                                     struct cpumask *dstp)
{
        return bitmap_parselist_user(buf, len, cpumask_bits(dstp),
                                     nr_cpumask_bits);
}

/**
 * cpumask_parse - extract a cpumask from a string
 * @buf: the buffer to extract from
 * @dstp: the cpumask to set.
 *
 * Returns -errno, or 0 for success.
 */
static inline int cpumask_parse(const char *buf, struct cpumask *dstp)
{
        return bitmap_parse(buf, UINT_MAX, cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpulist_parse - extract a cpumask from a user string of ranges
 * @buf: the buffer to extract from
 * @dstp: the cpumask to set.
 *
 * Returns -errno, or 0 for success.
 */
static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
{
        return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpumask_size - size to allocate for a 'struct cpumask' in bytes
 */
static inline unsigned int cpumask_size(void)
{
        return BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long);
}

/*
 * cpumask_var_t: struct cpumask for stack usage.
 *
 * Oh, the wicked games we play!  In order to make kernel coding a
 * little more difficult, we typedef cpumask_var_t to an array or a
 * pointer: doing &mask on an array is a noop, so it still works.
 *
 * ie.
 *        cpumask_var_t tmpmask;
 *        if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
 *                return -ENOMEM;
 *
 *          ... use 'tmpmask' like a normal struct cpumask * ...
 *
 *        free_cpumask_var(tmpmask);
 *
 *
 * However, one notable exception is there. alloc_cpumask_var() allocates
 * only nr_cpumask_bits bits (in the other hand, real cpumask_t always has
 * NR_CPUS bits). Therefore you don't have to dereference cpumask_var_t.
 *
 *        cpumask_var_t tmpmask;
 *        if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
 *                return -ENOMEM;
 *
 *        var = *tmpmask;
 *
 * This code makes NR_CPUS length memcopy and brings to a memory corruption.
 * cpumask_copy() provide safe copy functionality.
 *
 * Note that there is another evil here: If you define a cpumask_var_t
 * as a percpu variable then the way to obtain the address of the cpumask
 * structure differently influences what this_cpu_* operation needs to be
 * used. Please use this_cpu_cpumask_var_t in those cases. The direct use
 * of this_cpu_ptr() or this_cpu_read() will lead to failures when the
 * other type of cpumask_var_t implementation is configured.
 *
 * Please also note that __cpumask_var_read_mostly can be used to declare
 * a cpumask_var_t variable itself (not its content) as read mostly.
 */
#ifdef CONFIG_CPUMASK_OFFSTACK
typedef struct cpumask *cpumask_var_t;

#define this_cpu_cpumask_var_ptr(x)        this_cpu_read(x)
#define __cpumask_var_read_mostly        __read_mostly

bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags);
bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags);
void alloc_bootmem_cpumask_var(cpumask_var_t *mask);
void free_cpumask_var(cpumask_var_t mask);
void free_bootmem_cpumask_var(cpumask_var_t mask);

static inline bool cpumask_available(cpumask_var_t mask)
{
        return mask != NULL;
}

#else
typedef struct cpumask cpumask_var_t[1];

#define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x)
#define __cpumask_var_read_mostly

static inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        return true;
}

static inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
                                          int node)
{
        return true;
}

static inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        cpumask_clear(*mask);
        return true;
}

static inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
                                          int node)
{
        cpumask_clear(*mask);
        return true;
}

static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask)
{
}

static inline void free_cpumask_var(cpumask_var_t mask)
{
}

static inline void free_bootmem_cpumask_var(cpumask_var_t mask)
{
}

static inline bool cpumask_available(cpumask_var_t mask)
{
        return true;
}
#endif /* CONFIG_CPUMASK_OFFSTACK */

/* It's common to want to use cpu_all_mask in struct member initializers,
 * so it has to refer to an address rather than a pointer. */
extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS);
#define cpu_all_mask to_cpumask(cpu_all_bits)

/* First bits of cpu_bit_bitmap are in fact unset. */
#define cpu_none_mask to_cpumask(cpu_bit_bitmap[0])

#define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask)
#define for_each_online_cpu(cpu)   for_each_cpu((cpu), cpu_online_mask)
#define for_each_present_cpu(cpu)  for_each_cpu((cpu), cpu_present_mask)

/* Wrappers for arch boot code to manipulate normally-constant masks */
void init_cpu_present(const struct cpumask *src);
void init_cpu_possible(const struct cpumask *src);
void init_cpu_online(const struct cpumask *src);

static inline void reset_cpu_possible_mask(void)
{
        bitmap_zero(cpumask_bits(&__cpu_possible_mask), NR_CPUS);
}

static inline void
set_cpu_possible(unsigned int cpu, bool possible)
{
        if (possible)
                cpumask_set_cpu(cpu, &__cpu_possible_mask);
        else
                cpumask_clear_cpu(cpu, &__cpu_possible_mask);
}

static inline void
set_cpu_present(unsigned int cpu, bool present)
{
        if (present)
                cpumask_set_cpu(cpu, &__cpu_present_mask);
        else
                cpumask_clear_cpu(cpu, &__cpu_present_mask);
}

void set_cpu_online(unsigned int cpu, bool online);

static inline void
set_cpu_active(unsigned int cpu, bool active)
{
        if (active)
                cpumask_set_cpu(cpu, &__cpu_active_mask);
        else
                cpumask_clear_cpu(cpu, &__cpu_active_mask);
}


/**
 * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask *
 * @bitmap: the bitmap
 *
 * There are a few places where cpumask_var_t isn't appropriate and
 * static cpumasks must be used (eg. very early boot), yet we don't
 * expose the definition of 'struct cpumask'.
 *
 * This does the conversion, and can be used as a constant initializer.
 */
#define to_cpumask(bitmap)                                                \
        ((struct cpumask *)(1 ? (bitmap)                                \
                            : (void *)sizeof(__check_is_bitmap(bitmap))))

static inline int __check_is_bitmap(const unsigned long *bitmap)
{
        return 1;
}

/*
 * Special-case data structure for "single bit set only" constant CPU masks.
 *
 * We pre-generate all the 64 (or 32) possible bit positions, with enough
 * padding to the left and the right, and return the constant pointer
 * appropriately offset.
 */
extern const unsigned long
        cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)];

static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
{
        const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG];
        p -= cpu / BITS_PER_LONG;
        return to_cpumask(p);
}

#define cpu_is_offline(cpu)        unlikely(!cpu_online(cpu))

#if NR_CPUS <= BITS_PER_LONG
#define CPU_BITS_ALL                                                \
{                                                                \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
}

#else /* NR_CPUS > BITS_PER_LONG */

#define CPU_BITS_ALL                                                \
{                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,                \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
}
#endif /* NR_CPUS > BITS_PER_LONG */

/**
 * cpumap_print_to_pagebuf  - copies the cpumask into the buffer either
 *        as comma-separated list of cpus or hex values of cpumask
 * @list: indicates whether the cpumap must be list
 * @mask: the cpumask to copy
 * @buf: the buffer to copy into
 *
 * Returns the length of the (null-terminated) @buf string, zero if
 * nothing is copied.
 */
static inline ssize_t
cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask)
{
        return bitmap_print_to_pagebuf(list, buf, cpumask_bits(mask),
                                      nr_cpu_ids);
}

#if NR_CPUS <= BITS_PER_LONG
#define CPU_MASK_ALL                                                        \
(cpumask_t) { {                                                                \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
} }
#else
#define CPU_MASK_ALL                                                        \
(cpumask_t) { {                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,                        \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
} }
#endif /* NR_CPUS > BITS_PER_LONG */

#define CPU_MASK_NONE                                                        \
(cpumask_t) { {                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-1] =  0UL                                \
} }

#define CPU_MASK_CPU0                                                        \
(cpumask_t) { {                                                                \
        [0] =  1UL                                                        \
} }

#endif /* __LINUX_CPUMASK_H */



















































































    1 
    1 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
// SPDX-License-Identifier: GPL-2.0-only
/* Kernel thread helper functions.
 *   Copyright (C) 2004 IBM Corporation, Rusty Russell.
 *   Copyright (C) 2009 Red Hat, Inc.
 *
 * Creation is done via kthreadd, so that we get a clean environment
 * even if we're invoked from userspace (think modprobe, hotplug cpu,
 * etc.).
 */
#include <uapi/linux/sched/types.h>
#include <linux/mm.h>
#include <linux/mmu_context.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/kthread.h>
#include <linux/completion.h>
#include <linux/err.h>
#include <linux/cgroup.h>
#include <linux/cpuset.h>
#include <linux/unistd.h>
#include <linux/file.h>
#include <linux/export.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/freezer.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
#include <linux/numa.h>
#include <linux/sched/isolation.h>
#include <trace/events/sched.h>


static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;

struct kthread_create_info
{
        /* Information passed to kthread() from kthreadd. */
        int (*threadfn)(void *data);
        void *data;
        int node;

        /* Result passed back to kthread_create() from kthreadd. */
        struct task_struct *result;
        struct completion *done;

        struct list_head list;
};

struct kthread {
        unsigned long flags;
        unsigned int cpu;
        int (*threadfn)(void *);
        void *data;
        mm_segment_t oldfs;
        struct completion parked;
        struct completion exited;
#ifdef CONFIG_BLK_CGROUP
        struct cgroup_subsys_state *blkcg_css;
#endif
};

enum KTHREAD_BITS {
        KTHREAD_IS_PER_CPU = 0,
        KTHREAD_SHOULD_STOP,
        KTHREAD_SHOULD_PARK,
};

static inline void set_kthread_struct(void *kthread)
{
        /*
         * We abuse ->set_child_tid to avoid the new member and because it
         * can't be wrongly copied by copy_process(). We also rely on fact
         * that the caller can't exec, so PF_KTHREAD can't be cleared.
         */
        current->set_child_tid = (__force void __user *)kthread;
}

static inline struct kthread *to_kthread(struct task_struct *k)
{
        WARN_ON(!(k->flags & PF_KTHREAD));
        return (__force void *)k->set_child_tid;
}

/*
 * Variant of to_kthread() that doesn't assume @p is a kthread.
 *
 * Per construction; when:
 *
 *   (p->flags & PF_KTHREAD) && p->set_child_tid
 *
 * the task is both a kthread and struct kthread is persistent. However
 * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and
 * begin_new_exec()).
 */
static inline struct kthread *__to_kthread(struct task_struct *p)
{
        void *kthread = (__force void *)p->set_child_tid;
        if (kthread && !(p->flags & PF_KTHREAD))
                kthread = NULL;
        return kthread;
}

void free_kthread_struct(struct task_struct *k)
{
        struct kthread *kthread;

        /*
         * Can be NULL if this kthread was created by kernel_thread()
         * or if kmalloc() in kthread() failed.
         */
        kthread = to_kthread(k);
#ifdef CONFIG_BLK_CGROUP
        WARN_ON_ONCE(kthread && kthread->blkcg_css);
#endif
        kfree(kthread);
}

/**
 * kthread_should_stop - should this kthread return now?
 *
 * When someone calls kthread_stop() on your kthread, it will be woken
 * and this will return true.  You should then return, and your return
 * value will be passed through to kthread_stop().
 */
bool kthread_should_stop(void)
{
        return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags);
}
EXPORT_SYMBOL(kthread_should_stop);

bool __kthread_should_park(struct task_struct *k)
{
        return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags);
}
EXPORT_SYMBOL_GPL(__kthread_should_park);

/**
 * kthread_should_park - should this kthread park now?
 *
 * When someone calls kthread_park() on your kthread, it will be woken
 * and this will return true.  You should then do the necessary
 * cleanup and call kthread_parkme()
 *
 * Similar to kthread_should_stop(), but this keeps the thread alive
 * and in a park position. kthread_unpark() "restarts" the thread and
 * calls the thread function again.
 */
bool kthread_should_park(void)
{
        return __kthread_should_park(current);
}
EXPORT_SYMBOL_GPL(kthread_should_park);

/**
 * kthread_freezable_should_stop - should this freezable kthread return now?
 * @was_frozen: optional out parameter, indicates whether %current was frozen
 *
 * kthread_should_stop() for freezable kthreads, which will enter
 * refrigerator if necessary.  This function is safe from kthread_stop() /
 * freezer deadlock and freezable kthreads should use this function instead
 * of calling try_to_freeze() directly.
 */
bool kthread_freezable_should_stop(bool *was_frozen)
{
        bool frozen = false;

        might_sleep();

        if (unlikely(freezing(current)))
                frozen = __refrigerator(true);

        if (was_frozen)
                *was_frozen = frozen;

        return kthread_should_stop();
}
EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);

/**
 * kthread_func - return the function specified on kthread creation
 * @task: kthread task in question
 *
 * Returns NULL if the task is not a kthread.
 */
void *kthread_func(struct task_struct *task)
{
        struct kthread *kthread = __to_kthread(task);
        if (kthread)
                return kthread->threadfn;
        return NULL;
}
EXPORT_SYMBOL_GPL(kthread_func);

/**
 * kthread_data - return data value specified on kthread creation
 * @task: kthread task in question
 *
 * Return the data value specified when kthread @task was created.
 * The caller is responsible for ensuring the validity of @task when
 * calling this function.
 */
void *kthread_data(struct task_struct *task)
{
        return to_kthread(task)->data;
}
EXPORT_SYMBOL_GPL(kthread_data);

/**
 * kthread_probe_data - speculative version of kthread_data()
 * @task: possible kthread task in question
 *
 * @task could be a kthread task.  Return the data value specified when it
 * was created if accessible.  If @task isn't a kthread task or its data is
 * inaccessible for any reason, %NULL is returned.  This function requires
 * that @task itself is safe to dereference.
 */
void *kthread_probe_data(struct task_struct *task)
{
        struct kthread *kthread = __to_kthread(task);
        void *data = NULL;

        if (kthread)
                copy_from_kernel_nofault(&data, &kthread->data, sizeof(data));
        return data;
}

static void __kthread_parkme(struct kthread *self)
{
        for (;;) {
                /*
                 * TASK_PARKED is a special state; we must serialize against
                 * possible pending wakeups to avoid store-store collisions on
                 * task->state.
                 *
                 * Such a collision might possibly result in the task state
                 * changin from TASK_PARKED and us failing the
                 * wait_task_inactive() in kthread_park().
                 */
                set_special_state(TASK_PARKED);
                if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
                        break;

                /*
                 * Thread is going to call schedule(), do not preempt it,
                 * or the caller of kthread_park() may spend more time in
                 * wait_task_inactive().
                 */
                preempt_disable();
                complete(&self->parked);
                schedule_preempt_disabled();
                preempt_enable();
        }
        __set_current_state(TASK_RUNNING);
}

void kthread_parkme(void)
{
        __kthread_parkme(to_kthread(current));
}
EXPORT_SYMBOL_GPL(kthread_parkme);

static int kthread(void *_create)
{
        /* Copy data: it's on kthread's stack */
        struct kthread_create_info *create = _create;
        int (*threadfn)(void *data) = create->threadfn;
        void *data = create->data;
        struct completion *done;
        struct kthread *self;
        int ret;

        self = kzalloc(sizeof(*self), GFP_KERNEL);
        set_kthread_struct(self);

        /* If user was SIGKILLed, I release the structure. */
        done = xchg(&create->done, NULL);
        if (!done) {
                kfree(create);
                do_exit(-EINTR);
        }

        if (!self) {
                create->result = ERR_PTR(-ENOMEM);
                complete(done);
                do_exit(-ENOMEM);
        }

        self->threadfn = threadfn;
        self->data = data;
        init_completion(&self->exited);
        init_completion(&self->parked);
        current->vfork_done = &self->exited;

        /* OK, tell user we're spawned, wait for stop or wakeup */
        __set_current_state(TASK_UNINTERRUPTIBLE);
        create->result = current;
        /*
         * Thread is going to call schedule(), do not preempt it,
         * or the creator may spend more time in wait_task_inactive().
         */
        preempt_disable();
        complete(done);
        schedule_preempt_disabled();
        preempt_enable();

        ret = -EINTR;
        if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
                cgroup_kthread_ready();
                __kthread_parkme(self);
                ret = threadfn(data);
        }
        do_exit(ret);
}

/* called from do_fork() to get node information for about to be created task */
int tsk_fork_get_node(struct task_struct *tsk)
{
#ifdef CONFIG_NUMA
        if (tsk == kthreadd_task)
                return tsk->pref_node_fork;
#endif
        return NUMA_NO_NODE;
}

static void create_kthread(struct kthread_create_info *create)
{
        int pid;

#ifdef CONFIG_NUMA
        current->pref_node_fork = create->node;
#endif
        /* We want our own signal handler (we take no signals by default). */
        pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
        if (pid < 0) {
                /* If user was SIGKILLed, I release the structure. */
                struct completion *done = xchg(&create->done, NULL);

                if (!done) {
                        kfree(create);
                        return;
                }
                create->result = ERR_PTR(pid);
                complete(done);
        }
}

static __printf(4, 0)
struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
                                                    void *data, int node,
                                                    const char namefmt[],
                                                    va_list args)
{
        DECLARE_COMPLETION_ONSTACK(done);
        struct task_struct *task;
        struct kthread_create_info *create = kmalloc(sizeof(*create),
                                                     GFP_KERNEL);

        if (!create)
                return ERR_PTR(-ENOMEM);
        create->threadfn = threadfn;
        create->data = data;
        create->node = node;
        create->done = &done;

        spin_lock(&kthread_create_lock);
        list_add_tail(&create->list, &kthread_create_list);
        spin_unlock(&kthread_create_lock);

        wake_up_process(kthreadd_task);
        /*
         * Wait for completion in killable state, for I might be chosen by
         * the OOM killer while kthreadd is trying to allocate memory for
         * new kernel thread.
         */
        if (unlikely(wait_for_completion_killable(&done))) {
                /*
                 * If I was SIGKILLed before kthreadd (or new kernel thread)
                 * calls complete(), leave the cleanup of this structure to
                 * that thread.
                 */
                if (xchg(&create->done, NULL))
                        return ERR_PTR(-EINTR);
                /*
                 * kthreadd (or new kernel thread) will call complete()
                 * shortly.
                 */
                wait_for_completion(&done);
        }
        task = create->result;
        if (!IS_ERR(task)) {
                static const struct sched_param param = { .sched_priority = 0 };
                char name[TASK_COMM_LEN];

                /*
                 * task is already visible to other tasks, so updating
                 * COMM must be protected.
                 */
                vsnprintf(name, sizeof(name), namefmt, args);
                set_task_comm(task, name);
                /*
                 * root may have changed our (kthreadd's) priority or CPU mask.
                 * The kernel thread should not inherit these properties.
                 */
                sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
                set_cpus_allowed_ptr(task,
                                     housekeeping_cpumask(HK_FLAG_KTHREAD));
        }
        kfree(create);
        return task;
}

/**
 * kthread_create_on_node - create a kthread.
 * @threadfn: the function to run until signal_pending(current).
 * @data: data ptr for @threadfn.
 * @node: task and thread structures for the thread are allocated on this node
 * @namefmt: printf-style name for the thread.
 *
 * Description: This helper function creates and names a kernel
 * thread.  The thread will be stopped: use wake_up_process() to start
 * it.  See also kthread_run().  The new thread has SCHED_NORMAL policy and
 * is affine to all CPUs.
 *
 * If thread is going to be bound on a particular cpu, give its node
 * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
 * When woken, the thread will run @threadfn() with @data as its
 * argument. @threadfn() can either call do_exit() directly if it is a
 * standalone thread for which no one will call kthread_stop(), or
 * return when 'kthread_should_stop()' is true (which means
 * kthread_stop() has been called).  The return value should be zero
 * or a negative error number; it will be passed to kthread_stop().
 *
 * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
 */
struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
                                           void *data, int node,
                                           const char namefmt[],
                                           ...)
{
        struct task_struct *task;
        va_list args;

        va_start(args, namefmt);
        task = __kthread_create_on_node(threadfn, data, node, namefmt, args);
        va_end(args);

        return task;
}
EXPORT_SYMBOL(kthread_create_on_node);

static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state)
{
        unsigned long flags;

        if (!wait_task_inactive(p, state)) {
                WARN_ON(1);
                return;
        }

        /* It's safe because the task is inactive. */
        raw_spin_lock_irqsave(&p->pi_lock, flags);
        do_set_cpus_allowed(p, mask);
        p->flags |= PF_NO_SETAFFINITY;
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}

static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
{
        __kthread_bind_mask(p, cpumask_of(cpu), state);
}

void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
{
        __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
}

/**
 * kthread_bind - bind a just-created kthread to a cpu.
 * @p: thread created by kthread_create().
 * @cpu: cpu (might not be online, must be possible) for @k to run on.
 *
 * Description: This function is equivalent to set_cpus_allowed(),
 * except that @cpu doesn't need to be online, and the thread must be
 * stopped (i.e., just returned from kthread_create()).
 */
void kthread_bind(struct task_struct *p, unsigned int cpu)
{
        __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(kthread_bind);

/**
 * kthread_create_on_cpu - Create a cpu bound kthread
 * @threadfn: the function to run until signal_pending(current).
 * @data: data ptr for @threadfn.
 * @cpu: The cpu on which the thread should be bound,
 * @namefmt: printf-style name for the thread. Format is restricted
 *             to "name.*%u". Code fills in cpu number.
 *
 * Description: This helper function creates and names a kernel thread
 */
struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
                                          void *data, unsigned int cpu,
                                          const char *namefmt)
{
        struct task_struct *p;

        p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
                                   cpu);
        if (IS_ERR(p))
                return p;
        kthread_bind(p, cpu);
        /* CPU hotplug need to bind once again when unparking the thread. */
        to_kthread(p)->cpu = cpu;
        return p;
}

void kthread_set_per_cpu(struct task_struct *k, int cpu)
{
        struct kthread *kthread = to_kthread(k);
        if (!kthread)
                return;

        WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY));

        if (cpu < 0) {
                clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
                return;
        }

        kthread->cpu = cpu;
        set_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
}

bool kthread_is_per_cpu(struct task_struct *p)
{
        struct kthread *kthread = __to_kthread(p);
        if (!kthread)
                return false;

        return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
}

/**
 * kthread_unpark - unpark a thread created by kthread_create().
 * @k:                thread created by kthread_create().
 *
 * Sets kthread_should_park() for @k to return false, wakes it, and
 * waits for it to return. If the thread is marked percpu then its
 * bound to the cpu again.
 */
void kthread_unpark(struct task_struct *k)
{
        struct kthread *kthread = to_kthread(k);

        /*
         * Newly created kthread was parked when the CPU was offline.
         * The binding was lost and we need to set it again.
         */
        if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
                __kthread_bind(k, kthread->cpu, TASK_PARKED);

        clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
        /*
         * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup.
         */
        wake_up_state(k, TASK_PARKED);
}
EXPORT_SYMBOL_GPL(kthread_unpark);

/**
 * kthread_park - park a thread created by kthread_create().
 * @k: thread created by kthread_create().
 *
 * Sets kthread_should_park() for @k to return true, wakes it, and
 * waits for it to return. This can also be called after kthread_create()
 * instead of calling wake_up_process(): the thread will park without
 * calling threadfn().
 *
 * Returns 0 if the thread is parked, -ENOSYS if the thread exited.
 * If called by the kthread itself just the park bit is set.
 */
int kthread_park(struct task_struct *k)
{
        struct kthread *kthread = to_kthread(k);

        if (WARN_ON(k->flags & PF_EXITING))
                return -ENOSYS;

        if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)))
                return -EBUSY;

        set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
        if (k != current) {
                wake_up_process(k);
                /*
                 * Wait for __kthread_parkme() to complete(), this means we
                 * _will_ have TASK_PARKED and are about to call schedule().
                 */
                wait_for_completion(&kthread->parked);
                /*
                 * Now wait for that schedule() to complete and the task to
                 * get scheduled out.
                 */
                WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED));
        }

        return 0;
}
EXPORT_SYMBOL_GPL(kthread_park);

/**
 * kthread_stop - stop a thread created by kthread_create().
 * @k: thread created by kthread_create().
 *
 * Sets kthread_should_stop() for @k to return true, wakes it, and
 * waits for it to exit. This can also be called after kthread_create()
 * instead of calling wake_up_process(): the thread will exit without
 * calling threadfn().
 *
 * If threadfn() may call do_exit() itself, the caller must ensure
 * task_struct can't go away.
 *
 * Returns the result of threadfn(), or %-EINTR if wake_up_process()
 * was never called.
 */
int kthread_stop(struct task_struct *k)
{
        struct kthread *kthread;
        int ret;

        trace_sched_kthread_stop(k);

        get_task_struct(k);
        kthread = to_kthread(k);
        set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
        kthread_unpark(k);
        wake_up_process(k);
        wait_for_completion(&kthread->exited);
        ret = k->exit_code;
        put_task_struct(k);

        trace_sched_kthread_stop_ret(ret);
        return ret;
}
EXPORT_SYMBOL(kthread_stop);

int kthreadd(void *unused)
{
        struct task_struct *tsk = current;

        /* Setup a clean context for our children to inherit. */
        set_task_comm(tsk, "kthreadd");
        ignore_signals(tsk);
        set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_FLAG_KTHREAD));
        set_mems_allowed(node_states[N_MEMORY]);

        current->flags |= PF_NOFREEZE;
        cgroup_init_kthreadd();

        for (;;) {
                set_current_state(TASK_INTERRUPTIBLE);
                if (list_empty(&kthread_create_list))
                        schedule();
                __set_current_state(TASK_RUNNING);

                spin_lock(&kthread_create_lock);
                while (!list_empty(&kthread_create_list)) {
                        struct kthread_create_info *create;

                        create = list_entry(kthread_create_list.next,
                                            struct kthread_create_info, list);
                        list_del_init(&create->list);
                        spin_unlock(&kthread_create_lock);

                        create_kthread(create);

                        spin_lock(&kthread_create_lock);
                }
                spin_unlock(&kthread_create_lock);
        }

        return 0;
}

void __kthread_init_worker(struct kthread_worker *worker,
                                const char *name,
                                struct lock_class_key *key)
{
        memset(worker, 0, sizeof(struct kthread_worker));
        raw_spin_lock_init(&worker->lock);
        lockdep_set_class_and_name(&worker->lock, key, name);
        INIT_LIST_HEAD(&worker->work_list);
        INIT_LIST_HEAD(&worker->delayed_work_list);
}
EXPORT_SYMBOL_GPL(__kthread_init_worker);

/**
 * kthread_worker_fn - kthread function to process kthread_worker
 * @worker_ptr: pointer to initialized kthread_worker
 *
 * This function implements the main cycle of kthread worker. It processes
 * work_list until it is stopped with kthread_stop(). It sleeps when the queue
 * is empty.
 *
 * The works are not allowed to keep any locks, disable preemption or interrupts
 * when they finish. There is defined a safe point for freezing when one work
 * finishes and before a new one is started.
 *
 * Also the works must not be handled by more than one worker at the same time,
 * see also kthread_queue_work().
 */
int kthread_worker_fn(void *worker_ptr)
{
        struct kthread_worker *worker = worker_ptr;
        struct kthread_work *work;

        /*
         * FIXME: Update the check and remove the assignment when all kthread
         * worker users are created using kthread_create_worker*() functions.
         */
        WARN_ON(worker->task && worker->task != current);
        worker->task = current;

        if (worker->flags & KTW_FREEZABLE)
                set_freezable();

repeat:
        set_current_state(TASK_INTERRUPTIBLE);        /* mb paired w/ kthread_stop */

        if (kthread_should_stop()) {
                __set_current_state(TASK_RUNNING);
                raw_spin_lock_irq(&worker->lock);
                worker->task = NULL;
                raw_spin_unlock_irq(&worker->lock);
                return 0;
        }

        work = NULL;
        raw_spin_lock_irq(&worker->lock);
        if (!list_empty(&worker->work_list)) {
                work = list_first_entry(&worker->work_list,
                                        struct kthread_work, node);
                list_del_init(&work->node);
        }
        worker->current_work = work;
        raw_spin_unlock_irq(&worker->lock);

        if (work) {
                __set_current_state(TASK_RUNNING);
                work->func(work);
        } else if (!freezing(current))
                schedule();

        try_to_freeze();
        cond_resched();
        goto repeat;
}
EXPORT_SYMBOL_GPL(kthread_worker_fn);

static __printf(3, 0) struct kthread_worker *
__kthread_create_worker(int cpu, unsigned int flags,
                        const char namefmt[], va_list args)
{
        struct kthread_worker *worker;
        struct task_struct *task;
        int node = NUMA_NO_NODE;

        worker = kzalloc(sizeof(*worker), GFP_KERNEL);
        if (!worker)
                return ERR_PTR(-ENOMEM);

        kthread_init_worker(worker);

        if (cpu >= 0)
                node = cpu_to_node(cpu);

        task = __kthread_create_on_node(kthread_worker_fn, worker,
                                                node, namefmt, args);
        if (IS_ERR(task))
                goto fail_task;

        if (cpu >= 0)
                kthread_bind(task, cpu);

        worker->flags = flags;
        worker->task = task;
        wake_up_process(task);
        return worker;

fail_task:
        kfree(worker);
        return ERR_CAST(task);
}

/**
 * kthread_create_worker - create a kthread worker
 * @flags: flags modifying the default behavior of the worker
 * @namefmt: printf-style name for the kthread worker (task).
 *
 * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
 * when the needed structures could not get allocated, and ERR_PTR(-EINTR)
 * when the worker was SIGKILLed.
 */
struct kthread_worker *
kthread_create_worker(unsigned int flags, const char namefmt[], ...)
{
        struct kthread_worker *worker;
        va_list args;

        va_start(args, namefmt);
        worker = __kthread_create_worker(-1, flags, namefmt, args);
        va_end(args);

        return worker;
}
EXPORT_SYMBOL(kthread_create_worker);

/**
 * kthread_create_worker_on_cpu - create a kthread worker and bind it
 *        to a given CPU and the associated NUMA node.
 * @cpu: CPU number
 * @flags: flags modifying the default behavior of the worker
 * @namefmt: printf-style name for the kthread worker (task).
 *
 * Use a valid CPU number if you want to bind the kthread worker
 * to the given CPU and the associated NUMA node.
 *
 * A good practice is to add the cpu number also into the worker name.
 * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu).
 *
 * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
 * when the needed structures could not get allocated, and ERR_PTR(-EINTR)
 * when the worker was SIGKILLed.
 */
struct kthread_worker *
kthread_create_worker_on_cpu(int cpu, unsigned int flags,
                             const char namefmt[], ...)
{
        struct kthread_worker *worker;
        va_list args;

        va_start(args, namefmt);
        worker = __kthread_create_worker(cpu, flags, namefmt, args);
        va_end(args);

        return worker;
}
EXPORT_SYMBOL(kthread_create_worker_on_cpu);

/*
 * Returns true when the work could not be queued at the moment.
 * It happens when it is already pending in a worker list
 * or when it is being cancelled.
 */
static inline bool queuing_blocked(struct kthread_worker *worker,
                                   struct kthread_work *work)
{
        lockdep_assert_held(&worker->lock);

        return !list_empty(&work->node) || work->canceling;
}

static void kthread_insert_work_sanity_check(struct kthread_worker *worker,
                                             struct kthread_work *work)
{
        lockdep_assert_held(&worker->lock);
        WARN_ON_ONCE(!list_empty(&work->node));
        /* Do not use a work with >1 worker, see kthread_queue_work() */
        WARN_ON_ONCE(work->worker && work->worker != worker);
}

/* insert @work before @pos in @worker */
static void kthread_insert_work(struct kthread_worker *worker,
                                struct kthread_work *work,
                                struct list_head *pos)
{
        kthread_insert_work_sanity_check(worker, work);

        list_add_tail(&work->node, pos);
        work->worker = worker;
        if (!worker->current_work && likely(worker->task))
                wake_up_process(worker->task);
}

/**
 * kthread_queue_work - queue a kthread_work
 * @worker: target kthread_worker
 * @work: kthread_work to queue
 *
 * Queue @work to work processor @task for async execution.  @task
 * must have been created with kthread_worker_create().  Returns %true
 * if @work was successfully queued, %false if it was already pending.
 *
 * Reinitialize the work if it needs to be used by another worker.
 * For example, when the worker was stopped and started again.
 */
bool kthread_queue_work(struct kthread_worker *worker,
                        struct kthread_work *work)
{
        bool ret = false;
        unsigned long flags;

        raw_spin_lock_irqsave(&worker->lock, flags);
        if (!queuing_blocked(worker, work)) {
                kthread_insert_work(worker, work, &worker->work_list);
                ret = true;
        }
        raw_spin_unlock_irqrestore(&worker->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(kthread_queue_work);

/**
 * kthread_delayed_work_timer_fn - callback that queues the associated kthread
 *        delayed work when the timer expires.
 * @t: pointer to the expired timer
 *
 * The format of the function is defined by struct timer_list.
 * It should have been called from irqsafe timer with irq already off.
 */
void kthread_delayed_work_timer_fn(struct timer_list *t)
{
        struct kthread_delayed_work *dwork = from_timer(dwork, t, timer);
        struct kthread_work *work = &dwork->work;
        struct kthread_worker *worker = work->worker;
        unsigned long flags;

        /*
         * This might happen when a pending work is reinitialized.
         * It means that it is used a wrong way.
         */
        if (WARN_ON_ONCE(!worker))
                return;

        raw_spin_lock_irqsave(&worker->lock, flags);
        /* Work must not be used with >1 worker, see kthread_queue_work(). */
        WARN_ON_ONCE(work->worker != worker);

        /* Move the work from worker->delayed_work_list. */
        WARN_ON_ONCE(list_empty(&work->node));
        list_del_init(&work->node);
        if (!work->canceling)
                kthread_insert_work(worker, work, &worker->work_list);

        raw_spin_unlock_irqrestore(&worker->lock, flags);
}
EXPORT_SYMBOL(kthread_delayed_work_timer_fn);

static void __kthread_queue_delayed_work(struct kthread_worker *worker,
                                         struct kthread_delayed_work *dwork,
                                         unsigned long delay)
{
        struct timer_list *timer = &dwork->timer;
        struct kthread_work *work = &dwork->work;

        WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn);

        /*
         * If @delay is 0, queue @dwork->work immediately.  This is for
         * both optimization and correctness.  The earliest @timer can
         * expire is on the closest next tick and delayed_work users depend
         * on that there's no such delay when @delay is 0.
         */
        if (!delay) {
                kthread_insert_work(worker, work, &worker->work_list);
                return;
        }

        /* Be paranoid and try to detect possible races already now. */
        kthread_insert_work_sanity_check(worker, work);

        list_add(&work->node, &worker->delayed_work_list);
        work->worker = worker;
        timer->expires = jiffies + delay;
        add_timer(timer);
}

/**
 * kthread_queue_delayed_work - queue the associated kthread work
 *        after a delay.
 * @worker: target kthread_worker
 * @dwork: kthread_delayed_work to queue
 * @delay: number of jiffies to wait before queuing
 *
 * If the work has not been pending it starts a timer that will queue
 * the work after the given @delay. If @delay is zero, it queues the
 * work immediately.
 *
 * Return: %false if the @work has already been pending. It means that
 * either the timer was running or the work was queued. It returns %true
 * otherwise.
 */
bool kthread_queue_delayed_work(struct kthread_worker *worker,
                                struct kthread_delayed_work *dwork,
                                unsigned long delay)
{
        struct kthread_work *work = &dwork->work;
        unsigned long flags;
        bool ret = false;

        raw_spin_lock_irqsave(&worker->lock, flags);

        if (!queuing_blocked(worker, work)) {
                __kthread_queue_delayed_work(worker, dwork, delay);
                ret = true;
        }

        raw_spin_unlock_irqrestore(&worker->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(kthread_queue_delayed_work);

struct kthread_flush_work {
        struct kthread_work        work;
        struct completion        done;
};

static void kthread_flush_work_fn(struct kthread_work *work)
{
        struct kthread_flush_work *fwork =
                container_of(work, struct kthread_flush_work, work);
        complete(&fwork->done);
}

/**
 * kthread_flush_work - flush a kthread_work
 * @work: work to flush
 *
 * If @work is queued or executing, wait for it to finish execution.
 */
void kthread_flush_work(struct kthread_work *work)
{
        struct kthread_flush_work fwork = {
                KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
                COMPLETION_INITIALIZER_ONSTACK(fwork.done),
        };
        struct kthread_worker *worker;
        bool noop = false;

        worker = work->worker;
        if (!worker)
                return;

        raw_spin_lock_irq(&worker->lock);
        /* Work must not be used with >1 worker, see kthread_queue_work(). */
        WARN_ON_ONCE(work->worker != worker);

        if (!list_empty(&work->node))
                kthread_insert_work(worker, &fwork.work, work->node.next);
        else if (worker->current_work == work)
                kthread_insert_work(worker, &fwork.work,
                                    worker->work_list.next);
        else
                noop = true;

        raw_spin_unlock_irq(&worker->lock);

        if (!noop)
                wait_for_completion(&fwork.done);
}
EXPORT_SYMBOL_GPL(kthread_flush_work);

/*
 * Make sure that the timer is neither set nor running and could
 * not manipulate the work list_head any longer.
 *
 * The function is called under worker->lock. The lock is temporary
 * released but the timer can't be set again in the meantime.
 */
static void kthread_cancel_delayed_work_timer(struct kthread_work *work,
                                              unsigned long *flags)
{
        struct kthread_delayed_work *dwork =
                container_of(work, struct kthread_delayed_work, work);
        struct kthread_worker *worker = work->worker;

        /*
         * del_timer_sync() must be called to make sure that the timer
         * callback is not running. The lock must be temporary released
         * to avoid a deadlock with the callback. In the meantime,
         * any queuing is blocked by setting the canceling counter.
         */
        work->canceling++;
        raw_spin_unlock_irqrestore(&worker->lock, *flags);
        del_timer_sync(&dwork->timer);
        raw_spin_lock_irqsave(&worker->lock, *flags);
        work->canceling--;
}

/*
 * This function removes the work from the worker queue.
 *
 * It is called under worker->lock. The caller must make sure that
 * the timer used by delayed work is not running, e.g. by calling
 * kthread_cancel_delayed_work_timer().
 *
 * The work might still be in use when this function finishes. See the
 * current_work proceed by the worker.
 *
 * Return: %true if @work was pending and successfully canceled,
 *        %false if @work was not pending
 */
static bool __kthread_cancel_work(struct kthread_work *work)
{
        /*
         * Try to remove the work from a worker list. It might either
         * be from worker->work_list or from worker->delayed_work_list.
         */
        if (!list_empty(&work->node)) {
                list_del_init(&work->node);
                return true;
        }

        return false;
}

/**
 * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work
 * @worker: kthread worker to use
 * @dwork: kthread delayed work to queue
 * @delay: number of jiffies to wait before queuing
 *
 * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise,
 * modify @dwork's timer so that it expires after @delay. If @delay is zero,
 * @work is guaranteed to be queued immediately.
 *
 * Return: %false if @dwork was idle and queued, %true otherwise.
 *
 * A special case is when the work is being canceled in parallel.
 * It might be caused either by the real kthread_cancel_delayed_work_sync()
 * or yet another kthread_mod_delayed_work() call. We let the other command
 * win and return %true here. The return value can be used for reference
 * counting and the number of queued works stays the same. Anyway, the caller
 * is supposed to synchronize these operations a reasonable way.
 *
 * This function is safe to call from any context including IRQ handler.
 * See __kthread_cancel_work() and kthread_delayed_work_timer_fn()
 * for details.
 */
bool kthread_mod_delayed_work(struct kthread_worker *worker,
                              struct kthread_delayed_work *dwork,
                              unsigned long delay)
{
        struct kthread_work *work = &dwork->work;
        unsigned long flags;
        int ret;

        raw_spin_lock_irqsave(&worker->lock, flags);

        /* Do not bother with canceling when never queued. */
        if (!work->worker) {
                ret = false;
                goto fast_queue;
        }

        /* Work must not be used with >1 worker, see kthread_queue_work() */
        WARN_ON_ONCE(work->worker != worker);

        /*
         * Temporary cancel the work but do not fight with another command
         * that is canceling the work as well.
         *
         * It is a bit tricky because of possible races with another
         * mod_delayed_work() and cancel_delayed_work() callers.
         *
         * The timer must be canceled first because worker->lock is released
         * when doing so. But the work can be removed from the queue (list)
         * only when it can be queued again so that the return value can
         * be used for reference counting.
         */
        kthread_cancel_delayed_work_timer(work, &flags);
        if (work->canceling) {
                /* The number of works in the queue does not change. */
                ret = true;
                goto out;
        }
        ret = __kthread_cancel_work(work);

fast_queue:
        __kthread_queue_delayed_work(worker, dwork, delay);
out:
        raw_spin_unlock_irqrestore(&worker->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(kthread_mod_delayed_work);

static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork)
{
        struct kthread_worker *worker = work->worker;
        unsigned long flags;
        int ret = false;

        if (!worker)
                goto out;

        raw_spin_lock_irqsave(&worker->lock, flags);
        /* Work must not be used with >1 worker, see kthread_queue_work(). */
        WARN_ON_ONCE(work->worker != worker);

        if (is_dwork)
                kthread_cancel_delayed_work_timer(work, &flags);

        ret = __kthread_cancel_work(work);

        if (worker->current_work != work)
                goto out_fast;

        /*
         * The work is in progress and we need to wait with the lock released.
         * In the meantime, block any queuing by setting the canceling counter.
         */
        work->canceling++;
        raw_spin_unlock_irqrestore(&worker->lock, flags);
        kthread_flush_work(work);
        raw_spin_lock_irqsave(&worker->lock, flags);
        work->canceling--;

out_fast:
        raw_spin_unlock_irqrestore(&worker->lock, flags);
out:
        return ret;
}

/**
 * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish
 * @work: the kthread work to cancel
 *
 * Cancel @work and wait for its execution to finish.  This function
 * can be used even if the work re-queues itself. On return from this
 * function, @work is guaranteed to be not pending or executing on any CPU.
 *
 * kthread_cancel_work_sync(&delayed_work->work) must not be used for
 * delayed_work's. Use kthread_cancel_delayed_work_sync() instead.
 *
 * The caller must ensure that the worker on which @work was last
 * queued can't be destroyed before this function returns.
 *
 * Return: %true if @work was pending, %false otherwise.
 */
bool kthread_cancel_work_sync(struct kthread_work *work)
{
        return __kthread_cancel_work_sync(work, false);
}
EXPORT_SYMBOL_GPL(kthread_cancel_work_sync);

/**
 * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and
 *        wait for it to finish.
 * @dwork: the kthread delayed work to cancel
 *
 * This is kthread_cancel_work_sync() for delayed works.
 *
 * Return: %true if @dwork was pending, %false otherwise.
 */
bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork)
{
        return __kthread_cancel_work_sync(&dwork->work, true);
}
EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync);

/**
 * kthread_flush_worker - flush all current works on a kthread_worker
 * @worker: worker to flush
 *
 * Wait until all currently executing or pending works on @worker are
 * finished.
 */
void kthread_flush_worker(struct kthread_worker *worker)
{
        struct kthread_flush_work fwork = {
                KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
                COMPLETION_INITIALIZER_ONSTACK(fwork.done),
        };

        kthread_queue_work(worker, &fwork.work);
        wait_for_completion(&fwork.done);
}
EXPORT_SYMBOL_GPL(kthread_flush_worker);

/**
 * kthread_destroy_worker - destroy a kthread worker
 * @worker: worker to be destroyed
 *
 * Flush and destroy @worker.  The simple flush is enough because the kthread
 * worker API is used only in trivial scenarios.  There are no multi-step state
 * machines needed.
 */
void kthread_destroy_worker(struct kthread_worker *worker)
{
        struct task_struct *task;

        task = worker->task;
        if (WARN_ON(!task))
                return;

        kthread_flush_worker(worker);
        kthread_stop(task);
        WARN_ON(!list_empty(&worker->work_list));
        kfree(worker);
}
EXPORT_SYMBOL(kthread_destroy_worker);

/**
 * kthread_use_mm - make the calling kthread operate on an address space
 * @mm: address space to operate on
 */
void kthread_use_mm(struct mm_struct *mm)
{
        struct mm_struct *active_mm;
        struct task_struct *tsk = current;

        WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
        WARN_ON_ONCE(tsk->mm);

        task_lock(tsk);
        /* Hold off tlb flush IPIs while switching mm's */
        local_irq_disable();
        active_mm = tsk->active_mm;
        if (active_mm != mm) {
                mmgrab(mm);
                tsk->active_mm = mm;
        }
        tsk->mm = mm;
        switch_mm_irqs_off(active_mm, mm, tsk);
        local_irq_enable();
        task_unlock(tsk);
#ifdef finish_arch_post_lock_switch
        finish_arch_post_lock_switch();
#endif

        if (active_mm != mm)
                mmdrop(active_mm);

        to_kthread(tsk)->oldfs = force_uaccess_begin();
}
EXPORT_SYMBOL_GPL(kthread_use_mm);

/**
 * kthread_unuse_mm - reverse the effect of kthread_use_mm()
 * @mm: address space to operate on
 */
void kthread_unuse_mm(struct mm_struct *mm)
{
        struct task_struct *tsk = current;

        WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
        WARN_ON_ONCE(!tsk->mm);

        force_uaccess_end(to_kthread(tsk)->oldfs);

        task_lock(tsk);
        sync_mm_rss(mm);
        local_irq_disable();
        tsk->mm = NULL;
        /* active_mm is still 'mm' */
        enter_lazy_tlb(mm, tsk);
        local_irq_enable();
        task_unlock(tsk);
}
EXPORT_SYMBOL_GPL(kthread_unuse_mm);

#ifdef CONFIG_BLK_CGROUP
/**
 * kthread_associate_blkcg - associate blkcg to current kthread
 * @css: the cgroup info
 *
 * Current thread must be a kthread. The thread is running jobs on behalf of
 * other threads. In some cases, we expect the jobs attach cgroup info of
 * original threads instead of that of current thread. This function stores
 * original thread's cgroup info in current kthread context for later
 * retrieval.
 */
void kthread_associate_blkcg(struct cgroup_subsys_state *css)
{
        struct kthread *kthread;

        if (!(current->flags & PF_KTHREAD))
                return;
        kthread = to_kthread(current);
        if (!kthread)
                return;

        if (kthread->blkcg_css) {
                css_put(kthread->blkcg_css);
                kthread->blkcg_css = NULL;
        }
        if (css) {
                css_get(css);
                kthread->blkcg_css = css;
        }
}
EXPORT_SYMBOL(kthread_associate_blkcg);

/**
 * kthread_blkcg - get associated blkcg css of current kthread
 *
 * Current thread must be a kthread.
 */
struct cgroup_subsys_state *kthread_blkcg(void)
{
        struct kthread *kthread;

        if (current->flags & PF_KTHREAD) {
                kthread = to_kthread(current);
                if (kthread)
                        return kthread->blkcg_css;
        }
        return NULL;
}
EXPORT_SYMBOL(kthread_blkcg);
#endif

































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* memcontrol.h - Memory Controller
 *
 * Copyright IBM Corporation, 2007
 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
 *
 * Copyright 2007 OpenVZ SWsoft Inc
 * Author: Pavel Emelianov <xemul@openvz.org>
 */

#ifndef _LINUX_MEMCONTROL_H
#define _LINUX_MEMCONTROL_H
#include <linux/cgroup.h>
#include <linux/vm_event_item.h>
#include <linux/hardirq.h>
#include <linux/jump_label.h>
#include <linux/page_counter.h>
#include <linux/vmpressure.h>
#include <linux/eventfd.h>
#include <linux/mm.h>
#include <linux/vmstat.h>
#include <linux/writeback.h>
#include <linux/page-flags.h>

struct mem_cgroup;
struct obj_cgroup;
struct page;
struct mm_struct;
struct kmem_cache;

/* Cgroup-specific page state, on top of universal node page state */
enum memcg_stat_item {
        MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS,
        MEMCG_SOCK,
        MEMCG_PERCPU_B,
        MEMCG_NR_STAT,
};

enum memcg_memory_event {
        MEMCG_LOW,
        MEMCG_HIGH,
        MEMCG_MAX,
        MEMCG_OOM,
        MEMCG_OOM_KILL,
        MEMCG_SWAP_HIGH,
        MEMCG_SWAP_MAX,
        MEMCG_SWAP_FAIL,
        MEMCG_NR_MEMORY_EVENTS,
};

struct mem_cgroup_reclaim_cookie {
        pg_data_t *pgdat;
        unsigned int generation;
};

#ifdef CONFIG_MEMCG

#define MEM_CGROUP_ID_SHIFT        16
#define MEM_CGROUP_ID_MAX        USHRT_MAX

struct mem_cgroup_id {
        int id;
        refcount_t ref;
};

/*
 * Per memcg event counter is incremented at every pagein/pageout. With THP,
 * it will be incremented by the number of pages. This counter is used
 * to trigger some periodic events. This is straightforward and better
 * than using jiffies etc. to handle periodic memcg event.
 */
enum mem_cgroup_events_target {
        MEM_CGROUP_TARGET_THRESH,
        MEM_CGROUP_TARGET_SOFTLIMIT,
        MEM_CGROUP_NTARGETS,
};

struct memcg_vmstats_percpu {
        long stat[MEMCG_NR_STAT];
        unsigned long events[NR_VM_EVENT_ITEMS];
        unsigned long nr_page_events;
        unsigned long targets[MEM_CGROUP_NTARGETS];
};

struct mem_cgroup_reclaim_iter {
        struct mem_cgroup *position;
        /* scan generation, increased every round-trip */
        unsigned int generation;
};

struct lruvec_stat {
        long count[NR_VM_NODE_STAT_ITEMS];
};

/*
 * Bitmap of shrinker::id corresponding to memcg-aware shrinkers,
 * which have elements charged to this memcg.
 */
struct memcg_shrinker_map {
        struct rcu_head rcu;
        unsigned long map[];
};

/*
 * per-node information in memory controller.
 */
struct mem_cgroup_per_node {
        struct lruvec                lruvec;

        /* Legacy local VM stats */
        struct lruvec_stat __percpu *lruvec_stat_local;

        /* Subtree VM stats (batched updates) */
        struct lruvec_stat __percpu *lruvec_stat_cpu;
        atomic_long_t                lruvec_stat[NR_VM_NODE_STAT_ITEMS];

        unsigned long                lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];

        struct mem_cgroup_reclaim_iter        iter;

        struct memcg_shrinker_map __rcu        *shrinker_map;

        struct rb_node                tree_node;        /* RB tree node */
        unsigned long                usage_in_excess;/* Set to the value by which */
                                                /* the soft limit is exceeded*/
        bool                        on_tree;
        struct mem_cgroup        *memcg;                /* Back pointer, we cannot */
                                                /* use container_of           */
};

struct mem_cgroup_threshold {
        struct eventfd_ctx *eventfd;
        unsigned long threshold;
};

/* For threshold */
struct mem_cgroup_threshold_ary {
        /* An array index points to threshold just below or equal to usage. */
        int current_threshold;
        /* Size of entries[] */
        unsigned int size;
        /* Array of thresholds */
        struct mem_cgroup_threshold entries[];
};

struct mem_cgroup_thresholds {
        /* Primary thresholds array */
        struct mem_cgroup_threshold_ary *primary;
        /*
         * Spare threshold array.
         * This is needed to make mem_cgroup_unregister_event() "never fail".
         * It must be able to store at least primary->size - 1 entries.
         */
        struct mem_cgroup_threshold_ary *spare;
};

enum memcg_kmem_state {
        KMEM_NONE,
        KMEM_ALLOCATED,
        KMEM_ONLINE,
};

#if defined(CONFIG_SMP)
struct memcg_padding {
        char x[0];
} ____cacheline_internodealigned_in_smp;
#define MEMCG_PADDING(name)      struct memcg_padding name;
#else
#define MEMCG_PADDING(name)
#endif

/*
 * Remember four most recent foreign writebacks with dirty pages in this
 * cgroup.  Inode sharing is expected to be uncommon and, even if we miss
 * one in a given round, we're likely to catch it later if it keeps
 * foreign-dirtying, so a fairly low count should be enough.
 *
 * See mem_cgroup_track_foreign_dirty_slowpath() for details.
 */
#define MEMCG_CGWB_FRN_CNT        4

struct memcg_cgwb_frn {
        u64 bdi_id;                        /* bdi->id of the foreign inode */
        int memcg_id;                        /* memcg->css.id of foreign inode */
        u64 at;                                /* jiffies_64 at the time of dirtying */
        struct wb_completion done;        /* tracks in-flight foreign writebacks */
};

/*
 * Bucket for arbitrarily byte-sized objects charged to a memory
 * cgroup. The bucket can be reparented in one piece when the cgroup
 * is destroyed, without having to round up the individual references
 * of all live memory objects in the wild.
 */
struct obj_cgroup {
        struct percpu_ref refcnt;
        struct mem_cgroup *memcg;
        atomic_t nr_charged_bytes;
        union {
                struct list_head list; /* protected by objcg_lock */
                struct rcu_head rcu;
        };
};

/*
 * The memory controller data structure. The memory controller controls both
 * page cache and RSS per cgroup. We would eventually like to provide
 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 * to help the administrator determine what knobs to tune.
 */
struct mem_cgroup {
        struct cgroup_subsys_state css;

        /* Private memcg ID. Used to ID objects that outlive the cgroup */
        struct mem_cgroup_id id;

        /* Accounted resources */
        struct page_counter memory;                /* Both v1 & v2 */

        union {
                struct page_counter swap;        /* v2 only */
                struct page_counter memsw;        /* v1 only */
        };

        /* Legacy consumer-oriented counters */
        struct page_counter kmem;                /* v1 only */
        struct page_counter tcpmem;                /* v1 only */

        /* Range enforcement for interrupt charges */
        struct work_struct high_work;

        unsigned long soft_limit;

        /* vmpressure notifications */
        struct vmpressure vmpressure;

        /*
         * Should the accounting and control be hierarchical, per subtree?
         */
        bool use_hierarchy;

        /*
         * Should the OOM killer kill all belonging tasks, had it kill one?
         */
        bool oom_group;

        /* protected by memcg_oom_lock */
        bool                oom_lock;
        int                under_oom;

        int        swappiness;
        /* OOM-Killer disable */
        int                oom_kill_disable;

        /* memory.events and memory.events.local */
        struct cgroup_file events_file;
        struct cgroup_file events_local_file;

        /* handle for "memory.swap.events" */
        struct cgroup_file swap_events_file;

        /* protect arrays of thresholds */
        struct mutex thresholds_lock;

        /* thresholds for memory usage. RCU-protected */
        struct mem_cgroup_thresholds thresholds;

        /* thresholds for mem+swap usage. RCU-protected */
        struct mem_cgroup_thresholds memsw_thresholds;

        /* For oom notifier event fd */
        struct list_head oom_notify;

        /*
         * Should we move charges of a task when a task is moved into this
         * mem_cgroup ? And what type of charges should we move ?
         */
        unsigned long move_charge_at_immigrate;
        /* taken only while moving_account > 0 */
        spinlock_t                move_lock;
        unsigned long                move_lock_flags;

        MEMCG_PADDING(_pad1_);

        atomic_long_t                vmstats[MEMCG_NR_STAT];
        atomic_long_t                vmevents[NR_VM_EVENT_ITEMS];

        /* memory.events */
        atomic_long_t                memory_events[MEMCG_NR_MEMORY_EVENTS];
        atomic_long_t                memory_events_local[MEMCG_NR_MEMORY_EVENTS];

        unsigned long                socket_pressure;

        /* Legacy tcp memory accounting */
        bool                        tcpmem_active;
        int                        tcpmem_pressure;

#ifdef CONFIG_MEMCG_KMEM
        /* Index in the kmem_cache->memcg_params.memcg_caches array */
        int kmemcg_id;
        enum memcg_kmem_state kmem_state;
        struct obj_cgroup __rcu *objcg;
        /* list of inherited objcgs, protected by objcg_lock */
        struct list_head objcg_list;
#endif

        MEMCG_PADDING(_pad2_);

        /*
         * set > 0 if pages under this cgroup are moving to other cgroup.
         */
        atomic_t                moving_account;
        struct task_struct        *move_lock_task;

        /* Legacy local VM stats and events */
        struct memcg_vmstats_percpu __percpu *vmstats_local;

        /* Subtree VM stats and events (batched updates) */
        struct memcg_vmstats_percpu __percpu *vmstats_percpu;

#ifdef CONFIG_CGROUP_WRITEBACK
        struct list_head cgwb_list;
        struct wb_domain cgwb_domain;
        struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT];
#endif

        /* List of events which userspace want to receive */
        struct list_head event_list;
        spinlock_t event_list_lock;

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        struct deferred_split deferred_split_queue;
#endif

        struct mem_cgroup_per_node *nodeinfo[0];
        /* WARNING: nodeinfo must be the last member here */
};

/*
 * size of first charge trial. "32" comes from vmscan.c's magic value.
 * TODO: maybe necessary to use big numbers in big irons.
 */
#define MEMCG_CHARGE_BATCH 32U

extern struct mem_cgroup *root_mem_cgroup;

static __always_inline bool memcg_stat_item_in_bytes(int idx)
{
        if (idx == MEMCG_PERCPU_B)
                return true;
        return vmstat_item_in_bytes(idx);
}

static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
        return (memcg == root_mem_cgroup);
}

static inline bool mem_cgroup_disabled(void)
{
        return !cgroup_subsys_enabled(memory_cgrp_subsys);
}

static inline void mem_cgroup_protection(struct mem_cgroup *root,
                                         struct mem_cgroup *memcg,
                                         unsigned long *min,
                                         unsigned long *low)
{
        *min = *low = 0;

        if (mem_cgroup_disabled())
                return;

        /*
         * There is no reclaim protection applied to a targeted reclaim.
         * We are special casing this specific case here because
         * mem_cgroup_protected calculation is not robust enough to keep
         * the protection invariant for calculated effective values for
         * parallel reclaimers with different reclaim target. This is
         * especially a problem for tail memcgs (as they have pages on LRU)
         * which would want to have effective values 0 for targeted reclaim
         * but a different value for external reclaim.
         *
         * Example
         * Let's have global and A's reclaim in parallel:
         *  |
         *  A (low=2G, usage = 3G, max = 3G, children_low_usage = 1.5G)
         *  |\
         *  | C (low = 1G, usage = 2.5G)
         *  B (low = 1G, usage = 0.5G)
         *
         * For the global reclaim
         * A.elow = A.low
         * B.elow = min(B.usage, B.low) because children_low_usage <= A.elow
         * C.elow = min(C.usage, C.low)
         *
         * With the effective values resetting we have A reclaim
         * A.elow = 0
         * B.elow = B.low
         * C.elow = C.low
         *
         * If the global reclaim races with A's reclaim then
         * B.elow = C.elow = 0 because children_low_usage > A.elow)
         * is possible and reclaiming B would be violating the protection.
         *
         */
        if (root == memcg)
                return;

        *min = READ_ONCE(memcg->memory.emin);
        *low = READ_ONCE(memcg->memory.elow);
}

void mem_cgroup_calculate_protection(struct mem_cgroup *root,
                                     struct mem_cgroup *memcg);

static inline bool mem_cgroup_supports_protection(struct mem_cgroup *memcg)
{
        /*
         * The root memcg doesn't account charges, and doesn't support
         * protection.
         */
        return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg);

}

static inline bool mem_cgroup_below_low(struct mem_cgroup *memcg)
{
        if (!mem_cgroup_supports_protection(memcg))
                return false;

        return READ_ONCE(memcg->memory.elow) >=
                page_counter_read(&memcg->memory);
}

static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg)
{
        if (!mem_cgroup_supports_protection(memcg))
                return false;

        return READ_ONCE(memcg->memory.emin) >=
                page_counter_read(&memcg->memory);
}

int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask);

void mem_cgroup_uncharge(struct page *page);
void mem_cgroup_uncharge_list(struct list_head *page_list);

void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);

static struct mem_cgroup_per_node *
mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
{
        return memcg->nodeinfo[nid];
}

/**
 * mem_cgroup_lruvec - get the lru list vector for a memcg & node
 * @memcg: memcg of the wanted lruvec
 *
 * Returns the lru list vector holding pages for a given @memcg &
 * @node combination. This can be the node lruvec, if the memory
 * controller is disabled.
 */
static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
                                               struct pglist_data *pgdat)
{
        struct mem_cgroup_per_node *mz;
        struct lruvec *lruvec;

        if (mem_cgroup_disabled()) {
                lruvec = &pgdat->__lruvec;
                goto out;
        }

        if (!memcg)
                memcg = root_mem_cgroup;

        mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
        lruvec = &mz->lruvec;
out:
        /*
         * Since a node can be onlined after the mem_cgroup was created,
         * we have to be prepared to initialize lruvec->pgdat here;
         * and if offlined then reonlined, we need to reinitialize it.
         */
        if (unlikely(lruvec->pgdat != pgdat))
                lruvec->pgdat = pgdat;
        return lruvec;
}

struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);

struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);

struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);

struct mem_cgroup *get_mem_cgroup_from_page(struct page *page);

static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
        return css ? container_of(css, struct mem_cgroup, css) : NULL;
}

static inline bool obj_cgroup_tryget(struct obj_cgroup *objcg)
{
        return percpu_ref_tryget(&objcg->refcnt);
}

static inline void obj_cgroup_get(struct obj_cgroup *objcg)
{
        percpu_ref_get(&objcg->refcnt);
}

static inline void obj_cgroup_put(struct obj_cgroup *objcg)
{
        percpu_ref_put(&objcg->refcnt);
}

/*
 * After the initialization objcg->memcg is always pointing at
 * a valid memcg, but can be atomically swapped to the parent memcg.
 *
 * The caller must ensure that the returned memcg won't be released:
 * e.g. acquire the rcu_read_lock or css_set_lock.
 */
static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
{
        return READ_ONCE(objcg->memcg);
}

static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
        if (memcg)
                css_put(&memcg->css);
}

#define mem_cgroup_from_counter(counter, member)        \
        container_of(counter, struct mem_cgroup, member)

struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
                                   struct mem_cgroup *,
                                   struct mem_cgroup_reclaim_cookie *);
void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
int mem_cgroup_scan_tasks(struct mem_cgroup *,
                          int (*)(struct task_struct *, void *), void *);

static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
{
        if (mem_cgroup_disabled())
                return 0;

        return memcg->id.id;
}
struct mem_cgroup *mem_cgroup_from_id(unsigned short id);

static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
{
        return mem_cgroup_from_css(seq_css(m));
}

static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
{
        struct mem_cgroup_per_node *mz;

        if (mem_cgroup_disabled())
                return NULL;

        mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
        return mz->memcg;
}

/**
 * parent_mem_cgroup - find the accounting parent of a memcg
 * @memcg: memcg whose parent to find
 *
 * Returns the parent memcg, or NULL if this is the root or the memory
 * controller is in legacy no-hierarchy mode.
 */
static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
        if (!memcg->memory.parent)
                return NULL;
        return mem_cgroup_from_counter(memcg->memory.parent, memory);
}

static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
                              struct mem_cgroup *root)
{
        if (root == memcg)
                return true;
        if (!root->use_hierarchy)
                return false;
        return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
}

static inline bool mm_match_cgroup(struct mm_struct *mm,
                                   struct mem_cgroup *memcg)
{
        struct mem_cgroup *task_memcg;
        bool match = false;

        rcu_read_lock();
        task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
        if (task_memcg)
                match = mem_cgroup_is_descendant(task_memcg, memcg);
        rcu_read_unlock();
        return match;
}

struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
ino_t page_cgroup_ino(struct page *page);

static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
{
        if (mem_cgroup_disabled())
                return true;
        return !!(memcg->css.flags & CSS_ONLINE);
}

/*
 * For memory reclaim.
 */
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);

void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
                int zid, int nr_pages);

static inline
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
                enum lru_list lru, int zone_idx)
{
        struct mem_cgroup_per_node *mz;

        mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
        return READ_ONCE(mz->lru_zone_size[zone_idx][lru]);
}

void mem_cgroup_handle_over_high(void);

unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);

unsigned long mem_cgroup_size(struct mem_cgroup *memcg);

void mem_cgroup_print_oom_context(struct mem_cgroup *memcg,
                                struct task_struct *p);

void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg);

static inline void mem_cgroup_enter_user_fault(void)
{
        WARN_ON(current->in_user_fault);
        current->in_user_fault = 1;
}

static inline void mem_cgroup_exit_user_fault(void)
{
        WARN_ON(!current->in_user_fault);
        current->in_user_fault = 0;
}

static inline bool task_in_memcg_oom(struct task_struct *p)
{
        return p->memcg_in_oom;
}

bool mem_cgroup_oom_synchronize(bool wait);
struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
                                            struct mem_cgroup *oom_domain);
void mem_cgroup_print_oom_group(struct mem_cgroup *memcg);

#ifdef CONFIG_MEMCG_SWAP
extern bool cgroup_memory_noswap;
#endif

struct mem_cgroup *lock_page_memcg(struct page *page);
void __unlock_page_memcg(struct mem_cgroup *memcg);
void unlock_page_memcg(struct page *page);

/*
 * idx can be of type enum memcg_stat_item or node_stat_item.
 * Keep in sync with memcg_exact_page_state().
 */
static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
{
        long x = atomic_long_read(&memcg->vmstats[idx]);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

/*
 * idx can be of type enum memcg_stat_item or node_stat_item.
 * Keep in sync with memcg_exact_page_state().
 */
static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg,
                                                   int idx)
{
        long x = 0;
        int cpu;

        for_each_possible_cpu(cpu)
                x += per_cpu(memcg->vmstats_local->stat[idx], cpu);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val);

/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void mod_memcg_state(struct mem_cgroup *memcg,
                                   int idx, int val)
{
        unsigned long flags;

        local_irq_save(flags);
        __mod_memcg_state(memcg, idx, val);
        local_irq_restore(flags);
}

/**
 * mod_memcg_page_state - update page state statistics
 * @page: the page
 * @idx: page state item to account
 * @val: number of pages (positive or negative)
 *
 * The @page must be locked or the caller must use lock_page_memcg()
 * to prevent double accounting when the page is concurrently being
 * moved to another memcg:
 *
 *   lock_page(page) or lock_page_memcg(page)
 *   if (TestClearPageState(page))
 *     mod_memcg_page_state(page, state, -1);
 *   unlock_page(page) or unlock_page_memcg(page)
 *
 * Kernel pages are an exception to this, since they'll never move.
 */
static inline void __mod_memcg_page_state(struct page *page,
                                          int idx, int val)
{
        if (page->mem_cgroup)
                __mod_memcg_state(page->mem_cgroup, idx, val);
}

static inline void mod_memcg_page_state(struct page *page,
                                        int idx, int val)
{
        if (page->mem_cgroup)
                mod_memcg_state(page->mem_cgroup, idx, val);
}

static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
                                              enum node_stat_item idx)
{
        struct mem_cgroup_per_node *pn;
        long x;

        if (mem_cgroup_disabled())
                return node_page_state(lruvec_pgdat(lruvec), idx);

        pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
        x = atomic_long_read(&pn->lruvec_stat[idx]);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
                                                    enum node_stat_item idx)
{
        struct mem_cgroup_per_node *pn;
        long x = 0;
        int cpu;

        if (mem_cgroup_disabled())
                return node_page_state(lruvec_pgdat(lruvec), idx);

        pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
        for_each_possible_cpu(cpu)
                x += per_cpu(pn->lruvec_stat_local->count[idx], cpu);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
                              int val);
void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
                        int val);
void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val);

void mod_memcg_obj_state(void *p, int idx, int val);

static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx,
                                         int val)
{
        unsigned long flags;

        local_irq_save(flags);
        __mod_lruvec_slab_state(p, idx, val);
        local_irq_restore(flags);
}

static inline void mod_memcg_lruvec_state(struct lruvec *lruvec,
                                          enum node_stat_item idx, int val)
{
        unsigned long flags;

        local_irq_save(flags);
        __mod_memcg_lruvec_state(lruvec, idx, val);
        local_irq_restore(flags);
}

static inline void mod_lruvec_state(struct lruvec *lruvec,
                                    enum node_stat_item idx, int val)
{
        unsigned long flags;

        local_irq_save(flags);
        __mod_lruvec_state(lruvec, idx, val);
        local_irq_restore(flags);
}

static inline void __mod_lruvec_page_state(struct page *page,
                                           enum node_stat_item idx, int val)
{
        struct page *head = compound_head(page); /* rmap on tail pages */
        pg_data_t *pgdat = page_pgdat(page);
        struct lruvec *lruvec;

        /* Untracked pages have no memcg, no lruvec. Update only the node */
        if (!head->mem_cgroup) {
                __mod_node_page_state(pgdat, idx, val);
                return;
        }

        lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat);
        __mod_lruvec_state(lruvec, idx, val);
}

static inline void mod_lruvec_page_state(struct page *page,
                                         enum node_stat_item idx, int val)
{
        unsigned long flags;

        local_irq_save(flags);
        __mod_lruvec_page_state(page, idx, val);
        local_irq_restore(flags);
}

unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
                                                gfp_t gfp_mask,
                                                unsigned long *total_scanned);

void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
                          unsigned long count);

static inline void count_memcg_events(struct mem_cgroup *memcg,
                                      enum vm_event_item idx,
                                      unsigned long count)
{
        unsigned long flags;

        local_irq_save(flags);
        __count_memcg_events(memcg, idx, count);
        local_irq_restore(flags);
}

static inline void count_memcg_page_event(struct page *page,
                                          enum vm_event_item idx)
{
        if (page->mem_cgroup)
                count_memcg_events(page->mem_cgroup, idx, 1);
}

static inline void count_memcg_event_mm(struct mm_struct *mm,
                                        enum vm_event_item idx)
{
        struct mem_cgroup *memcg;

        if (mem_cgroup_disabled())
                return;

        rcu_read_lock();
        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
        if (likely(memcg))
                count_memcg_events(memcg, idx, 1);
        rcu_read_unlock();
}

static inline void memcg_memory_event(struct mem_cgroup *memcg,
                                      enum memcg_memory_event event)
{
        bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX ||
                          event == MEMCG_SWAP_FAIL;

        atomic_long_inc(&memcg->memory_events_local[event]);
        if (!swap_event)
                cgroup_file_notify(&memcg->events_local_file);

        do {
                atomic_long_inc(&memcg->memory_events[event]);
                if (swap_event)
                        cgroup_file_notify(&memcg->swap_events_file);
                else
                        cgroup_file_notify(&memcg->events_file);

                if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
                        break;
                if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
                        break;
        } while ((memcg = parent_mem_cgroup(memcg)) &&
                 !mem_cgroup_is_root(memcg));
}

static inline void memcg_memory_event_mm(struct mm_struct *mm,
                                         enum memcg_memory_event event)
{
        struct mem_cgroup *memcg;

        if (mem_cgroup_disabled())
                return;

        rcu_read_lock();
        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
        if (likely(memcg))
                memcg_memory_event(memcg, event);
        rcu_read_unlock();
}

void split_page_memcg(struct page *head, unsigned int nr);

#else /* CONFIG_MEMCG */

#define MEM_CGROUP_ID_SHIFT        0
#define MEM_CGROUP_ID_MAX        0

struct mem_cgroup;

static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
        return true;
}

static inline bool mem_cgroup_disabled(void)
{
        return true;
}

static inline void memcg_memory_event(struct mem_cgroup *memcg,
                                      enum memcg_memory_event event)
{
}

static inline void memcg_memory_event_mm(struct mm_struct *mm,
                                         enum memcg_memory_event event)
{
}

static inline void mem_cgroup_protection(struct mem_cgroup *root,
                                         struct mem_cgroup *memcg,
                                         unsigned long *min,
                                         unsigned long *low)
{
        *min = *low = 0;
}

static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root,
                                                   struct mem_cgroup *memcg)
{
}

static inline bool mem_cgroup_below_low(struct mem_cgroup *memcg)
{
        return false;
}

static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg)
{
        return false;
}

static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
                                    gfp_t gfp_mask)
{
        return 0;
}

static inline void mem_cgroup_uncharge(struct page *page)
{
}

static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
{
}

static inline void mem_cgroup_migrate(struct page *old, struct page *new)
{
}

static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
                                               struct pglist_data *pgdat)
{
        return &pgdat->__lruvec;
}

static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
                                                    struct pglist_data *pgdat)
{
        return &pgdat->__lruvec;
}

static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
        return NULL;
}

static inline bool mm_match_cgroup(struct mm_struct *mm,
                struct mem_cgroup *memcg)
{
        return true;
}

static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
        return NULL;
}

static inline struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
{
        return NULL;
}

static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
}

static inline struct mem_cgroup *
mem_cgroup_iter(struct mem_cgroup *root,
                struct mem_cgroup *prev,
                struct mem_cgroup_reclaim_cookie *reclaim)
{
        return NULL;
}

static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
                                         struct mem_cgroup *prev)
{
}

static inline int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
                int (*fn)(struct task_struct *, void *), void *arg)
{
        return 0;
}

static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
{
        return 0;
}

static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
{
        WARN_ON_ONCE(id);
        /* XXX: This should always return root_mem_cgroup */
        return NULL;
}

static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
{
        return NULL;
}

static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
{
        return NULL;
}

static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
{
        return true;
}

static inline
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
                enum lru_list lru, int zone_idx)
{
        return 0;
}

static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
        return 0;
}

static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
{
        return 0;
}

static inline void
mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
{
}

static inline void
mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
{
}

static inline struct mem_cgroup *lock_page_memcg(struct page *page)
{
        return NULL;
}

static inline void __unlock_page_memcg(struct mem_cgroup *memcg)
{
}

static inline void unlock_page_memcg(struct page *page)
{
}

static inline void mem_cgroup_handle_over_high(void)
{
}

static inline void mem_cgroup_enter_user_fault(void)
{
}

static inline void mem_cgroup_exit_user_fault(void)
{
}

static inline bool task_in_memcg_oom(struct task_struct *p)
{
        return false;
}

static inline bool mem_cgroup_oom_synchronize(bool wait)
{
        return false;
}

static inline struct mem_cgroup *mem_cgroup_get_oom_group(
        struct task_struct *victim, struct mem_cgroup *oom_domain)
{
        return NULL;
}

static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
{
}

static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
{
        return 0;
}

static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg,
                                                   int idx)
{
        return 0;
}

static inline void __mod_memcg_state(struct mem_cgroup *memcg,
                                     int idx,
                                     int nr)
{
}

static inline void mod_memcg_state(struct mem_cgroup *memcg,
                                   int idx,
                                   int nr)
{
}

static inline void __mod_memcg_page_state(struct page *page,
                                          int idx,
                                          int nr)
{
}

static inline void mod_memcg_page_state(struct page *page,
                                        int idx,
                                        int nr)
{
}

static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
                                              enum node_stat_item idx)
{
        return node_page_state(lruvec_pgdat(lruvec), idx);
}

static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
                                                    enum node_stat_item idx)
{
        return node_page_state(lruvec_pgdat(lruvec), idx);
}

static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec,
                                            enum node_stat_item idx, int val)
{
}

static inline void __mod_lruvec_state(struct lruvec *lruvec,
                                      enum node_stat_item idx, int val)
{
        __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
}

static inline void mod_lruvec_state(struct lruvec *lruvec,
                                    enum node_stat_item idx, int val)
{
        mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
}

static inline void __mod_lruvec_page_state(struct page *page,
                                           enum node_stat_item idx, int val)
{
        __mod_node_page_state(page_pgdat(page), idx, val);
}

static inline void mod_lruvec_page_state(struct page *page,
                                         enum node_stat_item idx, int val)
{
        mod_node_page_state(page_pgdat(page), idx, val);
}

static inline void __mod_lruvec_slab_state(void *p, enum node_stat_item idx,
                                           int val)
{
        struct page *page = virt_to_head_page(p);

        __mod_node_page_state(page_pgdat(page), idx, val);
}

static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx,
                                         int val)
{
        struct page *page = virt_to_head_page(p);

        mod_node_page_state(page_pgdat(page), idx, val);
}

static inline void mod_memcg_obj_state(void *p, int idx, int val)
{
}

static inline
unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
                                            gfp_t gfp_mask,
                                            unsigned long *total_scanned)
{
        return 0;
}

static inline void split_page_memcg(struct page *head, unsigned int nr)
{
}

static inline void count_memcg_events(struct mem_cgroup *memcg,
                                      enum vm_event_item idx,
                                      unsigned long count)
{
}

static inline void __count_memcg_events(struct mem_cgroup *memcg,
                                        enum vm_event_item idx,
                                        unsigned long count)
{
}

static inline void count_memcg_page_event(struct page *page,
                                          int idx)
{
}

static inline
void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
{
}
#endif /* CONFIG_MEMCG */

/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void __inc_memcg_state(struct mem_cgroup *memcg,
                                     int idx)
{
        __mod_memcg_state(memcg, idx, 1);
}

/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void __dec_memcg_state(struct mem_cgroup *memcg,
                                     int idx)
{
        __mod_memcg_state(memcg, idx, -1);
}

/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void __inc_memcg_page_state(struct page *page,
                                          int idx)
{
        __mod_memcg_page_state(page, idx, 1);
}

/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void __dec_memcg_page_state(struct page *page,
                                          int idx)
{
        __mod_memcg_page_state(page, idx, -1);
}

static inline void __inc_lruvec_state(struct lruvec *lruvec,
                                      enum node_stat_item idx)
{
        __mod_lruvec_state(lruvec, idx, 1);
}

static inline void __dec_lruvec_state(struct lruvec *lruvec,
                                      enum node_stat_item idx)
{
        __mod_lruvec_state(lruvec, idx, -1);
}

static inline void __inc_lruvec_page_state(struct page *page,
                                           enum node_stat_item idx)
{
        __mod_lruvec_page_state(page, idx, 1);
}

static inline void __dec_lruvec_page_state(struct page *page,
                                           enum node_stat_item idx)
{
        __mod_lruvec_page_state(page, idx, -1);
}

static inline void __inc_lruvec_slab_state(void *p, enum node_stat_item idx)
{
        __mod_lruvec_slab_state(p, idx, 1);
}

static inline void __dec_lruvec_slab_state(void *p, enum node_stat_item idx)
{
        __mod_lruvec_slab_state(p, idx, -1);
}

/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void inc_memcg_state(struct mem_cgroup *memcg,
                                   int idx)
{
        mod_memcg_state(memcg, idx, 1);
}

/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void dec_memcg_state(struct mem_cgroup *memcg,
                                   int idx)
{
        mod_memcg_state(memcg, idx, -1);
}

/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void inc_memcg_page_state(struct page *page,
                                        int idx)
{
        mod_memcg_page_state(page, idx, 1);
}

/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void dec_memcg_page_state(struct page *page,
                                        int idx)
{
        mod_memcg_page_state(page, idx, -1);
}

static inline void inc_lruvec_state(struct lruvec *lruvec,
                                    enum node_stat_item idx)
{
        mod_lruvec_state(lruvec, idx, 1);
}

static inline void dec_lruvec_state(struct lruvec *lruvec,
                                    enum node_stat_item idx)
{
        mod_lruvec_state(lruvec, idx, -1);
}

static inline void inc_lruvec_page_state(struct page *page,
                                         enum node_stat_item idx)
{
        mod_lruvec_page_state(page, idx, 1);
}

static inline void dec_lruvec_page_state(struct page *page,
                                         enum node_stat_item idx)
{
        mod_lruvec_page_state(page, idx, -1);
}

static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
{
        struct mem_cgroup *memcg;

        memcg = lruvec_memcg(lruvec);
        if (!memcg)
                return NULL;
        memcg = parent_mem_cgroup(memcg);
        if (!memcg)
                return NULL;
        return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
}

#ifdef CONFIG_CGROUP_WRITEBACK

struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
                         unsigned long *pheadroom, unsigned long *pdirty,
                         unsigned long *pwriteback);

void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
                                             struct bdi_writeback *wb);

static inline void mem_cgroup_track_foreign_dirty(struct page *page,
                                                  struct bdi_writeback *wb)
{
        if (mem_cgroup_disabled())
                return;

        if (unlikely(&page->mem_cgroup->css != wb->memcg_css))
                mem_cgroup_track_foreign_dirty_slowpath(page, wb);
}

void mem_cgroup_flush_foreign(struct bdi_writeback *wb);

#else        /* CONFIG_CGROUP_WRITEBACK */

static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
{
        return NULL;
}

static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
                                       unsigned long *pfilepages,
                                       unsigned long *pheadroom,
                                       unsigned long *pdirty,
                                       unsigned long *pwriteback)
{
}

static inline void mem_cgroup_track_foreign_dirty(struct page *page,
                                                  struct bdi_writeback *wb)
{
}

static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
{
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

struct sock;
bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages);
void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages);
#ifdef CONFIG_MEMCG
extern struct static_key_false memcg_sockets_enabled_key;
#define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key)
void mem_cgroup_sk_alloc(struct sock *sk);
void mem_cgroup_sk_free(struct sock *sk);
static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
{
        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure)
                return true;
        do {
                if (time_before(jiffies, memcg->socket_pressure))
                        return true;
        } while ((memcg = parent_mem_cgroup(memcg)));
        return false;
}

extern int memcg_expand_shrinker_maps(int new_id);

extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
                                   int nid, int shrinker_id);
#else
#define mem_cgroup_sockets_enabled 0
static inline void mem_cgroup_sk_alloc(struct sock *sk) { };
static inline void mem_cgroup_sk_free(struct sock *sk) { };
static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
{
        return false;
}

static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
                                          int nid, int shrinker_id)
{
}
#endif

#ifdef CONFIG_MEMCG_KMEM
int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
                        unsigned int nr_pages);
void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages);
int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order);
void __memcg_kmem_uncharge_page(struct page *page, int order);

struct obj_cgroup *get_obj_cgroup_from_current(void);

int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size);
void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size);

extern struct static_key_false memcg_kmem_enabled_key;

extern int memcg_nr_cache_ids;
void memcg_get_cache_ids(void);
void memcg_put_cache_ids(void);

/*
 * Helper macro to loop through all memcg-specific caches. Callers must still
 * check if the cache is valid (it is either valid or NULL).
 * the slab_mutex must be held when looping through those caches
 */
#define for_each_memcg_cache_index(_idx)        \
        for ((_idx) = 0; (_idx) < memcg_nr_cache_ids; (_idx)++)

static inline bool memcg_kmem_enabled(void)
{
        return static_branch_likely(&memcg_kmem_enabled_key);
}

static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp,
                                         int order)
{
        if (memcg_kmem_enabled())
                return __memcg_kmem_charge_page(page, gfp, order);
        return 0;
}

static inline void memcg_kmem_uncharge_page(struct page *page, int order)
{
        if (memcg_kmem_enabled())
                __memcg_kmem_uncharge_page(page, order);
}

static inline int memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
                                    unsigned int nr_pages)
{
        if (memcg_kmem_enabled())
                return __memcg_kmem_charge(memcg, gfp, nr_pages);
        return 0;
}

static inline void memcg_kmem_uncharge(struct mem_cgroup *memcg,
                                       unsigned int nr_pages)
{
        if (memcg_kmem_enabled())
                __memcg_kmem_uncharge(memcg, nr_pages);
}

/*
 * helper for accessing a memcg's index. It will be used as an index in the
 * child cache array in kmem_cache, and also to derive its name. This function
 * will return -1 when this is not a kmem-limited memcg.
 */
static inline int memcg_cache_id(struct mem_cgroup *memcg)
{
        return memcg ? memcg->kmemcg_id : -1;
}

struct mem_cgroup *mem_cgroup_from_obj(void *p);

#else

static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp,
                                         int order)
{
        return 0;
}

static inline void memcg_kmem_uncharge_page(struct page *page, int order)
{
}

static inline int __memcg_kmem_charge_page(struct page *page, gfp_t gfp,
                                           int order)
{
        return 0;
}

static inline void __memcg_kmem_uncharge_page(struct page *page, int order)
{
}

#define for_each_memcg_cache_index(_idx)        \
        for (; NULL; )

static inline bool memcg_kmem_enabled(void)
{
        return false;
}

static inline int memcg_cache_id(struct mem_cgroup *memcg)
{
        return -1;
}

static inline void memcg_get_cache_ids(void)
{
}

static inline void memcg_put_cache_ids(void)
{
}

static inline struct mem_cgroup *mem_cgroup_from_obj(void *p)
{
       return NULL;
}

#endif /* CONFIG_MEMCG_KMEM */

#endif /* _LINUX_MEMCONTROL_H */





















































































    1 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Prevent the compiler from merging or refetching reads or writes. The
 * compiler is also forbidden from reordering successive instances of
 * READ_ONCE and WRITE_ONCE, but only when the compiler is aware of some
 * particular ordering. One way to make the compiler aware of ordering is to
 * put the two invocations of READ_ONCE or WRITE_ONCE in different C
 * statements.
 *
 * These two macros will also work on aggregate data types like structs or
 * unions.
 *
 * Their two major use cases are: (1) Mediating communication between
 * process-level code and irq/NMI handlers, all running on the same CPU,
 * and (2) Ensuring that the compiler does not fold, spindle, or otherwise
 * mutilate accesses that either do not require ordering or that interact
 * with an explicit memory barrier or atomic instruction that provides the
 * required ordering.
 */
#ifndef __ASM_GENERIC_RWONCE_H
#define __ASM_GENERIC_RWONCE_H

#ifndef __ASSEMBLY__

#include <linux/compiler_types.h>
#include <linux/kasan-checks.h>
#include <linux/kcsan-checks.h>

/*
 * Yes, this permits 64-bit accesses on 32-bit architectures. These will
 * actually be atomic in some cases (namely Armv7 + LPAE), but for others we
 * rely on the access being split into 2x32-bit accesses for a 32-bit quantity
 * (e.g. a virtual address) and a strong prevailing wind.
 */
#define compiletime_assert_rwonce_type(t)                                        \
        compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long),        \
                "Unsupported access size for {READ,WRITE}_ONCE().")

/*
 * Use __READ_ONCE() instead of READ_ONCE() if you do not require any
 * atomicity. Note that this may result in tears!
 */
#ifndef __READ_ONCE
#define __READ_ONCE(x)        (*(const volatile __unqual_scalar_typeof(x) *)&(x))
#endif

#define READ_ONCE(x)                                                        \
({                                                                        \
        compiletime_assert_rwonce_type(x);                                \
        __READ_ONCE(x);                                                        \
})

#define __WRITE_ONCE(x, val)                                                \
do {                                                                        \
        *(volatile typeof(x) *)&(x) = (val);                                \
} while (0)

#define WRITE_ONCE(x, val)                                                \
do {                                                                        \
        compiletime_assert_rwonce_type(x);                                \
        __WRITE_ONCE(x, val);                                                \
} while (0)

static __no_sanitize_or_inline
unsigned long __read_once_word_nocheck(const void *addr)
{
        return __READ_ONCE(*(unsigned long *)addr);
}

/*
 * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need to load a
 * word from memory atomically but without telling KASAN/KCSAN. This is
 * usually used by unwinding code when walking the stack of a running process.
 */
#define READ_ONCE_NOCHECK(x)                                                \
({                                                                        \
        compiletime_assert(sizeof(x) == sizeof(unsigned long),                \
                "Unsupported access size for READ_ONCE_NOCHECK().");        \
        (typeof(x))__read_once_word_nocheck(&(x));                        \
})

static __no_kasan_or_inline
unsigned long read_word_at_a_time(const void *addr)
{
        kasan_check_read(addr, 1);
        return *(unsigned long *)addr;
}

#endif /* __ASSEMBLY__ */
#endif        /* __ASM_GENERIC_RWONCE_H */








































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Definitions for the 'struct sk_buff' memory handlers.
 *
 *        Authors:
 *                Alan Cox, <gw4pts@gw4pts.ampr.org>
 *                Florian La Roche, <rzsfl@rz.uni-sb.de>
 */

#ifndef _LINUX_SKBUFF_H
#define _LINUX_SKBUFF_H

#include <linux/kernel.h>
#include <linux/compiler.h>
#include <linux/time.h>
#include <linux/bug.h>
#include <linux/bvec.h>
#include <linux/cache.h>
#include <linux/rbtree.h>
#include <linux/socket.h>
#include <linux/refcount.h>

#include <linux/atomic.h>
#include <asm/types.h>
#include <linux/spinlock.h>
#include <linux/net.h>
#include <linux/textsearch.h>
#include <net/checksum.h>
#include <linux/rcupdate.h>
#include <linux/hrtimer.h>
#include <linux/dma-mapping.h>
#include <linux/netdev_features.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <net/flow_dissector.h>
#include <linux/splice.h>
#include <linux/in6.h>
#include <linux/if_packet.h>
#include <net/flow.h>
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <linux/netfilter/nf_conntrack_common.h>
#endif

/* The interface for checksum offload between the stack and networking drivers
 * is as follows...
 *
 * A. IP checksum related features
 *
 * Drivers advertise checksum offload capabilities in the features of a device.
 * From the stack's point of view these are capabilities offered by the driver.
 * A driver typically only advertises features that it is capable of offloading
 * to its device.
 *
 * The checksum related features are:
 *
 *        NETIF_F_HW_CSUM        - The driver (or its device) is able to compute one
 *                          IP (one's complement) checksum for any combination
 *                          of protocols or protocol layering. The checksum is
 *                          computed and set in a packet per the CHECKSUM_PARTIAL
 *                          interface (see below).
 *
 *        NETIF_F_IP_CSUM - Driver (device) is only able to checksum plain
 *                          TCP or UDP packets over IPv4. These are specifically
 *                          unencapsulated packets of the form IPv4|TCP or
 *                          IPv4|UDP where the Protocol field in the IPv4 header
 *                          is TCP or UDP. The IPv4 header may contain IP options.
 *                          This feature cannot be set in features for a device
 *                          with NETIF_F_HW_CSUM also set. This feature is being
 *                          DEPRECATED (see below).
 *
 *        NETIF_F_IPV6_CSUM - Driver (device) is only able to checksum plain
 *                          TCP or UDP packets over IPv6. These are specifically
 *                          unencapsulated packets of the form IPv6|TCP or
 *                          IPv6|UDP where the Next Header field in the IPv6
 *                          header is either TCP or UDP. IPv6 extension headers
 *                          are not supported with this feature. This feature
 *                          cannot be set in features for a device with
 *                          NETIF_F_HW_CSUM also set. This feature is being
 *                          DEPRECATED (see below).
 *
 *        NETIF_F_RXCSUM - Driver (device) performs receive checksum offload.
 *                         This flag is only used to disable the RX checksum
 *                         feature for a device. The stack will accept receive
 *                         checksum indication in packets received on a device
 *                         regardless of whether NETIF_F_RXCSUM is set.
 *
 * B. Checksumming of received packets by device. Indication of checksum
 *    verification is set in skb->ip_summed. Possible values are:
 *
 * CHECKSUM_NONE:
 *
 *   Device did not checksum this packet e.g. due to lack of capabilities.
 *   The packet contains full (though not verified) checksum in packet but
 *   not in skb->csum. Thus, skb->csum is undefined in this case.
 *
 * CHECKSUM_UNNECESSARY:
 *
 *   The hardware you're dealing with doesn't calculate the full checksum
 *   (as in CHECKSUM_COMPLETE), but it does parse headers and verify checksums
 *   for specific protocols. For such packets it will set CHECKSUM_UNNECESSARY
 *   if their checksums are okay. skb->csum is still undefined in this case
 *   though. A driver or device must never modify the checksum field in the
 *   packet even if checksum is verified.
 *
 *   CHECKSUM_UNNECESSARY is applicable to following protocols:
 *     TCP: IPv6 and IPv4.
 *     UDP: IPv4 and IPv6. A device may apply CHECKSUM_UNNECESSARY to a
 *       zero UDP checksum for either IPv4 or IPv6, the networking stack
 *       may perform further validation in this case.
 *     GRE: only if the checksum is present in the header.
 *     SCTP: indicates the CRC in SCTP header has been validated.
 *     FCOE: indicates the CRC in FC frame has been validated.
 *
 *   skb->csum_level indicates the number of consecutive checksums found in
 *   the packet minus one that have been verified as CHECKSUM_UNNECESSARY.
 *   For instance if a device receives an IPv6->UDP->GRE->IPv4->TCP packet
 *   and a device is able to verify the checksums for UDP (possibly zero),
 *   GRE (checksum flag is set) and TCP, skb->csum_level would be set to
 *   two. If the device were only able to verify the UDP checksum and not
 *   GRE, either because it doesn't support GRE checksum or because GRE
 *   checksum is bad, skb->csum_level would be set to zero (TCP checksum is
 *   not considered in this case).
 *
 * CHECKSUM_COMPLETE:
 *
 *   This is the most generic way. The device supplied checksum of the _whole_
 *   packet as seen by netif_rx() and fills in skb->csum. This means the
 *   hardware doesn't need to parse L3/L4 headers to implement this.
 *
 *   Notes:
 *   - Even if device supports only some protocols, but is able to produce
 *     skb->csum, it MUST use CHECKSUM_COMPLETE, not CHECKSUM_UNNECESSARY.
 *   - CHECKSUM_COMPLETE is not applicable to SCTP and FCoE protocols.
 *
 * CHECKSUM_PARTIAL:
 *
 *   A checksum is set up to be offloaded to a device as described in the
 *   output description for CHECKSUM_PARTIAL. This may occur on a packet
 *   received directly from another Linux OS, e.g., a virtualized Linux kernel
 *   on the same host, or it may be set in the input path in GRO or remote
 *   checksum offload. For the purposes of checksum verification, the checksum
 *   referred to by skb->csum_start + skb->csum_offset and any preceding
 *   checksums in the packet are considered verified. Any checksums in the
 *   packet that are after the checksum being offloaded are not considered to
 *   be verified.
 *
 * C. Checksumming on transmit for non-GSO. The stack requests checksum offload
 *    in the skb->ip_summed for a packet. Values are:
 *
 * CHECKSUM_PARTIAL:
 *
 *   The driver is required to checksum the packet as seen by hard_start_xmit()
 *   from skb->csum_start up to the end, and to record/write the checksum at
 *   offset skb->csum_start + skb->csum_offset. A driver may verify that the
 *   csum_start and csum_offset values are valid values given the length and
 *   offset of the packet, but it should not attempt to validate that the
 *   checksum refers to a legitimate transport layer checksum -- it is the
 *   purview of the stack to validate that csum_start and csum_offset are set
 *   correctly.
 *
 *   When the stack requests checksum offload for a packet, the driver MUST
 *   ensure that the checksum is set correctly. A driver can either offload the
 *   checksum calculation to the device, or call skb_checksum_help (in the case
 *   that the device does not support offload for a particular checksum).
 *
 *   NETIF_F_IP_CSUM and NETIF_F_IPV6_CSUM are being deprecated in favor of
 *   NETIF_F_HW_CSUM. New devices should use NETIF_F_HW_CSUM to indicate
 *   checksum offload capability.
 *   skb_csum_hwoffload_help() can be called to resolve CHECKSUM_PARTIAL based
 *   on network device checksumming capabilities: if a packet does not match
 *   them, skb_checksum_help or skb_crc32c_help (depending on the value of
 *   csum_not_inet, see item D.) is called to resolve the checksum.
 *
 * CHECKSUM_NONE:
 *
 *   The skb was already checksummed by the protocol, or a checksum is not
 *   required.
 *
 * CHECKSUM_UNNECESSARY:
 *
 *   This has the same meaning as CHECKSUM_NONE for checksum offload on
 *   output.
 *
 * CHECKSUM_COMPLETE:
 *   Not used in checksum output. If a driver observes a packet with this value
 *   set in skbuff, it should treat the packet as if CHECKSUM_NONE were set.
 *
 * D. Non-IP checksum (CRC) offloads
 *
 *   NETIF_F_SCTP_CRC - This feature indicates that a device is capable of
 *     offloading the SCTP CRC in a packet. To perform this offload the stack
 *     will set csum_start and csum_offset accordingly, set ip_summed to
 *     CHECKSUM_PARTIAL and set csum_not_inet to 1, to provide an indication in
 *     the skbuff that the CHECKSUM_PARTIAL refers to CRC32c.
 *     A driver that supports both IP checksum offload and SCTP CRC32c offload
 *     must verify which offload is configured for a packet by testing the
 *     value of skb->csum_not_inet; skb_crc32c_csum_help is provided to resolve
 *     CHECKSUM_PARTIAL on skbs where csum_not_inet is set to 1.
 *
 *   NETIF_F_FCOE_CRC - This feature indicates that a device is capable of
 *     offloading the FCOE CRC in a packet. To perform this offload the stack
 *     will set ip_summed to CHECKSUM_PARTIAL and set csum_start and csum_offset
 *     accordingly. Note that there is no indication in the skbuff that the
 *     CHECKSUM_PARTIAL refers to an FCOE checksum, so a driver that supports
 *     both IP checksum offload and FCOE CRC offload must verify which offload
 *     is configured for a packet, presumably by inspecting packet headers.
 *
 * E. Checksumming on output with GSO.
 *
 * In the case of a GSO packet (skb_is_gso(skb) is true), checksum offload
 * is implied by the SKB_GSO_* flags in gso_type. Most obviously, if the
 * gso_type is SKB_GSO_TCPV4 or SKB_GSO_TCPV6, TCP checksum offload as
 * part of the GSO operation is implied. If a checksum is being offloaded
 * with GSO then ip_summed is CHECKSUM_PARTIAL, and both csum_start and
 * csum_offset are set to refer to the outermost checksum being offloaded
 * (two offloaded checksums are possible with UDP encapsulation).
 */

/* Don't change this without changing skb_csum_unnecessary! */
#define CHECKSUM_NONE                0
#define CHECKSUM_UNNECESSARY        1
#define CHECKSUM_COMPLETE        2
#define CHECKSUM_PARTIAL        3

/* Maximum value in skb->csum_level */
#define SKB_MAX_CSUM_LEVEL        3

#define SKB_DATA_ALIGN(X)        ALIGN(X, SMP_CACHE_BYTES)
#define SKB_WITH_OVERHEAD(X)        \
        ((X) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
#define SKB_MAX_ORDER(X, ORDER) \
        SKB_WITH_OVERHEAD((PAGE_SIZE << (ORDER)) - (X))
#define SKB_MAX_HEAD(X)                (SKB_MAX_ORDER((X), 0))
#define SKB_MAX_ALLOC                (SKB_MAX_ORDER(0, 2))

/* return minimum truesize of one skb containing X bytes of data */
#define SKB_TRUESIZE(X) ((X) +                                                \
                         SKB_DATA_ALIGN(sizeof(struct sk_buff)) +        \
                         SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))

struct ahash_request;
struct net_device;
struct scatterlist;
struct pipe_inode_info;
struct iov_iter;
struct napi_struct;
struct bpf_prog;
union bpf_attr;
struct skb_ext;

#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
struct nf_bridge_info {
        enum {
                BRNF_PROTO_UNCHANGED,
                BRNF_PROTO_8021Q,
                BRNF_PROTO_PPPOE
        } orig_proto:8;
        u8                        pkt_otherhost:1;
        u8                        in_prerouting:1;
        u8                        bridged_dnat:1;
        u8                        sabotage_in_done:1;
        __u16                        frag_max_size;
        struct net_device        *physindev;

        /* always valid & non-NULL from FORWARD on, for physdev match */
        struct net_device        *physoutdev;
        union {
                /* prerouting: detect dnat in orig/reply direction */
                __be32          ipv4_daddr;
                struct in6_addr ipv6_daddr;

                /* after prerouting + nat detected: store original source
                 * mac since neigh resolution overwrites it, only used while
                 * skb is out in neigh layer.
                 */
                char neigh_header[8];
        };
};
#endif

#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
/* Chain in tc_skb_ext will be used to share the tc chain with
 * ovs recirc_id. It will be set to the current chain by tc
 * and read by ovs to recirc_id.
 */
struct tc_skb_ext {
        __u32 chain;
        __u16 mru;
};
#endif

struct sk_buff_head {
        /* These two members must be first. */
        struct sk_buff        *next;
        struct sk_buff        *prev;

        __u32                qlen;
        spinlock_t        lock;
};

struct sk_buff;

/* To allow 64K frame to be packed as single skb without frag_list we
 * require 64K/PAGE_SIZE pages plus 1 additional page to allow for
 * buffers which do not start on a page boundary.
 *
 * Since GRO uses frags we allocate at least 16 regardless of page
 * size.
 */
#if (65536/PAGE_SIZE + 1) < 16
#define MAX_SKB_FRAGS 16UL
#else
#define MAX_SKB_FRAGS (65536/PAGE_SIZE + 1)
#endif
extern int sysctl_max_skb_frags;

/* Set skb_shinfo(skb)->gso_size to this in case you want skb_segment to
 * segment using its current segmentation instead.
 */
#define GSO_BY_FRAGS        0xFFFF

typedef struct bio_vec skb_frag_t;

/**
 * skb_frag_size() - Returns the size of a skb fragment
 * @frag: skb fragment
 */
static inline unsigned int skb_frag_size(const skb_frag_t *frag)
{
        return frag->bv_len;
}

/**
 * skb_frag_size_set() - Sets the size of a skb fragment
 * @frag: skb fragment
 * @size: size of fragment
 */
static inline void skb_frag_size_set(skb_frag_t *frag, unsigned int size)
{
        frag->bv_len = size;
}

/**
 * skb_frag_size_add() - Increments the size of a skb fragment by @delta
 * @frag: skb fragment
 * @delta: value to add
 */
static inline void skb_frag_size_add(skb_frag_t *frag, int delta)
{
        frag->bv_len += delta;
}

/**
 * skb_frag_size_sub() - Decrements the size of a skb fragment by @delta
 * @frag: skb fragment
 * @delta: value to subtract
 */
static inline void skb_frag_size_sub(skb_frag_t *frag, int delta)
{
        frag->bv_len -= delta;
}

/**
 * skb_frag_must_loop - Test if %p is a high memory page
 * @p: fragment's page
 */
static inline bool skb_frag_must_loop(struct page *p)
{
#if defined(CONFIG_HIGHMEM)
        if (PageHighMem(p))
                return true;
#endif
        return false;
}

/**
 *        skb_frag_foreach_page - loop over pages in a fragment
 *
 *        @f:                skb frag to operate on
 *        @f_off:                offset from start of f->bv_page
 *        @f_len:                length from f_off to loop over
 *        @p:                (temp var) current page
 *        @p_off:                (temp var) offset from start of current page,
 *                                   non-zero only on first page.
 *        @p_len:                (temp var) length in current page,
 *                                   < PAGE_SIZE only on first and last page.
 *        @copied:        (temp var) length so far, excluding current p_len.
 *
 *        A fragment can hold a compound page, in which case per-page
 *        operations, notably kmap_atomic, must be called for each
 *        regular page.
 */
#define skb_frag_foreach_page(f, f_off, f_len, p, p_off, p_len, copied)        \
        for (p = skb_frag_page(f) + ((f_off) >> PAGE_SHIFT),                \
             p_off = (f_off) & (PAGE_SIZE - 1),                                \
             p_len = skb_frag_must_loop(p) ?                                \
             min_t(u32, f_len, PAGE_SIZE - p_off) : f_len,                \
             copied = 0;                                                \
             copied < f_len;                                                \
             copied += p_len, p++, p_off = 0,                                \
             p_len = min_t(u32, f_len - copied, PAGE_SIZE))                \

#define HAVE_HW_TIME_STAMP

/**
 * struct skb_shared_hwtstamps - hardware time stamps
 * @hwtstamp:        hardware time stamp transformed into duration
 *                since arbitrary point in time
 *
 * Software time stamps generated by ktime_get_real() are stored in
 * skb->tstamp.
 *
 * hwtstamps can only be compared against other hwtstamps from
 * the same device.
 *
 * This structure is attached to packets as part of the
 * &skb_shared_info. Use skb_hwtstamps() to get a pointer.
 */
struct skb_shared_hwtstamps {
        ktime_t        hwtstamp;
};

/* Definitions for tx_flags in struct skb_shared_info */
enum {
        /* generate hardware time stamp */
        SKBTX_HW_TSTAMP = 1 << 0,

        /* generate software time stamp when queueing packet to NIC */
        SKBTX_SW_TSTAMP = 1 << 1,

        /* device driver is going to provide hardware time stamp */
        SKBTX_IN_PROGRESS = 1 << 2,

        /* device driver supports TX zero-copy buffers */
        SKBTX_DEV_ZEROCOPY = 1 << 3,

        /* generate wifi status information (where possible) */
        SKBTX_WIFI_STATUS = 1 << 4,

        /* This indicates at least one fragment might be overwritten
         * (as in vmsplice(), sendfile() ...)
         * If we need to compute a TX checksum, we'll need to copy
         * all frags to avoid possible bad checksum
         */
        SKBTX_SHARED_FRAG = 1 << 5,

        /* generate software time stamp when entering packet scheduling */
        SKBTX_SCHED_TSTAMP = 1 << 6,
};

#define SKBTX_ZEROCOPY_FRAG        (SKBTX_DEV_ZEROCOPY | SKBTX_SHARED_FRAG)
#define SKBTX_ANY_SW_TSTAMP        (SKBTX_SW_TSTAMP    | \
                                 SKBTX_SCHED_TSTAMP)
#define SKBTX_ANY_TSTAMP        (SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP)

/*
 * The callback notifies userspace to release buffers when skb DMA is done in
 * lower device, the skb last reference should be 0 when calling this.
 * The zerocopy_success argument is true if zero copy transmit occurred,
 * false on data copy or out of memory error caused by data copy attempt.
 * The ctx field is used to track device context.
 * The desc field is used to track userspace buffer index.
 */
struct ubuf_info {
        void (*callback)(struct ubuf_info *, bool zerocopy_success);
        union {
                struct {
                        unsigned long desc;
                        void *ctx;
                };
                struct {
                        u32 id;
                        u16 len;
                        u16 zerocopy:1;
                        u32 bytelen;
                };
        };
        refcount_t refcnt;

        struct mmpin {
                struct user_struct *user;
                unsigned int num_pg;
        } mmp;
};

#define skb_uarg(SKB)        ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))

int mm_account_pinned_pages(struct mmpin *mmp, size_t size);
void mm_unaccount_pinned_pages(struct mmpin *mmp);

struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size);
struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
                                        struct ubuf_info *uarg);

static inline void sock_zerocopy_get(struct ubuf_info *uarg)
{
        refcount_inc(&uarg->refcnt);
}

void sock_zerocopy_put(struct ubuf_info *uarg);
void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);

void sock_zerocopy_callback(struct ubuf_info *uarg, bool success);

int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len);
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
                             struct msghdr *msg, int len,
                             struct ubuf_info *uarg);

/* This data is invariant across clones and lives at
 * the end of the header data, ie. at skb->end.
 */
struct skb_shared_info {
        __u8                __unused;
        __u8                meta_len;
        __u8                nr_frags;
        __u8                tx_flags;
        unsigned short        gso_size;
        /* Warning: this field is not always filled in (UFO)! */
        unsigned short        gso_segs;
        struct sk_buff        *frag_list;
        struct skb_shared_hwtstamps hwtstamps;
        unsigned int        gso_type;
        u32                tskey;

        /*
         * Warning : all fields before dataref are cleared in __alloc_skb()
         */
        atomic_t        dataref;

        /* Intermediate layers must ensure that destructor_arg
         * remains valid until skb destructor */
        void *                destructor_arg;

        /* must be last field, see pskb_expand_head() */
        skb_frag_t        frags[MAX_SKB_FRAGS];
};

/* We divide dataref into two halves.  The higher 16 bits hold references
 * to the payload part of skb->data.  The lower 16 bits hold references to
 * the entire skb->data.  A clone of a headerless skb holds the length of
 * the header in skb->hdr_len.
 *
 * All users must obey the rule that the skb->data reference count must be
 * greater than or equal to the payload reference count.
 *
 * Holding a reference to the payload part means that the user does not
 * care about modifications to the header part of skb->data.
 */
#define SKB_DATAREF_SHIFT 16
#define SKB_DATAREF_MASK ((1 << SKB_DATAREF_SHIFT) - 1)


enum {
        SKB_FCLONE_UNAVAILABLE,        /* skb has no fclone (from head_cache) */
        SKB_FCLONE_ORIG,        /* orig skb (from fclone_cache) */
        SKB_FCLONE_CLONE,        /* companion fclone skb (from fclone_cache) */
};

enum {
        SKB_GSO_TCPV4 = 1 << 0,

        /* This indicates the skb is from an untrusted source. */
        SKB_GSO_DODGY = 1 << 1,

        /* This indicates the tcp segment has CWR set. */
        SKB_GSO_TCP_ECN = 1 << 2,

        SKB_GSO_TCP_FIXEDID = 1 << 3,

        SKB_GSO_TCPV6 = 1 << 4,

        SKB_GSO_FCOE = 1 << 5,

        SKB_GSO_GRE = 1 << 6,

        SKB_GSO_GRE_CSUM = 1 << 7,

        SKB_GSO_IPXIP4 = 1 << 8,

        SKB_GSO_IPXIP6 = 1 << 9,

        SKB_GSO_UDP_TUNNEL = 1 << 10,

        SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11,

        SKB_GSO_PARTIAL = 1 << 12,

        SKB_GSO_TUNNEL_REMCSUM = 1 << 13,

        SKB_GSO_SCTP = 1 << 14,

        SKB_GSO_ESP = 1 << 15,

        SKB_GSO_UDP = 1 << 16,

        SKB_GSO_UDP_L4 = 1 << 17,

        SKB_GSO_FRAGLIST = 1 << 18,
};

#if BITS_PER_LONG > 32
#define NET_SKBUFF_DATA_USES_OFFSET 1
#endif

#ifdef NET_SKBUFF_DATA_USES_OFFSET
typedef unsigned int sk_buff_data_t;
#else
typedef unsigned char *sk_buff_data_t;
#endif

/**
 *        struct sk_buff - socket buffer
 *        @next: Next buffer in list
 *        @prev: Previous buffer in list
 *        @tstamp: Time we arrived/left
 *        @skb_mstamp_ns: (aka @tstamp) earliest departure time; start point
 *                for retransmit timer
 *        @rbnode: RB tree node, alternative to next/prev for netem/tcp
 *        @list: queue head
 *        @sk: Socket we are owned by
 *        @ip_defrag_offset: (aka @sk) alternate use of @sk, used in
 *                fragmentation management
 *        @dev: Device we arrived on/are leaving by
 *        @dev_scratch: (aka @dev) alternate use of @dev when @dev would be %NULL
 *        @cb: Control buffer. Free for use by every layer. Put private vars here
 *        @_skb_refdst: destination entry (with norefcount bit)
 *        @sp: the security path, used for xfrm
 *        @len: Length of actual data
 *        @data_len: Data length
 *        @mac_len: Length of link layer header
 *        @hdr_len: writable header length of cloned skb
 *        @csum: Checksum (must include start/offset pair)
 *        @csum_start: Offset from skb->head where checksumming should start
 *        @csum_offset: Offset from csum_start where checksum should be stored
 *        @priority: Packet queueing priority
 *        @ignore_df: allow local fragmentation
 *        @cloned: Head may be cloned (check refcnt to be sure)
 *        @ip_summed: Driver fed us an IP checksum
 *        @nohdr: Payload reference only, must not modify header
 *        @pkt_type: Packet class
 *        @fclone: skbuff clone status
 *        @ipvs_property: skbuff is owned by ipvs
 *        @inner_protocol_type: whether the inner protocol is
 *                ENCAP_TYPE_ETHER or ENCAP_TYPE_IPPROTO
 *        @remcsum_offload: remote checksum offload is enabled
 *        @offload_fwd_mark: Packet was L2-forwarded in hardware
 *        @offload_l3_fwd_mark: Packet was L3-forwarded in hardware
 *        @tc_skip_classify: do not classify packet. set by IFB device
 *        @tc_at_ingress: used within tc_classify to distinguish in/egress
 *        @redirected: packet was redirected by packet classifier
 *        @from_ingress: packet was redirected from the ingress path
 *        @peeked: this packet has been seen already, so stats have been
 *                done for it, don't do them again
 *        @nf_trace: netfilter packet trace flag
 *        @protocol: Packet protocol from driver
 *        @destructor: Destruct function
 *        @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue)
 *        @_nfct: Associated connection, if any (with nfctinfo bits)
 *        @nf_bridge: Saved data about a bridged frame - see br_netfilter.c
 *        @skb_iif: ifindex of device we arrived on
 *        @tc_index: Traffic control index
 *        @hash: the packet hash
 *        @queue_mapping: Queue mapping for multiqueue devices
 *        @head_frag: skb was allocated from page fragments,
 *                not allocated by kmalloc() or vmalloc().
 *        @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves
 *        @active_extensions: active extensions (skb_ext_id types)
 *        @ndisc_nodetype: router type (from link layer)
 *        @ooo_okay: allow the mapping of a socket to a queue to be changed
 *        @l4_hash: indicate hash is a canonical 4-tuple hash over transport
 *                ports.
 *        @sw_hash: indicates hash was computed in software stack
 *        @wifi_acked_valid: wifi_acked was set
 *        @wifi_acked: whether frame was acked on wifi or not
 *        @no_fcs:  Request NIC to treat last 4 bytes as Ethernet FCS
 *        @encapsulation: indicates the inner headers in the skbuff are valid
 *        @encap_hdr_csum: software checksum is needed
 *        @csum_valid: checksum is already valid
 *        @csum_not_inet: use CRC32c to resolve CHECKSUM_PARTIAL
 *        @csum_complete_sw: checksum was completed by software
 *        @csum_level: indicates the number of consecutive checksums found in
 *                the packet minus one that have been verified as
 *                CHECKSUM_UNNECESSARY (max 3)
 *        @scm_io_uring: SKB holds io_uring registered files
 *        @dst_pending_confirm: need to confirm neighbour
 *        @decrypted: Decrypted SKB
 *        @napi_id: id of the NAPI struct this skb came from
 *        @sender_cpu: (aka @napi_id) source CPU in XPS
 *        @secmark: security marking
 *        @mark: Generic packet mark
 *        @reserved_tailroom: (aka @mark) number of bytes of free space available
 *                at the tail of an sk_buff
 *        @vlan_present: VLAN tag is present
 *        @vlan_proto: vlan encapsulation protocol
 *        @vlan_tci: vlan tag control information
 *        @inner_protocol: Protocol (encapsulation)
 *        @inner_ipproto: (aka @inner_protocol) stores ipproto when
 *                skb->inner_protocol_type == ENCAP_TYPE_IPPROTO;
 *        @inner_transport_header: Inner transport layer header (encapsulation)
 *        @inner_network_header: Network layer header (encapsulation)
 *        @inner_mac_header: Link layer header (encapsulation)
 *        @transport_header: Transport layer header
 *        @network_header: Network layer header
 *        @mac_header: Link layer header
 *        @kcov_handle: KCOV remote handle for remote coverage collection
 *        @tail: Tail pointer
 *        @end: End pointer
 *        @head: Head of buffer
 *        @data: Data head pointer
 *        @truesize: Buffer size
 *        @users: User count - see {datagram,tcp}.c
 *        @extensions: allocated extensions, valid if active_extensions is nonzero
 */

struct sk_buff {
        union {
                struct {
                        /* These two members must be first. */
                        struct sk_buff                *next;
                        struct sk_buff                *prev;

                        union {
                                struct net_device        *dev;
                                /* Some protocols might use this space to store information,
                                 * while device pointer would be NULL.
                                 * UDP receive path is one user.
                                 */
                                unsigned long                dev_scratch;
                        };
                };
                struct rb_node                rbnode; /* used in netem, ip4 defrag, and tcp stack */
                struct list_head        list;
        };

        union {
                struct sock                *sk;
                int                        ip_defrag_offset;
        };

        union {
                ktime_t                tstamp;
                u64                skb_mstamp_ns; /* earliest departure time */
        };
        /*
         * This is the control buffer. It is free to use for every
         * layer. Please put your private variables there. If you
         * want to keep them across layers you have to do a skb_clone()
         * first. This is owned by whoever has the skb queued ATM.
         */
        char                        cb[48] __aligned(8);

        union {
                struct {
                        unsigned long        _skb_refdst;
                        void                (*destructor)(struct sk_buff *skb);
                };
                struct list_head        tcp_tsorted_anchor;
        };

#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        unsigned long                 _nfct;
#endif
        unsigned int                len,
                                data_len;
        __u16                        mac_len,
                                hdr_len;

        /* Following fields are _not_ copied in __copy_skb_header()
         * Note that queue_mapping is here mostly to fill a hole.
         */
        __u16                        queue_mapping;

/* if you move cloned around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define CLONED_MASK        (1 << 7)
#else
#define CLONED_MASK        1
#endif
#define CLONED_OFFSET()                offsetof(struct sk_buff, __cloned_offset)

        /* private: */
        __u8                        __cloned_offset[0];
        /* public: */
        __u8                        cloned:1,
                                nohdr:1,
                                fclone:2,
                                peeked:1,
                                head_frag:1,
                                pfmemalloc:1;
#ifdef CONFIG_SKB_EXTENSIONS
        __u8                        active_extensions;
#endif
        /* fields enclosed in headers_start/headers_end are copied
         * using a single memcpy() in __copy_skb_header()
         */
        /* private: */
        __u32                        headers_start[0];
        /* public: */

/* if you move pkt_type around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define PKT_TYPE_MAX        (7 << 5)
#else
#define PKT_TYPE_MAX        7
#endif
#define PKT_TYPE_OFFSET()        offsetof(struct sk_buff, __pkt_type_offset)

        /* private: */
        __u8                        __pkt_type_offset[0];
        /* public: */
        __u8                        pkt_type:3;
        __u8                        ignore_df:1;
        __u8                        nf_trace:1;
        __u8                        ip_summed:2;
        __u8                        ooo_okay:1;

        __u8                        l4_hash:1;
        __u8                        sw_hash:1;
        __u8                        wifi_acked_valid:1;
        __u8                        wifi_acked:1;
        __u8                        no_fcs:1;
        /* Indicates the inner headers are valid in the skbuff. */
        __u8                        encapsulation:1;
        __u8                        encap_hdr_csum:1;
        __u8                        csum_valid:1;

#ifdef __BIG_ENDIAN_BITFIELD
#define PKT_VLAN_PRESENT_BIT        7
#else
#define PKT_VLAN_PRESENT_BIT        0
#endif
#define PKT_VLAN_PRESENT_OFFSET()        offsetof(struct sk_buff, __pkt_vlan_present_offset)
        /* private: */
        __u8                        __pkt_vlan_present_offset[0];
        /* public: */
        __u8                        vlan_present:1;
        __u8                        csum_complete_sw:1;
        __u8                        csum_level:2;
        __u8                        csum_not_inet:1;
        __u8                        dst_pending_confirm:1;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
        __u8                        ndisc_nodetype:2;
#endif

        __u8                        ipvs_property:1;
        __u8                        inner_protocol_type:1;
        __u8                        remcsum_offload:1;
#ifdef CONFIG_NET_SWITCHDEV
        __u8                        offload_fwd_mark:1;
        __u8                        offload_l3_fwd_mark:1;
#endif
#ifdef CONFIG_NET_CLS_ACT
        __u8                        tc_skip_classify:1;
        __u8                        tc_at_ingress:1;
#endif
#ifdef CONFIG_NET_REDIRECT
        __u8                        redirected:1;
        __u8                        from_ingress:1;
#endif
#ifdef CONFIG_TLS_DEVICE
        __u8                        decrypted:1;
#endif
        __u8                        scm_io_uring:1;

#ifdef CONFIG_NET_SCHED
        __u16                        tc_index;        /* traffic control index */
#endif

        union {
                __wsum                csum;
                struct {
                        __u16        csum_start;
                        __u16        csum_offset;
                };
        };
        __u32                        priority;
        int                        skb_iif;
        __u32                        hash;
        __be16                        vlan_proto;
        __u16                        vlan_tci;
#if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS)
        union {
                unsigned int        napi_id;
                unsigned int        sender_cpu;
        };
#endif
#ifdef CONFIG_NETWORK_SECMARK
        __u32                secmark;
#endif

        union {
                __u32                mark;
                __u32                reserved_tailroom;
        };

        union {
                __be16                inner_protocol;
                __u8                inner_ipproto;
        };

        __u16                        inner_transport_header;
        __u16                        inner_network_header;
        __u16                        inner_mac_header;

        __be16                        protocol;
        __u16                        transport_header;
        __u16                        network_header;
        __u16                        mac_header;

#ifdef CONFIG_KCOV
        u64                        kcov_handle;
#endif

        /* private: */
        __u32                        headers_end[0];
        /* public: */

        /* These elements must be at the end, see alloc_skb() for details.  */
        sk_buff_data_t                tail;
        sk_buff_data_t                end;
        unsigned char                *head,
                                *data;
        unsigned int                truesize;
        refcount_t                users;

#ifdef CONFIG_SKB_EXTENSIONS
        /* only useable after checking ->active_extensions != 0 */
        struct skb_ext                *extensions;
#endif
};

#ifdef __KERNEL__
/*
 *        Handling routines are only of interest to the kernel
 */

#define SKB_ALLOC_FCLONE        0x01
#define SKB_ALLOC_RX                0x02
#define SKB_ALLOC_NAPI                0x04

/**
 * skb_pfmemalloc - Test if the skb was allocated from PFMEMALLOC reserves
 * @skb: buffer
 */
static inline bool skb_pfmemalloc(const struct sk_buff *skb)
{
        return unlikely(skb->pfmemalloc);
}

/*
 * skb might have a dst pointer attached, refcounted or not.
 * _skb_refdst low order bit is set if refcount was _not_ taken
 */
#define SKB_DST_NOREF        1UL
#define SKB_DST_PTRMASK        ~(SKB_DST_NOREF)

/**
 * skb_dst - returns skb dst_entry
 * @skb: buffer
 *
 * Returns skb dst_entry, regardless of reference taken or not.
 */
static inline struct dst_entry *skb_dst(const struct sk_buff *skb)
{
        /* If refdst was not refcounted, check we still are in a
         * rcu_read_lock section
         */
        WARN_ON((skb->_skb_refdst & SKB_DST_NOREF) &&
                !rcu_read_lock_held() &&
                !rcu_read_lock_bh_held());
        return (struct dst_entry *)(skb->_skb_refdst & SKB_DST_PTRMASK);
}

/**
 * skb_dst_set - sets skb dst
 * @skb: buffer
 * @dst: dst entry
 *
 * Sets skb dst, assuming a reference was taken on dst and should
 * be released by skb_dst_drop()
 */
static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)
{
        skb->_skb_refdst = (unsigned long)dst;
}

/**
 * skb_dst_set_noref - sets skb dst, hopefully, without taking reference
 * @skb: buffer
 * @dst: dst entry
 *
 * Sets skb dst, assuming a reference was not taken on dst.
 * If dst entry is cached, we do not take reference and dst_release
 * will be avoided by refdst_drop. If dst entry is not cached, we take
 * reference, so that last dst_release can destroy the dst immediately.
 */
static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
{
        WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
        skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
}

/**
 * skb_dst_is_noref - Test if skb dst isn't refcounted
 * @skb: buffer
 */
static inline bool skb_dst_is_noref(const struct sk_buff *skb)
{
        return (skb->_skb_refdst & SKB_DST_NOREF) && skb_dst(skb);
}

/**
 * skb_rtable - Returns the skb &rtable
 * @skb: buffer
 */
static inline struct rtable *skb_rtable(const struct sk_buff *skb)
{
        return (struct rtable *)skb_dst(skb);
}

/* For mangling skb->pkt_type from user space side from applications
 * such as nft, tc, etc, we only allow a conservative subset of
 * possible pkt_types to be set.
*/
static inline bool skb_pkt_type_ok(u32 ptype)
{
        return ptype <= PACKET_OTHERHOST;
}

/**
 * skb_napi_id - Returns the skb's NAPI id
 * @skb: buffer
 */
static inline unsigned int skb_napi_id(const struct sk_buff *skb)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        return skb->napi_id;
#else
        return 0;
#endif
}

/**
 * skb_unref - decrement the skb's reference count
 * @skb: buffer
 *
 * Returns true if we can free the skb.
 */
static inline bool skb_unref(struct sk_buff *skb)
{
        if (unlikely(!skb))
                return false;
        if (likely(refcount_read(&skb->users) == 1))
                smp_rmb();
        else if (likely(!refcount_dec_and_test(&skb->users)))
                return false;

        return true;
}

void skb_release_head_state(struct sk_buff *skb);
void kfree_skb(struct sk_buff *skb);
void kfree_skb_list(struct sk_buff *segs);
void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt);
void skb_tx_error(struct sk_buff *skb);

#ifdef CONFIG_TRACEPOINTS
void consume_skb(struct sk_buff *skb);
#else
static inline void consume_skb(struct sk_buff *skb)
{
        return kfree_skb(skb);
}
#endif

void __consume_stateless_skb(struct sk_buff *skb);
void  __kfree_skb(struct sk_buff *skb);
extern struct kmem_cache *skbuff_head_cache;

void kfree_skb_partial(struct sk_buff *skb, bool head_stolen);
bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
                      bool *fragstolen, int *delta_truesize);

struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags,
                            int node);
struct sk_buff *__build_skb(void *data, unsigned int frag_size);
struct sk_buff *build_skb(void *data, unsigned int frag_size);
struct sk_buff *build_skb_around(struct sk_buff *skb,
                                 void *data, unsigned int frag_size);

/**
 * alloc_skb - allocate a network buffer
 * @size: size to allocate
 * @priority: allocation mask
 *
 * This function is a convenient wrapper around __alloc_skb().
 */
static inline struct sk_buff *alloc_skb(unsigned int size,
                                        gfp_t priority)
{
        return __alloc_skb(size, priority, 0, NUMA_NO_NODE);
}

struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
                                     unsigned long data_len,
                                     int max_page_order,
                                     int *errcode,
                                     gfp_t gfp_mask);
struct sk_buff *alloc_skb_for_msg(struct sk_buff *first);

/* Layout of fast clones : [skb1][skb2][fclone_ref] */
struct sk_buff_fclones {
        struct sk_buff        skb1;

        struct sk_buff        skb2;

        refcount_t        fclone_ref;
};

/**
 *        skb_fclone_busy - check if fclone is busy
 *        @sk: socket
 *        @skb: buffer
 *
 * Returns true if skb is a fast clone, and its clone is not freed.
 * Some drivers call skb_orphan() in their ndo_start_xmit(),
 * so we also check that this didnt happen.
 */
static inline bool skb_fclone_busy(const struct sock *sk,
                                   const struct sk_buff *skb)
{
        const struct sk_buff_fclones *fclones;

        fclones = container_of(skb, struct sk_buff_fclones, skb1);

        return skb->fclone == SKB_FCLONE_ORIG &&
               refcount_read(&fclones->fclone_ref) > 1 &&
               fclones->skb2.sk == sk;
}

/**
 * alloc_skb_fclone - allocate a network buffer from fclone cache
 * @size: size to allocate
 * @priority: allocation mask
 *
 * This function is a convenient wrapper around __alloc_skb().
 */
static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
                                               gfp_t priority)
{
        return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE);
}

struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src);
void skb_headers_offset_update(struct sk_buff *skb, int off);
int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask);
struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority);
void skb_copy_header(struct sk_buff *new, const struct sk_buff *old);
struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t priority);
struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
                                   gfp_t gfp_mask, bool fclone);
static inline struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom,
                                          gfp_t gfp_mask)
{
        return __pskb_copy_fclone(skb, headroom, gfp_mask, false);
}

int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask);
struct sk_buff *skb_realloc_headroom(struct sk_buff *skb,
                                     unsigned int headroom);
struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom,
                                int newtailroom, gfp_t priority);
int __must_check skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
                                     int offset, int len);
int __must_check skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg,
                              int offset, int len);
int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer);
int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error);

/**
 *        skb_pad                        -        zero pad the tail of an skb
 *        @skb: buffer to pad
 *        @pad: space to pad
 *
 *        Ensure that a buffer is followed by a padding area that is zero
 *        filled. Used by network drivers which may DMA or transfer data
 *        beyond the buffer end onto the wire.
 *
 *        May return error in out of memory cases. The skb is freed on error.
 */
static inline int skb_pad(struct sk_buff *skb, int pad)
{
        return __skb_pad(skb, pad, true);
}
#define dev_kfree_skb(a)        consume_skb(a)

int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
                         int offset, size_t size);

struct skb_seq_state {
        __u32                lower_offset;
        __u32                upper_offset;
        __u32                frag_idx;
        __u32                stepped_offset;
        struct sk_buff        *root_skb;
        struct sk_buff        *cur_skb;
        __u8                *frag_data;
};

void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
                          unsigned int to, struct skb_seq_state *st);
unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
                          struct skb_seq_state *st);
void skb_abort_seq_read(struct skb_seq_state *st);

unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
                           unsigned int to, struct ts_config *config);

/*
 * Packet hash types specify the type of hash in skb_set_hash.
 *
 * Hash types refer to the protocol layer addresses which are used to
 * construct a packet's hash. The hashes are used to differentiate or identify
 * flows of the protocol layer for the hash type. Hash types are either
 * layer-2 (L2), layer-3 (L3), or layer-4 (L4).
 *
 * Properties of hashes:
 *
 * 1) Two packets in different flows have different hash values
 * 2) Two packets in the same flow should have the same hash value
 *
 * A hash at a higher layer is considered to be more specific. A driver should
 * set the most specific hash possible.
 *
 * A driver cannot indicate a more specific hash than the layer at which a hash
 * was computed. For instance an L3 hash cannot be set as an L4 hash.
 *
 * A driver may indicate a hash level which is less specific than the
 * actual layer the hash was computed on. For instance, a hash computed
 * at L4 may be considered an L3 hash. This should only be done if the
 * driver can't unambiguously determine that the HW computed the hash at
 * the higher layer. Note that the "should" in the second property above
 * permits this.
 */
enum pkt_hash_types {
        PKT_HASH_TYPE_NONE,        /* Undefined type */
        PKT_HASH_TYPE_L2,        /* Input: src_MAC, dest_MAC */
        PKT_HASH_TYPE_L3,        /* Input: src_IP, dst_IP */
        PKT_HASH_TYPE_L4,        /* Input: src_IP, dst_IP, src_port, dst_port */
};

static inline void skb_clear_hash(struct sk_buff *skb)
{
        skb->hash = 0;
        skb->sw_hash = 0;
        skb->l4_hash = 0;
}

static inline void skb_clear_hash_if_not_l4(struct sk_buff *skb)
{
        if (!skb->l4_hash)
                skb_clear_hash(skb);
}

static inline void
__skb_set_hash(struct sk_buff *skb, __u32 hash, bool is_sw, bool is_l4)
{
        skb->l4_hash = is_l4;
        skb->sw_hash = is_sw;
        skb->hash = hash;
}

static inline void
skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type)
{
        /* Used by drivers to set hash from HW */
        __skb_set_hash(skb, hash, false, type == PKT_HASH_TYPE_L4);
}

static inline void
__skb_set_sw_hash(struct sk_buff *skb, __u32 hash, bool is_l4)
{
        __skb_set_hash(skb, hash, true, is_l4);
}

void __skb_get_hash(struct sk_buff *skb);
u32 __skb_get_hash_symmetric(const struct sk_buff *skb);
u32 skb_get_poff(const struct sk_buff *skb);
u32 __skb_get_poff(const struct sk_buff *skb, void *data,
                   const struct flow_keys_basic *keys, int hlen);
__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
                            void *data, int hlen_proto);

static inline __be32 skb_flow_get_ports(const struct sk_buff *skb,
                                        int thoff, u8 ip_proto)
{
        return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0);
}

void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
                             const struct flow_dissector_key *key,
                             unsigned int key_count);

struct bpf_flow_dissector;
bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
                      __be16 proto, int nhoff, int hlen, unsigned int flags);

bool __skb_flow_dissect(const struct net *net,
                        const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
                        void *target_container,
                        void *data, __be16 proto, int nhoff, int hlen,
                        unsigned int flags);

static inline bool skb_flow_dissect(const struct sk_buff *skb,
                                    struct flow_dissector *flow_dissector,
                                    void *target_container, unsigned int flags)
{
        return __skb_flow_dissect(NULL, skb, flow_dissector,
                                  target_container, NULL, 0, 0, 0, flags);
}

static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
                                              struct flow_keys *flow,
                                              unsigned int flags)
{
        memset(flow, 0, sizeof(*flow));
        return __skb_flow_dissect(NULL, skb, &flow_keys_dissector,
                                  flow, NULL, 0, 0, 0, flags);
}

static inline bool
skb_flow_dissect_flow_keys_basic(const struct net *net,
                                 const struct sk_buff *skb,
                                 struct flow_keys_basic *flow, void *data,
                                 __be16 proto, int nhoff, int hlen,
                                 unsigned int flags)
{
        memset(flow, 0, sizeof(*flow));
        return __skb_flow_dissect(net, skb, &flow_keys_basic_dissector, flow,
                                  data, proto, nhoff, hlen, flags);
}

void skb_flow_dissect_meta(const struct sk_buff *skb,
                           struct flow_dissector *flow_dissector,
                           void *target_container);

/* Gets a skb connection tracking info, ctinfo map should be a
 * map of mapsize to translate enum ip_conntrack_info states
 * to user states.
 */
void
skb_flow_dissect_ct(const struct sk_buff *skb,
                    struct flow_dissector *flow_dissector,
                    void *target_container,
                    u16 *ctinfo_map,
                    size_t mapsize);
void
skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
                             struct flow_dissector *flow_dissector,
                             void *target_container);

void skb_flow_dissect_hash(const struct sk_buff *skb,
                           struct flow_dissector *flow_dissector,
                           void *target_container);

static inline __u32 skb_get_hash(struct sk_buff *skb)
{
        if (!skb->l4_hash && !skb->sw_hash)
                __skb_get_hash(skb);

        return skb->hash;
}

static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6)
{
        if (!skb->l4_hash && !skb->sw_hash) {
                struct flow_keys keys;
                __u32 hash = __get_hash_from_flowi6(fl6, &keys);

                __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
        }

        return skb->hash;
}

__u32 skb_get_hash_perturb(const struct sk_buff *skb,
                           const siphash_key_t *perturb);

static inline __u32 skb_get_hash_raw(const struct sk_buff *skb)
{
        return skb->hash;
}

static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from)
{
        to->hash = from->hash;
        to->sw_hash = from->sw_hash;
        to->l4_hash = from->l4_hash;
};

static inline void skb_copy_decrypted(struct sk_buff *to,
                                      const struct sk_buff *from)
{
#ifdef CONFIG_TLS_DEVICE
        to->decrypted = from->decrypted;
#endif
}

#ifdef NET_SKBUFF_DATA_USES_OFFSET
static inline unsigned char *skb_end_pointer(const struct sk_buff *skb)
{
        return skb->head + skb->end;
}

static inline unsigned int skb_end_offset(const struct sk_buff *skb)
{
        return skb->end;
}
#else
static inline unsigned char *skb_end_pointer(const struct sk_buff *skb)
{
        return skb->end;
}

static inline unsigned int skb_end_offset(const struct sk_buff *skb)
{
        return skb->end - skb->head;
}
#endif

/* Internal */
#define skb_shinfo(SKB)        ((struct skb_shared_info *)(skb_end_pointer(SKB)))

static inline struct skb_shared_hwtstamps *skb_hwtstamps(struct sk_buff *skb)
{
        return &skb_shinfo(skb)->hwtstamps;
}

static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
{
        bool is_zcopy = skb && skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY;

        return is_zcopy ? skb_uarg(skb) : NULL;
}

static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg,
                                 bool *have_ref)
{
        if (skb && uarg && !skb_zcopy(skb)) {
                if (unlikely(have_ref && *have_ref))
                        *have_ref = false;
                else
                        sock_zerocopy_get(uarg);
                skb_shinfo(skb)->destructor_arg = uarg;
                skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG;
        }
}

static inline void skb_zcopy_set_nouarg(struct sk_buff *skb, void *val)
{
        skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t) val | 0x1UL);
        skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG;
}

static inline bool skb_zcopy_is_nouarg(struct sk_buff *skb)
{
        return (uintptr_t) skb_shinfo(skb)->destructor_arg & 0x1UL;
}

static inline void *skb_zcopy_get_nouarg(struct sk_buff *skb)
{
        return (void *)((uintptr_t) skb_shinfo(skb)->destructor_arg & ~0x1UL);
}

/* Release a reference on a zerocopy structure */
static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy)
{
        struct ubuf_info *uarg = skb_zcopy(skb);

        if (uarg) {
                if (skb_zcopy_is_nouarg(skb)) {
                        /* no notification callback */
                } else if (uarg->callback == sock_zerocopy_callback) {
                        uarg->zerocopy = uarg->zerocopy && zerocopy;
                        sock_zerocopy_put(uarg);
                } else {
                        uarg->callback(uarg, zerocopy);
                }

                skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG;
        }
}

/* Abort a zerocopy operation and revert zckey on error in send syscall */
static inline void skb_zcopy_abort(struct sk_buff *skb)
{
        struct ubuf_info *uarg = skb_zcopy(skb);

        if (uarg) {
                sock_zerocopy_put_abort(uarg, false);
                skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG;
        }
}

static inline void skb_mark_not_on_list(struct sk_buff *skb)
{
        skb->next = NULL;
}

/* Iterate through singly-linked GSO fragments of an skb. */
#define skb_list_walk_safe(first, skb, next_skb)                               \
        for ((skb) = (first), (next_skb) = (skb) ? (skb)->next : NULL; (skb);  \
             (skb) = (next_skb), (next_skb) = (skb) ? (skb)->next : NULL)

static inline void skb_list_del_init(struct sk_buff *skb)
{
        __list_del_entry(&skb->list);
        skb_mark_not_on_list(skb);
}

/**
 *        skb_queue_empty - check if a queue is empty
 *        @list: queue head
 *
 *        Returns true if the queue is empty, false otherwise.
 */
static inline int skb_queue_empty(const struct sk_buff_head *list)
{
        return list->next == (const struct sk_buff *) list;
}

/**
 *        skb_queue_empty_lockless - check if a queue is empty
 *        @list: queue head
 *
 *        Returns true if the queue is empty, false otherwise.
 *        This variant can be used in lockless contexts.
 */
static inline bool skb_queue_empty_lockless(const struct sk_buff_head *list)
{
        return READ_ONCE(list->next) == (const struct sk_buff *) list;
}


/**
 *        skb_queue_is_last - check if skb is the last entry in the queue
 *        @list: queue head
 *        @skb: buffer
 *
 *        Returns true if @skb is the last buffer on the list.
 */
static inline bool skb_queue_is_last(const struct sk_buff_head *list,
                                     const struct sk_buff *skb)
{
        return skb->next == (const struct sk_buff *) list;
}

/**
 *        skb_queue_is_first - check if skb is the first entry in the queue
 *        @list: queue head
 *        @skb: buffer
 *
 *        Returns true if @skb is the first buffer on the list.
 */
static inline bool skb_queue_is_first(const struct sk_buff_head *list,
                                      const struct sk_buff *skb)
{
        return skb->prev == (const struct sk_buff *) list;
}

/**
 *        skb_queue_next - return the next packet in the queue
 *        @list: queue head
 *        @skb: current buffer
 *
 *        Return the next packet in @list after @skb.  It is only valid to
 *        call this if skb_queue_is_last() evaluates to false.
 */
static inline struct sk_buff *skb_queue_next(const struct sk_buff_head *list,
                                             const struct sk_buff *skb)
{
        /* This BUG_ON may seem severe, but if we just return then we
         * are going to dereference garbage.
         */
        BUG_ON(skb_queue_is_last(list, skb));
        return skb->next;
}

/**
 *        skb_queue_prev - return the prev packet in the queue
 *        @list: queue head
 *        @skb: current buffer
 *
 *        Return the prev packet in @list before @skb.  It is only valid to
 *        call this if skb_queue_is_first() evaluates to false.
 */
static inline struct sk_buff *skb_queue_prev(const struct sk_buff_head *list,
                                             const struct sk_buff *skb)
{
        /* This BUG_ON may seem severe, but if we just return then we
         * are going to dereference garbage.
         */
        BUG_ON(skb_queue_is_first(list, skb));
        return skb->prev;
}

/**
 *        skb_get - reference buffer
 *        @skb: buffer to reference
 *
 *        Makes another reference to a socket buffer and returns a pointer
 *        to the buffer.
 */
static inline struct sk_buff *skb_get(struct sk_buff *skb)
{
        refcount_inc(&skb->users);
        return skb;
}

/*
 * If users == 1, we are the only owner and can avoid redundant atomic changes.
 */

/**
 *        skb_cloned - is the buffer a clone
 *        @skb: buffer to check
 *
 *        Returns true if the buffer was generated with skb_clone() and is
 *        one of multiple shared copies of the buffer. Cloned buffers are
 *        shared data so must not be written to under normal circumstances.
 */
static inline int skb_cloned(const struct sk_buff *skb)
{
        return skb->cloned &&
               (atomic_read(&skb_shinfo(skb)->dataref) & SKB_DATAREF_MASK) != 1;
}

static inline int skb_unclone(struct sk_buff *skb, gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));

        if (skb_cloned(skb))
                return pskb_expand_head(skb, 0, 0, pri);

        return 0;
}

/**
 *        skb_header_cloned - is the header a clone
 *        @skb: buffer to check
 *
 *        Returns true if modifying the header part of the buffer requires
 *        the data to be copied.
 */
static inline int skb_header_cloned(const struct sk_buff *skb)
{
        int dataref;

        if (!skb->cloned)
                return 0;

        dataref = atomic_read(&skb_shinfo(skb)->dataref);
        dataref = (dataref & SKB_DATAREF_MASK) - (dataref >> SKB_DATAREF_SHIFT);
        return dataref != 1;
}

static inline int skb_header_unclone(struct sk_buff *skb, gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));

        if (skb_header_cloned(skb))
                return pskb_expand_head(skb, 0, 0, pri);

        return 0;
}

/**
 *        __skb_header_release - release reference to header
 *        @skb: buffer to operate on
 */
static inline void __skb_header_release(struct sk_buff *skb)
{
        skb->nohdr = 1;
        atomic_set(&skb_shinfo(skb)->dataref, 1 + (1 << SKB_DATAREF_SHIFT));
}


/**
 *        skb_shared - is the buffer shared
 *        @skb: buffer to check
 *
 *        Returns true if more than one person has a reference to this
 *        buffer.
 */
static inline int skb_shared(const struct sk_buff *skb)
{
        return refcount_read(&skb->users) != 1;
}

/**
 *        skb_share_check - check if buffer is shared and if so clone it
 *        @skb: buffer to check
 *        @pri: priority for memory allocation
 *
 *        If the buffer is shared the buffer is cloned and the old copy
 *        drops a reference. A new clone with a single reference is returned.
 *        If the buffer is not shared the original buffer is returned. When
 *        being called from interrupt status or with spinlocks held pri must
 *        be GFP_ATOMIC.
 *
 *        NULL is returned on a memory allocation failure.
 */
static inline struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));
        if (skb_shared(skb)) {
                struct sk_buff *nskb = skb_clone(skb, pri);

                if (likely(nskb))
                        consume_skb(skb);
                else
                        kfree_skb(skb);
                skb = nskb;
        }
        return skb;
}

/*
 *        Copy shared buffers into a new sk_buff. We effectively do COW on
 *        packets to handle cases where we have a local reader and forward
 *        and a couple of other messy ones. The normal one is tcpdumping
 *        a packet thats being forwarded.
 */

/**
 *        skb_unshare - make a copy of a shared buffer
 *        @skb: buffer to check
 *        @pri: priority for memory allocation
 *
 *        If the socket buffer is a clone then this function creates a new
 *        copy of the data, drops a reference count on the old copy and returns
 *        the new copy with the reference count at 1. If the buffer is not a clone
 *        the original buffer is returned. When called with a spinlock held or
 *        from interrupt state @pri must be %GFP_ATOMIC
 *
 *        %NULL is returned on a memory allocation failure.
 */
static inline struct sk_buff *skb_unshare(struct sk_buff *skb,
                                          gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));
        if (skb_cloned(skb)) {
                struct sk_buff *nskb = skb_copy(skb, pri);

                /* Free our shared copy */
                if (likely(nskb))
                        consume_skb(skb);
                else
                        kfree_skb(skb);
                skb = nskb;
        }
        return skb;
}

/**
 *        skb_peek - peek at the head of an &sk_buff_head
 *        @list_: list to peek at
 *
 *        Peek an &sk_buff. Unlike most other operations you _MUST_
 *        be careful with this one. A peek leaves the buffer on the
 *        list and someone else may run off with it. You must hold
 *        the appropriate locks or have a private queue to do this.
 *
 *        Returns %NULL for an empty list or a pointer to the head element.
 *        The reference count is not incremented and the reference is therefore
 *        volatile. Use with caution.
 */
static inline struct sk_buff *skb_peek(const struct sk_buff_head *list_)
{
        struct sk_buff *skb = list_->next;

        if (skb == (struct sk_buff *)list_)
                skb = NULL;
        return skb;
}

/**
 *        __skb_peek - peek at the head of a non-empty &sk_buff_head
 *        @list_: list to peek at
 *
 *        Like skb_peek(), but the caller knows that the list is not empty.
 */
static inline struct sk_buff *__skb_peek(const struct sk_buff_head *list_)
{
        return list_->next;
}

/**
 *        skb_peek_next - peek skb following the given one from a queue
 *        @skb: skb to start from
 *        @list_: list to peek at
 *
 *        Returns %NULL when the end of the list is met or a pointer to the
 *        next element. The reference count is not incremented and the
 *        reference is therefore volatile. Use with caution.
 */
static inline struct sk_buff *skb_peek_next(struct sk_buff *skb,
                const struct sk_buff_head *list_)
{
        struct sk_buff *next = skb->next;

        if (next == (struct sk_buff *)list_)
                next = NULL;
        return next;
}

/**
 *        skb_peek_tail - peek at the tail of an &sk_buff_head
 *        @list_: list to peek at
 *
 *        Peek an &sk_buff. Unlike most other operations you _MUST_
 *        be careful with this one. A peek leaves the buffer on the
 *        list and someone else may run off with it. You must hold
 *        the appropriate locks or have a private queue to do this.
 *
 *        Returns %NULL for an empty list or a pointer to the tail element.
 *        The reference count is not incremented and the reference is therefore
 *        volatile. Use with caution.
 */
static inline struct sk_buff *skb_peek_tail(const struct sk_buff_head *list_)
{
        struct sk_buff *skb = READ_ONCE(list_->prev);

        if (skb == (struct sk_buff *)list_)
                skb = NULL;
        return skb;

}

/**
 *        skb_queue_len        - get queue length
 *        @list_: list to measure
 *
 *        Return the length of an &sk_buff queue.
 */
static inline __u32 skb_queue_len(const struct sk_buff_head *list_)
{
        return list_->qlen;
}

/**
 *        skb_queue_len_lockless        - get queue length
 *        @list_: list to measure
 *
 *        Return the length of an &sk_buff queue.
 *        This variant can be used in lockless contexts.
 */
static inline __u32 skb_queue_len_lockless(const struct sk_buff_head *list_)
{
        return READ_ONCE(list_->qlen);
}

/**
 *        __skb_queue_head_init - initialize non-spinlock portions of sk_buff_head
 *        @list: queue to initialize
 *
 *        This initializes only the list and queue length aspects of
 *        an sk_buff_head object.  This allows to initialize the list
 *        aspects of an sk_buff_head without reinitializing things like
 *        the spinlock.  It can also be used for on-stack sk_buff_head
 *        objects where the spinlock is known to not be used.
 */
static inline void __skb_queue_head_init(struct sk_buff_head *list)
{
        list->prev = list->next = (struct sk_buff *)list;
        list->qlen = 0;
}

/*
 * This function creates a split out lock class for each invocation;
 * this is needed for now since a whole lot of users of the skb-queue
 * infrastructure in drivers have different locking usage (in hardirq)
 * than the networking core (in softirq only). In the long run either the
 * network layer or drivers should need annotation to consolidate the
 * main types of usage into 3 classes.
 */
static inline void skb_queue_head_init(struct sk_buff_head *list)
{
        spin_lock_init(&list->lock);
        __skb_queue_head_init(list);
}

static inline void skb_queue_head_init_class(struct sk_buff_head *list,
                struct lock_class_key *class)
{
        skb_queue_head_init(list);
        lockdep_set_class(&list->lock, class);
}

/*
 *        Insert an sk_buff on a list.
 *
 *        The "__skb_xxxx()" functions are the non-atomic ones that
 *        can only be called with interrupts disabled.
 */
static inline void __skb_insert(struct sk_buff *newsk,
                                struct sk_buff *prev, struct sk_buff *next,
                                struct sk_buff_head *list)
{
        /* See skb_queue_empty_lockless() and skb_peek_tail()
         * for the opposite READ_ONCE()
         */
        WRITE_ONCE(newsk->next, next);
        WRITE_ONCE(newsk->prev, prev);
        WRITE_ONCE(next->prev, newsk);
        WRITE_ONCE(prev->next, newsk);
        WRITE_ONCE(list->qlen, list->qlen + 1);
}

static inline void __skb_queue_splice(const struct sk_buff_head *list,
                                      struct sk_buff *prev,
                                      struct sk_buff *next)
{
        struct sk_buff *first = list->next;
        struct sk_buff *last = list->prev;

        WRITE_ONCE(first->prev, prev);
        WRITE_ONCE(prev->next, first);

        WRITE_ONCE(last->next, next);
        WRITE_ONCE(next->prev, last);
}

/**
 *        skb_queue_splice - join two skb lists, this is designed for stacks
 *        @list: the new list to add
 *        @head: the place to add it in the first list
 */
static inline void skb_queue_splice(const struct sk_buff_head *list,
                                    struct sk_buff_head *head)
{
        if (!skb_queue_empty(list)) {
                __skb_queue_splice(list, (struct sk_buff *) head, head->next);
                head->qlen += list->qlen;
        }
}

/**
 *        skb_queue_splice_init - join two skb lists and reinitialise the emptied list
 *        @list: the new list to add
 *        @head: the place to add it in the first list
 *
 *        The list at @list is reinitialised
 */
static inline void skb_queue_splice_init(struct sk_buff_head *list,
                                         struct sk_buff_head *head)
{
        if (!skb_queue_empty(list)) {
                __skb_queue_splice(list, (struct sk_buff *) head, head->next);
                head->qlen += list->qlen;
                __skb_queue_head_init(list);
        }
}

/**
 *        skb_queue_splice_tail - join two skb lists, each list being a queue
 *        @list: the new list to add
 *        @head: the place to add it in the first list
 */
static inline void skb_queue_splice_tail(const struct sk_buff_head *list,
                                         struct sk_buff_head *head)
{
        if (!skb_queue_empty(list)) {
                __skb_queue_splice(list, head->prev, (struct sk_buff *) head);
                head->qlen += list->qlen;
        }
}

/**
 *        skb_queue_splice_tail_init - join two skb lists and reinitialise the emptied list
 *        @list: the new list to add
 *        @head: the place to add it in the first list
 *
 *        Each of the lists is a queue.
 *        The list at @list is reinitialised
 */
static inline void skb_queue_splice_tail_init(struct sk_buff_head *list,
                                              struct sk_buff_head *head)
{
        if (!skb_queue_empty(list)) {
                __skb_queue_splice(list, head->prev, (struct sk_buff *) head);
                head->qlen += list->qlen;
                __skb_queue_head_init(list);
        }
}

/**
 *        __skb_queue_after - queue a buffer at the list head
 *        @list: list to use
 *        @prev: place after this buffer
 *        @newsk: buffer to queue
 *
 *        Queue a buffer int the middle of a list. This function takes no locks
 *        and you must therefore hold required locks before calling it.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
static inline void __skb_queue_after(struct sk_buff_head *list,
                                     struct sk_buff *prev,
                                     struct sk_buff *newsk)
{
        __skb_insert(newsk, prev, prev->next, list);
}

void skb_append(struct sk_buff *old, struct sk_buff *newsk,
                struct sk_buff_head *list);

static inline void __skb_queue_before(struct sk_buff_head *list,
                                      struct sk_buff *next,
                                      struct sk_buff *newsk)
{
        __skb_insert(newsk, next->prev, next, list);
}

/**
 *        __skb_queue_head - queue a buffer at the list head
 *        @list: list to use
 *        @newsk: buffer to queue
 *
 *        Queue a buffer at the start of a list. This function takes no locks
 *        and you must therefore hold required locks before calling it.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
static inline void __skb_queue_head(struct sk_buff_head *list,
                                    struct sk_buff *newsk)
{
        __skb_queue_after(list, (struct sk_buff *)list, newsk);
}
void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk);

/**
 *        __skb_queue_tail - queue a buffer at the list tail
 *        @list: list to use
 *        @newsk: buffer to queue
 *
 *        Queue a buffer at the end of a list. This function takes no locks
 *        and you must therefore hold required locks before calling it.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
static inline void __skb_queue_tail(struct sk_buff_head *list,
                                   struct sk_buff *newsk)
{
        __skb_queue_before(list, (struct sk_buff *)list, newsk);
}
void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk);

/*
 * remove sk_buff from list. _Must_ be called atomically, and with
 * the list known..
 */
void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list);
static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
{
        struct sk_buff *next, *prev;

        WRITE_ONCE(list->qlen, list->qlen - 1);
        next           = skb->next;
        prev           = skb->prev;
        skb->next  = skb->prev = NULL;
        WRITE_ONCE(next->prev, prev);
        WRITE_ONCE(prev->next, next);
}

/**
 *        __skb_dequeue - remove from the head of the queue
 *        @list: list to dequeue from
 *
 *        Remove the head of the list. This function does not take any locks
 *        so must be used with appropriate locks held only. The head item is
 *        returned or %NULL if the list is empty.
 */
static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list)
{
        struct sk_buff *skb = skb_peek(list);
        if (skb)
                __skb_unlink(skb, list);
        return skb;
}
struct sk_buff *skb_dequeue(struct sk_buff_head *list);

/**
 *        __skb_dequeue_tail - remove from the tail of the queue
 *        @list: list to dequeue from
 *
 *        Remove the tail of the list. This function does not take any locks
 *        so must be used with appropriate locks held only. The tail item is
 *        returned or %NULL if the list is empty.
 */
static inline struct sk_buff *__skb_dequeue_tail(struct sk_buff_head *list)
{
        struct sk_buff *skb = skb_peek_tail(list);
        if (skb)
                __skb_unlink(skb, list);
        return skb;
}
struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list);


static inline bool skb_is_nonlinear(const struct sk_buff *skb)
{
        return skb->data_len;
}

static inline unsigned int skb_headlen(const struct sk_buff *skb)
{
        return skb->len - skb->data_len;
}

static inline unsigned int __skb_pagelen(const struct sk_buff *skb)
{
        unsigned int i, len = 0;

        for (i = skb_shinfo(skb)->nr_frags - 1; (int)i >= 0; i--)
                len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
        return len;
}

static inline unsigned int skb_pagelen(const struct sk_buff *skb)
{
        return skb_headlen(skb) + __skb_pagelen(skb);
}

/**
 * __skb_fill_page_desc - initialise a paged fragment in an skb
 * @skb: buffer containing fragment to be initialised
 * @i: paged fragment index to initialise
 * @page: the page to use for this fragment
 * @off: the offset to the data with @page
 * @size: the length of the data
 *
 * Initialises the @i'th fragment of @skb to point to &size bytes at
 * offset @off within @page.
 *
 * Does not take any additional reference on the fragment.
 */
static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
                                        struct page *page, int off, int size)
{
        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];

        /*
         * Propagate page pfmemalloc to the skb if we can. The problem is
         * that not all callers have unique ownership of the page but rely
         * on page_is_pfmemalloc doing the right thing(tm).
         */
        frag->bv_page                  = page;
        frag->bv_offset                  = off;
        skb_frag_size_set(frag, size);

        page = compound_head(page);
        if (page_is_pfmemalloc(page))
                skb->pfmemalloc        = true;
}

/**
 * skb_fill_page_desc - initialise a paged fragment in an skb
 * @skb: buffer containing fragment to be initialised
 * @i: paged fragment index to initialise
 * @page: the page to use for this fragment
 * @off: the offset to the data with @page
 * @size: the length of the data
 *
 * As per __skb_fill_page_desc() -- initialises the @i'th fragment of
 * @skb to point to @size bytes at offset @off within @page. In
 * addition updates @skb such that @i is the last fragment.
 *
 * Does not take any additional reference on the fragment.
 */
static inline void skb_fill_page_desc(struct sk_buff *skb, int i,
                                      struct page *page, int off, int size)
{
        __skb_fill_page_desc(skb, i, page, off, size);
        skb_shinfo(skb)->nr_frags = i + 1;
}

void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
                     int size, unsigned int truesize);

void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
                          unsigned int truesize);

#define SKB_LINEAR_ASSERT(skb)  BUG_ON(skb_is_nonlinear(skb))

#ifdef NET_SKBUFF_DATA_USES_OFFSET
static inline unsigned char *skb_tail_pointer(const struct sk_buff *skb)
{
        return skb->head + skb->tail;
}

static inline void skb_reset_tail_pointer(struct sk_buff *skb)
{
        skb->tail = skb->data - skb->head;
}

static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset)
{
        skb_reset_tail_pointer(skb);
        skb->tail += offset;
}

#else /* NET_SKBUFF_DATA_USES_OFFSET */
static inline unsigned char *skb_tail_pointer(const struct sk_buff *skb)
{
        return skb->tail;
}

static inline void skb_reset_tail_pointer(struct sk_buff *skb)
{
        skb->tail = skb->data;
}

static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset)
{
        skb->tail = skb->data + offset;
}

#endif /* NET_SKBUFF_DATA_USES_OFFSET */

static inline void skb_assert_len(struct sk_buff *skb)
{
#ifdef CONFIG_DEBUG_NET
        if (WARN_ONCE(!skb->len, "%s\n", __func__))
                DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
#endif /* CONFIG_DEBUG_NET */
}

/*
 *        Add data to an sk_buff
 */
void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len);
void *skb_put(struct sk_buff *skb, unsigned int len);
static inline void *__skb_put(struct sk_buff *skb, unsigned int len)
{
        void *tmp = skb_tail_pointer(skb);
        SKB_LINEAR_ASSERT(skb);
        skb->tail += len;
        skb->len  += len;
        return tmp;
}

static inline void *__skb_put_zero(struct sk_buff *skb, unsigned int len)
{
        void *tmp = __skb_put(skb, len);

        memset(tmp, 0, len);
        return tmp;
}

static inline void *__skb_put_data(struct sk_buff *skb, const void *data,
                                   unsigned int len)
{
        void *tmp = __skb_put(skb, len);

        memcpy(tmp, data, len);
        return tmp;
}

static inline void __skb_put_u8(struct sk_buff *skb, u8 val)
{
        *(u8 *)__skb_put(skb, 1) = val;
}

static inline void *skb_put_zero(struct sk_buff *skb, unsigned int len)
{
        void *tmp = skb_put(skb, len);

        memset(tmp, 0, len);

        return tmp;
}

static inline void *skb_put_data(struct sk_buff *skb, const void *data,
                                 unsigned int len)
{
        void *tmp = skb_put(skb, len);

        memcpy(tmp, data, len);

        return tmp;
}

static inline void skb_put_u8(struct sk_buff *skb, u8 val)
{
        *(u8 *)skb_put(skb, 1) = val;
}

void *skb_push(struct sk_buff *skb, unsigned int len);
static inline void *__skb_push(struct sk_buff *skb, unsigned int len)
{
        skb->data -= len;
        skb->len  += len;
        return skb->data;
}

void *skb_pull(struct sk_buff *skb, unsigned int len);
static inline void *__skb_pull(struct sk_buff *skb, unsigned int len)
{
        skb->len -= len;
        BUG_ON(skb->len < skb->data_len);
        return skb->data += len;
}

static inline void *skb_pull_inline(struct sk_buff *skb, unsigned int len)
{
        return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len);
}

void *__pskb_pull_tail(struct sk_buff *skb, int delta);

static inline void *__pskb_pull(struct sk_buff *skb, unsigned int len)
{
        if (len > skb_headlen(skb) &&
            !__pskb_pull_tail(skb, len - skb_headlen(skb)))
                return NULL;
        skb->len -= len;
        return skb->data += len;
}

static inline void *pskb_pull(struct sk_buff *skb, unsigned int len)
{
        return unlikely(len > skb->len) ? NULL : __pskb_pull(skb, len);
}

static inline bool pskb_may_pull(struct sk_buff *skb, unsigned int len)
{
        if (likely(len <= skb_headlen(skb)))
                return true;
        if (unlikely(len > skb->len))
                return false;
        return __pskb_pull_tail(skb, len - skb_headlen(skb)) != NULL;
}

void skb_condense(struct sk_buff *skb);

/**
 *        skb_headroom - bytes at buffer head
 *        @skb: buffer to check
 *
 *        Return the number of bytes of free space at the head of an &sk_buff.
 */
static inline unsigned int skb_headroom(const struct sk_buff *skb)
{
        return skb->data - skb->head;
}

/**
 *        skb_tailroom - bytes at buffer end
 *        @skb: buffer to check
 *
 *        Return the number of bytes of free space at the tail of an sk_buff
 */
static inline int skb_tailroom(const struct sk_buff *skb)
{
        return skb_is_nonlinear(skb) ? 0 : skb->end - skb->tail;
}

/**
 *        skb_availroom - bytes at buffer end
 *        @skb: buffer to check
 *
 *        Return the number of bytes of free space at the tail of an sk_buff
 *        allocated by sk_stream_alloc()
 */
static inline int skb_availroom(const struct sk_buff *skb)
{
        if (skb_is_nonlinear(skb))
                return 0;

        return skb->end - skb->tail - skb->reserved_tailroom;
}

/**
 *        skb_reserve - adjust headroom
 *        @skb: buffer to alter
 *        @len: bytes to move
 *
 *        Increase the headroom of an empty &sk_buff by reducing the tail
 *        room. This is only allowed for an empty buffer.
 */
static inline void skb_reserve(struct sk_buff *skb, int len)
{
        skb->data += len;
        skb->tail += len;
}

/**
 *        skb_tailroom_reserve - adjust reserved_tailroom
 *        @skb: buffer to alter
 *        @mtu: maximum amount of headlen permitted
 *        @needed_tailroom: minimum amount of reserved_tailroom
 *
 *        Set reserved_tailroom so that headlen can be as large as possible but
 *        not larger than mtu and tailroom cannot be smaller than
 *        needed_tailroom.
 *        The required headroom should already have been reserved before using
 *        this function.
 */
static inline void skb_tailroom_reserve(struct sk_buff *skb, unsigned int mtu,
                                        unsigned int needed_tailroom)
{
        SKB_LINEAR_ASSERT(skb);
        if (mtu < skb_tailroom(skb) - needed_tailroom)
                /* use at most mtu */
                skb->reserved_tailroom = skb_tailroom(skb) - mtu;
        else
                /* use up to all available space */
                skb->reserved_tailroom = needed_tailroom;
}

#define ENCAP_TYPE_ETHER        0
#define ENCAP_TYPE_IPPROTO        1

static inline void skb_set_inner_protocol(struct sk_buff *skb,
                                          __be16 protocol)
{
        skb->inner_protocol = protocol;
        skb->inner_protocol_type = ENCAP_TYPE_ETHER;
}

static inline void skb_set_inner_ipproto(struct sk_buff *skb,
                                         __u8 ipproto)
{
        skb->inner_ipproto = ipproto;
        skb->inner_protocol_type = ENCAP_TYPE_IPPROTO;
}

static inline void skb_reset_inner_headers(struct sk_buff *skb)
{
        skb->inner_mac_header = skb->mac_header;
        skb->inner_network_header = skb->network_header;
        skb->inner_transport_header = skb->transport_header;
}

static inline void skb_reset_mac_len(struct sk_buff *skb)
{
        skb->mac_len = skb->network_header - skb->mac_header;
}

static inline unsigned char *skb_inner_transport_header(const struct sk_buff
                                                        *skb)
{
        return skb->head + skb->inner_transport_header;
}

static inline int skb_inner_transport_offset(const struct sk_buff *skb)
{
        return skb_inner_transport_header(skb) - skb->data;
}

static inline void skb_reset_inner_transport_header(struct sk_buff *skb)
{
        skb->inner_transport_header = skb->data - skb->head;
}

static inline void skb_set_inner_transport_header(struct sk_buff *skb,
                                                   const int offset)
{
        skb_reset_inner_transport_header(skb);
        skb->inner_transport_header += offset;
}

static inline unsigned char *skb_inner_network_header(const struct sk_buff *skb)
{
        return skb->head + skb->inner_network_header;
}

static inline void skb_reset_inner_network_header(struct sk_buff *skb)
{
        skb->inner_network_header = skb->data - skb->head;
}

static inline void skb_set_inner_network_header(struct sk_buff *skb,
                                                const int offset)
{
        skb_reset_inner_network_header(skb);
        skb->inner_network_header += offset;
}

static inline unsigned char *skb_inner_mac_header(const struct sk_buff *skb)
{
        return skb->head + skb->inner_mac_header;
}

static inline void skb_reset_inner_mac_header(struct sk_buff *skb)
{
        skb->inner_mac_header = skb->data - skb->head;
}

static inline void skb_set_inner_mac_header(struct sk_buff *skb,
                                            const int offset)
{
        skb_reset_inner_mac_header(skb);
        skb->inner_mac_header += offset;
}
static inline bool skb_transport_header_was_set(const struct sk_buff *skb)
{
        return skb->transport_header != (typeof(skb->transport_header))~0U;
}

static inline unsigned char *skb_transport_header(const struct sk_buff *skb)
{
        return skb->head + skb->transport_header;
}

static inline void skb_reset_transport_header(struct sk_buff *skb)
{
        skb->transport_header = skb->data - skb->head;
}

static inline void skb_set_transport_header(struct sk_buff *skb,
                                            const int offset)
{
        skb_reset_transport_header(skb);
        skb->transport_header += offset;
}

static inline unsigned char *skb_network_header(const struct sk_buff *skb)
{
        return skb->head + skb->network_header;
}

static inline void skb_reset_network_header(struct sk_buff *skb)
{
        skb->network_header = skb->data - skb->head;
}

static inline void skb_set_network_header(struct sk_buff *skb, const int offset)
{
        skb_reset_network_header(skb);
        skb->network_header += offset;
}

static inline unsigned char *skb_mac_header(const struct sk_buff *skb)
{
        return skb->head + skb->mac_header;
}

static inline int skb_mac_offset(const struct sk_buff *skb)
{
        return skb_mac_header(skb) - skb->data;
}

static inline u32 skb_mac_header_len(const struct sk_buff *skb)
{
        return skb->network_header - skb->mac_header;
}

static inline int skb_mac_header_was_set(const struct sk_buff *skb)
{
        return skb->mac_header != (typeof(skb->mac_header))~0U;
}

static inline void skb_unset_mac_header(struct sk_buff *skb)
{
        skb->mac_header = (typeof(skb->mac_header))~0U;
}

static inline void skb_reset_mac_header(struct sk_buff *skb)
{
        skb->mac_header = skb->data - skb->head;
}

static inline void skb_set_mac_header(struct sk_buff *skb, const int offset)
{
        skb_reset_mac_header(skb);
        skb->mac_header += offset;
}

static inline void skb_pop_mac_header(struct sk_buff *skb)
{
        skb->mac_header = skb->network_header;
}

static inline void skb_probe_transport_header(struct sk_buff *skb)
{
        struct flow_keys_basic keys;

        if (skb_transport_header_was_set(skb))
                return;

        if (skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
                                             NULL, 0, 0, 0, 0))
                skb_set_transport_header(skb, keys.control.thoff);
}

static inline void skb_mac_header_rebuild(struct sk_buff *skb)
{
        if (skb_mac_header_was_set(skb)) {
                const unsigned char *old_mac = skb_mac_header(skb);

                skb_set_mac_header(skb, -skb->mac_len);
                memmove(skb_mac_header(skb), old_mac, skb->mac_len);
        }
}

static inline int skb_checksum_start_offset(const struct sk_buff *skb)
{
        return skb->csum_start - skb_headroom(skb);
}

static inline unsigned char *skb_checksum_start(const struct sk_buff *skb)
{
        return skb->head + skb->csum_start;
}

static inline int skb_transport_offset(const struct sk_buff *skb)
{
        return skb_transport_header(skb) - skb->data;
}

static inline u32 skb_network_header_len(const struct sk_buff *skb)
{
        return skb->transport_header - skb->network_header;
}

static inline u32 skb_inner_network_header_len(const struct sk_buff *skb)
{
        return skb->inner_transport_header - skb->inner_network_header;
}

static inline int skb_network_offset(const struct sk_buff *skb)
{
        return skb_network_header(skb) - skb->data;
}

static inline int skb_inner_network_offset(const struct sk_buff *skb)
{
        return skb_inner_network_header(skb) - skb->data;
}

static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len)
{
        return pskb_may_pull(skb, skb_network_offset(skb) + len);
}

/*
 * CPUs often take a performance hit when accessing unaligned memory
 * locations. The actual performance hit varies, it can be small if the
 * hardware handles it or large if we have to take an exception and fix it
 * in software.
 *
 * Since an ethernet header is 14 bytes network drivers often end up with
 * the IP header at an unaligned offset. The IP header can be aligned by
 * shifting the start of the packet by 2 bytes. Drivers should do this
 * with:
 *
 * skb_reserve(skb, NET_IP_ALIGN);
 *
 * The downside to this alignment of the IP header is that the DMA is now
 * unaligned. On some architectures the cost of an unaligned DMA is high
 * and this cost outweighs the gains made by aligning the IP header.
 *
 * Since this trade off varies between architectures, we allow NET_IP_ALIGN
 * to be overridden.
 */
#ifndef NET_IP_ALIGN
#define NET_IP_ALIGN        2
#endif

/*
 * The networking layer reserves some headroom in skb data (via
 * dev_alloc_skb). This is used to avoid having to reallocate skb data when
 * the header has to grow. In the default case, if the header has to grow
 * 32 bytes or less we avoid the reallocation.
 *
 * Unfortunately this headroom changes the DMA alignment of the resulting
 * network packet. As for NET_IP_ALIGN, this unaligned DMA is expensive
 * on some architectures. An architecture can override this value,
 * perhaps setting it to a cacheline in size (since that will maintain
 * cacheline alignment of the DMA). It must be a power of 2.
 *
 * Various parts of the networking layer expect at least 32 bytes of
 * headroom, you should not reduce this.
 *
 * Using max(32, L1_CACHE_BYTES) makes sense (especially with RPS)
 * to reduce average number of cache lines per packet.
 * get_rps_cpu() for example only access one 64 bytes aligned block :
 * NET_IP_ALIGN(2) + ethernet_header(14) + IP_header(20/40) + ports(8)
 */
#ifndef NET_SKB_PAD
#define NET_SKB_PAD        max(32, L1_CACHE_BYTES)
#endif

int ___pskb_trim(struct sk_buff *skb, unsigned int len);

static inline void __skb_set_length(struct sk_buff *skb, unsigned int len)
{
        if (WARN_ON(skb_is_nonlinear(skb)))
                return;
        skb->len = len;
        skb_set_tail_pointer(skb, len);
}

static inline void __skb_trim(struct sk_buff *skb, unsigned int len)
{
        __skb_set_length(skb, len);
}

void skb_trim(struct sk_buff *skb, unsigned int len);

static inline int __pskb_trim(struct sk_buff *skb, unsigned int len)
{
        if (skb->data_len)
                return ___pskb_trim(skb, len);
        __skb_trim(skb, len);
        return 0;
}

static inline int pskb_trim(struct sk_buff *skb, unsigned int len)
{
        return (len < skb->len) ? __pskb_trim(skb, len) : 0;
}

/**
 *        pskb_trim_unique - remove end from a paged unique (not cloned) buffer
 *        @skb: buffer to alter
 *        @len: new length
 *
 *        This is identical to pskb_trim except that the caller knows that
 *        the skb is not cloned so we should never get an error due to out-
 *        of-memory.
 */
static inline void pskb_trim_unique(struct sk_buff *skb, unsigned int len)
{
        int err = pskb_trim(skb, len);
        BUG_ON(err);
}

static inline int __skb_grow(struct sk_buff *skb, unsigned int len)
{
        unsigned int diff = len - skb->len;

        if (skb_tailroom(skb) < diff) {
                int ret = pskb_expand_head(skb, 0, diff - skb_tailroom(skb),
                                           GFP_ATOMIC);
                if (ret)
                        return ret;
        }
        __skb_set_length(skb, len);
        return 0;
}

/**
 *        skb_orphan - orphan a buffer
 *        @skb: buffer to orphan
 *
 *        If a buffer currently has an owner then we call the owner's
 *        destructor function and make the @skb unowned. The buffer continues
 *        to exist but is no longer charged to its former owner.
 */
static inline void skb_orphan(struct sk_buff *skb)
{
        if (skb->destructor) {
                skb->destructor(skb);
                skb->destructor = NULL;
                skb->sk                = NULL;
        } else {
                BUG_ON(skb->sk);
        }
}

/**
 *        skb_orphan_frags - orphan the frags contained in a buffer
 *        @skb: buffer to orphan frags from
 *        @gfp_mask: allocation mask for replacement pages
 *
 *        For each frag in the SKB which needs a destructor (i.e. has an
 *        owner) create a copy of that frag and release the original
 *        page by calling the destructor.
 */
static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask)
{
        if (likely(!skb_zcopy(skb)))
                return 0;
        if (!skb_zcopy_is_nouarg(skb) &&
            skb_uarg(skb)->callback == sock_zerocopy_callback)
                return 0;
        return skb_copy_ubufs(skb, gfp_mask);
}

/* Frags must be orphaned, even if refcounted, if skb might loop to rx path */
static inline int skb_orphan_frags_rx(struct sk_buff *skb, gfp_t gfp_mask)
{
        if (likely(!skb_zcopy(skb)))
                return 0;
        return skb_copy_ubufs(skb, gfp_mask);
}

/**
 *        __skb_queue_purge - empty a list
 *        @list: list to empty
 *
 *        Delete all buffers on an &sk_buff list. Each buffer is removed from
 *        the list and one reference dropped. This function does not take the
 *        list lock and the caller must hold the relevant locks to use it.
 */
static inline void __skb_queue_purge(struct sk_buff_head *list)
{
        struct sk_buff *skb;
        while ((skb = __skb_dequeue(list)) != NULL)
                kfree_skb(skb);
}
void skb_queue_purge(struct sk_buff_head *list);

unsigned int skb_rbtree_purge(struct rb_root *root);

void *netdev_alloc_frag(unsigned int fragsz);

struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length,
                                   gfp_t gfp_mask);

/**
 *        netdev_alloc_skb - allocate an skbuff for rx on a specific device
 *        @dev: network device to receive on
 *        @length: length to allocate
 *
 *        Allocate a new &sk_buff and assign it a usage count of one. The
 *        buffer has unspecified headroom built in. Users should allocate
 *        the headroom they think they need without accounting for the
 *        built in space. The built in space is used for optimisations.
 *
 *        %NULL is returned if there is no free memory. Although this function
 *        allocates memory it can be called from an interrupt.
 */
static inline struct sk_buff *netdev_alloc_skb(struct net_device *dev,
                                               unsigned int length)
{
        return __netdev_alloc_skb(dev, length, GFP_ATOMIC);
}

/* legacy helper around __netdev_alloc_skb() */
static inline struct sk_buff *__dev_alloc_skb(unsigned int length,
                                              gfp_t gfp_mask)
{
        return __netdev_alloc_skb(NULL, length, gfp_mask);
}

/* legacy helper around netdev_alloc_skb() */
static inline struct sk_buff *dev_alloc_skb(unsigned int length)
{
        return netdev_alloc_skb(NULL, length);
}


static inline struct sk_buff *__netdev_alloc_skb_ip_align(struct net_device *dev,
                unsigned int length, gfp_t gfp)
{
        struct sk_buff *skb = __netdev_alloc_skb(dev, length + NET_IP_ALIGN, gfp);

        if (NET_IP_ALIGN && skb)
                skb_reserve(skb, NET_IP_ALIGN);
        return skb;
}

static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
                unsigned int length)
{
        return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC);
}

static inline void skb_free_frag(void *addr)
{
        page_frag_free(addr);
}

void *napi_alloc_frag(unsigned int fragsz);
struct sk_buff *__napi_alloc_skb(struct napi_struct *napi,
                                 unsigned int length, gfp_t gfp_mask);
static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi,
                                             unsigned int length)
{
        return __napi_alloc_skb(napi, length, GFP_ATOMIC);
}
void napi_consume_skb(struct sk_buff *skb, int budget);

void __kfree_skb_flush(void);
void __kfree_skb_defer(struct sk_buff *skb);

/**
 * __dev_alloc_pages - allocate page for network Rx
 * @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx
 * @order: size of the allocation
 *
 * Allocate a new page.
 *
 * %NULL is returned if there is no free memory.
*/
static inline struct page *__dev_alloc_pages(gfp_t gfp_mask,
                                             unsigned int order)
{
        /* This piece of code contains several assumptions.
         * 1.  This is for device Rx, therefor a cold page is preferred.
         * 2.  The expectation is the user wants a compound page.
         * 3.  If requesting a order 0 page it will not be compound
         *     due to the check to see if order has a value in prep_new_page
         * 4.  __GFP_MEMALLOC is ignored if __GFP_NOMEMALLOC is set due to
         *     code in gfp_to_alloc_flags that should be enforcing this.
         */
        gfp_mask |= __GFP_COMP | __GFP_MEMALLOC;

        return alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
}

static inline struct page *dev_alloc_pages(unsigned int order)
{
        return __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN, order);
}

/**
 * __dev_alloc_page - allocate a page for network Rx
 * @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx
 *
 * Allocate a new page.
 *
 * %NULL is returned if there is no free memory.
 */
static inline struct page *__dev_alloc_page(gfp_t gfp_mask)
{
        return __dev_alloc_pages(gfp_mask, 0);
}

static inline struct page *dev_alloc_page(void)
{
        return dev_alloc_pages(0);
}

/**
 *        skb_propagate_pfmemalloc - Propagate pfmemalloc if skb is allocated after RX page
 *        @page: The page that was allocated from skb_alloc_page
 *        @skb: The skb that may need pfmemalloc set
 */
static inline void skb_propagate_pfmemalloc(struct page *page,
                                             struct sk_buff *skb)
{
        if (page_is_pfmemalloc(page))
                skb->pfmemalloc = true;
}

/**
 * skb_frag_off() - Returns the offset of a skb fragment
 * @frag: the paged fragment
 */
static inline unsigned int skb_frag_off(const skb_frag_t *frag)
{
        return frag->bv_offset;
}

/**
 * skb_frag_off_add() - Increments the offset of a skb fragment by @delta
 * @frag: skb fragment
 * @delta: value to add
 */
static inline void skb_frag_off_add(skb_frag_t *frag, int delta)
{
        frag->bv_offset += delta;
}

/**
 * skb_frag_off_set() - Sets the offset of a skb fragment
 * @frag: skb fragment
 * @offset: offset of fragment
 */
static inline void skb_frag_off_set(skb_frag_t *frag, unsigned int offset)
{
        frag->bv_offset = offset;
}

/**
 * skb_frag_off_copy() - Sets the offset of a skb fragment from another fragment
 * @fragto: skb fragment where offset is set
 * @fragfrom: skb fragment offset is copied from
 */
static inline void skb_frag_off_copy(skb_frag_t *fragto,
                                     const skb_frag_t *fragfrom)
{
        fragto->bv_offset = fragfrom->bv_offset;
}

/**
 * skb_frag_page - retrieve the page referred to by a paged fragment
 * @frag: the paged fragment
 *
 * Returns the &struct page associated with @frag.
 */
static inline struct page *skb_frag_page(const skb_frag_t *frag)
{
        return frag->bv_page;
}

/**
 * __skb_frag_ref - take an addition reference on a paged fragment.
 * @frag: the paged fragment
 *
 * Takes an additional reference on the paged fragment @frag.
 */
static inline void __skb_frag_ref(skb_frag_t *frag)
{
        get_page(skb_frag_page(frag));
}

/**
 * skb_frag_ref - take an addition reference on a paged fragment of an skb.
 * @skb: the buffer
 * @f: the fragment offset.
 *
 * Takes an additional reference on the @f'th paged fragment of @skb.
 */
static inline void skb_frag_ref(struct sk_buff *skb, int f)
{
        __skb_frag_ref(&skb_shinfo(skb)->frags[f]);
}

/**
 * __skb_frag_unref - release a reference on a paged fragment.
 * @frag: the paged fragment
 *
 * Releases a reference on the paged fragment @frag.
 */
static inline void __skb_frag_unref(skb_frag_t *frag)
{
        put_page(skb_frag_page(frag));
}

/**
 * skb_frag_unref - release a reference on a paged fragment of an skb.
 * @skb: the buffer
 * @f: the fragment offset
 *
 * Releases a reference on the @f'th paged fragment of @skb.
 */
static inline void skb_frag_unref(struct sk_buff *skb, int f)
{
        __skb_frag_unref(&skb_shinfo(skb)->frags[f]);
}

/**
 * skb_frag_address - gets the address of the data contained in a paged fragment
 * @frag: the paged fragment buffer
 *
 * Returns the address of the data within @frag. The page must already
 * be mapped.
 */
static inline void *skb_frag_address(const skb_frag_t *frag)
{
        return page_address(skb_frag_page(frag)) + skb_frag_off(frag);
}

/**
 * skb_frag_address_safe - gets the address of the data contained in a paged fragment
 * @frag: the paged fragment buffer
 *
 * Returns the address of the data within @frag. Checks that the page
 * is mapped and returns %NULL otherwise.
 */
static inline void *skb_frag_address_safe(const skb_frag_t *frag)
{
        void *ptr = page_address(skb_frag_page(frag));
        if (unlikely(!ptr))
                return NULL;

        return ptr + skb_frag_off(frag);
}

/**
 * skb_frag_page_copy() - sets the page in a fragment from another fragment
 * @fragto: skb fragment where page is set
 * @fragfrom: skb fragment page is copied from
 */
static inline void skb_frag_page_copy(skb_frag_t *fragto,
                                      const skb_frag_t *fragfrom)
{
        fragto->bv_page = fragfrom->bv_page;
}

/**
 * __skb_frag_set_page - sets the page contained in a paged fragment
 * @frag: the paged fragment
 * @page: the page to set
 *
 * Sets the fragment @frag to contain @page.
 */
static inline void __skb_frag_set_page(skb_frag_t *frag, struct page *page)
{
        frag->bv_page = page;
}

/**
 * skb_frag_set_page - sets the page contained in a paged fragment of an skb
 * @skb: the buffer
 * @f: the fragment offset
 * @page: the page to set
 *
 * Sets the @f'th fragment of @skb to contain @page.
 */
static inline void skb_frag_set_page(struct sk_buff *skb, int f,
                                     struct page *page)
{
        __skb_frag_set_page(&skb_shinfo(skb)->frags[f], page);
}

bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio);

/**
 * skb_frag_dma_map - maps a paged fragment via the DMA API
 * @dev: the device to map the fragment to
 * @frag: the paged fragment to map
 * @offset: the offset within the fragment (starting at the
 *          fragment's own offset)
 * @size: the number of bytes to map
 * @dir: the direction of the mapping (``PCI_DMA_*``)
 *
 * Maps the page associated with @frag to @device.
 */
static inline dma_addr_t skb_frag_dma_map(struct device *dev,
                                          const skb_frag_t *frag,
                                          size_t offset, size_t size,
                                          enum dma_data_direction dir)
{
        return dma_map_page(dev, skb_frag_page(frag),
                            skb_frag_off(frag) + offset, size, dir);
}

static inline struct sk_buff *pskb_copy(struct sk_buff *skb,
                                        gfp_t gfp_mask)
{
        return __pskb_copy(skb, skb_headroom(skb), gfp_mask);
}


static inline struct sk_buff *pskb_copy_for_clone(struct sk_buff *skb,
                                                  gfp_t gfp_mask)
{
        return __pskb_copy_fclone(skb, skb_headroom(skb), gfp_mask, true);
}


/**
 *        skb_clone_writable - is the header of a clone writable
 *        @skb: buffer to check
 *        @len: length up to which to write
 *
 *        Returns true if modifying the header part of the cloned buffer
 *        does not requires the data to be copied.
 */
static inline int skb_clone_writable(const struct sk_buff *skb, unsigned int len)
{
        return !skb_header_cloned(skb) &&
               skb_headroom(skb) + len <= skb->hdr_len;
}

static inline int skb_try_make_writable(struct sk_buff *skb,
                                        unsigned int write_len)
{
        return skb_cloned(skb) && !skb_clone_writable(skb, write_len) &&
               pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
}

static inline int __skb_cow(struct sk_buff *skb, unsigned int headroom,
                            int cloned)
{
        int delta = 0;

        if (headroom > skb_headroom(skb))
                delta = headroom - skb_headroom(skb);

        if (delta || cloned)
                return pskb_expand_head(skb, ALIGN(delta, NET_SKB_PAD), 0,
                                        GFP_ATOMIC);
        return 0;
}

/**
 *        skb_cow - copy header of skb when it is required
 *        @skb: buffer to cow
 *        @headroom: needed headroom
 *
 *        If the skb passed lacks sufficient headroom or its data part
 *        is shared, data is reallocated. If reallocation fails, an error
 *        is returned and original skb is not changed.
 *
 *        The result is skb with writable area skb->head...skb->tail
 *        and at least @headroom of space at head.
 */
static inline int skb_cow(struct sk_buff *skb, unsigned int headroom)
{
        return __skb_cow(skb, headroom, skb_cloned(skb));
}

/**
 *        skb_cow_head - skb_cow but only making the head writable
 *        @skb: buffer to cow
 *        @headroom: needed headroom
 *
 *        This function is identical to skb_cow except that we replace the
 *        skb_cloned check by skb_header_cloned.  It should be used when
 *        you only need to push on some header and do not need to modify
 *        the data.
 */
static inline int skb_cow_head(struct sk_buff *skb, unsigned int headroom)
{
        return __skb_cow(skb, headroom, skb_header_cloned(skb));
}

/**
 *        skb_padto        - pad an skbuff up to a minimal size
 *        @skb: buffer to pad
 *        @len: minimal length
 *
 *        Pads up a buffer to ensure the trailing bytes exist and are
 *        blanked. If the buffer already contains sufficient data it
 *        is untouched. Otherwise it is extended. Returns zero on
 *        success. The skb is freed on error.
 */
static inline int skb_padto(struct sk_buff *skb, unsigned int len)
{
        unsigned int size = skb->len;
        if (likely(size >= len))
                return 0;
        return skb_pad(skb, len - size);
}

/**
 *        __skb_put_padto - increase size and pad an skbuff up to a minimal size
 *        @skb: buffer to pad
 *        @len: minimal length
 *        @free_on_error: free buffer on error
 *
 *        Pads up a buffer to ensure the trailing bytes exist and are
 *        blanked. If the buffer already contains sufficient data it
 *        is untouched. Otherwise it is extended. Returns zero on
 *        success. The skb is freed on error if @free_on_error is true.
 */
static inline int __must_check __skb_put_padto(struct sk_buff *skb,
                                               unsigned int len,
                                               bool free_on_error)
{
        unsigned int size = skb->len;

        if (unlikely(size < len)) {
                len -= size;
                if (__skb_pad(skb, len, free_on_error))
                        return -ENOMEM;
                __skb_put(skb, len);
        }
        return 0;
}

/**
 *        skb_put_padto - increase size and pad an skbuff up to a minimal size
 *        @skb: buffer to pad
 *        @len: minimal length
 *
 *        Pads up a buffer to ensure the trailing bytes exist and are
 *        blanked. If the buffer already contains sufficient data it
 *        is untouched. Otherwise it is extended. Returns zero on
 *        success. The skb is freed on error.
 */
static inline int __must_check skb_put_padto(struct sk_buff *skb, unsigned int len)
{
        return __skb_put_padto(skb, len, true);
}

static inline int skb_add_data(struct sk_buff *skb,
                               struct iov_iter *from, int copy)
{
        const int off = skb->len;

        if (skb->ip_summed == CHECKSUM_NONE) {
                __wsum csum = 0;
                if (csum_and_copy_from_iter_full(skb_put(skb, copy), copy,
                                                 &csum, from)) {
                        skb->csum = csum_block_add(skb->csum, csum, off);
                        return 0;
                }
        } else if (copy_from_iter_full(skb_put(skb, copy), copy, from))
                return 0;

        __skb_trim(skb, off);
        return -EFAULT;
}

static inline bool skb_can_coalesce(struct sk_buff *skb, int i,
                                    const struct page *page, int off)
{
        if (skb_zcopy(skb))
                return false;
        if (i) {
                const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];

                return page == skb_frag_page(frag) &&
                       off == skb_frag_off(frag) + skb_frag_size(frag);
        }
        return false;
}

static inline int __skb_linearize(struct sk_buff *skb)
{
        return __pskb_pull_tail(skb, skb->data_len) ? 0 : -ENOMEM;
}

/**
 *        skb_linearize - convert paged skb to linear one
 *        @skb: buffer to linarize
 *
 *        If there is no free memory -ENOMEM is returned, otherwise zero
 *        is returned and the old skb data released.
 */
static inline int skb_linearize(struct sk_buff *skb)
{
        return skb_is_nonlinear(skb) ? __skb_linearize(skb) : 0;
}

/**
 * skb_has_shared_frag - can any frag be overwritten
 * @skb: buffer to test
 *
 * Return true if the skb has at least one frag that might be modified
 * by an external entity (as in vmsplice()/sendfile())
 */
static inline bool skb_has_shared_frag(const struct sk_buff *skb)
{
        return skb_is_nonlinear(skb) &&
               skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG;
}

/**
 *        skb_linearize_cow - make sure skb is linear and writable
 *        @skb: buffer to process
 *
 *        If there is no free memory -ENOMEM is returned, otherwise zero
 *        is returned and the old skb data released.
 */
static inline int skb_linearize_cow(struct sk_buff *skb)
{
        return skb_is_nonlinear(skb) || skb_cloned(skb) ?
               __skb_linearize(skb) : 0;
}

static __always_inline void
__skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len,
                     unsigned int off)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->csum = csum_block_sub(skb->csum,
                                           csum_partial(start, len, 0), off);
        else if (skb->ip_summed == CHECKSUM_PARTIAL &&
                 skb_checksum_start_offset(skb) < 0)
                skb->ip_summed = CHECKSUM_NONE;
}

/**
 *        skb_postpull_rcsum - update checksum for received skb after pull
 *        @skb: buffer to update
 *        @start: start of data before pull
 *        @len: length of data pulled
 *
 *        After doing a pull on a received packet, you need to call this to
 *        update the CHECKSUM_COMPLETE checksum, or set ip_summed to
 *        CHECKSUM_NONE so that it can be recomputed from scratch.
 */
static inline void skb_postpull_rcsum(struct sk_buff *skb,
                                      const void *start, unsigned int len)
{
        __skb_postpull_rcsum(skb, start, len, 0);
}

static __always_inline void
__skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len,
                     unsigned int off)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->csum = csum_block_add(skb->csum,
                                           csum_partial(start, len, 0), off);
}

/**
 *        skb_postpush_rcsum - update checksum for received skb after push
 *        @skb: buffer to update
 *        @start: start of data after push
 *        @len: length of data pushed
 *
 *        After doing a push on a received packet, you need to call this to
 *        update the CHECKSUM_COMPLETE checksum.
 */
static inline void skb_postpush_rcsum(struct sk_buff *skb,
                                      const void *start, unsigned int len)
{
        __skb_postpush_rcsum(skb, start, len, 0);
}

void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len);

/**
 *        skb_push_rcsum - push skb and update receive checksum
 *        @skb: buffer to update
 *        @len: length of data pulled
 *
 *        This function performs an skb_push on the packet and updates
 *        the CHECKSUM_COMPLETE checksum.  It should be used on
 *        receive path processing instead of skb_push unless you know
 *        that the checksum difference is zero (e.g., a valid IP header)
 *        or you are setting ip_summed to CHECKSUM_NONE.
 */
static inline void *skb_push_rcsum(struct sk_buff *skb, unsigned int len)
{
        skb_push(skb, len);
        skb_postpush_rcsum(skb, skb->data, len);
        return skb->data;
}

int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len);
/**
 *        pskb_trim_rcsum - trim received skb and update checksum
 *        @skb: buffer to trim
 *        @len: new length
 *
 *        This is exactly the same as pskb_trim except that it ensures the
 *        checksum of received packets are still valid after the operation.
 *        It can change skb pointers.
 */

static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
{
        if (likely(len >= skb->len))
                return 0;
        return pskb_trim_rcsum_slow(skb, len);
}

static inline int __skb_trim_rcsum(struct sk_buff *skb, unsigned int len)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;
        __skb_trim(skb, len);
        return 0;
}

static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;
        return __skb_grow(skb, len);
}

#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
#define skb_rb_first(root) rb_to_skb(rb_first(root))
#define skb_rb_last(root)  rb_to_skb(rb_last(root))
#define skb_rb_next(skb)   rb_to_skb(rb_next(&(skb)->rbnode))
#define skb_rb_prev(skb)   rb_to_skb(rb_prev(&(skb)->rbnode))

#define skb_queue_walk(queue, skb) \
                for (skb = (queue)->next;                                        \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = skb->next)

#define skb_queue_walk_safe(queue, skb, tmp)                                        \
                for (skb = (queue)->next, tmp = skb->next;                        \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = tmp, tmp = skb->next)

#define skb_queue_walk_from(queue, skb)                                                \
                for (; skb != (struct sk_buff *)(queue);                        \
                     skb = skb->next)

#define skb_rbtree_walk(skb, root)                                                \
                for (skb = skb_rb_first(root); skb != NULL;                        \
                     skb = skb_rb_next(skb))

#define skb_rbtree_walk_from(skb)                                                \
                for (; skb != NULL;                                                \
                     skb = skb_rb_next(skb))

#define skb_rbtree_walk_from_safe(skb, tmp)                                        \
                for (; tmp = skb ? skb_rb_next(skb) : NULL, (skb != NULL);        \
                     skb = tmp)

#define skb_queue_walk_from_safe(queue, skb, tmp)                                \
                for (tmp = skb->next;                                                \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = tmp, tmp = skb->next)

#define skb_queue_reverse_walk(queue, skb) \
                for (skb = (queue)->prev;                                        \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = skb->prev)

#define skb_queue_reverse_walk_safe(queue, skb, tmp)                                \
                for (skb = (queue)->prev, tmp = skb->prev;                        \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = tmp, tmp = skb->prev)

#define skb_queue_reverse_walk_from_safe(queue, skb, tmp)                        \
                for (tmp = skb->prev;                                                \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = tmp, tmp = skb->prev)

static inline bool skb_has_frag_list(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->frag_list != NULL;
}

static inline void skb_frag_list_init(struct sk_buff *skb)
{
        skb_shinfo(skb)->frag_list = NULL;
}

#define skb_walk_frags(skb, iter)        \
        for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next)


int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue,
                                int *err, long *timeo_p,
                                const struct sk_buff *skb);
struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
                                          struct sk_buff_head *queue,
                                          unsigned int flags,
                                          int *off, int *err,
                                          struct sk_buff **last);
struct sk_buff *__skb_try_recv_datagram(struct sock *sk,
                                        struct sk_buff_head *queue,
                                        unsigned int flags, int *off, int *err,
                                        struct sk_buff **last);
struct sk_buff *__skb_recv_datagram(struct sock *sk,
                                    struct sk_buff_head *sk_queue,
                                    unsigned int flags, int *off, int *err);
struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock,
                                  int *err);
__poll_t datagram_poll(struct file *file, struct socket *sock,
                           struct poll_table_struct *wait);
int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
                           struct iov_iter *to, int size);
static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset,
                                        struct msghdr *msg, int size)
{
        return skb_copy_datagram_iter(from, offset, &msg->msg_iter, size);
}
int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen,
                                   struct msghdr *msg);
int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset,
                           struct iov_iter *to, int len,
                           struct ahash_request *hash);
int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
                                 struct iov_iter *from, int len);
int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm);
void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len);
static inline void skb_free_datagram_locked(struct sock *sk,
                                            struct sk_buff *skb)
{
        __skb_free_datagram_locked(sk, skb, 0);
}
int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags);
int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len);
int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len);
__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to,
                              int len);
int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
                    struct pipe_inode_info *pipe, unsigned int len,
                    unsigned int flags);
int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
                         int len);
void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
unsigned int skb_zerocopy_headlen(const struct sk_buff *from);
int skb_zerocopy(struct sk_buff *to, struct sk_buff *from,
                 int len, int hlen);
void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len);
int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen);
void skb_scrub_packet(struct sk_buff *skb, bool xnet);
bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu);
bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len);
struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features);
struct sk_buff *skb_segment_list(struct sk_buff *skb, netdev_features_t features,
                                 unsigned int offset);
struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
int skb_ensure_writable(struct sk_buff *skb, int write_len);
int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci);
int skb_vlan_pop(struct sk_buff *skb);
int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
int skb_eth_pop(struct sk_buff *skb);
int skb_eth_push(struct sk_buff *skb, const unsigned char *dst,
                 const unsigned char *src);
int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto,
                  int mac_len, bool ethernet);
int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len,
                 bool ethernet);
int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse);
int skb_mpls_dec_ttl(struct sk_buff *skb);
struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy,
                             gfp_t gfp);

static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len)
{
        return copy_from_iter_full(data, len, &msg->msg_iter) ? 0 : -EFAULT;
}

static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len)
{
        return copy_to_iter(data, len, &msg->msg_iter) == len ? 0 : -EFAULT;
}

struct skb_checksum_ops {
        __wsum (*update)(const void *mem, int len, __wsum wsum);
        __wsum (*combine)(__wsum csum, __wsum csum2, int offset, int len);
};

extern const struct skb_checksum_ops *crc32c_csum_stub __read_mostly;

__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
                      __wsum csum, const struct skb_checksum_ops *ops);
__wsum skb_checksum(const struct sk_buff *skb, int offset, int len,
                    __wsum csum);

static inline void * __must_check
__skb_header_pointer(const struct sk_buff *skb, int offset,
                     int len, void *data, int hlen, void *buffer)
{
        if (hlen - offset >= len)
                return data + offset;

        if (!skb ||
            skb_copy_bits(skb, offset, buffer, len) < 0)
                return NULL;

        return buffer;
}

static inline void * __must_check
skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer)
{
        return __skb_header_pointer(skb, offset, len, skb->data,
                                    skb_headlen(skb), buffer);
}

/**
 *        skb_needs_linearize - check if we need to linearize a given skb
 *                              depending on the given device features.
 *        @skb: socket buffer to check
 *        @features: net device features
 *
 *        Returns true if either:
 *        1. skb has frag_list and the device doesn't support FRAGLIST, or
 *        2. skb is fragmented and the device does not support SG.
 */
static inline bool skb_needs_linearize(struct sk_buff *skb,
                                       netdev_features_t features)
{
        return skb_is_nonlinear(skb) &&
               ((skb_has_frag_list(skb) && !(features & NETIF_F_FRAGLIST)) ||
                (skb_shinfo(skb)->nr_frags && !(features & NETIF_F_SG)));
}

static inline void skb_copy_from_linear_data(const struct sk_buff *skb,
                                             void *to,
                                             const unsigned int len)
{
        memcpy(to, skb->data, len);
}

static inline void skb_copy_from_linear_data_offset(const struct sk_buff *skb,
                                                    const int offset, void *to,
                                                    const unsigned int len)
{
        memcpy(to, skb->data + offset, len);
}

static inline void skb_copy_to_linear_data(struct sk_buff *skb,
                                           const void *from,
                                           const unsigned int len)
{
        memcpy(skb->data, from, len);
}

static inline void skb_copy_to_linear_data_offset(struct sk_buff *skb,
                                                  const int offset,
                                                  const void *from,
                                                  const unsigned int len)
{
        memcpy(skb->data + offset, from, len);
}

void skb_init(void);

static inline ktime_t skb_get_ktime(const struct sk_buff *skb)
{
        return skb->tstamp;
}

/**
 *        skb_get_timestamp - get timestamp from a skb
 *        @skb: skb to get stamp from
 *        @stamp: pointer to struct __kernel_old_timeval to store stamp in
 *
 *        Timestamps are stored in the skb as offsets to a base timestamp.
 *        This function converts the offset back to a struct timeval and stores
 *        it in stamp.
 */
static inline void skb_get_timestamp(const struct sk_buff *skb,
                                     struct __kernel_old_timeval *stamp)
{
        *stamp = ns_to_kernel_old_timeval(skb->tstamp);
}

static inline void skb_get_new_timestamp(const struct sk_buff *skb,
                                         struct __kernel_sock_timeval *stamp)
{
        struct timespec64 ts = ktime_to_timespec64(skb->tstamp);

        stamp->tv_sec = ts.tv_sec;
        stamp->tv_usec = ts.tv_nsec / 1000;
}

static inline void skb_get_timestampns(const struct sk_buff *skb,
                                       struct __kernel_old_timespec *stamp)
{
        struct timespec64 ts = ktime_to_timespec64(skb->tstamp);

        stamp->tv_sec = ts.tv_sec;
        stamp->tv_nsec = ts.tv_nsec;
}

static inline void skb_get_new_timestampns(const struct sk_buff *skb,
                                           struct __kernel_timespec *stamp)
{
        struct timespec64 ts = ktime_to_timespec64(skb->tstamp);

        stamp->tv_sec = ts.tv_sec;
        stamp->tv_nsec = ts.tv_nsec;
}

static inline void __net_timestamp(struct sk_buff *skb)
{
        skb->tstamp = ktime_get_real();
}

static inline ktime_t net_timedelta(ktime_t t)
{
        return ktime_sub(ktime_get_real(), t);
}

static inline ktime_t net_invalid_timestamp(void)
{
        return 0;
}

static inline u8 skb_metadata_len(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->meta_len;
}

static inline void *skb_metadata_end(const struct sk_buff *skb)
{
        return skb_mac_header(skb);
}

static inline bool __skb_metadata_differs(const struct sk_buff *skb_a,
                                          const struct sk_buff *skb_b,
                                          u8 meta_len)
{
        const void *a = skb_metadata_end(skb_a);
        const void *b = skb_metadata_end(skb_b);
        /* Using more efficient varaiant than plain call to memcmp(). */
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        u64 diffs = 0;

        switch (meta_len) {
#define __it(x, op) (x -= sizeof(u##op))
#define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op))
        case 32: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case 24: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case 16: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case  8: diffs |= __it_diff(a, b, 64);
                break;
        case 28: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case 20: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case 12: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case  4: diffs |= __it_diff(a, b, 32);
                break;
        }
        return diffs;
#else
        return memcmp(a - meta_len, b - meta_len, meta_len);
#endif
}

static inline bool skb_metadata_differs(const struct sk_buff *skb_a,
                                        const struct sk_buff *skb_b)
{
        u8 len_a = skb_metadata_len(skb_a);
        u8 len_b = skb_metadata_len(skb_b);

        if (!(len_a | len_b))
                return false;

        return len_a != len_b ?
               true : __skb_metadata_differs(skb_a, skb_b, len_a);
}

static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len)
{
        skb_shinfo(skb)->meta_len = meta_len;
}

static inline void skb_metadata_clear(struct sk_buff *skb)
{
        skb_metadata_set(skb, 0);
}

struct sk_buff *skb_clone_sk(struct sk_buff *skb);

#ifdef CONFIG_NETWORK_PHY_TIMESTAMPING

void skb_clone_tx_timestamp(struct sk_buff *skb);
bool skb_defer_rx_timestamp(struct sk_buff *skb);

#else /* CONFIG_NETWORK_PHY_TIMESTAMPING */

static inline void skb_clone_tx_timestamp(struct sk_buff *skb)
{
}

static inline bool skb_defer_rx_timestamp(struct sk_buff *skb)
{
        return false;
}

#endif /* !CONFIG_NETWORK_PHY_TIMESTAMPING */

/**
 * skb_complete_tx_timestamp() - deliver cloned skb with tx timestamps
 *
 * PHY drivers may accept clones of transmitted packets for
 * timestamping via their phy_driver.txtstamp method. These drivers
 * must call this function to return the skb back to the stack with a
 * timestamp.
 *
 * @skb: clone of the original outgoing packet
 * @hwtstamps: hardware time stamps
 *
 */
void skb_complete_tx_timestamp(struct sk_buff *skb,
                               struct skb_shared_hwtstamps *hwtstamps);

void __skb_tstamp_tx(struct sk_buff *orig_skb,
                     struct skb_shared_hwtstamps *hwtstamps,
                     struct sock *sk, int tstype);

/**
 * skb_tstamp_tx - queue clone of skb with send time stamps
 * @orig_skb:        the original outgoing packet
 * @hwtstamps:        hardware time stamps, may be NULL if not available
 *
 * If the skb has a socket associated, then this function clones the
 * skb (thus sharing the actual data and optional structures), stores
 * the optional hardware time stamping information (if non NULL) or
 * generates a software time stamp (otherwise), then queues the clone
 * to the error queue of the socket.  Errors are silently ignored.
 */
void skb_tstamp_tx(struct sk_buff *orig_skb,
                   struct skb_shared_hwtstamps *hwtstamps);

/**
 * skb_tx_timestamp() - Driver hook for transmit timestamping
 *
 * Ethernet MAC Drivers should call this function in their hard_xmit()
 * function immediately before giving the sk_buff to the MAC hardware.
 *
 * Specifically, one should make absolutely sure that this function is
 * called before TX completion of this packet can trigger.  Otherwise
 * the packet could potentially already be freed.
 *
 * @skb: A socket buffer.
 */
static inline void skb_tx_timestamp(struct sk_buff *skb)
{
        skb_clone_tx_timestamp(skb);
        if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP)
                skb_tstamp_tx(skb, NULL);
}

/**
 * skb_complete_wifi_ack - deliver skb with wifi status
 *
 * @skb: the original outgoing packet
 * @acked: ack status
 *
 */
void skb_complete_wifi_ack(struct sk_buff *skb, bool acked);

__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len);
__sum16 __skb_checksum_complete(struct sk_buff *skb);

static inline int skb_csum_unnecessary(const struct sk_buff *skb)
{
        return ((skb->ip_summed == CHECKSUM_UNNECESSARY) ||
                skb->csum_valid ||
                (skb->ip_summed == CHECKSUM_PARTIAL &&
                 skb_checksum_start_offset(skb) >= 0));
}

/**
 *        skb_checksum_complete - Calculate checksum of an entire packet
 *        @skb: packet to process
 *
 *        This function calculates the checksum over the entire packet plus
 *        the value of skb->csum.  The latter can be used to supply the
 *        checksum of a pseudo header as used by TCP/UDP.  It returns the
 *        checksum.
 *
 *        For protocols that contain complete checksums such as ICMP/TCP/UDP,
 *        this function can be used to verify that checksum on received
 *        packets.  In that case the function should return zero if the
 *        checksum is correct.  In particular, this function will return zero
 *        if skb->ip_summed is CHECKSUM_UNNECESSARY which indicates that the
 *        hardware has already verified the correctness of the checksum.
 */
static inline __sum16 skb_checksum_complete(struct sk_buff *skb)
{
        return skb_csum_unnecessary(skb) ?
               0 : __skb_checksum_complete(skb);
}

static inline void __skb_decr_checksum_unnecessary(struct sk_buff *skb)
{
        if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
                if (skb->csum_level == 0)
                        skb->ip_summed = CHECKSUM_NONE;
                else
                        skb->csum_level--;
        }
}

static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb)
{
        if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
                if (skb->csum_level < SKB_MAX_CSUM_LEVEL)
                        skb->csum_level++;
        } else if (skb->ip_summed == CHECKSUM_NONE) {
                skb->ip_summed = CHECKSUM_UNNECESSARY;
                skb->csum_level = 0;
        }
}

static inline void __skb_reset_checksum_unnecessary(struct sk_buff *skb)
{
        if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
                skb->ip_summed = CHECKSUM_NONE;
                skb->csum_level = 0;
        }
}

/* Check if we need to perform checksum complete validation.
 *
 * Returns true if checksum complete is needed, false otherwise
 * (either checksum is unnecessary or zero checksum is allowed).
 */
static inline bool __skb_checksum_validate_needed(struct sk_buff *skb,
                                                  bool zero_okay,
                                                  __sum16 check)
{
        if (skb_csum_unnecessary(skb) || (zero_okay && !check)) {
                skb->csum_valid = 1;
                __skb_decr_checksum_unnecessary(skb);
                return false;
        }

        return true;
}

/* For small packets <= CHECKSUM_BREAK perform checksum complete directly
 * in checksum_init.
 */
#define CHECKSUM_BREAK 76

/* Unset checksum-complete
 *
 * Unset checksum complete can be done when packet is being modified
 * (uncompressed for instance) and checksum-complete value is
 * invalidated.
 */
static inline void skb_checksum_complete_unset(struct sk_buff *skb)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;
}

/* Validate (init) checksum based on checksum complete.
 *
 * Return values:
 *   0: checksum is validated or try to in skb_checksum_complete. In the latter
 *        case the ip_summed will not be CHECKSUM_UNNECESSARY and the pseudo
 *        checksum is stored in skb->csum for use in __skb_checksum_complete
 *   non-zero: value of invalid checksum
 *
 */
static inline __sum16 __skb_checksum_validate_complete(struct sk_buff *skb,
                                                       bool complete,
                                                       __wsum psum)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE) {
                if (!csum_fold(csum_add(psum, skb->csum))) {
                        skb->csum_valid = 1;
                        return 0;
                }
        }

        skb->csum = psum;

        if (complete || skb->len <= CHECKSUM_BREAK) {
                __sum16 csum;

                csum = __skb_checksum_complete(skb);
                skb->csum_valid = !csum;
                return csum;
        }

        return 0;
}

static inline __wsum null_compute_pseudo(struct sk_buff *skb, int proto)
{
        return 0;
}

/* Perform checksum validate (init). Note that this is a macro since we only
 * want to calculate the pseudo header which is an input function if necessary.
 * First we try to validate without any computation (checksum unnecessary) and
 * then calculate based on checksum complete calling the function to compute
 * pseudo header.
 *
 * Return values:
 *   0: checksum is validated or try to in skb_checksum_complete
 *   non-zero: value of invalid checksum
 */
#define __skb_checksum_validate(skb, proto, complete,                        \
                                zero_okay, check, compute_pseudo)        \
({                                                                        \
        __sum16 __ret = 0;                                                \
        skb->csum_valid = 0;                                                \
        if (__skb_checksum_validate_needed(skb, zero_okay, check))        \
                __ret = __skb_checksum_validate_complete(skb,                \
                                complete, compute_pseudo(skb, proto));        \
        __ret;                                                                \
})

#define skb_checksum_init(skb, proto, compute_pseudo)                        \
        __skb_checksum_validate(skb, proto, false, false, 0, compute_pseudo)

#define skb_checksum_init_zero_check(skb, proto, check, compute_pseudo)        \
        __skb_checksum_validate(skb, proto, false, true, check, compute_pseudo)

#define skb_checksum_validate(skb, proto, compute_pseudo)                \
        __skb_checksum_validate(skb, proto, true, false, 0, compute_pseudo)

#define skb_checksum_validate_zero_check(skb, proto, check,                \
                                         compute_pseudo)                \
        __skb_checksum_validate(skb, proto, true, true, check, compute_pseudo)

#define skb_checksum_simple_validate(skb)                                \
        __skb_checksum_validate(skb, 0, true, false, 0, null_compute_pseudo)

static inline bool __skb_checksum_convert_check(struct sk_buff *skb)
{
        return (skb->ip_summed == CHECKSUM_NONE && skb->csum_valid);
}

static inline void __skb_checksum_convert(struct sk_buff *skb, __wsum pseudo)
{
        skb->csum = ~pseudo;
        skb->ip_summed = CHECKSUM_COMPLETE;
}

#define skb_checksum_try_convert(skb, proto, compute_pseudo)        \
do {                                                                        \
        if (__skb_checksum_convert_check(skb))                                \
                __skb_checksum_convert(skb, compute_pseudo(skb, proto)); \
} while (0)

static inline void skb_remcsum_adjust_partial(struct sk_buff *skb, void *ptr,
                                              u16 start, u16 offset)
{
        skb->ip_summed = CHECKSUM_PARTIAL;
        skb->csum_start = ((unsigned char *)ptr + start) - skb->head;
        skb->csum_offset = offset - start;
}

/* Update skbuf and packet to reflect the remote checksum offload operation.
 * When called, ptr indicates the starting point for skb->csum when
 * ip_summed is CHECKSUM_COMPLETE. If we need create checksum complete
 * here, skb_postpull_rcsum is done so skb->csum start is ptr.
 */
static inline void skb_remcsum_process(struct sk_buff *skb, void *ptr,
                                       int start, int offset, bool nopartial)
{
        __wsum delta;

        if (!nopartial) {
                skb_remcsum_adjust_partial(skb, ptr, start, offset);
                return;
        }

         if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) {
                __skb_checksum_complete(skb);
                skb_postpull_rcsum(skb, skb->data, ptr - (void *)skb->data);
        }

        delta = remcsum_adjust(ptr, skb->csum, start, offset);

        /* Adjust skb->csum since we changed the packet */
        skb->csum = csum_add(skb->csum, delta);
}

static inline struct nf_conntrack *skb_nfct(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        return (void *)(skb->_nfct & NFCT_PTRMASK);
#else
        return NULL;
#endif
}

static inline unsigned long skb_get_nfct(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        return skb->_nfct;
#else
        return 0UL;
#endif
}

static inline void skb_set_nfct(struct sk_buff *skb, unsigned long nfct)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        skb->_nfct = nfct;
#endif
}

#ifdef CONFIG_SKB_EXTENSIONS
enum skb_ext_id {
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
        SKB_EXT_BRIDGE_NF,
#endif
#ifdef CONFIG_XFRM
        SKB_EXT_SEC_PATH,
#endif
#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
        TC_SKB_EXT,
#endif
#if IS_ENABLED(CONFIG_MPTCP)
        SKB_EXT_MPTCP,
#endif
        SKB_EXT_NUM, /* must be last */
};

/**
 *        struct skb_ext - sk_buff extensions
 *        @refcnt: 1 on allocation, deallocated on 0
 *        @offset: offset to add to @data to obtain extension address
 *        @chunks: size currently allocated, stored in SKB_EXT_ALIGN_SHIFT units
 *        @data: start of extension data, variable sized
 *
 *        Note: offsets/lengths are stored in chunks of 8 bytes, this allows
 *        to use 'u8' types while allowing up to 2kb worth of extension data.
 */
struct skb_ext {
        refcount_t refcnt;
        u8 offset[SKB_EXT_NUM]; /* in chunks of 8 bytes */
        u8 chunks;                /* same */
        char data[] __aligned(8);
};

struct skb_ext *__skb_ext_alloc(gfp_t flags);
void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
                    struct skb_ext *ext);
void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id);
void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id);
void __skb_ext_put(struct skb_ext *ext);

static inline void skb_ext_put(struct sk_buff *skb)
{
        if (skb->active_extensions)
                __skb_ext_put(skb->extensions);
}

static inline void __skb_ext_copy(struct sk_buff *dst,
                                  const struct sk_buff *src)
{
        dst->active_extensions = src->active_extensions;

        if (src->active_extensions) {
                struct skb_ext *ext = src->extensions;

                refcount_inc(&ext->refcnt);
                dst->extensions = ext;
        }
}

static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *src)
{
        skb_ext_put(dst);
        __skb_ext_copy(dst, src);
}

static inline bool __skb_ext_exist(const struct skb_ext *ext, enum skb_ext_id i)
{
        return !!ext->offset[i];
}

static inline bool skb_ext_exist(const struct sk_buff *skb, enum skb_ext_id id)
{
        return skb->active_extensions & (1 << id);
}

static inline void skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
{
        if (skb_ext_exist(skb, id))
                __skb_ext_del(skb, id);
}

static inline void *skb_ext_find(const struct sk_buff *skb, enum skb_ext_id id)
{
        if (skb_ext_exist(skb, id)) {
                struct skb_ext *ext = skb->extensions;

                return (void *)ext + (ext->offset[id] << 3);
        }

        return NULL;
}

static inline void skb_ext_reset(struct sk_buff *skb)
{
        if (unlikely(skb->active_extensions)) {
                __skb_ext_put(skb->extensions);
                skb->active_extensions = 0;
        }
}

static inline bool skb_has_extensions(struct sk_buff *skb)
{
        return unlikely(skb->active_extensions);
}
#else
static inline void skb_ext_put(struct sk_buff *skb) {}
static inline void skb_ext_reset(struct sk_buff *skb) {}
static inline void skb_ext_del(struct sk_buff *skb, int unused) {}
static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {}
static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {}
static inline bool skb_has_extensions(struct sk_buff *skb) { return false; }
#endif /* CONFIG_SKB_EXTENSIONS */

static inline void nf_reset_ct(struct sk_buff *skb)
{
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        nf_conntrack_put(skb_nfct(skb));
        skb->_nfct = 0;
#endif
}

static inline void nf_reset_trace(struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES)
        skb->nf_trace = 0;
#endif
}

static inline void ipvs_reset(struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_IP_VS)
        skb->ipvs_property = 0;
#endif
}

/* Note: This doesn't put any conntrack info in dst. */
static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src,
                             bool copy)
{
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        dst->_nfct = src->_nfct;
        nf_conntrack_get(skb_nfct(src));
#endif
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES)
        if (copy)
                dst->nf_trace = src->nf_trace;
#endif
}

static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src)
{
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        nf_conntrack_put(skb_nfct(dst));
#endif
        __nf_copy(dst, src, true);
}

#ifdef CONFIG_NETWORK_SECMARK
static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from)
{
        to->secmark = from->secmark;
}

static inline void skb_init_secmark(struct sk_buff *skb)
{
        skb->secmark = 0;
}
#else
static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from)
{ }

static inline void skb_init_secmark(struct sk_buff *skb)
{ }
#endif

static inline int secpath_exists(const struct sk_buff *skb)
{
#ifdef CONFIG_XFRM
        return skb_ext_exist(skb, SKB_EXT_SEC_PATH);
#else
        return 0;
#endif
}

static inline bool skb_irq_freeable(const struct sk_buff *skb)
{
        return !skb->destructor &&
                !secpath_exists(skb) &&
                !skb_nfct(skb) &&
                !skb->_skb_refdst &&
                !skb_has_frag_list(skb);
}

static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping)
{
        skb->queue_mapping = queue_mapping;
}

static inline u16 skb_get_queue_mapping(const struct sk_buff *skb)
{
        return skb->queue_mapping;
}

static inline void skb_copy_queue_mapping(struct sk_buff *to, const struct sk_buff *from)
{
        to->queue_mapping = from->queue_mapping;
}

static inline void skb_record_rx_queue(struct sk_buff *skb, u16 rx_queue)
{
        skb->queue_mapping = rx_queue + 1;
}

static inline u16 skb_get_rx_queue(const struct sk_buff *skb)
{
        return skb->queue_mapping - 1;
}

static inline bool skb_rx_queue_recorded(const struct sk_buff *skb)
{
        return skb->queue_mapping != 0;
}

static inline void skb_set_dst_pending_confirm(struct sk_buff *skb, u32 val)
{
        skb->dst_pending_confirm = val;
}

static inline bool skb_get_dst_pending_confirm(const struct sk_buff *skb)
{
        return skb->dst_pending_confirm != 0;
}

static inline struct sec_path *skb_sec_path(const struct sk_buff *skb)
{
#ifdef CONFIG_XFRM
        return skb_ext_find(skb, SKB_EXT_SEC_PATH);
#else
        return NULL;
#endif
}

/* Keeps track of mac header offset relative to skb->head.
 * It is useful for TSO of Tunneling protocol. e.g. GRE.
 * For non-tunnel skb it points to skb_mac_header() and for
 * tunnel skb it points to outer mac header.
 * Keeps track of level of encapsulation of network headers.
 */
struct skb_gso_cb {
        union {
                int        mac_offset;
                int        data_offset;
        };
        int        encap_level;
        __wsum        csum;
        __u16        csum_start;
};
#define SKB_GSO_CB_OFFSET        32
#define SKB_GSO_CB(skb) ((struct skb_gso_cb *)((skb)->cb + SKB_GSO_CB_OFFSET))

static inline int skb_tnl_header_len(const struct sk_buff *inner_skb)
{
        return (skb_mac_header(inner_skb) - inner_skb->head) -
                SKB_GSO_CB(inner_skb)->mac_offset;
}

static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra)
{
        int new_headroom, headroom;
        int ret;

        headroom = skb_headroom(skb);
        ret = pskb_expand_head(skb, extra, 0, GFP_ATOMIC);
        if (ret)
                return ret;

        new_headroom = skb_headroom(skb);
        SKB_GSO_CB(skb)->mac_offset += (new_headroom - headroom);
        return 0;
}

static inline void gso_reset_checksum(struct sk_buff *skb, __wsum res)
{
        /* Do not update partial checksums if remote checksum is enabled. */
        if (skb->remcsum_offload)
                return;

        SKB_GSO_CB(skb)->csum = res;
        SKB_GSO_CB(skb)->csum_start = skb_checksum_start(skb) - skb->head;
}

/* Compute the checksum for a gso segment. First compute the checksum value
 * from the start of transport header to SKB_GSO_CB(skb)->csum_start, and
 * then add in skb->csum (checksum from csum_start to end of packet).
 * skb->csum and csum_start are then updated to reflect the checksum of the
 * resultant packet starting from the transport header-- the resultant checksum
 * is in the res argument (i.e. normally zero or ~ of checksum of a pseudo
 * header.
 */
static inline __sum16 gso_make_checksum(struct sk_buff *skb, __wsum res)
{
        unsigned char *csum_start = skb_transport_header(skb);
        int plen = (skb->head + SKB_GSO_CB(skb)->csum_start) - csum_start;
        __wsum partial = SKB_GSO_CB(skb)->csum;

        SKB_GSO_CB(skb)->csum = res;
        SKB_GSO_CB(skb)->csum_start = csum_start - skb->head;

        return csum_fold(csum_partial(csum_start, plen, partial));
}

static inline bool skb_is_gso(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->gso_size;
}

/* Note: Should be called only if skb_is_gso(skb) is true */
static inline bool skb_is_gso_v6(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6;
}

/* Note: Should be called only if skb_is_gso(skb) is true */
static inline bool skb_is_gso_sctp(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->gso_type & SKB_GSO_SCTP;
}

/* Note: Should be called only if skb_is_gso(skb) is true */
static inline bool skb_is_gso_tcp(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6);
}

static inline void skb_gso_reset(struct sk_buff *skb)
{
        skb_shinfo(skb)->gso_size = 0;
        skb_shinfo(skb)->gso_segs = 0;
        skb_shinfo(skb)->gso_type = 0;
}

static inline void skb_increase_gso_size(struct skb_shared_info *shinfo,
                                         u16 increment)
{
        if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS))
                return;
        shinfo->gso_size += increment;
}

static inline void skb_decrease_gso_size(struct skb_shared_info *shinfo,
                                         u16 decrement)
{
        if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS))
                return;
        shinfo->gso_size -= decrement;
}

void __skb_warn_lro_forwarding(const struct sk_buff *skb);

static inline bool skb_warn_if_lro(const struct sk_buff *skb)
{
        /* LRO sets gso_size but not gso_type, whereas if GSO is really
         * wanted then gso_type will be set. */
        const struct skb_shared_info *shinfo = skb_shinfo(skb);

        if (skb_is_nonlinear(skb) && shinfo->gso_size != 0 &&
            unlikely(shinfo->gso_type == 0)) {
                __skb_warn_lro_forwarding(skb);
                return true;
        }
        return false;
}

static inline void skb_forward_csum(struct sk_buff *skb)
{
        /* Unfortunately we don't support this one.  Any brave souls? */
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;
}

/**
 * skb_checksum_none_assert - make sure skb ip_summed is CHECKSUM_NONE
 * @skb: skb to check
 *
 * fresh skbs have their ip_summed set to CHECKSUM_NONE.
 * Instead of forcing ip_summed to CHECKSUM_NONE, we can
 * use this helper, to document places where we make this assertion.
 */
static inline void skb_checksum_none_assert(const struct sk_buff *skb)
{
#ifdef DEBUG
        BUG_ON(skb->ip_summed != CHECKSUM_NONE);
#endif
}

bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);

int skb_checksum_setup(struct sk_buff *skb, bool recalculate);
struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
                                     unsigned int transport_len,
                                     __sum16(*skb_chkf)(struct sk_buff *skb));

/**
 * skb_head_is_locked - Determine if the skb->head is locked down
 * @skb: skb to check
 *
 * The head on skbs build around a head frag can be removed if they are
 * not cloned.  This function returns true if the skb head is locked down
 * due to either being allocated via kmalloc, or by being a clone with
 * multiple references to the head.
 */
static inline bool skb_head_is_locked(const struct sk_buff *skb)
{
        return !skb->head_frag || skb_cloned(skb);
}

/* Local Checksum Offload.
 * Compute outer checksum based on the assumption that the
 * inner checksum will be offloaded later.
 * See Documentation/networking/checksum-offloads.rst for
 * explanation of how this works.
 * Fill in outer checksum adjustment (e.g. with sum of outer
 * pseudo-header) before calling.
 * Also ensure that inner checksum is in linear data area.
 */
static inline __wsum lco_csum(struct sk_buff *skb)
{
        unsigned char *csum_start = skb_checksum_start(skb);
        unsigned char *l4_hdr = skb_transport_header(skb);
        __wsum partial;

        /* Start with complement of inner checksum adjustment */
        partial = ~csum_unfold(*(__force __sum16 *)(csum_start +
                                                    skb->csum_offset));

        /* Add in checksum of our headers (incl. outer checksum
         * adjustment filled in by caller) and return result.
         */
        return csum_partial(l4_hdr, csum_start - l4_hdr, partial);
}

static inline bool skb_is_redirected(const struct sk_buff *skb)
{
#ifdef CONFIG_NET_REDIRECT
        return skb->redirected;
#else
        return false;
#endif
}

static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress)
{
#ifdef CONFIG_NET_REDIRECT
        skb->redirected = 1;
        skb->from_ingress = from_ingress;
        if (skb->from_ingress)
                skb->tstamp = 0;
#endif
}

static inline void skb_reset_redirect(struct sk_buff *skb)
{
#ifdef CONFIG_NET_REDIRECT
        skb->redirected = 0;
#endif
}

static inline bool skb_csum_is_sctp(struct sk_buff *skb)
{
        return skb->csum_not_inet;
}

static inline void skb_set_kcov_handle(struct sk_buff *skb,
                                       const u64 kcov_handle)
{
#ifdef CONFIG_KCOV
        skb->kcov_handle = kcov_handle;
#endif
}

static inline u64 skb_get_kcov_handle(struct sk_buff *skb)
{
#ifdef CONFIG_KCOV
        return skb->kcov_handle;
#else
        return 0;
#endif
}

#endif        /* __KERNEL__ */
#endif        /* _LINUX_SKBUFF_H */






















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Linux Socket Filter Data Structures
 */
#ifndef __LINUX_FILTER_H__
#define __LINUX_FILTER_H__

#include <stdarg.h>

#include <linux/atomic.h>
#include <linux/refcount.h>
#include <linux/compat.h>
#include <linux/skbuff.h>
#include <linux/linkage.h>
#include <linux/printk.h>
#include <linux/workqueue.h>
#include <linux/sched.h>
#include <linux/capability.h>
#include <linux/set_memory.h>
#include <linux/kallsyms.h>
#include <linux/if_vlan.h>
#include <linux/vmalloc.h>
#include <linux/sockptr.h>
#include <crypto/sha.h>

#include <net/sch_generic.h>

#include <asm/byteorder.h>
#include <uapi/linux/filter.h>
#include <uapi/linux/bpf.h>

struct sk_buff;
struct sock;
struct seccomp_data;
struct bpf_prog_aux;
struct xdp_rxq_info;
struct xdp_buff;
struct sock_reuseport;
struct ctl_table;
struct ctl_table_header;

/* ArgX, context and stack frame pointer register positions. Note,
 * Arg1, Arg2, Arg3, etc are used as argument mappings of function
 * calls in BPF_CALL instruction.
 */
#define BPF_REG_ARG1        BPF_REG_1
#define BPF_REG_ARG2        BPF_REG_2
#define BPF_REG_ARG3        BPF_REG_3
#define BPF_REG_ARG4        BPF_REG_4
#define BPF_REG_ARG5        BPF_REG_5
#define BPF_REG_CTX        BPF_REG_6
#define BPF_REG_FP        BPF_REG_10

/* Additional register mappings for converted user programs. */
#define BPF_REG_A        BPF_REG_0
#define BPF_REG_X        BPF_REG_7
#define BPF_REG_TMP        BPF_REG_2        /* scratch reg */
#define BPF_REG_D        BPF_REG_8        /* data, callee-saved */
#define BPF_REG_H        BPF_REG_9        /* hlen, callee-saved */

/* Kernel hidden auxiliary/helper register. */
#define BPF_REG_AX                MAX_BPF_REG
#define MAX_BPF_EXT_REG                (MAX_BPF_REG + 1)
#define MAX_BPF_JIT_REG                MAX_BPF_EXT_REG

/* unused opcode to mark special call to bpf_tail_call() helper */
#define BPF_TAIL_CALL        0xf0

/* unused opcode to mark special load instruction. Same as BPF_ABS */
#define BPF_PROBE_MEM        0x20

/* unused opcode to mark call to interpreter with arguments */
#define BPF_CALL_ARGS        0xe0

/* unused opcode to mark speculation barrier for mitigating
 * Speculative Store Bypass
 */
#define BPF_NOSPEC        0xc0

/* As per nm, we expose JITed images as text (code) section for
 * kallsyms. That way, tools like perf can find it to match
 * addresses.
 */
#define BPF_SYM_ELF_TYPE        't'

/* BPF program can access up to 512 bytes of stack space. */
#define MAX_BPF_STACK        512

/* Helper macros for filter block array initializers. */

/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */

#define BPF_ALU64_REG(OP, DST, SRC)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_OP(OP) | BPF_X,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

#define BPF_ALU32_REG(OP, DST, SRC)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_OP(OP) | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */

#define BPF_ALU64_IMM(OP, DST, IMM)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_OP(OP) | BPF_K,        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

#define BPF_ALU32_IMM(OP, DST, IMM)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_OP(OP) | BPF_K,                \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Endianess conversion, cpu_to_{l,b}e(), {l,b}e_to_cpu() */

#define BPF_ENDIAN(TYPE, DST, LEN)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_END | BPF_SRC(TYPE),        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = LEN })

/* Short form of mov, dst_reg = src_reg */

#define BPF_MOV64_REG(DST, SRC)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

#define BPF_MOV32_REG(DST, SRC)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

/* Short form of mov, dst_reg = imm32 */

#define BPF_MOV64_IMM(DST, IMM)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_MOV | BPF_K,                \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

#define BPF_MOV32_IMM(DST, IMM)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_K,                \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Special form of mov32, used for doing explicit zero extension on dst. */
#define BPF_ZEXT_REG(DST)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = DST,                                        \
                .off   = 0,                                        \
                .imm   = 1 })

static inline bool insn_is_zext(const struct bpf_insn *insn)
{
        return insn->code == (BPF_ALU | BPF_MOV | BPF_X) && insn->imm == 1;
}

/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */
#define BPF_LD_IMM64(DST, IMM)                                        \
        BPF_LD_IMM64_RAW(DST, 0, IMM)

#define BPF_LD_IMM64_RAW(DST, SRC, IMM)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LD | BPF_DW | BPF_IMM,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = (__u32) (IMM) }),                        \
        ((struct bpf_insn) {                                        \
                .code  = 0, /* zero is reserved opcode */        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = ((__u64) (IMM)) >> 32 })

/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
#define BPF_LD_MAP_FD(DST, MAP_FD)                                \
        BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)

/* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */

#define BPF_MOV64_RAW(TYPE, DST, SRC, IMM)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_MOV | BPF_SRC(TYPE),        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

#define BPF_MOV32_RAW(TYPE, DST, SRC, IMM)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_SRC(TYPE),        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */

#define BPF_LD_ABS(SIZE, IMM)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS,        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Indirect packet access, R0 = *(uint *) (skb->data + src_reg + imm32) */

#define BPF_LD_IND(SIZE, SRC, IMM)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LD | BPF_SIZE(SIZE) | BPF_IND,        \
                .dst_reg = 0,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Memory load, dst_reg = *(uint *) (src_reg + off16) */

#define BPF_LDX_MEM(SIZE, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Memory store, *(uint *) (dst_reg + off16) = src_reg */

#define BPF_STX_MEM(SIZE, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Atomic memory add, *(uint *)(dst_reg + off16) += src_reg */

#define BPF_STX_XADD(SIZE, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Memory store, *(uint *) (dst_reg + off16) = imm32 */

#define BPF_ST_MEM(SIZE, DST, OFF, IMM)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM,        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })

/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */

#define BPF_JMP_REG(OP, DST, SRC, OFF)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_OP(OP) | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */

#define BPF_JMP_IMM(OP, DST, IMM, OFF)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_OP(OP) | BPF_K,                \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })

/* Like BPF_JMP_REG, but with 32-bit wide operands for comparison. */

#define BPF_JMP32_REG(OP, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP32 | BPF_OP(OP) | BPF_X,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Like BPF_JMP_IMM, but with 32-bit wide operands for comparison. */

#define BPF_JMP32_IMM(OP, DST, IMM, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP32 | BPF_OP(OP) | BPF_K,        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })

/* Unconditional jumps, goto pc + off16 */

#define BPF_JMP_A(OFF)                                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_JA,                        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Relative call */

#define BPF_CALL_REL(TGT)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_CALL,                        \
                .dst_reg = 0,                                        \
                .src_reg = BPF_PSEUDO_CALL,                        \
                .off   = 0,                                        \
                .imm   = TGT })

/* Function call */

#define BPF_CAST_CALL(x)                                        \
                ((u64 (*)(u64, u64, u64, u64, u64))(x))

#define BPF_EMIT_CALL(FUNC)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_CALL,                        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = ((FUNC) - __bpf_call_base) })

/* Raw code statement block */

#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM)                        \
        ((struct bpf_insn) {                                        \
                .code  = CODE,                                        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })

/* Program exit */

#define BPF_EXIT_INSN()                                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_EXIT,                        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

/* Speculation barrier */

#define BPF_ST_NOSPEC()                                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ST | BPF_NOSPEC,                        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

/* Internal classic blocks for direct assignment */

#define __BPF_STMT(CODE, K)                                        \
        ((struct sock_filter) BPF_STMT(CODE, K))

#define __BPF_JUMP(CODE, K, JT, JF)                                \
        ((struct sock_filter) BPF_JUMP(CODE, K, JT, JF))

#define bytes_to_bpf_size(bytes)                                \
({                                                                \
        int bpf_size = -EINVAL;                                        \
                                                                \
        if (bytes == sizeof(u8))                                \
                bpf_size = BPF_B;                                \
        else if (bytes == sizeof(u16))                                \
                bpf_size = BPF_H;                                \
        else if (bytes == sizeof(u32))                                \
                bpf_size = BPF_W;                                \
        else if (bytes == sizeof(u64))                                \
                bpf_size = BPF_DW;                                \
                                                                \
        bpf_size;                                                \
})

#define bpf_size_to_bytes(bpf_size)                                \
({                                                                \
        int bytes = -EINVAL;                                        \
                                                                \
        if (bpf_size == BPF_B)                                        \
                bytes = sizeof(u8);                                \
        else if (bpf_size == BPF_H)                                \
                bytes = sizeof(u16);                                \
        else if (bpf_size == BPF_W)                                \
                bytes = sizeof(u32);                                \
        else if (bpf_size == BPF_DW)                                \
                bytes = sizeof(u64);                                \
                                                                \
        bytes;                                                        \
})

#define BPF_SIZEOF(type)                                        \
        ({                                                        \
                const int __size = bytes_to_bpf_size(sizeof(type)); \
                BUILD_BUG_ON(__size < 0);                        \
                __size;                                                \
        })

#define BPF_FIELD_SIZEOF(type, field)                                \
        ({                                                        \
                const int __size = bytes_to_bpf_size(sizeof_field(type, field)); \
                BUILD_BUG_ON(__size < 0);                        \
                __size;                                                \
        })

#define BPF_LDST_BYTES(insn)                                        \
        ({                                                        \
                const int __size = bpf_size_to_bytes(BPF_SIZE((insn)->code)); \
                WARN_ON(__size < 0);                                \
                __size;                                                \
        })

#define __BPF_MAP_0(m, v, ...) v
#define __BPF_MAP_1(m, v, t, a, ...) m(t, a)
#define __BPF_MAP_2(m, v, t, a, ...) m(t, a), __BPF_MAP_1(m, v, __VA_ARGS__)
#define __BPF_MAP_3(m, v, t, a, ...) m(t, a), __BPF_MAP_2(m, v, __VA_ARGS__)
#define __BPF_MAP_4(m, v, t, a, ...) m(t, a), __BPF_MAP_3(m, v, __VA_ARGS__)
#define __BPF_MAP_5(m, v, t, a, ...) m(t, a), __BPF_MAP_4(m, v, __VA_ARGS__)

#define __BPF_REG_0(...) __BPF_PAD(5)
#define __BPF_REG_1(...) __BPF_MAP(1, __VA_ARGS__), __BPF_PAD(4)
#define __BPF_REG_2(...) __BPF_MAP(2, __VA_ARGS__), __BPF_PAD(3)
#define __BPF_REG_3(...) __BPF_MAP(3, __VA_ARGS__), __BPF_PAD(2)
#define __BPF_REG_4(...) __BPF_MAP(4, __VA_ARGS__), __BPF_PAD(1)
#define __BPF_REG_5(...) __BPF_MAP(5, __VA_ARGS__)

#define __BPF_MAP(n, ...) __BPF_MAP_##n(__VA_ARGS__)
#define __BPF_REG(n, ...) __BPF_REG_##n(__VA_ARGS__)

#define __BPF_CAST(t, a)                                                       \
        (__force t)                                                               \
        (__force                                                               \
         typeof(__builtin_choose_expr(sizeof(t) == sizeof(unsigned long),      \
                                      (unsigned long)0, (t)0))) a
#define __BPF_V void
#define __BPF_N

#define __BPF_DECL_ARGS(t, a) t   a
#define __BPF_DECL_REGS(t, a) u64 a

#define __BPF_PAD(n)                                                               \
        __BPF_MAP(n, __BPF_DECL_ARGS, __BPF_N, u64, __ur_1, u64, __ur_2,       \
                  u64, __ur_3, u64, __ur_4, u64, __ur_5)

#define BPF_CALL_x(x, attr, name, ...)                                               \
        static __always_inline                                                       \
        u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__));   \
        typedef u64 (*btf_##name)(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \
        attr u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__));    \
        attr u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__))     \
        {                                                                       \
                return ((btf_##name)____##name)(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\
        }                                                                       \
        static __always_inline                                                       \
        u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__))

#define __NOATTR
#define BPF_CALL_0(name, ...)        BPF_CALL_x(0, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_1(name, ...)        BPF_CALL_x(1, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_2(name, ...)        BPF_CALL_x(2, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_3(name, ...)        BPF_CALL_x(3, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_4(name, ...)        BPF_CALL_x(4, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_5(name, ...)        BPF_CALL_x(5, __NOATTR, name, __VA_ARGS__)

#define NOTRACE_BPF_CALL_1(name, ...)        BPF_CALL_x(1, notrace, name, __VA_ARGS__)

#define bpf_ctx_range(TYPE, MEMBER)                                                \
        offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1
#define bpf_ctx_range_till(TYPE, MEMBER1, MEMBER2)                                \
        offsetof(TYPE, MEMBER1) ... offsetofend(TYPE, MEMBER2) - 1
#if BITS_PER_LONG == 64
# define bpf_ctx_range_ptr(TYPE, MEMBER)                                        \
        offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1
#else
# define bpf_ctx_range_ptr(TYPE, MEMBER)                                        \
        offsetof(TYPE, MEMBER) ... offsetof(TYPE, MEMBER) + 8 - 1
#endif /* BITS_PER_LONG == 64 */

#define bpf_target_off(TYPE, MEMBER, SIZE, PTR_SIZE)                                \
        ({                                                                        \
                BUILD_BUG_ON(sizeof_field(TYPE, MEMBER) != (SIZE));                \
                *(PTR_SIZE) = (SIZE);                                                \
                offsetof(TYPE, MEMBER);                                                \
        })

/* A struct sock_filter is architecture independent. */
struct compat_sock_fprog {
        u16                len;
        compat_uptr_t        filter;        /* struct sock_filter * */
};

struct sock_fprog_kern {
        u16                        len;
        struct sock_filter        *filter;
};

/* Some arches need doubleword alignment for their instructions and/or data */
#define BPF_IMAGE_ALIGNMENT 8

struct bpf_binary_header {
        u32 pages;
        u8 image[] __aligned(BPF_IMAGE_ALIGNMENT);
};

struct bpf_prog {
        u16                        pages;                /* Number of allocated pages */
        u16                        jited:1,        /* Is our filter JIT'ed? */
                                jit_requested:1,/* archs need to JIT the prog */
                                gpl_compatible:1, /* Is filter GPL compatible? */
                                cb_access:1,        /* Is control block accessed? */
                                dst_needed:1,        /* Do we need dst entry? */
                                blinded:1,        /* Was blinded */
                                is_func:1,        /* program is a bpf function */
                                kprobe_override:1, /* Do we override a kprobe? */
                                has_callchain_buf:1, /* callchain buffer allocated? */
                                enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
                                call_get_stack:1; /* Do we call bpf_get_stack() or bpf_get_stackid() */
        enum bpf_prog_type        type;                /* Type of BPF program */
        enum bpf_attach_type        expected_attach_type; /* For some prog types */
        u32                        len;                /* Number of filter blocks */
        u32                        jited_len;        /* Size of jited insns in bytes */
        u8                        tag[BPF_TAG_SIZE];
        struct bpf_prog_aux        *aux;                /* Auxiliary fields */
        struct sock_fprog_kern        *orig_prog;        /* Original BPF program */
        unsigned int                (*bpf_func)(const void *ctx,
                                            const struct bpf_insn *insn);
        /* Instructions for interpreter */
        struct sock_filter        insns[0];
        struct bpf_insn                insnsi[];
};

struct sk_filter {
        refcount_t        refcnt;
        struct rcu_head        rcu;
        struct bpf_prog        *prog;
};

DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);

#define __BPF_PROG_RUN(prog, ctx, dfunc)        ({                        \
        u32 __ret;                                                        \
        cant_migrate();                                                        \
        if (static_branch_unlikely(&bpf_stats_enabled_key)) {                \
                struct bpf_prog_stats *__stats;                                \
                u64 __start = sched_clock();                                \
                __ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func);        \
                __stats = this_cpu_ptr(prog->aux->stats);                \
                u64_stats_update_begin(&__stats->syncp);                \
                __stats->cnt++;                                                \
                __stats->nsecs += sched_clock() - __start;                \
                u64_stats_update_end(&__stats->syncp);                        \
        } else {                                                        \
                __ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func);        \
        }                                                                \
        __ret; })

#define BPF_PROG_RUN(prog, ctx)                                                \
        __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func)

/*
 * Use in preemptible and therefore migratable context to make sure that
 * the execution of the BPF program runs on one CPU.
 *
 * This uses migrate_disable/enable() explicitly to document that the
 * invocation of a BPF program does not require reentrancy protection
 * against a BPF program which is invoked from a preempting task.
 *
 * For non RT enabled kernels migrate_disable/enable() maps to
 * preempt_disable/enable(), i.e. it disables also preemption.
 */
static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog,
                                          const void *ctx)
{
        u32 ret;

        migrate_disable();
        ret = __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func);
        migrate_enable();
        return ret;
}

#define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN

struct bpf_skb_data_end {
        struct qdisc_skb_cb qdisc_cb;
        void *data_meta;
        void *data_end;
};

struct bpf_nh_params {
        u32 nh_family;
        union {
                u32 ipv4_nh;
                struct in6_addr ipv6_nh;
        };
};

struct bpf_redirect_info {
        u32 flags;
        u32 tgt_index;
        void *tgt_value;
        struct bpf_map *map;
        u32 kern_flags;
        struct bpf_nh_params nh;
};

DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);

/* flags for bpf_redirect_info kern_flags */
#define BPF_RI_F_RF_NO_DIRECT        BIT(0)        /* no napi_direct on return_frame */

/* Compute the linear packet data range [data, data_end) which
 * will be accessed by various program types (cls_bpf, act_bpf,
 * lwt, ...). Subsystems allowing direct data access must (!)
 * ensure that cb[] area can be written to when BPF program is
 * invoked (otherwise cb[] save/restore is necessary).
 */
static inline void bpf_compute_data_pointers(struct sk_buff *skb)
{
        struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;

        BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb));
        cb->data_meta = skb->data - skb_metadata_len(skb);
        cb->data_end  = skb->data + skb_headlen(skb);
}

/* Similar to bpf_compute_data_pointers(), except that save orginal
 * data in cb->data and cb->meta_data for restore.
 */
static inline void bpf_compute_and_save_data_end(
        struct sk_buff *skb, void **saved_data_end)
{
        struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;

        *saved_data_end = cb->data_end;
        cb->data_end  = skb->data + skb_headlen(skb);
}

/* Restore data saved by bpf_compute_data_pointers(). */
static inline void bpf_restore_data_end(
        struct sk_buff *skb, void *saved_data_end)
{
        struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;

        cb->data_end = saved_data_end;
}

static inline u8 *bpf_skb_cb(struct sk_buff *skb)
{
        /* eBPF programs may read/write skb->cb[] area to transfer meta
         * data between tail calls. Since this also needs to work with
         * tc, that scratch memory is mapped to qdisc_skb_cb's data area.
         *
         * In some socket filter cases, the cb unfortunately needs to be
         * saved/restored so that protocol specific skb->cb[] data won't
         * be lost. In any case, due to unpriviledged eBPF programs
         * attached to sockets, we need to clear the bpf_skb_cb() area
         * to not leak previous contents to user space.
         */
        BUILD_BUG_ON(sizeof_field(struct __sk_buff, cb) != BPF_SKB_CB_LEN);
        BUILD_BUG_ON(sizeof_field(struct __sk_buff, cb) !=
                     sizeof_field(struct qdisc_skb_cb, data));

        return qdisc_skb_cb(skb)->data;
}

/* Must be invoked with migration disabled */
static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog,
                                         struct sk_buff *skb)
{
        u8 *cb_data = bpf_skb_cb(skb);
        u8 cb_saved[BPF_SKB_CB_LEN];
        u32 res;

        if (unlikely(prog->cb_access)) {
                memcpy(cb_saved, cb_data, sizeof(cb_saved));
                memset(cb_data, 0, sizeof(cb_saved));
        }

        res = BPF_PROG_RUN(prog, skb);

        if (unlikely(prog->cb_access))
                memcpy(cb_data, cb_saved, sizeof(cb_saved));

        return res;
}

static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog,
                                       struct sk_buff *skb)
{
        u32 res;

        migrate_disable();
        res = __bpf_prog_run_save_cb(prog, skb);
        migrate_enable();
        return res;
}

static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
                                        struct sk_buff *skb)
{
        u8 *cb_data = bpf_skb_cb(skb);
        u32 res;

        if (unlikely(prog->cb_access))
                memset(cb_data, 0, BPF_SKB_CB_LEN);

        res = bpf_prog_run_pin_on_cpu(prog, skb);
        return res;
}

DECLARE_BPF_DISPATCHER(xdp)

static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
                                            struct xdp_buff *xdp)
{
        /* Caller needs to hold rcu_read_lock() (!), otherwise program
         * can be released while still running, or map elements could be
         * freed early while still having concurrent users. XDP fastpath
         * already takes rcu_read_lock() when fetching the program, so
         * it's not necessary here anymore.
         */
        return __BPF_PROG_RUN(prog, xdp, BPF_DISPATCHER_FUNC(xdp));
}

void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog);

static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog)
{
        return prog->len * sizeof(struct bpf_insn);
}

static inline u32 bpf_prog_tag_scratch_size(const struct bpf_prog *prog)
{
        return round_up(bpf_prog_insn_size(prog) +
                        sizeof(__be64) + 1, SHA1_BLOCK_SIZE);
}

static inline unsigned int bpf_prog_size(unsigned int proglen)
{
        return max(sizeof(struct bpf_prog),
                   offsetof(struct bpf_prog, insns[proglen]));
}

static inline bool bpf_prog_was_classic(const struct bpf_prog *prog)
{
        /* When classic BPF programs have been loaded and the arch
         * does not have a classic BPF JIT (anymore), they have been
         * converted via bpf_migrate_filter() to eBPF and thus always
         * have an unspec program type.
         */
        return prog->type == BPF_PROG_TYPE_UNSPEC;
}

static inline u32 bpf_ctx_off_adjust_machine(u32 size)
{
        const u32 size_machine = sizeof(unsigned long);

        if (size > size_machine && size % size_machine == 0)
                size = size_machine;

        return size;
}

static inline bool
bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
{
        return size <= size_default && (size & (size - 1)) == 0;
}

static inline u8
bpf_ctx_narrow_access_offset(u32 off, u32 size, u32 size_default)
{
        u8 access_off = off & (size_default - 1);

#ifdef __LITTLE_ENDIAN
        return access_off;
#else
        return size_default - (access_off + size);
#endif
}

#define bpf_ctx_wide_access_ok(off, size, type, field)                        \
        (size == sizeof(__u64) &&                                        \
        off >= offsetof(type, field) &&                                        \
        off + sizeof(__u64) <= offsetofend(type, field) &&                \
        off % sizeof(__u64) == 0)

#define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0]))

static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
{
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
        if (!fp->jited) {
                set_vm_flush_reset_perms(fp);
                set_memory_ro((unsigned long)fp, fp->pages);
        }
#endif
}

static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
{
        set_vm_flush_reset_perms(hdr);
        set_memory_ro((unsigned long)hdr, hdr->pages);
        set_memory_x((unsigned long)hdr, hdr->pages);
}

static inline struct bpf_binary_header *
bpf_jit_binary_hdr(const struct bpf_prog *fp)
{
        unsigned long real_start = (unsigned long)fp->bpf_func;
        unsigned long addr = real_start & PAGE_MASK;

        return (void *)addr;
}

int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);
static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
{
        return sk_filter_trim_cap(sk, skb, 1);
}

struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err);
void bpf_prog_free(struct bpf_prog *fp);

bool bpf_opcode_in_insntable(u8 code);

void bpf_prog_free_linfo(struct bpf_prog *prog);
void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
                               const u32 *insn_to_jit_off);
int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog);
void bpf_prog_free_jited_linfo(struct bpf_prog *prog);
void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog);

struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags);
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
                                  gfp_t gfp_extra_flags);
void __bpf_prog_free(struct bpf_prog *fp);

static inline void bpf_prog_unlock_free(struct bpf_prog *fp)
{
        __bpf_prog_free(fp);
}

typedef int (*bpf_aux_classic_check_t)(struct sock_filter *filter,
                                       unsigned int flen);

int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog);
int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
                              bpf_aux_classic_check_t trans, bool save_orig);
void bpf_prog_destroy(struct bpf_prog *fp);

int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
int sk_attach_bpf(u32 ufd, struct sock *sk);
int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk);
int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk);
void sk_reuseport_prog_free(struct bpf_prog *prog);
int sk_detach_filter(struct sock *sk);
int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len);

bool sk_filter_charge(struct sock *sk, struct sk_filter *fp);
void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);

u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
#define __bpf_call_base_args \
        ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \
         (void *)__bpf_call_base)

struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
void bpf_jit_compile(struct bpf_prog *prog);
bool bpf_jit_needs_zext(void);
bool bpf_helper_changes_pkt_data(void *func);

static inline bool bpf_dump_raw_ok(const struct cred *cred)
{
        /* Reconstruction of call-sites is dependent on kallsyms,
         * thus make dump the same restriction.
         */
        return kallsyms_show_value(cred);
}

struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
                                       const struct bpf_insn *patch, u32 len);
int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt);

void bpf_clear_redirect_map(struct bpf_map *map);

static inline bool xdp_return_frame_no_direct(void)
{
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);

        return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT;
}

static inline void xdp_set_return_frame_no_direct(void)
{
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);

        ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT;
}

static inline void xdp_clear_return_frame_no_direct(void)
{
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);

        ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT;
}

static inline int xdp_ok_fwd_dev(const struct net_device *fwd,
                                 unsigned int pktlen)
{
        unsigned int len;

        if (unlikely(!(fwd->flags & IFF_UP)))
                return -ENETDOWN;

        len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
        if (pktlen > len)
                return -EMSGSIZE;

        return 0;
}

/* The pair of xdp_do_redirect and xdp_do_flush MUST be called in the
 * same cpu context. Further for best results no more than a single map
 * for the do_redirect/do_flush pair should be used. This limitation is
 * because we only track one map and force a flush when the map changes.
 * This does not appear to be a real limitation for existing software.
 */
int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
                            struct xdp_buff *xdp, struct bpf_prog *prog);
int xdp_do_redirect(struct net_device *dev,
                    struct xdp_buff *xdp,
                    struct bpf_prog *prog);
void xdp_do_flush(void);

/* The xdp_do_flush_map() helper has been renamed to drop the _map suffix, as
 * it is no longer only flushing maps. Keep this define for compatibility
 * until all drivers are updated - do not use xdp_do_flush_map() in new code!
 */
#define xdp_do_flush_map xdp_do_flush

void bpf_warn_invalid_xdp_action(u32 act);

#ifdef CONFIG_INET
struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
                                  struct bpf_prog *prog, struct sk_buff *skb,
                                  u32 hash);
#else
static inline struct sock *
bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
                     struct bpf_prog *prog, struct sk_buff *skb,
                     u32 hash)
{
        return NULL;
}
#endif

#ifdef CONFIG_BPF_JIT
extern int bpf_jit_enable;
extern int bpf_jit_harden;
extern int bpf_jit_kallsyms;
extern long bpf_jit_limit;
extern long bpf_jit_limit_max;

typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);

struct bpf_binary_header *
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
                     unsigned int alignment,
                     bpf_jit_fill_hole_t bpf_fill_ill_insns);
void bpf_jit_binary_free(struct bpf_binary_header *hdr);
u64 bpf_jit_alloc_exec_limit(void);
void *bpf_jit_alloc_exec(unsigned long size);
void bpf_jit_free_exec(void *addr);
void bpf_jit_free(struct bpf_prog *fp);

int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
                                struct bpf_jit_poke_descriptor *poke);

int bpf_jit_get_func_addr(const struct bpf_prog *prog,
                          const struct bpf_insn *insn, bool extra_pass,
                          u64 *func_addr, bool *func_addr_fixed);

struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp);
void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other);

static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
                                u32 pass, void *image)
{
        pr_err("flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n", flen,
               proglen, pass, image, current->comm, task_pid_nr(current));

        if (image)
                print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET,
                               16, 1, image, proglen, false);
}

static inline bool bpf_jit_is_ebpf(void)
{
# ifdef CONFIG_HAVE_EBPF_JIT
        return true;
# else
        return false;
# endif
}

static inline bool ebpf_jit_enabled(void)
{
        return bpf_jit_enable && bpf_jit_is_ebpf();
}

static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp)
{
        return fp->jited && bpf_jit_is_ebpf();
}

static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog)
{
        /* These are the prerequisites, should someone ever have the
         * idea to call blinding outside of them, we make sure to
         * bail out.
         */
        if (!bpf_jit_is_ebpf())
                return false;
        if (!prog->jit_requested)
                return false;
        if (!bpf_jit_harden)
                return false;
        if (bpf_jit_harden == 1 && capable(CAP_SYS_ADMIN))
                return false;

        return true;
}

static inline bool bpf_jit_kallsyms_enabled(void)
{
        /* There are a couple of corner cases where kallsyms should
         * not be enabled f.e. on hardening.
         */
        if (bpf_jit_harden)
                return false;
        if (!bpf_jit_kallsyms)
                return false;
        if (bpf_jit_kallsyms == 1)
                return true;

        return false;
}

const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
                                 unsigned long *off, char *sym);
bool is_bpf_text_address(unsigned long addr);
int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
                    char *sym);

static inline const char *
bpf_address_lookup(unsigned long addr, unsigned long *size,
                   unsigned long *off, char **modname, char *sym)
{
        const char *ret = __bpf_address_lookup(addr, size, off, sym);

        if (ret && modname)
                *modname = NULL;
        return ret;
}

void bpf_prog_kallsyms_add(struct bpf_prog *fp);
void bpf_prog_kallsyms_del(struct bpf_prog *fp);

#else /* CONFIG_BPF_JIT */

static inline bool ebpf_jit_enabled(void)
{
        return false;
}

static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog)
{
        return false;
}

static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp)
{
        return false;
}

static inline int
bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
                            struct bpf_jit_poke_descriptor *poke)
{
        return -ENOTSUPP;
}

static inline void bpf_jit_free(struct bpf_prog *fp)
{
        bpf_prog_unlock_free(fp);
}

static inline bool bpf_jit_kallsyms_enabled(void)
{
        return false;
}

static inline const char *
__bpf_address_lookup(unsigned long addr, unsigned long *size,
                     unsigned long *off, char *sym)
{
        return NULL;
}

static inline bool is_bpf_text_address(unsigned long addr)
{
        return false;
}

static inline int bpf_get_kallsym(unsigned int symnum, unsigned long *value,
                                  char *type, char *sym)
{
        return -ERANGE;
}

static inline const char *
bpf_address_lookup(unsigned long addr, unsigned long *size,
                   unsigned long *off, char **modname, char *sym)
{
        return NULL;
}

static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
}

static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp)
{
}

#endif /* CONFIG_BPF_JIT */

void bpf_prog_kallsyms_del_all(struct bpf_prog *fp);

#define BPF_ANC                BIT(15)

static inline bool bpf_needs_clear_a(const struct sock_filter *first)
{
        switch (first->code) {
        case BPF_RET | BPF_K:
        case BPF_LD | BPF_W | BPF_LEN:
                return false;

        case BPF_LD | BPF_W | BPF_ABS:
        case BPF_LD | BPF_H | BPF_ABS:
        case BPF_LD | BPF_B | BPF_ABS:
                if (first->k == SKF_AD_OFF + SKF_AD_ALU_XOR_X)
                        return true;
                return false;

        default:
                return true;
        }
}

static inline u16 bpf_anc_helper(const struct sock_filter *ftest)
{
        BUG_ON(ftest->code & BPF_ANC);

        switch (ftest->code) {
        case BPF_LD | BPF_W | BPF_ABS:
        case BPF_LD | BPF_H | BPF_ABS:
        case BPF_LD | BPF_B | BPF_ABS:
#define BPF_ANCILLARY(CODE)        case SKF_AD_OFF + SKF_AD_##CODE:        \
                                return BPF_ANC | SKF_AD_##CODE
                switch (ftest->k) {
                BPF_ANCILLARY(PROTOCOL);
                BPF_ANCILLARY(PKTTYPE);
                BPF_ANCILLARY(IFINDEX);
                BPF_ANCILLARY(NLATTR);
                BPF_ANCILLARY(NLATTR_NEST);
                BPF_ANCILLARY(MARK);
                BPF_ANCILLARY(QUEUE);
                BPF_ANCILLARY(HATYPE);
                BPF_ANCILLARY(RXHASH);
                BPF_ANCILLARY(CPU);
                BPF_ANCILLARY(ALU_XOR_X);
                BPF_ANCILLARY(VLAN_TAG);
                BPF_ANCILLARY(VLAN_TAG_PRESENT);
                BPF_ANCILLARY(PAY_OFFSET);
                BPF_ANCILLARY(RANDOM);
                BPF_ANCILLARY(VLAN_TPID);
                }
                fallthrough;
        default:
                return ftest->code;
        }
}

void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb,
                                           int k, unsigned int size);

static inline void *bpf_load_pointer(const struct sk_buff *skb, int k,
                                     unsigned int size, void *buffer)
{
        if (k >= 0)
                return skb_header_pointer(skb, k, size, buffer);

        return bpf_internal_load_pointer_neg_helper(skb, k, size);
}

static inline int bpf_tell_extensions(void)
{
        return SKF_AD_MAX;
}

struct bpf_sock_addr_kern {
        struct sock *sk;
        struct sockaddr *uaddr;
        /* Temporary "register" to make indirect stores to nested structures
         * defined above. We need three registers to make such a store, but
         * only two (src and dst) are available at convert_ctx_access time
         */
        u64 tmp_reg;
        void *t_ctx;        /* Attach type specific context. */
};

struct bpf_sock_ops_kern {
        struct        sock *sk;
        union {
                u32 args[4];
                u32 reply;
                u32 replylong[4];
        };
        struct sk_buff        *syn_skb;
        struct sk_buff        *skb;
        void        *skb_data_end;
        u8        op;
        u8        is_fullsock;
        u8        remaining_opt_len;
        u64        temp;                        /* temp and everything after is not
                                         * initialized to 0 before calling
                                         * the BPF program. New fields that
                                         * should be initialized to 0 should
                                         * be inserted before temp.
                                         * temp is scratch storage used by
                                         * sock_ops_convert_ctx_access
                                         * as temporary storage of a register.
                                         */
};

struct bpf_sysctl_kern {
        struct ctl_table_header *head;
        struct ctl_table *table;
        void *cur_val;
        size_t cur_len;
        void *new_val;
        size_t new_len;
        int new_updated;
        int write;
        loff_t *ppos;
        /* Temporary "register" for indirect stores to ppos. */
        u64 tmp_reg;
};

struct bpf_sockopt_kern {
        struct sock        *sk;
        u8                *optval;
        u8                *optval_end;
        s32                level;
        s32                optname;
        s32                optlen;
        s32                retval;
};

int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len);

struct bpf_sk_lookup_kern {
        u16                family;
        u16                protocol;
        __be16                sport;
        u16                dport;
        struct {
                __be32 saddr;
                __be32 daddr;
        } v4;
        struct {
                const struct in6_addr *saddr;
                const struct in6_addr *daddr;
        } v6;
        struct sock        *selected_sk;
        bool                no_reuseport;
};

extern struct static_key_false bpf_sk_lookup_enabled;

/* Runners for BPF_SK_LOOKUP programs to invoke on socket lookup.
 *
 * Allowed return values for a BPF SK_LOOKUP program are SK_PASS and
 * SK_DROP. Their meaning is as follows:
 *
 *  SK_PASS && ctx.selected_sk != NULL: use selected_sk as lookup result
 *  SK_PASS && ctx.selected_sk == NULL: continue to htable-based socket lookup
 *  SK_DROP                           : terminate lookup with -ECONNREFUSED
 *
 * This macro aggregates return values and selected sockets from
 * multiple BPF programs according to following rules in order:
 *
 *  1. If any program returned SK_PASS and a non-NULL ctx.selected_sk,
 *     macro result is SK_PASS and last ctx.selected_sk is used.
 *  2. If any program returned SK_DROP return value,
 *     macro result is SK_DROP.
 *  3. Otherwise result is SK_PASS and ctx.selected_sk is NULL.
 *
 * Caller must ensure that the prog array is non-NULL, and that the
 * array as well as the programs it contains remain valid.
 */
#define BPF_PROG_SK_LOOKUP_RUN_ARRAY(array, ctx, func)                        \
        ({                                                                \
                struct bpf_sk_lookup_kern *_ctx = &(ctx);                \
                struct bpf_prog_array_item *_item;                        \
                struct sock *_selected_sk = NULL;                        \
                bool _no_reuseport = false;                                \
                struct bpf_prog *_prog;                                        \
                bool _all_pass = true;                                        \
                u32 _ret;                                                \
                                                                        \
                migrate_disable();                                        \
                _item = &(array)->items[0];                                \
                while ((_prog = READ_ONCE(_item->prog))) {                \
                        /* restore most recent selection */                \
                        _ctx->selected_sk = _selected_sk;                \
                        _ctx->no_reuseport = _no_reuseport;                \
                                                                        \
                        _ret = func(_prog, _ctx);                        \
                        if (_ret == SK_PASS && _ctx->selected_sk) {        \
                                /* remember last non-NULL socket */        \
                                _selected_sk = _ctx->selected_sk;        \
                                _no_reuseport = _ctx->no_reuseport;        \
                        } else if (_ret == SK_DROP && _all_pass) {        \
                                _all_pass = false;                        \
                        }                                                \
                        _item++;                                        \
                }                                                        \
                _ctx->selected_sk = _selected_sk;                        \
                _ctx->no_reuseport = _no_reuseport;                        \
                migrate_enable();                                        \
                _all_pass || _selected_sk ? SK_PASS : SK_DROP;                \
         })

static inline bool bpf_sk_lookup_run_v4(struct net *net, int protocol,
                                        const __be32 saddr, const __be16 sport,
                                        const __be32 daddr, const u16 dport,
                                        struct sock **psk)
{
        struct bpf_prog_array *run_array;
        struct sock *selected_sk = NULL;
        bool no_reuseport = false;

        rcu_read_lock();
        run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]);
        if (run_array) {
                struct bpf_sk_lookup_kern ctx = {
                        .family                = AF_INET,
                        .protocol        = protocol,
                        .v4.saddr        = saddr,
                        .v4.daddr        = daddr,
                        .sport                = sport,
                        .dport                = dport,
                };
                u32 act;

                act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, BPF_PROG_RUN);
                if (act == SK_PASS) {
                        selected_sk = ctx.selected_sk;
                        no_reuseport = ctx.no_reuseport;
                } else {
                        selected_sk = ERR_PTR(-ECONNREFUSED);
                }
        }
        rcu_read_unlock();
        *psk = selected_sk;
        return no_reuseport;
}

#if IS_ENABLED(CONFIG_IPV6)
static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol,
                                        const struct in6_addr *saddr,
                                        const __be16 sport,
                                        const struct in6_addr *daddr,
                                        const u16 dport,
                                        struct sock **psk)
{
        struct bpf_prog_array *run_array;
        struct sock *selected_sk = NULL;
        bool no_reuseport = false;

        rcu_read_lock();
        run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]);
        if (run_array) {
                struct bpf_sk_lookup_kern ctx = {
                        .family                = AF_INET6,
                        .protocol        = protocol,
                        .v6.saddr        = saddr,
                        .v6.daddr        = daddr,
                        .sport                = sport,
                        .dport                = dport,
                };
                u32 act;

                act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, BPF_PROG_RUN);
                if (act == SK_PASS) {
                        selected_sk = ctx.selected_sk;
                        no_reuseport = ctx.no_reuseport;
                } else {
                        selected_sk = ERR_PTR(-ECONNREFUSED);
                }
        }
        rcu_read_unlock();
        *psk = selected_sk;
        return no_reuseport;
}
#endif /* IS_ENABLED(CONFIG_IPV6) */

#endif /* __LINUX_FILTER_H__ */



































































































































































































































































































































    1 



































































    1 


































































































































    1 


























    1 



























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_DCACHE_H
#define __LINUX_DCACHE_H

#include <linux/atomic.h>
#include <linux/list.h>
#include <linux/rculist.h>
#include <linux/rculist_bl.h>
#include <linux/spinlock.h>
#include <linux/seqlock.h>
#include <linux/cache.h>
#include <linux/rcupdate.h>
#include <linux/lockref.h>
#include <linux/stringhash.h>
#include <linux/wait.h>

struct path;
struct vfsmount;

/*
 * linux/include/linux/dcache.h
 *
 * Dirent cache data structures
 *
 * (C) Copyright 1997 Thomas Schoebel-Theuer,
 * with heavy changes by Linus Torvalds
 */

#define IS_ROOT(x) ((x) == (x)->d_parent)

/* The hash is always the low bits of hash_len */
#ifdef __LITTLE_ENDIAN
 #define HASH_LEN_DECLARE u32 hash; u32 len
 #define bytemask_from_count(cnt)        (~(~0ul << (cnt)*8))
#else
 #define HASH_LEN_DECLARE u32 len; u32 hash
 #define bytemask_from_count(cnt)        (~(~0ul >> (cnt)*8))
#endif

/*
 * "quick string" -- eases parameter passing, but more importantly
 * saves "metadata" about the string (ie length and the hash).
 *
 * hash comes first so it snuggles against d_parent in the
 * dentry.
 */
struct qstr {
        union {
                struct {
                        HASH_LEN_DECLARE;
                };
                u64 hash_len;
        };
        const unsigned char *name;
};

#define QSTR_INIT(n,l) { { { .len = l } }, .name = n }

extern const struct qstr empty_name;
extern const struct qstr slash_name;

struct dentry_stat_t {
        long nr_dentry;
        long nr_unused;
        long age_limit;                /* age in seconds */
        long want_pages;        /* pages requested by system */
        long nr_negative;        /* # of unused negative dentries */
        long dummy;                /* Reserved for future use */
};
extern struct dentry_stat_t dentry_stat;

/*
 * Try to keep struct dentry aligned on 64 byte cachelines (this will
 * give reasonable cacheline footprint with larger lines without the
 * large memory footprint increase).
 */
#ifdef CONFIG_64BIT
# define DNAME_INLINE_LEN 32 /* 192 bytes */
#else
# ifdef CONFIG_SMP
#  define DNAME_INLINE_LEN 36 /* 128 bytes */
# else
#  define DNAME_INLINE_LEN 40 /* 128 bytes */
# endif
#endif

#define d_lock        d_lockref.lock

struct dentry {
        /* RCU lookup touched fields */
        unsigned int d_flags;                /* protected by d_lock */
        seqcount_spinlock_t d_seq;        /* per dentry seqlock */
        struct hlist_bl_node d_hash;        /* lookup hash list */
        struct dentry *d_parent;        /* parent directory */
        struct qstr d_name;
        struct inode *d_inode;                /* Where the name belongs to - NULL is
                                         * negative */
        unsigned char d_iname[DNAME_INLINE_LEN];        /* small names */

        /* Ref lookup also touches following */
        struct lockref d_lockref;        /* per-dentry lock and refcount */
        const struct dentry_operations *d_op;
        struct super_block *d_sb;        /* The root of the dentry tree */
        unsigned long d_time;                /* used by d_revalidate */
        void *d_fsdata;                        /* fs-specific data */

        union {
                struct list_head d_lru;                /* LRU list */
                wait_queue_head_t *d_wait;        /* in-lookup ones only */
        };
        struct list_head d_child;        /* child of parent list */
        struct list_head d_subdirs;        /* our children */
        /*
         * d_alias and d_rcu can share memory
         */
        union {
                struct hlist_node d_alias;        /* inode alias list */
                struct hlist_bl_node d_in_lookup_hash;        /* only for in-lookup ones */
                 struct rcu_head d_rcu;
        } d_u;
} __randomize_layout;

/*
 * dentry->d_lock spinlock nesting subclasses:
 *
 * 0: normal
 * 1: nested
 */
enum dentry_d_lock_class
{
        DENTRY_D_LOCK_NORMAL, /* implicitly used by plain spin_lock() APIs. */
        DENTRY_D_LOCK_NESTED
};

struct dentry_operations {
        int (*d_revalidate)(struct dentry *, unsigned int);
        int (*d_weak_revalidate)(struct dentry *, unsigned int);
        int (*d_hash)(const struct dentry *, struct qstr *);
        int (*d_compare)(const struct dentry *,
                        unsigned int, const char *, const struct qstr *);
        int (*d_delete)(const struct dentry *);
        int (*d_init)(struct dentry *);
        void (*d_release)(struct dentry *);
        void (*d_prune)(struct dentry *);
        void (*d_iput)(struct dentry *, struct inode *);
        char *(*d_dname)(struct dentry *, char *, int);
        struct vfsmount *(*d_automount)(struct path *);
        int (*d_manage)(const struct path *, bool);
        struct dentry *(*d_real)(struct dentry *, const struct inode *);
} ____cacheline_aligned;

/*
 * Locking rules for dentry_operations callbacks are to be found in
 * Documentation/filesystems/locking.rst. Keep it updated!
 *
 * FUrther descriptions are found in Documentation/filesystems/vfs.rst.
 * Keep it updated too!
 */

/* d_flags entries */
#define DCACHE_OP_HASH                        0x00000001
#define DCACHE_OP_COMPARE                0x00000002
#define DCACHE_OP_REVALIDATE                0x00000004
#define DCACHE_OP_DELETE                0x00000008
#define DCACHE_OP_PRUNE                        0x00000010

#define        DCACHE_DISCONNECTED                0x00000020
     /* This dentry is possibly not currently connected to the dcache tree, in
      * which case its parent will either be itself, or will have this flag as
      * well.  nfsd will not use a dentry with this bit set, but will first
      * endeavour to clear the bit either by discovering that it is connected,
      * or by performing lookup operations.   Any filesystem which supports
      * nfsd_operations MUST have a lookup function which, if it finds a
      * directory inode with a DCACHE_DISCONNECTED dentry, will d_move that
      * dentry into place and return that dentry rather than the passed one,
      * typically using d_splice_alias. */

#define DCACHE_REFERENCED                0x00000040 /* Recently used, don't discard. */

#define DCACHE_DONTCACHE                0x00000080 /* Purge from memory on final dput() */

#define DCACHE_CANT_MOUNT                0x00000100
#define DCACHE_GENOCIDE                        0x00000200
#define DCACHE_SHRINK_LIST                0x00000400

#define DCACHE_OP_WEAK_REVALIDATE        0x00000800

#define DCACHE_NFSFS_RENAMED                0x00001000
     /* this dentry has been "silly renamed" and has to be deleted on the last
      * dput() */
#define DCACHE_COOKIE                        0x00002000 /* For use by dcookie subsystem */
#define DCACHE_FSNOTIFY_PARENT_WATCHED        0x00004000
     /* Parent inode is watched by some fsnotify listener */

#define DCACHE_DENTRY_KILLED                0x00008000

#define DCACHE_MOUNTED                        0x00010000 /* is a mountpoint */
#define DCACHE_NEED_AUTOMOUNT                0x00020000 /* handle automount on this dir */
#define DCACHE_MANAGE_TRANSIT                0x00040000 /* manage transit from this dirent */
#define DCACHE_MANAGED_DENTRY \
        (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT)

#define DCACHE_LRU_LIST                        0x00080000

#define DCACHE_ENTRY_TYPE                0x00700000
#define DCACHE_MISS_TYPE                0x00000000 /* Negative dentry (maybe fallthru to nowhere) */
#define DCACHE_WHITEOUT_TYPE                0x00100000 /* Whiteout dentry (stop pathwalk) */
#define DCACHE_DIRECTORY_TYPE                0x00200000 /* Normal directory */
#define DCACHE_AUTODIR_TYPE                0x00300000 /* Lookupless directory (presumed automount) */
#define DCACHE_REGULAR_TYPE                0x00400000 /* Regular file type (or fallthru to such) */
#define DCACHE_SPECIAL_TYPE                0x00500000 /* Other file type (or fallthru to such) */
#define DCACHE_SYMLINK_TYPE                0x00600000 /* Symlink (or fallthru to such) */

#define DCACHE_MAY_FREE                        0x00800000
#define DCACHE_FALLTHRU                        0x01000000 /* Fall through to lower layer */
#define DCACHE_NOKEY_NAME                0x02000000 /* Encrypted name encoded without key */
#define DCACHE_OP_REAL                        0x04000000

#define DCACHE_PAR_LOOKUP                0x10000000 /* being looked up (with parent locked shared) */
#define DCACHE_DENTRY_CURSOR                0x20000000
#define DCACHE_NORCU                        0x40000000 /* No RCU delay for freeing */

extern seqlock_t rename_lock;

/*
 * These are the low-level FS interfaces to the dcache..
 */
extern void d_instantiate(struct dentry *, struct inode *);
extern void d_instantiate_new(struct dentry *, struct inode *);
extern struct dentry * d_instantiate_unique(struct dentry *, struct inode *);
extern struct dentry * d_instantiate_anon(struct dentry *, struct inode *);
extern void __d_drop(struct dentry *dentry);
extern void d_drop(struct dentry *dentry);
extern void d_delete(struct dentry *);
extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op);

/* allocate/de-allocate */
extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
extern struct dentry * d_alloc_anon(struct super_block *);
extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
                                        wait_queue_head_t *);
extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
extern struct dentry *d_find_any_alias(struct inode *inode);
extern struct dentry * d_obtain_alias(struct inode *);
extern struct dentry * d_obtain_root(struct inode *);
extern void shrink_dcache_sb(struct super_block *);
extern void shrink_dcache_parent(struct dentry *);
extern void shrink_dcache_for_umount(struct super_block *);
extern void d_invalidate(struct dentry *);

/* only used at mount-time */
extern struct dentry * d_make_root(struct inode *);

/* <clickety>-<click> the ramfs-type tree */
extern void d_genocide(struct dentry *);

extern void d_tmpfile(struct dentry *, struct inode *);

extern struct dentry *d_find_alias(struct inode *);
extern void d_prune_aliases(struct inode *);

/* test whether we have any submounts in a subdir tree */
extern int path_has_submounts(const struct path *);

/*
 * This adds the entry to the hash queues.
 */
extern void d_rehash(struct dentry *);
 
extern void d_add(struct dentry *, struct inode *);

/* used for rename() and baskets */
extern void d_move(struct dentry *, struct dentry *);
extern void d_exchange(struct dentry *, struct dentry *);
extern struct dentry *d_ancestor(struct dentry *, struct dentry *);

/* appendix may either be NULL or be used for transname suffixes */
extern struct dentry *d_lookup(const struct dentry *, const struct qstr *);
extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *);
extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *);
extern struct dentry *__d_lookup_rcu(const struct dentry *parent,
                                const struct qstr *name, unsigned *seq);

static inline unsigned d_count(const struct dentry *dentry)
{
        return dentry->d_lockref.count;
}

/*
 * helper function for dentry_operations.d_dname() members
 */
extern __printf(4, 5)
char *dynamic_dname(struct dentry *, char *, int, const char *, ...);

extern char *__d_path(const struct path *, const struct path *, char *, int);
extern char *d_absolute_path(const struct path *, char *, int);
extern char *d_path(const struct path *, char *, int);
extern char *dentry_path_raw(struct dentry *, char *, int);
extern char *dentry_path(struct dentry *, char *, int);

/* Allocation counts.. */

/**
 *        dget, dget_dlock -        get a reference to a dentry
 *        @dentry: dentry to get a reference to
 *
 *        Given a dentry or %NULL pointer increment the reference count
 *        if appropriate and return the dentry. A dentry will not be 
 *        destroyed when it has references.
 */
static inline struct dentry *dget_dlock(struct dentry *dentry)
{
        if (dentry)
                dentry->d_lockref.count++;
        return dentry;
}

static inline struct dentry *dget(struct dentry *dentry)
{
        if (dentry)
                lockref_get(&dentry->d_lockref);
        return dentry;
}

extern struct dentry *dget_parent(struct dentry *dentry);

/**
 *        d_unhashed -        is dentry hashed
 *        @dentry: entry to check
 *
 *        Returns true if the dentry passed is not currently hashed.
 */
 
static inline int d_unhashed(const struct dentry *dentry)
{
        return hlist_bl_unhashed(&dentry->d_hash);
}

static inline int d_unlinked(const struct dentry *dentry)
{
        return d_unhashed(dentry) && !IS_ROOT(dentry);
}

static inline int cant_mount(const struct dentry *dentry)
{
        return (dentry->d_flags & DCACHE_CANT_MOUNT);
}

static inline void dont_mount(struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        dentry->d_flags |= DCACHE_CANT_MOUNT;
        spin_unlock(&dentry->d_lock);
}

extern void __d_lookup_done(struct dentry *);

static inline int d_in_lookup(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_PAR_LOOKUP;
}

static inline void d_lookup_done(struct dentry *dentry)
{
        if (unlikely(d_in_lookup(dentry))) {
                spin_lock(&dentry->d_lock);
                __d_lookup_done(dentry);
                spin_unlock(&dentry->d_lock);
        }
}

extern void dput(struct dentry *);

static inline bool d_managed(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_MANAGED_DENTRY;
}

static inline bool d_mountpoint(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_MOUNTED;
}

/*
 * Directory cache entry type accessor functions.
 */
static inline unsigned __d_entry_type(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_ENTRY_TYPE;
}

static inline bool d_is_miss(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_MISS_TYPE;
}

static inline bool d_is_whiteout(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_WHITEOUT_TYPE;
}

static inline bool d_can_lookup(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_DIRECTORY_TYPE;
}

static inline bool d_is_autodir(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_AUTODIR_TYPE;
}

static inline bool d_is_dir(const struct dentry *dentry)
{
        return d_can_lookup(dentry) || d_is_autodir(dentry);
}

static inline bool d_is_symlink(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_SYMLINK_TYPE;
}

static inline bool d_is_reg(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_REGULAR_TYPE;
}

static inline bool d_is_special(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_SPECIAL_TYPE;
}

static inline bool d_is_file(const struct dentry *dentry)
{
        return d_is_reg(dentry) || d_is_special(dentry);
}

static inline bool d_is_negative(const struct dentry *dentry)
{
        // TODO: check d_is_whiteout(dentry) also.
        return d_is_miss(dentry);
}

static inline bool d_flags_negative(unsigned flags)
{
        return (flags & DCACHE_ENTRY_TYPE) == DCACHE_MISS_TYPE;
}

static inline bool d_is_positive(const struct dentry *dentry)
{
        return !d_is_negative(dentry);
}

/**
 * d_really_is_negative - Determine if a dentry is really negative (ignoring fallthroughs)
 * @dentry: The dentry in question
 *
 * Returns true if the dentry represents either an absent name or a name that
 * doesn't map to an inode (ie. ->d_inode is NULL).  The dentry could represent
 * a true miss, a whiteout that isn't represented by a 0,0 chardev or a
 * fallthrough marker in an opaque directory.
 *
 * Note!  (1) This should be used *only* by a filesystem to examine its own
 * dentries.  It should not be used to look at some other filesystem's
 * dentries.  (2) It should also be used in combination with d_inode() to get
 * the inode.  (3) The dentry may have something attached to ->d_lower and the
 * type field of the flags may be set to something other than miss or whiteout.
 */
static inline bool d_really_is_negative(const struct dentry *dentry)
{
        return dentry->d_inode == NULL;
}

/**
 * d_really_is_positive - Determine if a dentry is really positive (ignoring fallthroughs)
 * @dentry: The dentry in question
 *
 * Returns true if the dentry represents a name that maps to an inode
 * (ie. ->d_inode is not NULL).  The dentry might still represent a whiteout if
 * that is represented on medium as a 0,0 chardev.
 *
 * Note!  (1) This should be used *only* by a filesystem to examine its own
 * dentries.  It should not be used to look at some other filesystem's
 * dentries.  (2) It should also be used in combination with d_inode() to get
 * the inode.
 */
static inline bool d_really_is_positive(const struct dentry *dentry)
{
        return dentry->d_inode != NULL;
}

static inline int simple_positive(const struct dentry *dentry)
{
        return d_really_is_positive(dentry) && !d_unhashed(dentry);
}

extern void d_set_fallthru(struct dentry *dentry);

static inline bool d_is_fallthru(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_FALLTHRU;
}


extern int sysctl_vfs_cache_pressure;

static inline unsigned long vfs_pressure_ratio(unsigned long val)
{
        return mult_frac(val, sysctl_vfs_cache_pressure, 100);
}

/**
 * d_inode - Get the actual inode of this dentry
 * @dentry: The dentry to query
 *
 * This is the helper normal filesystems should use to get at their own inodes
 * in their own dentries and ignore the layering superimposed upon them.
 */
static inline struct inode *d_inode(const struct dentry *dentry)
{
        return dentry->d_inode;
}

/**
 * d_inode_rcu - Get the actual inode of this dentry with READ_ONCE()
 * @dentry: The dentry to query
 *
 * This is the helper normal filesystems should use to get at their own inodes
 * in their own dentries and ignore the layering superimposed upon them.
 */
static inline struct inode *d_inode_rcu(const struct dentry *dentry)
{
        return READ_ONCE(dentry->d_inode);
}

/**
 * d_backing_inode - Get upper or lower inode we should be using
 * @upper: The upper layer
 *
 * This is the helper that should be used to get at the inode that will be used
 * if this dentry were to be opened as a file.  The inode may be on the upper
 * dentry or it may be on a lower dentry pinned by the upper.
 *
 * Normal filesystems should not use this to access their own inodes.
 */
static inline struct inode *d_backing_inode(const struct dentry *upper)
{
        struct inode *inode = upper->d_inode;

        return inode;
}

/**
 * d_backing_dentry - Get upper or lower dentry we should be using
 * @upper: The upper layer
 *
 * This is the helper that should be used to get the dentry of the inode that
 * will be used if this dentry were opened as a file.  It may be the upper
 * dentry or it may be a lower dentry pinned by the upper.
 *
 * Normal filesystems should not use this to access their own dentries.
 */
static inline struct dentry *d_backing_dentry(struct dentry *upper)
{
        return upper;
}

/**
 * d_real - Return the real dentry
 * @dentry: the dentry to query
 * @inode: inode to select the dentry from multiple layers (can be NULL)
 *
 * If dentry is on a union/overlay, then return the underlying, real dentry.
 * Otherwise return the dentry itself.
 *
 * See also: Documentation/filesystems/vfs.rst
 */
static inline struct dentry *d_real(struct dentry *dentry,
                                    const struct inode *inode)
{
        if (unlikely(dentry->d_flags & DCACHE_OP_REAL))
                return dentry->d_op->d_real(dentry, inode);
        else
                return dentry;
}

/**
 * d_real_inode - Return the real inode
 * @dentry: The dentry to query
 *
 * If dentry is on a union/overlay, then return the underlying, real inode.
 * Otherwise return d_inode().
 */
static inline struct inode *d_real_inode(const struct dentry *dentry)
{
        /* This usage of d_real() results in const dentry */
        return d_backing_inode(d_real((struct dentry *) dentry, NULL));
}

struct name_snapshot {
        struct qstr name;
        unsigned char inline_name[DNAME_INLINE_LEN];
};
void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *);
void release_dentry_name_snapshot(struct name_snapshot *);

#endif        /* __LINUX_DCACHE_H */
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2020 ARM Ltd.
 */
#ifndef __ASM_VDSO_PROCESSOR_H
#define __ASM_VDSO_PROCESSOR_H

#ifndef __ASSEMBLY__

/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
static __always_inline void rep_nop(void)
{
        asm volatile("rep; nop" ::: "memory");
}

static __always_inline void cpu_relax(void)
{
        rep_nop();
}

#endif /* __ASSEMBLY__ */

#endif /* __ASM_VDSO_PROCESSOR_H */








































































































































































































































































































































    1 
    1 
    1 

    1 
























    1 








    1 
    1 



    1 





































    1 



































































    1 











































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
// SPDX-License-Identifier: GPL-2.0-only

#include <linux/export.h>
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/sched/signal.h>
#include <linux/user_namespace.h>
#include <linux/proc_ns.h>
#include <linux/highuid.h>
#include <linux/cred.h>
#include <linux/securebits.h>
#include <linux/keyctl.h>
#include <linux/key-type.h>
#include <keys/user-type.h>
#include <linux/seq_file.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <linux/ctype.h>
#include <linux/projid.h>
#include <linux/fs_struct.h>
#include <linux/bsearch.h>
#include <linux/sort.h>

static struct kmem_cache *user_ns_cachep __read_mostly;
static DEFINE_MUTEX(userns_state_mutex);

static bool new_idmap_permitted(const struct file *file,
                                struct user_namespace *ns, int cap_setid,
                                struct uid_gid_map *map);
static void free_user_ns(struct work_struct *work);

static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid)
{
        return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES);
}

static void dec_user_namespaces(struct ucounts *ucounts)
{
        return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES);
}

static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
{
        /* Start with the same capabilities as init but useless for doing
         * anything as the capabilities are bound to the new user namespace.
         */
        cred->securebits = SECUREBITS_DEFAULT;
        cred->cap_inheritable = CAP_EMPTY_SET;
        cred->cap_permitted = CAP_FULL_SET;
        cred->cap_effective = CAP_FULL_SET;
        cred->cap_ambient = CAP_EMPTY_SET;
        cred->cap_bset = CAP_FULL_SET;
#ifdef CONFIG_KEYS
        key_put(cred->request_key_auth);
        cred->request_key_auth = NULL;
#endif
        /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
        cred->user_ns = user_ns;
}

/*
 * Create a new user namespace, deriving the creator from the user in the
 * passed credentials, and replacing that user with the new root user for the
 * new namespace.
 *
 * This is called by copy_creds(), which will finish setting the target task's
 * credentials.
 */
int create_user_ns(struct cred *new)
{
        struct user_namespace *ns, *parent_ns = new->user_ns;
        kuid_t owner = new->euid;
        kgid_t group = new->egid;
        struct ucounts *ucounts;
        int ret, i;

        ret = -ENOSPC;
        if (parent_ns->level > 32)
                goto fail;

        ucounts = inc_user_namespaces(parent_ns, owner);
        if (!ucounts)
                goto fail;

        /*
         * Verify that we can not violate the policy of which files
         * may be accessed that is specified by the root directory,
         * by verifing that the root directory is at the root of the
         * mount namespace which allows all files to be accessed.
         */
        ret = -EPERM;
        if (current_chrooted())
                goto fail_dec;

        /* The creator needs a mapping in the parent user namespace
         * or else we won't be able to reasonably tell userspace who
         * created a user_namespace.
         */
        ret = -EPERM;
        if (!kuid_has_mapping(parent_ns, owner) ||
            !kgid_has_mapping(parent_ns, group))
                goto fail_dec;

        ret = -ENOMEM;
        ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
        if (!ns)
                goto fail_dec;

        ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP);
        ret = ns_alloc_inum(&ns->ns);
        if (ret)
                goto fail_free;
        ns->ns.ops = &userns_operations;

        atomic_set(&ns->count, 1);
        /* Leave the new->user_ns reference with the new user namespace. */
        ns->parent = parent_ns;
        ns->level = parent_ns->level + 1;
        ns->owner = owner;
        ns->group = group;
        INIT_WORK(&ns->work, free_user_ns);
        for (i = 0; i < UCOUNT_COUNTS; i++) {
                ns->ucount_max[i] = INT_MAX;
        }
        ns->ucounts = ucounts;

        /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
        mutex_lock(&userns_state_mutex);
        ns->flags = parent_ns->flags;
        mutex_unlock(&userns_state_mutex);

#ifdef CONFIG_KEYS
        INIT_LIST_HEAD(&ns->keyring_name_list);
        init_rwsem(&ns->keyring_sem);
#endif
        ret = -ENOMEM;
        if (!setup_userns_sysctls(ns))
                goto fail_keyring;

        set_cred_user_ns(new, ns);
        return 0;
fail_keyring:
#ifdef CONFIG_PERSISTENT_KEYRINGS
        key_put(ns->persistent_keyring_register);
#endif
        ns_free_inum(&ns->ns);
fail_free:
        kmem_cache_free(user_ns_cachep, ns);
fail_dec:
        dec_user_namespaces(ucounts);
fail:
        return ret;
}

int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
{
        struct cred *cred;
        int err = -ENOMEM;

        if (!(unshare_flags & CLONE_NEWUSER))
                return 0;

        cred = prepare_creds();
        if (cred) {
                err = create_user_ns(cred);
                if (err)
                        put_cred(cred);
                else
                        *new_cred = cred;
        }

        return err;
}

static void free_user_ns(struct work_struct *work)
{
        struct user_namespace *parent, *ns =
                container_of(work, struct user_namespace, work);

        do {
                struct ucounts *ucounts = ns->ucounts;
                parent = ns->parent;
                if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                        kfree(ns->gid_map.forward);
                        kfree(ns->gid_map.reverse);
                }
                if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                        kfree(ns->uid_map.forward);
                        kfree(ns->uid_map.reverse);
                }
                if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                        kfree(ns->projid_map.forward);
                        kfree(ns->projid_map.reverse);
                }
                retire_userns_sysctls(ns);
                key_free_user_ns(ns);
                ns_free_inum(&ns->ns);
                kmem_cache_free(user_ns_cachep, ns);
                dec_user_namespaces(ucounts);
                ns = parent;
        } while (atomic_dec_and_test(&parent->count));
}

void __put_user_ns(struct user_namespace *ns)
{
        schedule_work(&ns->work);
}
EXPORT_SYMBOL(__put_user_ns);

/**
 * idmap_key struct holds the information necessary to find an idmapping in a
 * sorted idmap array. It is passed to cmp_map_id() as first argument.
 */
struct idmap_key {
        bool map_up; /* true  -> id from kid; false -> kid from id */
        u32 id; /* id to find */
        u32 count; /* == 0 unless used with map_id_range_down() */
};

/**
 * cmp_map_id - Function to be passed to bsearch() to find the requested
 * idmapping. Expects struct idmap_key to be passed via @k.
 */
static int cmp_map_id(const void *k, const void *e)
{
        u32 first, last, id2;
        const struct idmap_key *key = k;
        const struct uid_gid_extent *el = e;

        id2 = key->id + key->count - 1;

        /* handle map_id_{down,up}() */
        if (key->map_up)
                first = el->lower_first;
        else
                first = el->first;

        last = first + el->count - 1;

        if (key->id >= first && key->id <= last &&
            (id2 >= first && id2 <= last))
                return 0;

        if (key->id < first || id2 < first)
                return -1;

        return 1;
}

/**
 * map_id_range_down_max - Find idmap via binary search in ordered idmap array.
 * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static struct uid_gid_extent *
map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
{
        struct idmap_key key;

        key.map_up = false;
        key.count = count;
        key.id = id;

        return bsearch(&key, map->forward, extents,
                       sizeof(struct uid_gid_extent), cmp_map_id);
}

/**
 * map_id_range_down_base - Find idmap via binary search in static extent array.
 * Can only be called if number of mappings is equal or less than
 * UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static struct uid_gid_extent *
map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
{
        unsigned idx;
        u32 first, last, id2;

        id2 = id + count - 1;

        /* Find the matching extent */
        for (idx = 0; idx < extents; idx++) {
                first = map->extent[idx].first;
                last = first + map->extent[idx].count - 1;
                if (id >= first && id <= last &&
                    (id2 >= first && id2 <= last))
                        return &map->extent[idx];
        }
        return NULL;
}

static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
{
        struct uid_gid_extent *extent;
        unsigned extents = map->nr_extents;
        smp_rmb();

        if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                extent = map_id_range_down_base(extents, map, id, count);
        else
                extent = map_id_range_down_max(extents, map, id, count);

        /* Map the id or note failure */
        if (extent)
                id = (id - extent->first) + extent->lower_first;
        else
                id = (u32) -1;

        return id;
}

static u32 map_id_down(struct uid_gid_map *map, u32 id)
{
        return map_id_range_down(map, id, 1);
}

/**
 * map_id_up_base - Find idmap via binary search in static extent array.
 * Can only be called if number of mappings is equal or less than
 * UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static struct uid_gid_extent *
map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id)
{
        unsigned idx;
        u32 first, last;

        /* Find the matching extent */
        for (idx = 0; idx < extents; idx++) {
                first = map->extent[idx].lower_first;
                last = first + map->extent[idx].count - 1;
                if (id >= first && id <= last)
                        return &map->extent[idx];
        }
        return NULL;
}

/**
 * map_id_up_max - Find idmap via binary search in ordered idmap array.
 * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static struct uid_gid_extent *
map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id)
{
        struct idmap_key key;

        key.map_up = true;
        key.count = 1;
        key.id = id;

        return bsearch(&key, map->reverse, extents,
                       sizeof(struct uid_gid_extent), cmp_map_id);
}

static u32 map_id_up(struct uid_gid_map *map, u32 id)
{
        struct uid_gid_extent *extent;
        unsigned extents = map->nr_extents;
        smp_rmb();

        if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                extent = map_id_up_base(extents, map, id);
        else
                extent = map_id_up_max(extents, map, id);

        /* Map the id or note failure */
        if (extent)
                id = (id - extent->lower_first) + extent->first;
        else
                id = (u32) -1;

        return id;
}

/**
 *        make_kuid - Map a user-namespace uid pair into a kuid.
 *        @ns:  User namespace that the uid is in
 *        @uid: User identifier
 *
 *        Maps a user-namespace uid pair into a kernel internal kuid,
 *        and returns that kuid.
 *
 *        When there is no mapping defined for the user-namespace uid
 *        pair INVALID_UID is returned.  Callers are expected to test
 *        for and handle INVALID_UID being returned.  INVALID_UID
 *        may be tested for using uid_valid().
 */
kuid_t make_kuid(struct user_namespace *ns, uid_t uid)
{
        /* Map the uid to a global kernel uid */
        return KUIDT_INIT(map_id_down(&ns->uid_map, uid));
}
EXPORT_SYMBOL(make_kuid);

/**
 *        from_kuid - Create a uid from a kuid user-namespace pair.
 *        @targ: The user namespace we want a uid in.
 *        @kuid: The kernel internal uid to start with.
 *
 *        Map @kuid into the user-namespace specified by @targ and
 *        return the resulting uid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        If @kuid has no mapping in @targ (uid_t)-1 is returned.
 */
uid_t from_kuid(struct user_namespace *targ, kuid_t kuid)
{
        /* Map the uid from a global kernel uid */
        return map_id_up(&targ->uid_map, __kuid_val(kuid));
}
EXPORT_SYMBOL(from_kuid);

/**
 *        from_kuid_munged - Create a uid from a kuid user-namespace pair.
 *        @targ: The user namespace we want a uid in.
 *        @kuid: The kernel internal uid to start with.
 *
 *        Map @kuid into the user-namespace specified by @targ and
 *        return the resulting uid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        Unlike from_kuid from_kuid_munged never fails and always
 *        returns a valid uid.  This makes from_kuid_munged appropriate
 *        for use in syscalls like stat and getuid where failing the
 *        system call and failing to provide a valid uid are not an
 *        options.
 *
 *        If @kuid has no mapping in @targ overflowuid is returned.
 */
uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid)
{
        uid_t uid;
        uid = from_kuid(targ, kuid);

        if (uid == (uid_t) -1)
                uid = overflowuid;
        return uid;
}
EXPORT_SYMBOL(from_kuid_munged);

/**
 *        make_kgid - Map a user-namespace gid pair into a kgid.
 *        @ns:  User namespace that the gid is in
 *        @gid: group identifier
 *
 *        Maps a user-namespace gid pair into a kernel internal kgid,
 *        and returns that kgid.
 *
 *        When there is no mapping defined for the user-namespace gid
 *        pair INVALID_GID is returned.  Callers are expected to test
 *        for and handle INVALID_GID being returned.  INVALID_GID may be
 *        tested for using gid_valid().
 */
kgid_t make_kgid(struct user_namespace *ns, gid_t gid)
{
        /* Map the gid to a global kernel gid */
        return KGIDT_INIT(map_id_down(&ns->gid_map, gid));
}
EXPORT_SYMBOL(make_kgid);

/**
 *        from_kgid - Create a gid from a kgid user-namespace pair.
 *        @targ: The user namespace we want a gid in.
 *        @kgid: The kernel internal gid to start with.
 *
 *        Map @kgid into the user-namespace specified by @targ and
 *        return the resulting gid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        If @kgid has no mapping in @targ (gid_t)-1 is returned.
 */
gid_t from_kgid(struct user_namespace *targ, kgid_t kgid)
{
        /* Map the gid from a global kernel gid */
        return map_id_up(&targ->gid_map, __kgid_val(kgid));
}
EXPORT_SYMBOL(from_kgid);

/**
 *        from_kgid_munged - Create a gid from a kgid user-namespace pair.
 *        @targ: The user namespace we want a gid in.
 *        @kgid: The kernel internal gid to start with.
 *
 *        Map @kgid into the user-namespace specified by @targ and
 *        return the resulting gid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        Unlike from_kgid from_kgid_munged never fails and always
 *        returns a valid gid.  This makes from_kgid_munged appropriate
 *        for use in syscalls like stat and getgid where failing the
 *        system call and failing to provide a valid gid are not options.
 *
 *        If @kgid has no mapping in @targ overflowgid is returned.
 */
gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid)
{
        gid_t gid;
        gid = from_kgid(targ, kgid);

        if (gid == (gid_t) -1)
                gid = overflowgid;
        return gid;
}
EXPORT_SYMBOL(from_kgid_munged);

/**
 *        make_kprojid - Map a user-namespace projid pair into a kprojid.
 *        @ns:  User namespace that the projid is in
 *        @projid: Project identifier
 *
 *        Maps a user-namespace uid pair into a kernel internal kuid,
 *        and returns that kuid.
 *
 *        When there is no mapping defined for the user-namespace projid
 *        pair INVALID_PROJID is returned.  Callers are expected to test
 *        for and handle INVALID_PROJID being returned.  INVALID_PROJID
 *        may be tested for using projid_valid().
 */
kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid)
{
        /* Map the uid to a global kernel uid */
        return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid));
}
EXPORT_SYMBOL(make_kprojid);

/**
 *        from_kprojid - Create a projid from a kprojid user-namespace pair.
 *        @targ: The user namespace we want a projid in.
 *        @kprojid: The kernel internal project identifier to start with.
 *
 *        Map @kprojid into the user-namespace specified by @targ and
 *        return the resulting projid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        If @kprojid has no mapping in @targ (projid_t)-1 is returned.
 */
projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid)
{
        /* Map the uid from a global kernel uid */
        return map_id_up(&targ->projid_map, __kprojid_val(kprojid));
}
EXPORT_SYMBOL(from_kprojid);

/**
 *        from_kprojid_munged - Create a projiid from a kprojid user-namespace pair.
 *        @targ: The user namespace we want a projid in.
 *        @kprojid: The kernel internal projid to start with.
 *
 *        Map @kprojid into the user-namespace specified by @targ and
 *        return the resulting projid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        Unlike from_kprojid from_kprojid_munged never fails and always
 *        returns a valid projid.  This makes from_kprojid_munged
 *        appropriate for use in syscalls like stat and where
 *        failing the system call and failing to provide a valid projid are
 *        not an options.
 *
 *        If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned.
 */
projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid)
{
        projid_t projid;
        projid = from_kprojid(targ, kprojid);

        if (projid == (projid_t) -1)
                projid = OVERFLOW_PROJID;
        return projid;
}
EXPORT_SYMBOL(from_kprojid_munged);


static int uid_m_show(struct seq_file *seq, void *v)
{
        struct user_namespace *ns = seq->private;
        struct uid_gid_extent *extent = v;
        struct user_namespace *lower_ns;
        uid_t lower;

        lower_ns = seq_user_ns(seq);
        if ((lower_ns == ns) && lower_ns->parent)
                lower_ns = lower_ns->parent;

        lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first));

        seq_printf(seq, "%10u %10u %10u\n",
                extent->first,
                lower,
                extent->count);

        return 0;
}

static int gid_m_show(struct seq_file *seq, void *v)
{
        struct user_namespace *ns = seq->private;
        struct uid_gid_extent *extent = v;
        struct user_namespace *lower_ns;
        gid_t lower;

        lower_ns = seq_user_ns(seq);
        if ((lower_ns == ns) && lower_ns->parent)
                lower_ns = lower_ns->parent;

        lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first));

        seq_printf(seq, "%10u %10u %10u\n",
                extent->first,
                lower,
                extent->count);

        return 0;
}

static int projid_m_show(struct seq_file *seq, void *v)
{
        struct user_namespace *ns = seq->private;
        struct uid_gid_extent *extent = v;
        struct user_namespace *lower_ns;
        projid_t lower;

        lower_ns = seq_user_ns(seq);
        if ((lower_ns == ns) && lower_ns->parent)
                lower_ns = lower_ns->parent;

        lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first));

        seq_printf(seq, "%10u %10u %10u\n",
                extent->first,
                lower,
                extent->count);

        return 0;
}

static void *m_start(struct seq_file *seq, loff_t *ppos,
                     struct uid_gid_map *map)
{
        loff_t pos = *ppos;
        unsigned extents = map->nr_extents;
        smp_rmb();

        if (pos >= extents)
                return NULL;

        if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                return &map->extent[pos];

        return &map->forward[pos];
}

static void *uid_m_start(struct seq_file *seq, loff_t *ppos)
{
        struct user_namespace *ns = seq->private;

        return m_start(seq, ppos, &ns->uid_map);
}

static void *gid_m_start(struct seq_file *seq, loff_t *ppos)
{
        struct user_namespace *ns = seq->private;

        return m_start(seq, ppos, &ns->gid_map);
}

static void *projid_m_start(struct seq_file *seq, loff_t *ppos)
{
        struct user_namespace *ns = seq->private;

        return m_start(seq, ppos, &ns->projid_map);
}

static void *m_next(struct seq_file *seq, void *v, loff_t *pos)
{
        (*pos)++;
        return seq->op->start(seq, pos);
}

static void m_stop(struct seq_file *seq, void *v)
{
        return;
}

const struct seq_operations proc_uid_seq_operations = {
        .start = uid_m_start,
        .stop = m_stop,
        .next = m_next,
        .show = uid_m_show,
};

const struct seq_operations proc_gid_seq_operations = {
        .start = gid_m_start,
        .stop = m_stop,
        .next = m_next,
        .show = gid_m_show,
};

const struct seq_operations proc_projid_seq_operations = {
        .start = projid_m_start,
        .stop = m_stop,
        .next = m_next,
        .show = projid_m_show,
};

static bool mappings_overlap(struct uid_gid_map *new_map,
                             struct uid_gid_extent *extent)
{
        u32 upper_first, lower_first, upper_last, lower_last;
        unsigned idx;

        upper_first = extent->first;
        lower_first = extent->lower_first;
        upper_last = upper_first + extent->count - 1;
        lower_last = lower_first + extent->count - 1;

        for (idx = 0; idx < new_map->nr_extents; idx++) {
                u32 prev_upper_first, prev_lower_first;
                u32 prev_upper_last, prev_lower_last;
                struct uid_gid_extent *prev;

                if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                        prev = &new_map->extent[idx];
                else
                        prev = &new_map->forward[idx];

                prev_upper_first = prev->first;
                prev_lower_first = prev->lower_first;
                prev_upper_last = prev_upper_first + prev->count - 1;
                prev_lower_last = prev_lower_first + prev->count - 1;

                /* Does the upper range intersect a previous extent? */
                if ((prev_upper_first <= upper_last) &&
                    (prev_upper_last >= upper_first))
                        return true;

                /* Does the lower range intersect a previous extent? */
                if ((prev_lower_first <= lower_last) &&
                    (prev_lower_last >= lower_first))
                        return true;
        }
        return false;
}

/**
 * insert_extent - Safely insert a new idmap extent into struct uid_gid_map.
 * Takes care to allocate a 4K block of memory if the number of mappings exceeds
 * UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent)
{
        struct uid_gid_extent *dest;

        if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) {
                struct uid_gid_extent *forward;

                /* Allocate memory for 340 mappings. */
                forward = kmalloc_array(UID_GID_MAP_MAX_EXTENTS,
                                        sizeof(struct uid_gid_extent),
                                        GFP_KERNEL);
                if (!forward)
                        return -ENOMEM;

                /* Copy over memory. Only set up memory for the forward pointer.
                 * Defer the memory setup for the reverse pointer.
                 */
                memcpy(forward, map->extent,
                       map->nr_extents * sizeof(map->extent[0]));

                map->forward = forward;
                map->reverse = NULL;
        }

        if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS)
                dest = &map->extent[map->nr_extents];
        else
                dest = &map->forward[map->nr_extents];

        *dest = *extent;
        map->nr_extents++;
        return 0;
}

/* cmp function to sort() forward mappings */
static int cmp_extents_forward(const void *a, const void *b)
{
        const struct uid_gid_extent *e1 = a;
        const struct uid_gid_extent *e2 = b;

        if (e1->first < e2->first)
                return -1;

        if (e1->first > e2->first)
                return 1;

        return 0;
}

/* cmp function to sort() reverse mappings */
static int cmp_extents_reverse(const void *a, const void *b)
{
        const struct uid_gid_extent *e1 = a;
        const struct uid_gid_extent *e2 = b;

        if (e1->lower_first < e2->lower_first)
                return -1;

        if (e1->lower_first > e2->lower_first)
                return 1;

        return 0;
}

/**
 * sort_idmaps - Sorts an array of idmap entries.
 * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static int sort_idmaps(struct uid_gid_map *map)
{
        if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                return 0;

        /* Sort forward array. */
        sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent),
             cmp_extents_forward, NULL);

        /* Only copy the memory from forward we actually need. */
        map->reverse = kmemdup(map->forward,
                               map->nr_extents * sizeof(struct uid_gid_extent),
                               GFP_KERNEL);
        if (!map->reverse)
                return -ENOMEM;

        /* Sort reverse array. */
        sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent),
             cmp_extents_reverse, NULL);

        return 0;
}

/**
 * verify_root_map() - check the uid 0 mapping
 * @file: idmapping file
 * @map_ns: user namespace of the target process
 * @new_map: requested idmap
 *
 * If a process requests mapping parent uid 0 into the new ns, verify that the
 * process writing the map had the CAP_SETFCAP capability as the target process
 * will be able to write fscaps that are valid in ancestor user namespaces.
 *
 * Return: true if the mapping is allowed, false if not.
 */
static bool verify_root_map(const struct file *file,
                            struct user_namespace *map_ns,
                            struct uid_gid_map *new_map)
{
        int idx;
        const struct user_namespace *file_ns = file->f_cred->user_ns;
        struct uid_gid_extent *extent0 = NULL;

        for (idx = 0; idx < new_map->nr_extents; idx++) {
                if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                        extent0 = &new_map->extent[idx];
                else
                        extent0 = &new_map->forward[idx];
                if (extent0->lower_first == 0)
                        break;

                extent0 = NULL;
        }

        if (!extent0)
                return true;

        if (map_ns == file_ns) {
                /* The process unshared its ns and is writing to its own
                 * /proc/self/uid_map.  User already has full capabilites in
                 * the new namespace.  Verify that the parent had CAP_SETFCAP
                 * when it unshared.
                 * */
                if (!file_ns->parent_could_setfcap)
                        return false;
        } else {
                /* Process p1 is writing to uid_map of p2, who is in a child
                 * user namespace to p1's.  Verify that the opener of the map
                 * file has CAP_SETFCAP against the parent of the new map
                 * namespace */
                if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP))
                        return false;
        }

        return true;
}

static ssize_t map_write(struct file *file, const char __user *buf,
                         size_t count, loff_t *ppos,
                         int cap_setid,
                         struct uid_gid_map *map,
                         struct uid_gid_map *parent_map)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *map_ns = seq->private;
        struct uid_gid_map new_map;
        unsigned idx;
        struct uid_gid_extent extent;
        char *kbuf = NULL, *pos, *next_line;
        ssize_t ret;

        /* Only allow < page size writes at the beginning of the file */
        if ((*ppos != 0) || (count >= PAGE_SIZE))
                return -EINVAL;

        /* Slurp in the user data */
        kbuf = memdup_user_nul(buf, count);
        if (IS_ERR(kbuf))
                return PTR_ERR(kbuf);

        /*
         * The userns_state_mutex serializes all writes to any given map.
         *
         * Any map is only ever written once.
         *
         * An id map fits within 1 cache line on most architectures.
         *
         * On read nothing needs to be done unless you are on an
         * architecture with a crazy cache coherency model like alpha.
         *
         * There is a one time data dependency between reading the
         * count of the extents and the values of the extents.  The
         * desired behavior is to see the values of the extents that
         * were written before the count of the extents.
         *
         * To achieve this smp_wmb() is used on guarantee the write
         * order and smp_rmb() is guaranteed that we don't have crazy
         * architectures returning stale data.
         */
        mutex_lock(&userns_state_mutex);

        memset(&new_map, 0, sizeof(struct uid_gid_map));

        ret = -EPERM;
        /* Only allow one successful write to the map */
        if (map->nr_extents != 0)
                goto out;

        /*
         * Adjusting namespace settings requires capabilities on the target.
         */
        if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN))
                goto out;

        /* Parse the user data */
        ret = -EINVAL;
        pos = kbuf;
        for (; pos; pos = next_line) {

                /* Find the end of line and ensure I don't look past it */
                next_line = strchr(pos, '\n');
                if (next_line) {
                        *next_line = '\0';
                        next_line++;
                        if (*next_line == '\0')
                                next_line = NULL;
                }

                pos = skip_spaces(pos);
                extent.first = simple_strtoul(pos, &pos, 10);
                if (!isspace(*pos))
                        goto out;

                pos = skip_spaces(pos);
                extent.lower_first = simple_strtoul(pos, &pos, 10);
                if (!isspace(*pos))
                        goto out;

                pos = skip_spaces(pos);
                extent.count = simple_strtoul(pos, &pos, 10);
                if (*pos && !isspace(*pos))
                        goto out;

                /* Verify there is not trailing junk on the line */
                pos = skip_spaces(pos);
                if (*pos != '\0')
                        goto out;

                /* Verify we have been given valid starting values */
                if ((extent.first == (u32) -1) ||
                    (extent.lower_first == (u32) -1))
                        goto out;

                /* Verify count is not zero and does not cause the
                 * extent to wrap
                 */
                if ((extent.first + extent.count) <= extent.first)
                        goto out;
                if ((extent.lower_first + extent.count) <=
                     extent.lower_first)
                        goto out;

                /* Do the ranges in extent overlap any previous extents? */
                if (mappings_overlap(&new_map, &extent))
                        goto out;

                if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS &&
                    (next_line != NULL))
                        goto out;

                ret = insert_extent(&new_map, &extent);
                if (ret < 0)
                        goto out;
                ret = -EINVAL;
        }
        /* Be very certaint the new map actually exists */
        if (new_map.nr_extents == 0)
                goto out;

        ret = -EPERM;
        /* Validate the user is allowed to use user id's mapped to. */
        if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map))
                goto out;

        ret = -EPERM;
        /* Map the lower ids from the parent user namespace to the
         * kernel global id space.
         */
        for (idx = 0; idx < new_map.nr_extents; idx++) {
                struct uid_gid_extent *e;
                u32 lower_first;

                if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                        e = &new_map.extent[idx];
                else
                        e = &new_map.forward[idx];

                lower_first = map_id_range_down(parent_map,
                                                e->lower_first,
                                                e->count);

                /* Fail if we can not map the specified extent to
                 * the kernel global id space.
                 */
                if (lower_first == (u32) -1)
                        goto out;

                e->lower_first = lower_first;
        }

        /*
         * If we want to use binary search for lookup, this clones the extent
         * array and sorts both copies.
         */
        ret = sort_idmaps(&new_map);
        if (ret < 0)
                goto out;

        /* Install the map */
        if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) {
                memcpy(map->extent, new_map.extent,
                       new_map.nr_extents * sizeof(new_map.extent[0]));
        } else {
                map->forward = new_map.forward;
                map->reverse = new_map.reverse;
        }
        smp_wmb();
        map->nr_extents = new_map.nr_extents;

        *ppos = count;
        ret = count;
out:
        if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                kfree(new_map.forward);
                kfree(new_map.reverse);
                map->forward = NULL;
                map->reverse = NULL;
                map->nr_extents = 0;
        }

        mutex_unlock(&userns_state_mutex);
        kfree(kbuf);
        return ret;
}

ssize_t proc_uid_map_write(struct file *file, const char __user *buf,
                           size_t size, loff_t *ppos)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        struct user_namespace *seq_ns = seq_user_ns(seq);

        if (!ns->parent)
                return -EPERM;

        if ((seq_ns != ns) && (seq_ns != ns->parent))
                return -EPERM;

        return map_write(file, buf, size, ppos, CAP_SETUID,
                         &ns->uid_map, &ns->parent->uid_map);
}

ssize_t proc_gid_map_write(struct file *file, const char __user *buf,
                           size_t size, loff_t *ppos)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        struct user_namespace *seq_ns = seq_user_ns(seq);

        if (!ns->parent)
                return -EPERM;

        if ((seq_ns != ns) && (seq_ns != ns->parent))
                return -EPERM;

        return map_write(file, buf, size, ppos, CAP_SETGID,
                         &ns->gid_map, &ns->parent->gid_map);
}

ssize_t proc_projid_map_write(struct file *file, const char __user *buf,
                              size_t size, loff_t *ppos)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        struct user_namespace *seq_ns = seq_user_ns(seq);

        if (!ns->parent)
                return -EPERM;

        if ((seq_ns != ns) && (seq_ns != ns->parent))
                return -EPERM;

        /* Anyone can set any valid project id no capability needed */
        return map_write(file, buf, size, ppos, -1,
                         &ns->projid_map, &ns->parent->projid_map);
}

static bool new_idmap_permitted(const struct file *file,
                                struct user_namespace *ns, int cap_setid,
                                struct uid_gid_map *new_map)
{
        const struct cred *cred = file->f_cred;

        if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map))
                return false;

        /* Don't allow mappings that would allow anything that wouldn't
         * be allowed without the establishment of unprivileged mappings.
         */
        if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) &&
            uid_eq(ns->owner, cred->euid)) {
                u32 id = new_map->extent[0].lower_first;
                if (cap_setid == CAP_SETUID) {
                        kuid_t uid = make_kuid(ns->parent, id);
                        if (uid_eq(uid, cred->euid))
                                return true;
                } else if (cap_setid == CAP_SETGID) {
                        kgid_t gid = make_kgid(ns->parent, id);
                        if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) &&
                            gid_eq(gid, cred->egid))
                                return true;
                }
        }

        /* Allow anyone to set a mapping that doesn't require privilege */
        if (!cap_valid(cap_setid))
                return true;

        /* Allow the specified ids if we have the appropriate capability
         * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
         * And the opener of the id file also had the approprpiate capability.
         */
        if (ns_capable(ns->parent, cap_setid) &&
            file_ns_capable(file, ns->parent, cap_setid))
                return true;

        return false;
}

int proc_setgroups_show(struct seq_file *seq, void *v)
{
        struct user_namespace *ns = seq->private;
        unsigned long userns_flags = READ_ONCE(ns->flags);

        seq_printf(seq, "%s\n",
                   (userns_flags & USERNS_SETGROUPS_ALLOWED) ?
                   "allow" : "deny");
        return 0;
}

ssize_t proc_setgroups_write(struct file *file, const char __user *buf,
                             size_t count, loff_t *ppos)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        char kbuf[8], *pos;
        bool setgroups_allowed;
        ssize_t ret;

        /* Only allow a very narrow range of strings to be written */
        ret = -EINVAL;
        if ((*ppos != 0) || (count >= sizeof(kbuf)))
                goto out;

        /* What was written? */
        ret = -EFAULT;
        if (copy_from_user(kbuf, buf, count))
                goto out;
        kbuf[count] = '\0';
        pos = kbuf;

        /* What is being requested? */
        ret = -EINVAL;
        if (strncmp(pos, "allow", 5) == 0) {
                pos += 5;
                setgroups_allowed = true;
        }
        else if (strncmp(pos, "deny", 4) == 0) {
                pos += 4;
                setgroups_allowed = false;
        }
        else
                goto out;

        /* Verify there is not trailing junk on the line */
        pos = skip_spaces(pos);
        if (*pos != '\0')
                goto out;

        ret = -EPERM;
        mutex_lock(&userns_state_mutex);
        if (setgroups_allowed) {
                /* Enabling setgroups after setgroups has been disabled
                 * is not allowed.
                 */
                if (!(ns->flags & USERNS_SETGROUPS_ALLOWED))
                        goto out_unlock;
        } else {
                /* Permanently disabling setgroups after setgroups has
                 * been enabled by writing the gid_map is not allowed.
                 */
                if (ns->gid_map.nr_extents != 0)
                        goto out_unlock;
                ns->flags &= ~USERNS_SETGROUPS_ALLOWED;
        }
        mutex_unlock(&userns_state_mutex);

        /* Report a successful write */
        *ppos = count;
        ret = count;
out:
        return ret;
out_unlock:
        mutex_unlock(&userns_state_mutex);
        goto out;
}

bool userns_may_setgroups(const struct user_namespace *ns)
{
        bool allowed;

        mutex_lock(&userns_state_mutex);
        /* It is not safe to use setgroups until a gid mapping in
         * the user namespace has been established.
         */
        allowed = ns->gid_map.nr_extents != 0;
        /* Is setgroups allowed? */
        allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED);
        mutex_unlock(&userns_state_mutex);

        return allowed;
}

/*
 * Returns true if @child is the same namespace or a descendant of
 * @ancestor.
 */
bool in_userns(const struct user_namespace *ancestor,
               const struct user_namespace *child)
{
        const struct user_namespace *ns;
        for (ns = child; ns->level > ancestor->level; ns = ns->parent)
                ;
        return (ns == ancestor);
}

bool current_in_userns(const struct user_namespace *target_ns)
{
        return in_userns(target_ns, current_user_ns());
}
EXPORT_SYMBOL(current_in_userns);

static inline struct user_namespace *to_user_ns(struct ns_common *ns)
{
        return container_of(ns, struct user_namespace, ns);
}

static struct ns_common *userns_get(struct task_struct *task)
{
        struct user_namespace *user_ns;

        rcu_read_lock();
        user_ns = get_user_ns(__task_cred(task)->user_ns);
        rcu_read_unlock();

        return user_ns ? &user_ns->ns : NULL;
}

static void userns_put(struct ns_common *ns)
{
        put_user_ns(to_user_ns(ns));
}

static int userns_install(struct nsset *nsset, struct ns_common *ns)
{
        struct user_namespace *user_ns = to_user_ns(ns);
        struct cred *cred;

        /* Don't allow gaining capabilities by reentering
         * the same user namespace.
         */
        if (user_ns == current_user_ns())
                return -EINVAL;

        /* Tasks that share a thread group must share a user namespace */
        if (!thread_group_empty(current))
                return -EINVAL;

        if (current->fs->users != 1)
                return -EINVAL;

        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        cred = nsset_cred(nsset);
        if (!cred)
                return -EINVAL;

        put_user_ns(cred->user_ns);
        set_cred_user_ns(cred, get_user_ns(user_ns));

        return 0;
}

struct ns_common *ns_get_owner(struct ns_common *ns)
{
        struct user_namespace *my_user_ns = current_user_ns();
        struct user_namespace *owner, *p;

        /* See if the owner is in the current user namespace */
        owner = p = ns->ops->owner(ns);
        for (;;) {
                if (!p)
                        return ERR_PTR(-EPERM);
                if (p == my_user_ns)
                        break;
                p = p->parent;
        }

        return &get_user_ns(owner)->ns;
}

static struct user_namespace *userns_owner(struct ns_common *ns)
{
        return to_user_ns(ns)->parent;
}

const struct proc_ns_operations userns_operations = {
        .name                = "user",
        .type                = CLONE_NEWUSER,
        .get                = userns_get,
        .put                = userns_put,
        .install        = userns_install,
        .owner                = userns_owner,
        .get_parent        = ns_get_owner,
};

static __init int user_namespaces_init(void)
{
        user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
        return 0;
}
subsys_initcall(user_namespaces_init);





































































































    1 



    1 






    1 








    1 





























    1 









































    1 



    1 





    1 

    1 
    1 

    1 

    1 











    1 






    1 

    1 


    1 






































































































    1 




















    1 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/file_table.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
 */

#include <linux/string.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/security.h>
#include <linux/cred.h>
#include <linux/eventpoll.h>
#include <linux/rcupdate.h>
#include <linux/mount.h>
#include <linux/capability.h>
#include <linux/cdev.h>
#include <linux/fsnotify.h>
#include <linux/sysctl.h>
#include <linux/percpu_counter.h>
#include <linux/percpu.h>
#include <linux/task_work.h>
#include <linux/ima.h>
#include <linux/swap.h>

#include <linux/atomic.h>

#include "internal.h"

/* sysctl tunables... */
struct files_stat_struct files_stat = {
        .max_files = NR_FILE
};

/* SLAB cache for file structures */
static struct kmem_cache *filp_cachep __read_mostly;

static struct percpu_counter nr_files __cacheline_aligned_in_smp;

static void file_free_rcu(struct rcu_head *head)
{
        struct file *f = container_of(head, struct file, f_u.fu_rcuhead);

        put_cred(f->f_cred);
        kmem_cache_free(filp_cachep, f);
}

static inline void file_free(struct file *f)
{
        security_file_free(f);
        if (!(f->f_mode & FMODE_NOACCOUNT))
                percpu_counter_dec(&nr_files);
        call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
}

/*
 * Return the total number of open files in the system
 */
static long get_nr_files(void)
{
        return percpu_counter_read_positive(&nr_files);
}

/*
 * Return the maximum number of open files in the system
 */
unsigned long get_max_files(void)
{
        return files_stat.max_files;
}
EXPORT_SYMBOL_GPL(get_max_files);

/*
 * Handle nr_files sysctl
 */
#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
int proc_nr_files(struct ctl_table *table, int write,
                     void *buffer, size_t *lenp, loff_t *ppos)
{
        files_stat.nr_files = get_nr_files();
        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
#else
int proc_nr_files(struct ctl_table *table, int write,
                     void *buffer, size_t *lenp, loff_t *ppos)
{
        return -ENOSYS;
}
#endif

static struct file *__alloc_file(int flags, const struct cred *cred)
{
        struct file *f;
        int error;

        f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
        if (unlikely(!f))
                return ERR_PTR(-ENOMEM);

        f->f_cred = get_cred(cred);
        error = security_file_alloc(f);
        if (unlikely(error)) {
                file_free_rcu(&f->f_u.fu_rcuhead);
                return ERR_PTR(error);
        }

        atomic_long_set(&f->f_count, 1);
        rwlock_init(&f->f_owner.lock);
        spin_lock_init(&f->f_lock);
        mutex_init(&f->f_pos_lock);
        eventpoll_init_file(f);
        f->f_flags = flags;
        f->f_mode = OPEN_FMODE(flags);
        /* f->f_version: 0 */

        return f;
}

/* Find an unused file structure and return a pointer to it.
 * Returns an error pointer if some error happend e.g. we over file
 * structures limit, run out of memory or operation is not permitted.
 *
 * Be very careful using this.  You are responsible for
 * getting write access to any mount that you might assign
 * to this filp, if it is opened for write.  If this is not
 * done, you will imbalance int the mount's writer count
 * and a warning at __fput() time.
 */
struct file *alloc_empty_file(int flags, const struct cred *cred)
{
        static long old_max;
        struct file *f;

        /*
         * Privileged users can go above max_files
         */
        if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
                /*
                 * percpu_counters are inaccurate.  Do an expensive check before
                 * we go and fail.
                 */
                if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files)
                        goto over;
        }

        f = __alloc_file(flags, cred);
        if (!IS_ERR(f))
                percpu_counter_inc(&nr_files);

        return f;

over:
        /* Ran out of filps - report that */
        if (get_nr_files() > old_max) {
                pr_info("VFS: file-max limit %lu reached\n", get_max_files());
                old_max = get_nr_files();
        }
        return ERR_PTR(-ENFILE);
}

/*
 * Variant of alloc_empty_file() that doesn't check and modify nr_files.
 *
 * Should not be used unless there's a very good reason to do so.
 */
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred)
{
        struct file *f = __alloc_file(flags, cred);

        if (!IS_ERR(f))
                f->f_mode |= FMODE_NOACCOUNT;

        return f;
}

/**
 * alloc_file - allocate and initialize a 'struct file'
 *
 * @path: the (dentry, vfsmount) pair for the new file
 * @flags: O_... flags with which the new file will be opened
 * @fop: the 'struct file_operations' for the new file
 */
static struct file *alloc_file(const struct path *path, int flags,
                const struct file_operations *fop)
{
        struct file *file;

        file = alloc_empty_file(flags, current_cred());
        if (IS_ERR(file))
                return file;

        file->f_path = *path;
        file->f_inode = path->dentry->d_inode;
        file->f_mapping = path->dentry->d_inode->i_mapping;
        file->f_wb_err = filemap_sample_wb_err(file->f_mapping);
        file->f_sb_err = file_sample_sb_err(file);
        if ((file->f_mode & FMODE_READ) &&
             likely(fop->read || fop->read_iter))
                file->f_mode |= FMODE_CAN_READ;
        if ((file->f_mode & FMODE_WRITE) &&
             likely(fop->write || fop->write_iter))
                file->f_mode |= FMODE_CAN_WRITE;
        file->f_mode |= FMODE_OPENED;
        file->f_op = fop;
        if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
                i_readcount_inc(path->dentry->d_inode);
        return file;
}

struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
                                const char *name, int flags,
                                const struct file_operations *fops)
{
        static const struct dentry_operations anon_ops = {
                .d_dname = simple_dname
        };
        struct qstr this = QSTR_INIT(name, strlen(name));
        struct path path;
        struct file *file;

        path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this);
        if (!path.dentry)
                return ERR_PTR(-ENOMEM);
        if (!mnt->mnt_sb->s_d_op)
                d_set_d_op(path.dentry, &anon_ops);
        path.mnt = mntget(mnt);
        d_instantiate(path.dentry, inode);
        file = alloc_file(&path, flags, fops);
        if (IS_ERR(file)) {
                ihold(inode);
                path_put(&path);
        }
        return file;
}
EXPORT_SYMBOL(alloc_file_pseudo);

struct file *alloc_file_clone(struct file *base, int flags,
                                const struct file_operations *fops)
{
        struct file *f = alloc_file(&base->f_path, flags, fops);
        if (!IS_ERR(f)) {
                path_get(&f->f_path);
                f->f_mapping = base->f_mapping;
        }
        return f;
}

/* the real guts of fput() - releasing the last reference to file
 */
static void __fput(struct file *file)
{
        struct dentry *dentry = file->f_path.dentry;
        struct vfsmount *mnt = file->f_path.mnt;
        struct inode *inode = file->f_inode;
        fmode_t mode = file->f_mode;

        if (unlikely(!(file->f_mode & FMODE_OPENED)))
                goto out;

        might_sleep();

        fsnotify_close(file);
        /*
         * The function eventpoll_release() should be the first called
         * in the file cleanup chain.
         */
        eventpoll_release(file);
        locks_remove_file(file);

        ima_file_free(file);
        if (unlikely(file->f_flags & FASYNC)) {
                if (file->f_op->fasync)
                        file->f_op->fasync(-1, file, 0);
        }
        if (file->f_op->release)
                file->f_op->release(inode, file);
        if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
                     !(mode & FMODE_PATH))) {
                cdev_put(inode->i_cdev);
        }
        fops_put(file->f_op);
        put_pid(file->f_owner.pid);
        if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
                i_readcount_dec(inode);
        if (mode & FMODE_WRITER) {
                put_write_access(inode);
                __mnt_drop_write(mnt);
        }
        dput(dentry);
        if (unlikely(mode & FMODE_NEED_UNMOUNT))
                dissolve_on_fput(mnt);
        mntput(mnt);
out:
        file_free(file);
}

static LLIST_HEAD(delayed_fput_list);
static void delayed_fput(struct work_struct *unused)
{
        struct llist_node *node = llist_del_all(&delayed_fput_list);
        struct file *f, *t;

        llist_for_each_entry_safe(f, t, node, f_u.fu_llist)
                __fput(f);
}

static void ____fput(struct callback_head *work)
{
        __fput(container_of(work, struct file, f_u.fu_rcuhead));
}

/*
 * If kernel thread really needs to have the final fput() it has done
 * to complete, call this.  The only user right now is the boot - we
 * *do* need to make sure our writes to binaries on initramfs has
 * not left us with opened struct file waiting for __fput() - execve()
 * won't work without that.  Please, don't add more callers without
 * very good reasons; in particular, never call that with locks
 * held and never call that from a thread that might need to do
 * some work on any kind of umount.
 */
void flush_delayed_fput(void)
{
        delayed_fput(NULL);
}
EXPORT_SYMBOL_GPL(flush_delayed_fput);

static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);

void fput_many(struct file *file, unsigned int refs)
{
        if (atomic_long_sub_and_test(refs, &file->f_count)) {
                struct task_struct *task = current;

                if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
                        init_task_work(&file->f_u.fu_rcuhead, ____fput);
                        if (!task_work_add(task, &file->f_u.fu_rcuhead, TWA_RESUME))
                                return;
                        /*
                         * After this task has run exit_task_work(),
                         * task_work_add() will fail.  Fall through to delayed
                         * fput to avoid leaking *file.
                         */
                }

                if (llist_add(&file->f_u.fu_llist, &delayed_fput_list))
                        schedule_delayed_work(&delayed_fput_work, 1);
        }
}

void fput(struct file *file)
{
        fput_many(file, 1);
}

/*
 * synchronous analog of fput(); for kernel threads that might be needed
 * in some umount() (and thus can't use flush_delayed_fput() without
 * risking deadlocks), need to wait for completion of __fput() and know
 * for this specific struct file it won't involve anything that would
 * need them.  Use only if you really need it - at the very least,
 * don't blindly convert fput() by kernel thread to that.
 */
void __fput_sync(struct file *file)
{
        if (atomic_long_dec_and_test(&file->f_count)) {
                struct task_struct *task = current;
                BUG_ON(!(task->flags & PF_KTHREAD));
                __fput(file);
        }
}

EXPORT_SYMBOL(fput);
EXPORT_SYMBOL(__fput_sync);

void __init files_init(void)
{
        filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
                        SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL);
        percpu_counter_init(&nr_files, 0, GFP_KERNEL);
}

/*
 * One file with associated inode and dcache is very roughly 1K. Per default
 * do not use more than 10% of our memory for files.
 */
void __init files_maxfiles_init(void)
{
        unsigned long n;
        unsigned long nr_pages = totalram_pages();
        unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2;

        memreserve = min(memreserve, nr_pages - 1);
        n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;

        files_stat.max_files = max_t(unsigned long, n, NR_FILE);
}












































    1 





    1 














































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Supervisor Mode Access Prevention support
 *
 * Copyright (C) 2012 Intel Corporation
 * Author: H. Peter Anvin <hpa@linux.intel.com>
 */

#ifndef _ASM_X86_SMAP_H
#define _ASM_X86_SMAP_H

#include <asm/nops.h>
#include <asm/cpufeatures.h>
#include <asm/alternative.h>

/* "Raw" instruction opcodes */
#define __ASM_CLAC        ".byte 0x0f,0x01,0xca"
#define __ASM_STAC        ".byte 0x0f,0x01,0xcb"

#ifdef __ASSEMBLY__

#ifdef CONFIG_X86_SMAP

#define ASM_CLAC \
        ALTERNATIVE "", __ASM_CLAC, X86_FEATURE_SMAP

#define ASM_STAC \
        ALTERNATIVE "", __ASM_STAC, X86_FEATURE_SMAP

#else /* CONFIG_X86_SMAP */

#define ASM_CLAC
#define ASM_STAC

#endif /* CONFIG_X86_SMAP */

#else /* __ASSEMBLY__ */

#ifdef CONFIG_X86_SMAP

static __always_inline void clac(void)
{
        /* Note: a barrier is implicit in alternative() */
        alternative("", __ASM_CLAC, X86_FEATURE_SMAP);
}

static __always_inline void stac(void)
{
        /* Note: a barrier is implicit in alternative() */
        alternative("", __ASM_STAC, X86_FEATURE_SMAP);
}

static __always_inline unsigned long smap_save(void)
{
        unsigned long flags;

        asm volatile ("# smap_save\n\t"
                      ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP)
                      "pushf; pop %0; " __ASM_CLAC "\n\t"
                      "1:"
                      : "=rm" (flags) : : "memory", "cc");

        return flags;
}

static __always_inline void smap_restore(unsigned long flags)
{
        asm volatile ("# smap_restore\n\t"
                      ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP)
                      "push %0; popf\n\t"
                      "1:"
                      : : "g" (flags) : "memory", "cc");
}

/* These macros can be used in asm() statements */
#define ASM_CLAC \
        ALTERNATIVE("", __ASM_CLAC, X86_FEATURE_SMAP)
#define ASM_STAC \
        ALTERNATIVE("", __ASM_STAC, X86_FEATURE_SMAP)

#else /* CONFIG_X86_SMAP */

static inline void clac(void) { }
static inline void stac(void) { }

static inline unsigned long smap_save(void) { return 0; }
static inline void smap_restore(unsigned long flags) { }

#define ASM_CLAC
#define ASM_STAC

#endif /* CONFIG_X86_SMAP */

#endif /* __ASSEMBLY__ */

#endif /* _ASM_X86_SMAP_H */

















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMIOTRACE_H
#define _LINUX_MMIOTRACE_H

#include <linux/types.h>
#include <linux/list.h>

struct kmmio_probe;
struct pt_regs;

typedef void (*kmmio_pre_handler_t)(struct kmmio_probe *,
                                struct pt_regs *, unsigned long addr);
typedef void (*kmmio_post_handler_t)(struct kmmio_probe *,
                                unsigned long condition, struct pt_regs *);

struct kmmio_probe {
        /* kmmio internal list: */
        struct list_head        list;
        /* start location of the probe point: */
        unsigned long                addr;
        /* length of the probe region: */
        unsigned long                len;
        /* Called before addr is executed: */
        kmmio_pre_handler_t        pre_handler;
        /* Called after addr is executed: */
        kmmio_post_handler_t        post_handler;
        void                        *private;
};

extern unsigned int kmmio_count;

extern int register_kmmio_probe(struct kmmio_probe *p);
extern void unregister_kmmio_probe(struct kmmio_probe *p);
extern int kmmio_init(void);
extern void kmmio_cleanup(void);

#ifdef CONFIG_MMIOTRACE
/* kmmio is active by some kmmio_probes? */
static inline int is_kmmio_active(void)
{
        return kmmio_count;
}

/* Called from page fault handler. */
extern int kmmio_handler(struct pt_regs *regs, unsigned long addr);

/* Called from ioremap.c */
extern void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
                                                        void __iomem *addr);
extern void mmiotrace_iounmap(volatile void __iomem *addr);

/* For anyone to insert markers. Remember trailing newline. */
extern __printf(1, 2) int mmiotrace_printk(const char *fmt, ...);
#else /* !CONFIG_MMIOTRACE: */
static inline int is_kmmio_active(void)
{
        return 0;
}

static inline int kmmio_handler(struct pt_regs *regs, unsigned long addr)
{
        return 0;
}

static inline void mmiotrace_ioremap(resource_size_t offset,
                                        unsigned long size, void __iomem *addr)
{
}

static inline void mmiotrace_iounmap(volatile void __iomem *addr)
{
}

static inline __printf(1, 2) int mmiotrace_printk(const char *fmt, ...)
{
        return 0;
}
#endif /* CONFIG_MMIOTRACE */

enum mm_io_opcode {
        MMIO_READ        = 0x1,        /* struct mmiotrace_rw */
        MMIO_WRITE        = 0x2,        /* struct mmiotrace_rw */
        MMIO_PROBE        = 0x3,        /* struct mmiotrace_map */
        MMIO_UNPROBE        = 0x4,        /* struct mmiotrace_map */
        MMIO_UNKNOWN_OP = 0x5,        /* struct mmiotrace_rw */
};

struct mmiotrace_rw {
        resource_size_t        phys;        /* PCI address of register */
        unsigned long        value;
        unsigned long        pc;        /* optional program counter */
        int                map_id;
        unsigned char        opcode;        /* one of MMIO_{READ,WRITE,UNKNOWN_OP} */
        unsigned char        width;        /* size of register access in bytes */
};

struct mmiotrace_map {
        resource_size_t        phys;        /* base address in PCI space */
        unsigned long        virt;        /* base virtual address */
        unsigned long        len;        /* mapping size */
        int                map_id;
        unsigned char        opcode;        /* MMIO_PROBE or MMIO_UNPROBE */
};

/* in kernel/trace/trace_mmiotrace.c */
extern void enable_mmiotrace(void);
extern void disable_mmiotrace(void);
extern void mmio_trace_rw(struct mmiotrace_rw *rw);
extern void mmio_trace_mapping(struct mmiotrace_map *map);
extern __printf(1, 0) int mmio_trace_printk(const char *fmt, va_list args);

#endif /* _LINUX_MMIOTRACE_H */






















    1 



































    1 
    1 

    1 










































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
// SPDX-License-Identifier: GPL-2.0
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/bitops.h>
#include <linux/cpumask.h>
#include <linux/export.h>
#include <linux/memblock.h>
#include <linux/numa.h>

/**
 * cpumask_next - get the next cpu in a cpumask
 * @n: the cpu prior to the place to search (ie. return will be > @n)
 * @srcp: the cpumask pointer
 *
 * Returns >= nr_cpu_ids if no further cpus set.
 */
unsigned int cpumask_next(int n, const struct cpumask *srcp)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n + 1);
}
EXPORT_SYMBOL(cpumask_next);

/**
 * cpumask_next_and - get the next cpu in *src1p & *src2p
 * @n: the cpu prior to the place to search (ie. return will be > @n)
 * @src1p: the first cpumask pointer
 * @src2p: the second cpumask pointer
 *
 * Returns >= nr_cpu_ids if no further cpus set in both.
 */
int cpumask_next_and(int n, const struct cpumask *src1p,
                     const struct cpumask *src2p)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p),
                nr_cpumask_bits, n + 1);
}
EXPORT_SYMBOL(cpumask_next_and);

/**
 * cpumask_any_but - return a "random" in a cpumask, but not this one.
 * @mask: the cpumask to search
 * @cpu: the cpu to ignore.
 *
 * Often used to find any cpu but smp_processor_id() in a mask.
 * Returns >= nr_cpu_ids if no cpus set.
 */
int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
{
        unsigned int i;

        cpumask_check(cpu);
        for_each_cpu(i, mask)
                if (i != cpu)
                        break;
        return i;
}
EXPORT_SYMBOL(cpumask_any_but);

/**
 * cpumask_next_wrap - helper to implement for_each_cpu_wrap
 * @n: the cpu prior to the place to search
 * @mask: the cpumask pointer
 * @start: the start point of the iteration
 * @wrap: assume @n crossing @start terminates the iteration
 *
 * Returns >= nr_cpu_ids on completion
 *
 * Note: the @wrap argument is required for the start condition when
 * we cannot assume @start is set in @mask.
 */
int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
{
        int next;

again:
        next = cpumask_next(n, mask);

        if (wrap && n < start && next >= start) {
                return nr_cpumask_bits;

        } else if (next >= nr_cpumask_bits) {
                wrap = true;
                n = -1;
                goto again;
        }

        return next;
}
EXPORT_SYMBOL(cpumask_next_wrap);

/* These are not inline because of header tangles. */
#ifdef CONFIG_CPUMASK_OFFSTACK
/**
 * alloc_cpumask_var_node - allocate a struct cpumask on a given node
 * @mask: pointer to cpumask_var_t where the cpumask is returned
 * @flags: GFP_ flags
 *
 * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
 * a nop returning a constant 1 (in <linux/cpumask.h>)
 * Returns TRUE if memory allocation succeeded, FALSE otherwise.
 *
 * In addition, mask will be NULL if this fails.  Note that gcc is
 * usually smart enough to know that mask can never be NULL if
 * CONFIG_CPUMASK_OFFSTACK=n, so does code elimination in that case
 * too.
 */
bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
{
        *mask = kmalloc_node(cpumask_size(), flags, node);

#ifdef CONFIG_DEBUG_PER_CPU_MAPS
        if (!*mask) {
                printk(KERN_ERR "=> alloc_cpumask_var: failed!\n");
                dump_stack();
        }
#endif

        return *mask != NULL;
}
EXPORT_SYMBOL(alloc_cpumask_var_node);

bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
{
        return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node);
}
EXPORT_SYMBOL(zalloc_cpumask_var_node);

/**
 * alloc_cpumask_var - allocate a struct cpumask
 * @mask: pointer to cpumask_var_t where the cpumask is returned
 * @flags: GFP_ flags
 *
 * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
 * a nop returning a constant 1 (in <linux/cpumask.h>).
 *
 * See alloc_cpumask_var_node.
 */
bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        return alloc_cpumask_var_node(mask, flags, NUMA_NO_NODE);
}
EXPORT_SYMBOL(alloc_cpumask_var);

bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        return alloc_cpumask_var(mask, flags | __GFP_ZERO);
}
EXPORT_SYMBOL(zalloc_cpumask_var);

/**
 * alloc_bootmem_cpumask_var - allocate a struct cpumask from the bootmem arena.
 * @mask: pointer to cpumask_var_t where the cpumask is returned
 *
 * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
 * a nop (in <linux/cpumask.h>).
 * Either returns an allocated (zero-filled) cpumask, or causes the
 * system to panic.
 */
void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask)
{
        *mask = memblock_alloc(cpumask_size(), SMP_CACHE_BYTES);
        if (!*mask)
                panic("%s: Failed to allocate %u bytes\n", __func__,
                      cpumask_size());
}

/**
 * free_cpumask_var - frees memory allocated for a struct cpumask.
 * @mask: cpumask to free
 *
 * This is safe on a NULL mask.
 */
void free_cpumask_var(cpumask_var_t mask)
{
        kfree(mask);
}
EXPORT_SYMBOL(free_cpumask_var);

/**
 * free_bootmem_cpumask_var - frees result of alloc_bootmem_cpumask_var
 * @mask: cpumask to free
 */
void __init free_bootmem_cpumask_var(cpumask_var_t mask)
{
        memblock_free_early(__pa(mask), cpumask_size());
}
#endif

/**
 * cpumask_local_spread - select the i'th cpu with local numa cpu's first
 * @i: index number
 * @node: local numa_node
 *
 * This function selects an online CPU according to a numa aware policy;
 * local cpus are returned first, followed by non-local ones, then it
 * wraps around.
 *
 * It's not very efficient, but useful for setup.
 */
unsigned int cpumask_local_spread(unsigned int i, int node)
{
        int cpu;

        /* Wrap: we always want a cpu. */
        i %= num_online_cpus();

        if (node == NUMA_NO_NODE) {
                for_each_cpu(cpu, cpu_online_mask)
                        if (i-- == 0)
                                return cpu;
        } else {
                /* NUMA first. */
                for_each_cpu_and(cpu, cpumask_of_node(node), cpu_online_mask)
                        if (i-- == 0)
                                return cpu;

                for_each_cpu(cpu, cpu_online_mask) {
                        /* Skip NUMA nodes, done above. */
                        if (cpumask_test_cpu(cpu, cpumask_of_node(node)))
                                continue;

                        if (i-- == 0)
                                return cpu;
                }
        }
        BUG();
}
EXPORT_SYMBOL(cpumask_local_spread);

static DEFINE_PER_CPU(int, distribute_cpu_mask_prev);

/**
 * Returns an arbitrary cpu within srcp1 & srcp2.
 *
 * Iterated calls using the same srcp1 and srcp2 will be distributed within
 * their intersection.
 *
 * Returns >= nr_cpu_ids if the intersection is empty.
 */
int cpumask_any_and_distribute(const struct cpumask *src1p,
                               const struct cpumask *src2p)
{
        int next, prev;

        /* NOTE: our first selection will skip 0. */
        prev = __this_cpu_read(distribute_cpu_mask_prev);

        next = cpumask_next_and(prev, src1p, src2p);
        if (next >= nr_cpu_ids)
                next = cpumask_first_and(src1p, src2p);

        if (next < nr_cpu_ids)
                __this_cpu_write(distribute_cpu_mask_prev, next);

        return next;
}
EXPORT_SYMBOL(cpumask_any_and_distribute);



















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_ATOMIC_H
#define _ASM_X86_ATOMIC_H

#include <linux/compiler.h>
#include <linux/types.h>
#include <asm/alternative.h>
#include <asm/cmpxchg.h>
#include <asm/rmwcc.h>
#include <asm/barrier.h>

/*
 * Atomic operations that C can't guarantee us.  Useful for
 * resource counting etc..
 */

/**
 * arch_atomic_read - read atomic variable
 * @v: pointer of type atomic_t
 *
 * Atomically reads the value of @v.
 */
static __always_inline int arch_atomic_read(const atomic_t *v)
{
        /*
         * Note for KASAN: we deliberately don't use READ_ONCE_NOCHECK() here,
         * it's non-inlined function that increases binary size and stack usage.
         */
        return __READ_ONCE((v)->counter);
}

/**
 * arch_atomic_set - set atomic variable
 * @v: pointer of type atomic_t
 * @i: required value
 *
 * Atomically sets the value of @v to @i.
 */
static __always_inline void arch_atomic_set(atomic_t *v, int i)
{
        __WRITE_ONCE(v->counter, i);
}

/**
 * arch_atomic_add - add integer to atomic variable
 * @i: integer value to add
 * @v: pointer of type atomic_t
 *
 * Atomically adds @i to @v.
 */
static __always_inline void arch_atomic_add(int i, atomic_t *v)
{
        asm volatile(LOCK_PREFIX "addl %1,%0"
                     : "+m" (v->counter)
                     : "ir" (i) : "memory");
}

/**
 * arch_atomic_sub - subtract integer from atomic variable
 * @i: integer value to subtract
 * @v: pointer of type atomic_t
 *
 * Atomically subtracts @i from @v.
 */
static __always_inline void arch_atomic_sub(int i, atomic_t *v)
{
        asm volatile(LOCK_PREFIX "subl %1,%0"
                     : "+m" (v->counter)
                     : "ir" (i) : "memory");
}

/**
 * arch_atomic_sub_and_test - subtract value from variable and test result
 * @i: integer value to subtract
 * @v: pointer of type atomic_t
 *
 * Atomically subtracts @i from @v and returns
 * true if the result is zero, or false for all
 * other cases.
 */
static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, e, "er", i);
}
#define arch_atomic_sub_and_test arch_atomic_sub_and_test

/**
 * arch_atomic_inc - increment atomic variable
 * @v: pointer of type atomic_t
 *
 * Atomically increments @v by 1.
 */
static __always_inline void arch_atomic_inc(atomic_t *v)
{
        asm volatile(LOCK_PREFIX "incl %0"
                     : "+m" (v->counter) :: "memory");
}
#define arch_atomic_inc arch_atomic_inc

/**
 * arch_atomic_dec - decrement atomic variable
 * @v: pointer of type atomic_t
 *
 * Atomically decrements @v by 1.
 */
static __always_inline void arch_atomic_dec(atomic_t *v)
{
        asm volatile(LOCK_PREFIX "decl %0"
                     : "+m" (v->counter) :: "memory");
}
#define arch_atomic_dec arch_atomic_dec

/**
 * arch_atomic_dec_and_test - decrement and test
 * @v: pointer of type atomic_t
 *
 * Atomically decrements @v by 1 and
 * returns true if the result is 0, or false for all other
 * cases.
 */
static __always_inline bool arch_atomic_dec_and_test(atomic_t *v)
{
        return GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, e);
}
#define arch_atomic_dec_and_test arch_atomic_dec_and_test

/**
 * arch_atomic_inc_and_test - increment and test
 * @v: pointer of type atomic_t
 *
 * Atomically increments @v by 1
 * and returns true if the result is zero, or false for all
 * other cases.
 */
static __always_inline bool arch_atomic_inc_and_test(atomic_t *v)
{
        return GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, e);
}
#define arch_atomic_inc_and_test arch_atomic_inc_and_test

/**
 * arch_atomic_add_negative - add and test if negative
 * @i: integer value to add
 * @v: pointer of type atomic_t
 *
 * Atomically adds @i to @v and returns true
 * if the result is negative, or false when
 * result is greater than or equal to zero.
 */
static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, s, "er", i);
}
#define arch_atomic_add_negative arch_atomic_add_negative

/**
 * arch_atomic_add_return - add integer and return
 * @i: integer value to add
 * @v: pointer of type atomic_t
 *
 * Atomically adds @i to @v and returns @i + @v
 */
static __always_inline int arch_atomic_add_return(int i, atomic_t *v)
{
        return i + xadd(&v->counter, i);
}
#define arch_atomic_add_return arch_atomic_add_return

/**
 * arch_atomic_sub_return - subtract integer and return
 * @v: pointer of type atomic_t
 * @i: integer value to subtract
 *
 * Atomically subtracts @i from @v and returns @v - @i
 */
static __always_inline int arch_atomic_sub_return(int i, atomic_t *v)
{
        return arch_atomic_add_return(-i, v);
}
#define arch_atomic_sub_return arch_atomic_sub_return

static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v)
{
        return xadd(&v->counter, i);
}
#define arch_atomic_fetch_add arch_atomic_fetch_add

static __always_inline int arch_atomic_fetch_sub(int i, atomic_t *v)
{
        return xadd(&v->counter, -i);
}
#define arch_atomic_fetch_sub arch_atomic_fetch_sub

static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new)
{
        return arch_cmpxchg(&v->counter, old, new);
}
#define arch_atomic_cmpxchg arch_atomic_cmpxchg

static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new)
{
        return try_cmpxchg(&v->counter, old, new);
}
#define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg

static __always_inline int arch_atomic_xchg(atomic_t *v, int new)
{
        return arch_xchg(&v->counter, new);
}
#define arch_atomic_xchg arch_atomic_xchg

static __always_inline void arch_atomic_and(int i, atomic_t *v)
{
        asm volatile(LOCK_PREFIX "andl %1,%0"
                        : "+m" (v->counter)
                        : "ir" (i)
                        : "memory");
}

static __always_inline int arch_atomic_fetch_and(int i, atomic_t *v)
{
        int val = arch_atomic_read(v);

        do { } while (!arch_atomic_try_cmpxchg(v, &val, val & i));

        return val;
}
#define arch_atomic_fetch_and arch_atomic_fetch_and

static __always_inline void arch_atomic_or(int i, atomic_t *v)
{
        asm volatile(LOCK_PREFIX "orl %1,%0"
                        : "+m" (v->counter)
                        : "ir" (i)
                        : "memory");
}

static __always_inline int arch_atomic_fetch_or(int i, atomic_t *v)
{
        int val = arch_atomic_read(v);

        do { } while (!arch_atomic_try_cmpxchg(v, &val, val | i));

        return val;
}
#define arch_atomic_fetch_or arch_atomic_fetch_or

static __always_inline void arch_atomic_xor(int i, atomic_t *v)
{
        asm volatile(LOCK_PREFIX "xorl %1,%0"
                        : "+m" (v->counter)
                        : "ir" (i)
                        : "memory");
}

static __always_inline int arch_atomic_fetch_xor(int i, atomic_t *v)
{
        int val = arch_atomic_read(v);

        do { } while (!arch_atomic_try_cmpxchg(v, &val, val ^ i));

        return val;
}
#define arch_atomic_fetch_xor arch_atomic_fetch_xor

#ifdef CONFIG_X86_32
# include <asm/atomic64_32.h>
#else
# include <asm/atomic64_64.h>
#endif

#define ARCH_ATOMIC

#endif /* _ASM_X86_ATOMIC_H */
































































































































































































    1 








































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_UACCESS_H__
#define __LINUX_UACCESS_H__

#include <linux/fault-inject-usercopy.h>
#include <linux/instrumented.h>
#include <linux/minmax.h>
#include <linux/sched.h>
#include <linux/thread_info.h>

#include <asm/uaccess.h>

#ifdef CONFIG_SET_FS
/*
 * Force the uaccess routines to be wired up for actual userspace access,
 * overriding any possible set_fs(KERNEL_DS) still lingering around.  Undone
 * using force_uaccess_end below.
 */
static inline mm_segment_t force_uaccess_begin(void)
{
        mm_segment_t fs = get_fs();

        set_fs(USER_DS);
        return fs;
}

static inline void force_uaccess_end(mm_segment_t oldfs)
{
        set_fs(oldfs);
}
#else /* CONFIG_SET_FS */
typedef struct {
        /* empty dummy */
} mm_segment_t;

#ifndef TASK_SIZE_MAX
#define TASK_SIZE_MAX                        TASK_SIZE
#endif

#define uaccess_kernel()                (false)
#define user_addr_max()                        (TASK_SIZE_MAX)

static inline mm_segment_t force_uaccess_begin(void)
{
        return (mm_segment_t) { };
}

static inline void force_uaccess_end(mm_segment_t oldfs)
{
}
#endif /* CONFIG_SET_FS */

/*
 * Architectures should provide two primitives (raw_copy_{to,from}_user())
 * and get rid of their private instances of copy_{to,from}_user() and
 * __copy_{to,from}_user{,_inatomic}().
 *
 * raw_copy_{to,from}_user(to, from, size) should copy up to size bytes and
 * return the amount left to copy.  They should assume that access_ok() has
 * already been checked (and succeeded); they should *not* zero-pad anything.
 * No KASAN or object size checks either - those belong here.
 *
 * Both of these functions should attempt to copy size bytes starting at from
 * into the area starting at to.  They must not fetch or store anything
 * outside of those areas.  Return value must be between 0 (everything
 * copied successfully) and size (nothing copied).
 *
 * If raw_copy_{to,from}_user(to, from, size) returns N, size - N bytes starting
 * at to must become equal to the bytes fetched from the corresponding area
 * starting at from.  All data past to + size - N must be left unmodified.
 *
 * If copying succeeds, the return value must be 0.  If some data cannot be
 * fetched, it is permitted to copy less than had been fetched; the only
 * hard requirement is that not storing anything at all (i.e. returning size)
 * should happen only when nothing could be copied.  In other words, you don't
 * have to squeeze as much as possible - it is allowed, but not necessary.
 *
 * For raw_copy_from_user() to always points to kernel memory and no faults
 * on store should happen.  Interpretation of from is affected by set_fs().
 * For raw_copy_to_user() it's the other way round.
 *
 * Both can be inlined - it's up to architectures whether it wants to bother
 * with that.  They should not be used directly; they are used to implement
 * the 6 functions (copy_{to,from}_user(), __copy_{to,from}_user_inatomic())
 * that are used instead.  Out of those, __... ones are inlined.  Plain
 * copy_{to,from}_user() might or might not be inlined.  If you want them
 * inlined, have asm/uaccess.h define INLINE_COPY_{TO,FROM}_USER.
 *
 * NOTE: only copy_from_user() zero-pads the destination in case of short copy.
 * Neither __copy_from_user() nor __copy_from_user_inatomic() zero anything
 * at all; their callers absolutely must check the return value.
 *
 * Biarch ones should also provide raw_copy_in_user() - similar to the above,
 * but both source and destination are __user pointers (affected by set_fs()
 * as usual) and both source and destination can trigger faults.
 */

static __always_inline __must_check unsigned long
__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
{
        instrument_copy_from_user(to, from, n);
        check_object_size(to, n, false);
        return raw_copy_from_user(to, from, n);
}

static __always_inline __must_check unsigned long
__copy_from_user(void *to, const void __user *from, unsigned long n)
{
        might_fault();
        if (should_fail_usercopy())
                return n;
        instrument_copy_from_user(to, from, n);
        check_object_size(to, n, false);
        return raw_copy_from_user(to, from, n);
}

/**
 * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking.
 * @to:   Destination address, in user space.
 * @from: Source address, in kernel space.
 * @n:    Number of bytes to copy.
 *
 * Context: User context only.
 *
 * Copy data from kernel space to user space.  Caller must check
 * the specified block with access_ok() before calling this function.
 * The caller should also make sure he pins the user space address
 * so that we don't result in page fault and sleep.
 */
static __always_inline __must_check unsigned long
__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
{
        if (should_fail_usercopy())
                return n;
        instrument_copy_to_user(to, from, n);
        check_object_size(from, n, true);
        return raw_copy_to_user(to, from, n);
}

static __always_inline __must_check unsigned long
__copy_to_user(void __user *to, const void *from, unsigned long n)
{
        might_fault();
        if (should_fail_usercopy())
                return n;
        instrument_copy_to_user(to, from, n);
        check_object_size(from, n, true);
        return raw_copy_to_user(to, from, n);
}

#ifdef INLINE_COPY_FROM_USER
static inline __must_check unsigned long
_copy_from_user(void *to, const void __user *from, unsigned long n)
{
        unsigned long res = n;
        might_fault();
        if (!should_fail_usercopy() && likely(access_ok(from, n))) {
                instrument_copy_from_user(to, from, n);
                res = raw_copy_from_user(to, from, n);
        }
        if (unlikely(res))
                memset(to + (n - res), 0, res);
        return res;
}
#else
extern __must_check unsigned long
_copy_from_user(void *, const void __user *, unsigned long);
#endif

#ifdef INLINE_COPY_TO_USER
static inline __must_check unsigned long
_copy_to_user(void __user *to, const void *from, unsigned long n)
{
        might_fault();
        if (should_fail_usercopy())
                return n;
        if (access_ok(to, n)) {
                instrument_copy_to_user(to, from, n);
                n = raw_copy_to_user(to, from, n);
        }
        return n;
}
#else
extern __must_check unsigned long
_copy_to_user(void __user *, const void *, unsigned long);
#endif

static __always_inline unsigned long __must_check
copy_from_user(void *to, const void __user *from, unsigned long n)
{
        if (likely(check_copy_size(to, n, false)))
                n = _copy_from_user(to, from, n);
        return n;
}

static __always_inline unsigned long __must_check
copy_to_user(void __user *to, const void *from, unsigned long n)
{
        if (likely(check_copy_size(from, n, true)))
                n = _copy_to_user(to, from, n);
        return n;
}
#ifdef CONFIG_COMPAT
static __always_inline unsigned long __must_check
copy_in_user(void __user *to, const void __user *from, unsigned long n)
{
        might_fault();
        if (access_ok(to, n) && access_ok(from, n))
                n = raw_copy_in_user(to, from, n);
        return n;
}
#endif

#ifndef copy_mc_to_kernel
/*
 * Without arch opt-in this generic copy_mc_to_kernel() will not handle
 * #MC (or arch equivalent) during source read.
 */
static inline unsigned long __must_check
copy_mc_to_kernel(void *dst, const void *src, size_t cnt)
{
        memcpy(dst, src, cnt);
        return 0;
}
#endif

static __always_inline void pagefault_disabled_inc(void)
{
        current->pagefault_disabled++;
}

static __always_inline void pagefault_disabled_dec(void)
{
        current->pagefault_disabled--;
}

/*
 * These routines enable/disable the pagefault handler. If disabled, it will
 * not take any locks and go straight to the fixup table.
 *
 * User access methods will not sleep when called from a pagefault_disabled()
 * environment.
 */
static inline void pagefault_disable(void)
{
        pagefault_disabled_inc();
        /*
         * make sure to have issued the store before a pagefault
         * can hit.
         */
        barrier();
}

static inline void pagefault_enable(void)
{
        /*
         * make sure to issue those last loads/stores before enabling
         * the pagefault handler again.
         */
        barrier();
        pagefault_disabled_dec();
}

/*
 * Is the pagefault handler disabled? If so, user access methods will not sleep.
 */
static inline bool pagefault_disabled(void)
{
        return current->pagefault_disabled != 0;
}

/*
 * The pagefault handler is in general disabled by pagefault_disable() or
 * when in irq context (via in_atomic()).
 *
 * This function should only be used by the fault handlers. Other users should
 * stick to pagefault_disabled().
 * Please NEVER use preempt_disable() to disable the fault handler. With
 * !CONFIG_PREEMPT_COUNT, this is like a NOP. So the handler won't be disabled.
 * in_atomic() will report different values based on !CONFIG_PREEMPT_COUNT.
 */
#define faulthandler_disabled() (pagefault_disabled() || in_atomic())

#ifndef ARCH_HAS_NOCACHE_UACCESS

static inline __must_check unsigned long
__copy_from_user_inatomic_nocache(void *to, const void __user *from,
                                  unsigned long n)
{
        return __copy_from_user_inatomic(to, from, n);
}

#endif                /* ARCH_HAS_NOCACHE_UACCESS */

extern __must_check int check_zeroed_user(const void __user *from, size_t size);

/**
 * copy_struct_from_user: copy a struct from userspace
 * @dst:   Destination address, in kernel space. This buffer must be @ksize
 *         bytes long.
 * @ksize: Size of @dst struct.
 * @src:   Source address, in userspace.
 * @usize: (Alleged) size of @src struct.
 *
 * Copies a struct from userspace to kernel space, in a way that guarantees
 * backwards-compatibility for struct syscall arguments (as long as future
 * struct extensions are made such that all new fields are *appended* to the
 * old struct, and zeroed-out new fields have the same meaning as the old
 * struct).
 *
 * @ksize is just sizeof(*dst), and @usize should've been passed by userspace.
 * The recommended usage is something like the following:
 *
 *   SYSCALL_DEFINE2(foobar, const struct foo __user *, uarg, size_t, usize)
 *   {
 *      int err;
 *      struct foo karg = {};
 *
 *      if (usize > PAGE_SIZE)
 *        return -E2BIG;
 *      if (usize < FOO_SIZE_VER0)
 *        return -EINVAL;
 *
 *      err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize);
 *      if (err)
 *        return err;
 *
 *      // ...
 *   }
 *
 * There are three cases to consider:
 *  * If @usize == @ksize, then it's copied verbatim.
 *  * If @usize < @ksize, then the userspace has passed an old struct to a
 *    newer kernel. The rest of the trailing bytes in @dst (@ksize - @usize)
 *    are to be zero-filled.
 *  * If @usize > @ksize, then the userspace has passed a new struct to an
 *    older kernel. The trailing bytes unknown to the kernel (@usize - @ksize)
 *    are checked to ensure they are zeroed, otherwise -E2BIG is returned.
 *
 * Returns (in all cases, some data may have been copied):
 *  * -E2BIG:  (@usize > @ksize) and there are non-zero trailing bytes in @src.
 *  * -EFAULT: access to userspace failed.
 */
static __always_inline __must_check int
copy_struct_from_user(void *dst, size_t ksize, const void __user *src,
                      size_t usize)
{
        size_t size = min(ksize, usize);
        size_t rest = max(ksize, usize) - size;

        /* Double check if ksize is larger than a known object size. */
        if (WARN_ON_ONCE(ksize > __builtin_object_size(dst, 1)))
                return -E2BIG;

        /* Deal with trailing bytes. */
        if (usize < ksize) {
                memset(dst + size, 0, rest);
        } else if (usize > ksize) {
                int ret = check_zeroed_user(src + size, rest);
                if (ret <= 0)
                        return ret ?: -E2BIG;
        }
        /* Copy the interoperable parts of the struct. */
        if (copy_from_user(dst, src, size))
                return -EFAULT;
        return 0;
}

bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size);

long copy_from_kernel_nofault(void *dst, const void *src, size_t size);
long notrace copy_to_kernel_nofault(void *dst, const void *src, size_t size);

long copy_from_user_nofault(void *dst, const void __user *src, size_t size);
long notrace copy_to_user_nofault(void __user *dst, const void *src,
                size_t size);

long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr,
                long count);

long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
                long count);
long strnlen_user_nofault(const void __user *unsafe_addr, long count);

/**
 * get_kernel_nofault(): safely attempt to read from a location
 * @val: read into this variable
 * @ptr: address to read from
 *
 * Returns 0 on success, or -EFAULT.
 */
#define get_kernel_nofault(val, ptr) ({                                \
        const typeof(val) *__gk_ptr = (ptr);                        \
        copy_from_kernel_nofault(&(val), __gk_ptr, sizeof(val));\
})

#ifndef user_access_begin
#define user_access_begin(ptr,len) access_ok(ptr, len)
#define user_access_end() do { } while (0)
#define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0)
#define unsafe_get_user(x,p,e) unsafe_op_wrap(__get_user(x,p),e)
#define unsafe_put_user(x,p,e) unsafe_op_wrap(__put_user(x,p),e)
#define unsafe_copy_to_user(d,s,l,e) unsafe_op_wrap(__copy_to_user(d,s,l),e)
static inline unsigned long user_access_save(void) { return 0UL; }
static inline void user_access_restore(unsigned long flags) { }
#endif
#ifndef user_write_access_begin
#define user_write_access_begin user_access_begin
#define user_write_access_end user_access_end
#endif
#ifndef user_read_access_begin
#define user_read_access_begin user_access_begin
#define user_read_access_end user_access_end
#endif

#ifdef CONFIG_HARDENED_USERCOPY
void usercopy_warn(const char *name, const char *detail, bool to_user,
                   unsigned long offset, unsigned long len);
void __noreturn usercopy_abort(const char *name, const char *detail,
                               bool to_user, unsigned long offset,
                               unsigned long len);
#endif

#endif                /* __LINUX_UACCESS_H__ */






































































































    1 














































































































































































    1 














































































































































































    1 

































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_SEQLOCK_H
#define __LINUX_SEQLOCK_H

/*
 * seqcount_t / seqlock_t - a reader-writer consistency mechanism with
 * lockless readers (read-only retry loops), and no writer starvation.
 *
 * See Documentation/locking/seqlock.rst
 *
 * Copyrights:
 * - Based on x86_64 vsyscall gettimeofday: Keith Owens, Andrea Arcangeli
 * - Sequence counters with associated locks, (C) 2020 Linutronix GmbH
 */

#include <linux/compiler.h>
#include <linux/kcsan-checks.h>
#include <linux/lockdep.h>
#include <linux/mutex.h>
#include <linux/ww_mutex.h>
#include <linux/preempt.h>
#include <linux/spinlock.h>

#include <asm/processor.h>

/*
 * The seqlock seqcount_t interface does not prescribe a precise sequence of
 * read begin/retry/end. For readers, typically there is a call to
 * read_seqcount_begin() and read_seqcount_retry(), however, there are more
 * esoteric cases which do not follow this pattern.
 *
 * As a consequence, we take the following best-effort approach for raw usage
 * via seqcount_t under KCSAN: upon beginning a seq-reader critical section,
 * pessimistically mark the next KCSAN_SEQLOCK_REGION_MAX memory accesses as
 * atomics; if there is a matching read_seqcount_retry() call, no following
 * memory operations are considered atomic. Usage of the seqlock_t interface
 * is not affected.
 */
#define KCSAN_SEQLOCK_REGION_MAX 1000

/*
 * Sequence counters (seqcount_t)
 *
 * This is the raw counting mechanism, without any writer protection.
 *
 * Write side critical sections must be serialized and non-preemptible.
 *
 * If readers can be invoked from hardirq or softirq contexts,
 * interrupts or bottom halves must also be respectively disabled before
 * entering the write section.
 *
 * This mechanism can't be used if the protected data contains pointers,
 * as the writer can invalidate a pointer that a reader is following.
 *
 * If the write serialization mechanism is one of the common kernel
 * locking primitives, use a sequence counter with associated lock
 * (seqcount_LOCKNAME_t) instead.
 *
 * If it's desired to automatically handle the sequence counter writer
 * serialization and non-preemptibility requirements, use a sequential
 * lock (seqlock_t) instead.
 *
 * See Documentation/locking/seqlock.rst
 */
typedef struct seqcount {
        unsigned sequence;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map dep_map;
#endif
} seqcount_t;

static inline void __seqcount_init(seqcount_t *s, const char *name,
                                          struct lock_class_key *key)
{
        /*
         * Make sure we are not reinitializing a held lock:
         */
        lockdep_init_map(&s->dep_map, name, key, 0);
        s->sequence = 0;
}

#ifdef CONFIG_DEBUG_LOCK_ALLOC

# define SEQCOUNT_DEP_MAP_INIT(lockname)                                \
                .dep_map = { .name = #lockname }

/**
 * seqcount_init() - runtime initializer for seqcount_t
 * @s: Pointer to the seqcount_t instance
 */
# define seqcount_init(s)                                                \
        do {                                                                \
                static struct lock_class_key __key;                        \
                __seqcount_init((s), #s, &__key);                        \
        } while (0)

static inline void seqcount_lockdep_reader_access(const seqcount_t *s)
{
        seqcount_t *l = (seqcount_t *)s;
        unsigned long flags;

        local_irq_save(flags);
        seqcount_acquire_read(&l->dep_map, 0, 0, _RET_IP_);
        seqcount_release(&l->dep_map, _RET_IP_);
        local_irq_restore(flags);
}

#else
# define SEQCOUNT_DEP_MAP_INIT(lockname)
# define seqcount_init(s) __seqcount_init(s, NULL, NULL)
# define seqcount_lockdep_reader_access(x)
#endif

/**
 * SEQCNT_ZERO() - static initializer for seqcount_t
 * @name: Name of the seqcount_t instance
 */
#define SEQCNT_ZERO(name) { .sequence = 0, SEQCOUNT_DEP_MAP_INIT(name) }

/*
 * Sequence counters with associated locks (seqcount_LOCKNAME_t)
 *
 * A sequence counter which associates the lock used for writer
 * serialization at initialization time. This enables lockdep to validate
 * that the write side critical section is properly serialized.
 *
 * For associated locks which do not implicitly disable preemption,
 * preemption protection is enforced in the write side function.
 *
 * Lockdep is never used in any for the raw write variants.
 *
 * See Documentation/locking/seqlock.rst
 */

/*
 * For PREEMPT_RT, seqcount_LOCKNAME_t write side critical sections cannot
 * disable preemption. It can lead to higher latencies, and the write side
 * sections will not be able to acquire locks which become sleeping locks
 * (e.g. spinlock_t).
 *
 * To remain preemptible while avoiding a possible livelock caused by the
 * reader preempting the writer, use a different technique: let the reader
 * detect if a seqcount_LOCKNAME_t writer is in progress. If that is the
 * case, acquire then release the associated LOCKNAME writer serialization
 * lock. This will allow any possibly-preempted writer to make progress
 * until the end of its writer serialization lock critical section.
 *
 * This lock-unlock technique must be implemented for all of PREEMPT_RT
 * sleeping locks.  See Documentation/locking/locktypes.rst
 */
#if defined(CONFIG_LOCKDEP) || defined(CONFIG_PREEMPT_RT)
#define __SEQ_LOCK(expr)        expr
#else
#define __SEQ_LOCK(expr)
#endif

/*
 * typedef seqcount_LOCKNAME_t - sequence counter with LOCKNAME associated
 * @seqcount:        The real sequence counter
 * @lock:        Pointer to the associated lock
 *
 * A plain sequence counter with external writer synchronization by
 * LOCKNAME @lock. The lock is associated to the sequence counter in the
 * static initializer or init function. This enables lockdep to validate
 * that the write side critical section is properly serialized.
 *
 * LOCKNAME:        raw_spinlock, spinlock, rwlock, mutex, or ww_mutex.
 */

/*
 * seqcount_LOCKNAME_init() - runtime initializer for seqcount_LOCKNAME_t
 * @s:                Pointer to the seqcount_LOCKNAME_t instance
 * @lock:        Pointer to the associated lock
 */

#define seqcount_LOCKNAME_init(s, _lock, lockname)                        \
        do {                                                                \
                seqcount_##lockname##_t *____s = (s);                        \
                seqcount_init(&____s->seqcount);                        \
                __SEQ_LOCK(____s->lock = (_lock));                        \
        } while (0)

#define seqcount_raw_spinlock_init(s, lock)        seqcount_LOCKNAME_init(s, lock, raw_spinlock)
#define seqcount_spinlock_init(s, lock)                seqcount_LOCKNAME_init(s, lock, spinlock)
#define seqcount_rwlock_init(s, lock)                seqcount_LOCKNAME_init(s, lock, rwlock);
#define seqcount_mutex_init(s, lock)                seqcount_LOCKNAME_init(s, lock, mutex);
#define seqcount_ww_mutex_init(s, lock)                seqcount_LOCKNAME_init(s, lock, ww_mutex);

/*
 * SEQCOUNT_LOCKNAME()        - Instantiate seqcount_LOCKNAME_t and helpers
 * seqprop_LOCKNAME_*()        - Property accessors for seqcount_LOCKNAME_t
 *
 * @lockname:                "LOCKNAME" part of seqcount_LOCKNAME_t
 * @locktype:                LOCKNAME canonical C data type
 * @preemptible:        preemptibility of above locktype
 * @lockmember:                argument for lockdep_assert_held()
 * @lockbase:                associated lock release function (prefix only)
 * @lock_acquire:        associated lock acquisition function (full call)
 */
#define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockmember, lockbase, lock_acquire) \
typedef struct seqcount_##lockname {                                        \
        seqcount_t                seqcount;                                \
        __SEQ_LOCK(locktype        *lock);                                        \
} seqcount_##lockname##_t;                                                \
                                                                        \
static __always_inline seqcount_t *                                        \
__seqprop_##lockname##_ptr(seqcount_##lockname##_t *s)                        \
{                                                                        \
        return &s->seqcount;                                                \
}                                                                        \
                                                                        \
static __always_inline unsigned                                                \
__seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s)        \
{                                                                        \
        unsigned seq = READ_ONCE(s->seqcount.sequence);                        \
                                                                        \
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))                                \
                return seq;                                                \
                                                                        \
        if (preemptible && unlikely(seq & 1)) {                                \
                __SEQ_LOCK(lock_acquire);                                \
                __SEQ_LOCK(lockbase##_unlock(s->lock));                        \
                                                                        \
                /*                                                        \
                 * Re-read the sequence counter since the (possibly        \
                 * preempted) writer made progress.                        \
                 */                                                        \
                seq = READ_ONCE(s->seqcount.sequence);                        \
        }                                                                \
                                                                        \
        return seq;                                                        \
}                                                                        \
                                                                        \
static __always_inline bool                                                \
__seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s)        \
{                                                                        \
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))                                \
                return preemptible;                                        \
                                                                        \
        /* PREEMPT_RT relies on the above LOCK+UNLOCK */                \
        return false;                                                        \
}                                                                        \
                                                                        \
static __always_inline void                                                \
__seqprop_##lockname##_assert(const seqcount_##lockname##_t *s)                \
{                                                                        \
        __SEQ_LOCK(lockdep_assert_held(lockmember));                        \
}

/*
 * __seqprop() for seqcount_t
 */

static inline seqcount_t *__seqprop_ptr(seqcount_t *s)
{
        return s;
}

static inline unsigned __seqprop_sequence(const seqcount_t *s)
{
        return READ_ONCE(s->sequence);
}

static inline bool __seqprop_preemptible(const seqcount_t *s)
{
        return false;
}

static inline void __seqprop_assert(const seqcount_t *s)
{
        lockdep_assert_preemption_disabled();
}

#define __SEQ_RT        IS_ENABLED(CONFIG_PREEMPT_RT)

SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t,  false,    s->lock,        raw_spin, raw_spin_lock(s->lock))
SEQCOUNT_LOCKNAME(spinlock,     spinlock_t,      __SEQ_RT, s->lock,        spin,     spin_lock(s->lock))
SEQCOUNT_LOCKNAME(rwlock,       rwlock_t,        __SEQ_RT, s->lock,        read,     read_lock(s->lock))
SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     s->lock,        mutex,    mutex_lock(s->lock))
SEQCOUNT_LOCKNAME(ww_mutex,     struct ww_mutex, true,     &s->lock->base, ww_mutex, ww_mutex_lock(s->lock, NULL))

/*
 * SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t
 * @name:        Name of the seqcount_LOCKNAME_t instance
 * @lock:        Pointer to the associated LOCKNAME
 */

#define SEQCOUNT_LOCKNAME_ZERO(seq_name, assoc_lock) {                        \
        .seqcount                = SEQCNT_ZERO(seq_name.seqcount),        \
        __SEQ_LOCK(.lock        = (assoc_lock))                                \
}

#define SEQCNT_RAW_SPINLOCK_ZERO(name, lock)        SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_SPINLOCK_ZERO(name, lock)        SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_RWLOCK_ZERO(name, lock)                SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_MUTEX_ZERO(name, lock)                SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_WW_MUTEX_ZERO(name, lock)         SEQCOUNT_LOCKNAME_ZERO(name, lock)

#define __seqprop_case(s, lockname, prop)                                \
        seqcount_##lockname##_t: __seqprop_##lockname##_##prop((void *)(s))

#define __seqprop(s, prop) _Generic(*(s),                                \
        seqcount_t:                __seqprop_##prop((void *)(s)),                \
        __seqprop_case((s),        raw_spinlock,        prop),                        \
        __seqprop_case((s),        spinlock,        prop),                        \
        __seqprop_case((s),        rwlock,                prop),                        \
        __seqprop_case((s),        mutex,                prop),                        \
        __seqprop_case((s),        ww_mutex,        prop))

#define seqprop_ptr(s)                        __seqprop(s, ptr)
#define seqprop_sequence(s)                __seqprop(s, sequence)
#define seqprop_preemptible(s)                __seqprop(s, preemptible)
#define seqprop_assert(s)                __seqprop(s, assert)

/**
 * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * __read_seqcount_begin is like read_seqcount_begin, but has no smp_rmb()
 * barrier. Callers should ensure that smp_rmb() or equivalent ordering is
 * provided before actually loading any of the variables that are to be
 * protected in this critical section.
 *
 * Use carefully, only in critical code, and comment how the barrier is
 * provided.
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define __read_seqcount_begin(s)                                        \
({                                                                        \
        unsigned __seq;                                                        \
                                                                        \
        while ((__seq = seqprop_sequence(s)) & 1)                        \
                cpu_relax();                                                \
                                                                        \
        kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);                        \
        __seq;                                                                \
})

/**
 * raw_read_seqcount_begin() - begin a seqcount_t read section w/o lockdep
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define raw_read_seqcount_begin(s)                                        \
({                                                                        \
        unsigned _seq = __read_seqcount_begin(s);                        \
                                                                        \
        smp_rmb();                                                        \
        _seq;                                                                \
})

/**
 * read_seqcount_begin() - begin a seqcount_t read critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define read_seqcount_begin(s)                                                \
({                                                                        \
        seqcount_lockdep_reader_access(seqprop_ptr(s));                        \
        raw_read_seqcount_begin(s);                                        \
})

/**
 * raw_read_seqcount() - read the raw seqcount_t counter value
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * raw_read_seqcount opens a read critical section of the given
 * seqcount_t, without any lockdep checking, and without checking or
 * masking the sequence counter LSB. Calling code is responsible for
 * handling that.
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define raw_read_seqcount(s)                                                \
({                                                                        \
        unsigned __seq = seqprop_sequence(s);                                \
                                                                        \
        smp_rmb();                                                        \
        kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);                        \
        __seq;                                                                \
})

/**
 * raw_seqcount_begin() - begin a seqcount_t read critical section w/o
 *                        lockdep and w/o counter stabilization
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * raw_seqcount_begin opens a read critical section of the given
 * seqcount_t. Unlike read_seqcount_begin(), this function will not wait
 * for the count to stabilize. If a writer is active when it begins, it
 * will fail the read_seqcount_retry() at the end of the read critical
 * section instead of stabilizing at the beginning of it.
 *
 * Use this only in special kernel hot paths where the read section is
 * small and has a high probability of success through other external
 * means. It will save a single branching instruction.
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define raw_seqcount_begin(s)                                                \
({                                                                        \
        /*                                                                \
         * If the counter is odd, let read_seqcount_retry() fail        \
         * by decrementing the counter.                                        \
         */                                                                \
        raw_read_seqcount(s) & ~1;                                        \
})

/**
 * __read_seqcount_retry() - end a seqcount_t read section w/o barrier
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 * @start: count, from read_seqcount_begin()
 *
 * __read_seqcount_retry is like read_seqcount_retry, but has no smp_rmb()
 * barrier. Callers should ensure that smp_rmb() or equivalent ordering is
 * provided before actually loading any of the variables that are to be
 * protected in this critical section.
 *
 * Use carefully, only in critical code, and comment how the barrier is
 * provided.
 *
 * Return: true if a read section retry is required, else false
 */
#define __read_seqcount_retry(s, start)                                        \
        do___read_seqcount_retry(seqprop_ptr(s), start)

static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start)
{
        kcsan_atomic_next(0);
        return unlikely(READ_ONCE(s->sequence) != start);
}

/**
 * read_seqcount_retry() - end a seqcount_t read critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 * @start: count, from read_seqcount_begin()
 *
 * read_seqcount_retry closes the read critical section of given
 * seqcount_t.  If the critical section was invalid, it must be ignored
 * (and typically retried).
 *
 * Return: true if a read section retry is required, else false
 */
#define read_seqcount_retry(s, start)                                        \
        do_read_seqcount_retry(seqprop_ptr(s), start)

static inline int do_read_seqcount_retry(const seqcount_t *s, unsigned start)
{
        smp_rmb();
        return do___read_seqcount_retry(s, start);
}

/**
 * raw_write_seqcount_begin() - start a seqcount_t write section w/o lockdep
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 */
#define raw_write_seqcount_begin(s)                                        \
do {                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_disable();                                        \
                                                                        \
        do_raw_write_seqcount_begin(seqprop_ptr(s));                        \
} while (0)

static inline void do_raw_write_seqcount_begin(seqcount_t *s)
{
        kcsan_nestable_atomic_begin();
        s->sequence++;
        smp_wmb();
}

/**
 * raw_write_seqcount_end() - end a seqcount_t write section w/o lockdep
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 */
#define raw_write_seqcount_end(s)                                        \
do {                                                                        \
        do_raw_write_seqcount_end(seqprop_ptr(s));                        \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_enable();                                        \
} while (0)

static inline void do_raw_write_seqcount_end(seqcount_t *s)
{
        smp_wmb();
        s->sequence++;
        kcsan_nestable_atomic_end();
}

/**
 * write_seqcount_begin_nested() - start a seqcount_t write section with
 *                                 custom lockdep nesting level
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 * @subclass: lockdep nesting level
 *
 * See Documentation/locking/lockdep-design.rst
 */
#define write_seqcount_begin_nested(s, subclass)                        \
do {                                                                        \
        seqprop_assert(s);                                                \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_disable();                                        \
                                                                        \
        do_write_seqcount_begin_nested(seqprop_ptr(s), subclass);        \
} while (0)

static inline void do_write_seqcount_begin_nested(seqcount_t *s, int subclass)
{
        seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_);
        do_raw_write_seqcount_begin(s);
}

/**
 * write_seqcount_begin() - start a seqcount_t write side critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * write_seqcount_begin opens a write side critical section of the given
 * seqcount_t.
 *
 * Context: seqcount_t write side critical sections must be serialized and
 * non-preemptible. If readers can be invoked from hardirq or softirq
 * context, interrupts or bottom halves must be respectively disabled.
 */
#define write_seqcount_begin(s)                                                \
do {                                                                        \
        seqprop_assert(s);                                                \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_disable();                                        \
                                                                        \
        do_write_seqcount_begin(seqprop_ptr(s));                        \
} while (0)

static inline void do_write_seqcount_begin(seqcount_t *s)
{
        do_write_seqcount_begin_nested(s, 0);
}

/**
 * write_seqcount_end() - end a seqcount_t write side critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * The write section must've been opened with write_seqcount_begin().
 */
#define write_seqcount_end(s)                                                \
do {                                                                        \
        do_write_seqcount_end(seqprop_ptr(s));                                \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_enable();                                        \
} while (0)

static inline void do_write_seqcount_end(seqcount_t *s)
{
        seqcount_release(&s->dep_map, _RET_IP_);
        do_raw_write_seqcount_end(s);
}

/**
 * raw_write_seqcount_barrier() - do a seqcount_t write barrier
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * This can be used to provide an ordering guarantee instead of the usual
 * consistency guarantee. It is one wmb cheaper, because it can collapse
 * the two back-to-back wmb()s.
 *
 * Note that writes surrounding the barrier should be declared atomic (e.g.
 * via WRITE_ONCE): a) to ensure the writes become visible to other threads
 * atomically, avoiding compiler optimizations; b) to document which writes are
 * meant to propagate to the reader critical section. This is necessary because
 * neither writes before and after the barrier are enclosed in a seq-writer
 * critical section that would ensure readers are aware of ongoing writes::
 *
 *        seqcount_t seq;
 *        bool X = true, Y = false;
 *
 *        void read(void)
 *        {
 *                bool x, y;
 *
 *                do {
 *                        int s = read_seqcount_begin(&seq);
 *
 *                        x = X; y = Y;
 *
 *                } while (read_seqcount_retry(&seq, s));
 *
 *                BUG_ON(!x && !y);
 *      }
 *
 *      void write(void)
 *      {
 *                WRITE_ONCE(Y, true);
 *
 *                raw_write_seqcount_barrier(seq);
 *
 *                WRITE_ONCE(X, false);
 *      }
 */
#define raw_write_seqcount_barrier(s)                                        \
        do_raw_write_seqcount_barrier(seqprop_ptr(s))

static inline void do_raw_write_seqcount_barrier(seqcount_t *s)
{
        kcsan_nestable_atomic_begin();
        s->sequence++;
        smp_wmb();
        s->sequence++;
        kcsan_nestable_atomic_end();
}

/**
 * write_seqcount_invalidate() - invalidate in-progress seqcount_t read
 *                               side operations
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * After write_seqcount_invalidate, no seqcount_t read side operations
 * will complete successfully and see data older than this.
 */
#define write_seqcount_invalidate(s)                                        \
        do_write_seqcount_invalidate(seqprop_ptr(s))

static inline void do_write_seqcount_invalidate(seqcount_t *s)
{
        smp_wmb();
        kcsan_nestable_atomic_begin();
        s->sequence+=2;
        kcsan_nestable_atomic_end();
}

/*
 * Latch sequence counters (seqcount_latch_t)
 *
 * A sequence counter variant where the counter even/odd value is used to
 * switch between two copies of protected data. This allows the read path,
 * typically NMIs, to safely interrupt the write side critical section.
 *
 * As the write sections are fully preemptible, no special handling for
 * PREEMPT_RT is needed.
 */
typedef struct {
        seqcount_t seqcount;
} seqcount_latch_t;

/**
 * SEQCNT_LATCH_ZERO() - static initializer for seqcount_latch_t
 * @seq_name: Name of the seqcount_latch_t instance
 */
#define SEQCNT_LATCH_ZERO(seq_name) {                                        \
        .seqcount                = SEQCNT_ZERO(seq_name.seqcount),        \
}

/**
 * seqcount_latch_init() - runtime initializer for seqcount_latch_t
 * @s: Pointer to the seqcount_latch_t instance
 */
#define seqcount_latch_init(s) seqcount_init(&(s)->seqcount)

/**
 * raw_read_seqcount_latch() - pick even/odd latch data copy
 * @s: Pointer to seqcount_latch_t
 *
 * See raw_write_seqcount_latch() for details and a full reader/writer
 * usage example.
 *
 * Return: sequence counter raw value. Use the lowest bit as an index for
 * picking which data copy to read. The full counter must then be checked
 * with read_seqcount_latch_retry().
 */
static inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s)
{
        /*
         * Pairs with the first smp_wmb() in raw_write_seqcount_latch().
         * Due to the dependent load, a full smp_rmb() is not needed.
         */
        return READ_ONCE(s->seqcount.sequence);
}

/**
 * read_seqcount_latch_retry() - end a seqcount_latch_t read section
 * @s:                Pointer to seqcount_latch_t
 * @start:        count, from raw_read_seqcount_latch()
 *
 * Return: true if a read section retry is required, else false
 */
static inline int
read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
{
        return read_seqcount_retry(&s->seqcount, start);
}

/**
 * raw_write_seqcount_latch() - redirect latch readers to even/odd copy
 * @s: Pointer to seqcount_latch_t
 *
 * The latch technique is a multiversion concurrency control method that allows
 * queries during non-atomic modifications. If you can guarantee queries never
 * interrupt the modification -- e.g. the concurrency is strictly between CPUs
 * -- you most likely do not need this.
 *
 * Where the traditional RCU/lockless data structures rely on atomic
 * modifications to ensure queries observe either the old or the new state the
 * latch allows the same for non-atomic updates. The trade-off is doubling the
 * cost of storage; we have to maintain two copies of the entire data
 * structure.
 *
 * Very simply put: we first modify one copy and then the other. This ensures
 * there is always one copy in a stable state, ready to give us an answer.
 *
 * The basic form is a data structure like::
 *
 *        struct latch_struct {
 *                seqcount_latch_t        seq;
 *                struct data_struct        data[2];
 *        };
 *
 * Where a modification, which is assumed to be externally serialized, does the
 * following::
 *
 *        void latch_modify(struct latch_struct *latch, ...)
 *        {
 *                smp_wmb();        // Ensure that the last data[1] update is visible
 *                latch->seq.sequence++;
 *                smp_wmb();        // Ensure that the seqcount update is visible
 *
 *                modify(latch->data[0], ...);
 *
 *                smp_wmb();        // Ensure that the data[0] update is visible
 *                latch->seq.sequence++;
 *                smp_wmb();        // Ensure that the seqcount update is visible
 *
 *                modify(latch->data[1], ...);
 *        }
 *
 * The query will have a form like::
 *
 *        struct entry *latch_query(struct latch_struct *latch, ...)
 *        {
 *                struct entry *entry;
 *                unsigned seq, idx;
 *
 *                do {
 *                        seq = raw_read_seqcount_latch(&latch->seq);
 *
 *                        idx = seq & 0x01;
 *                        entry = data_query(latch->data[idx], ...);
 *
 *                // This includes needed smp_rmb()
 *                } while (read_seqcount_latch_retry(&latch->seq, seq));
 *
 *                return entry;
 *        }
 *
 * So during the modification, queries are first redirected to data[1]. Then we
 * modify data[0]. When that is complete, we redirect queries back to data[0]
 * and we can modify data[1].
 *
 * NOTE:
 *
 *        The non-requirement for atomic modifications does _NOT_ include
 *        the publishing of new entries in the case where data is a dynamic
 *        data structure.
 *
 *        An iteration might start in data[0] and get suspended long enough
 *        to miss an entire modification sequence, once it resumes it might
 *        observe the new entry.
 *
 * NOTE2:
 *
 *        When data is a dynamic data structure; one should use regular RCU
 *        patterns to manage the lifetimes of the objects within.
 */
static inline void raw_write_seqcount_latch(seqcount_latch_t *s)
{
        smp_wmb();        /* prior stores before incrementing "sequence" */
        s->seqcount.sequence++;
        smp_wmb();      /* increment "sequence" before following stores */
}

/*
 * Sequential locks (seqlock_t)
 *
 * Sequence counters with an embedded spinlock for writer serialization
 * and non-preemptibility.
 *
 * For more info, see:
 *    - Comments on top of seqcount_t
 *    - Documentation/locking/seqlock.rst
 */
typedef struct {
        /*
         * Make sure that readers don't starve writers on PREEMPT_RT: use
         * seqcount_spinlock_t instead of seqcount_t. Check __SEQ_LOCK().
         */
        seqcount_spinlock_t seqcount;
        spinlock_t lock;
} seqlock_t;

#define __SEQLOCK_UNLOCKED(lockname)                                        \
        {                                                                \
                .seqcount = SEQCNT_SPINLOCK_ZERO(lockname, &(lockname).lock), \
                .lock =        __SPIN_LOCK_UNLOCKED(lockname)                        \
        }

/**
 * seqlock_init() - dynamic initializer for seqlock_t
 * @sl: Pointer to the seqlock_t instance
 */
#define seqlock_init(sl)                                                \
        do {                                                                \
                spin_lock_init(&(sl)->lock);                                \
                seqcount_spinlock_init(&(sl)->seqcount, &(sl)->lock);        \
        } while (0)

/**
 * DEFINE_SEQLOCK(sl) - Define a statically allocated seqlock_t
 * @sl: Name of the seqlock_t instance
 */
#define DEFINE_SEQLOCK(sl) \
                seqlock_t sl = __SEQLOCK_UNLOCKED(sl)

/**
 * read_seqbegin() - start a seqlock_t read side critical section
 * @sl: Pointer to seqlock_t
 *
 * Return: count, to be passed to read_seqretry()
 */
static inline unsigned read_seqbegin(const seqlock_t *sl)
{
        unsigned ret = read_seqcount_begin(&sl->seqcount);

        kcsan_atomic_next(0);  /* non-raw usage, assume closing read_seqretry() */
        kcsan_flat_atomic_begin();
        return ret;
}

/**
 * read_seqretry() - end a seqlock_t read side section
 * @sl: Pointer to seqlock_t
 * @start: count, from read_seqbegin()
 *
 * read_seqretry closes the read side critical section of given seqlock_t.
 * If the critical section was invalid, it must be ignored (and typically
 * retried).
 *
 * Return: true if a read section retry is required, else false
 */
static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
{
        /*
         * Assume not nested: read_seqretry() may be called multiple times when
         * completing read critical section.
         */
        kcsan_flat_atomic_end();

        return read_seqcount_retry(&sl->seqcount, start);
}

/*
 * For all seqlock_t write side functions, use the the internal
 * do_write_seqcount_begin() instead of generic write_seqcount_begin().
 * This way, no redundant lockdep_assert_held() checks are added.
 */

/**
 * write_seqlock() - start a seqlock_t write side critical section
 * @sl: Pointer to seqlock_t
 *
 * write_seqlock opens a write side critical section for the given
 * seqlock_t.  It also implicitly acquires the spinlock_t embedded inside
 * that sequential lock. All seqlock_t write side sections are thus
 * automatically serialized and non-preemptible.
 *
 * Context: if the seqlock_t read section, or other write side critical
 * sections, can be invoked from hardirq or softirq contexts, use the
 * _irqsave or _bh variants of this function instead.
 */
static inline void write_seqlock(seqlock_t *sl)
{
        spin_lock(&sl->lock);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
}

/**
 * write_sequnlock() - end a seqlock_t write side critical section
 * @sl: Pointer to seqlock_t
 *
 * write_sequnlock closes the (serialized and non-preemptible) write side
 * critical section of given seqlock_t.
 */
static inline void write_sequnlock(seqlock_t *sl)
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock(&sl->lock);
}

/**
 * write_seqlock_bh() - start a softirqs-disabled seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * _bh variant of write_seqlock(). Use only if the read side section, or
 * other write side sections, can be invoked from softirq contexts.
 */
static inline void write_seqlock_bh(seqlock_t *sl)
{
        spin_lock_bh(&sl->lock);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
}

/**
 * write_sequnlock_bh() - end a softirqs-disabled seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * write_sequnlock_bh closes the serialized, non-preemptible, and
 * softirqs-disabled, seqlock_t write side critical section opened with
 * write_seqlock_bh().
 */
static inline void write_sequnlock_bh(seqlock_t *sl)
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock_bh(&sl->lock);
}

/**
 * write_seqlock_irq() - start a non-interruptible seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * _irq variant of write_seqlock(). Use only if the read side section, or
 * other write sections, can be invoked from hardirq contexts.
 */
static inline void write_seqlock_irq(seqlock_t *sl)
{
        spin_lock_irq(&sl->lock);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
}

/**
 * write_sequnlock_irq() - end a non-interruptible seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * write_sequnlock_irq closes the serialized and non-interruptible
 * seqlock_t write side section opened with write_seqlock_irq().
 */
static inline void write_sequnlock_irq(seqlock_t *sl)
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock_irq(&sl->lock);
}

static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
{
        unsigned long flags;

        spin_lock_irqsave(&sl->lock, flags);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
        return flags;
}

/**
 * write_seqlock_irqsave() - start a non-interruptible seqlock_t write
 *                           section
 * @lock:  Pointer to seqlock_t
 * @flags: Stack-allocated storage for saving caller's local interrupt
 *         state, to be passed to write_sequnlock_irqrestore().
 *
 * _irqsave variant of write_seqlock(). Use it only if the read side
 * section, or other write sections, can be invoked from hardirq context.
 */
#define write_seqlock_irqsave(lock, flags)                                \
        do { flags = __write_seqlock_irqsave(lock); } while (0)

/**
 * write_sequnlock_irqrestore() - end non-interruptible seqlock_t write
 *                                section
 * @sl:    Pointer to seqlock_t
 * @flags: Caller's saved interrupt state, from write_seqlock_irqsave()
 *
 * write_sequnlock_irqrestore closes the serialized and non-interruptible
 * seqlock_t write section previously opened with write_seqlock_irqsave().
 */
static inline void
write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock_irqrestore(&sl->lock, flags);
}

/**
 * read_seqlock_excl() - begin a seqlock_t locking reader section
 * @sl:        Pointer to seqlock_t
 *
 * read_seqlock_excl opens a seqlock_t locking reader critical section.  A
 * locking reader exclusively locks out *both* other writers *and* other
 * locking readers, but it does not update the embedded sequence number.
 *
 * Locking readers act like a normal spin_lock()/spin_unlock().
 *
 * Context: if the seqlock_t write section, *or other read sections*, can
 * be invoked from hardirq or softirq contexts, use the _irqsave or _bh
 * variant of this function instead.
 *
 * The opened read section must be closed with read_sequnlock_excl().
 */
static inline void read_seqlock_excl(seqlock_t *sl)
{
        spin_lock(&sl->lock);
}

/**
 * read_sequnlock_excl() - end a seqlock_t locking reader critical section
 * @sl: Pointer to seqlock_t
 */
static inline void read_sequnlock_excl(seqlock_t *sl)
{
        spin_unlock(&sl->lock);
}

/**
 * read_seqlock_excl_bh() - start a seqlock_t locking reader section with
 *                            softirqs disabled
 * @sl: Pointer to seqlock_t
 *
 * _bh variant of read_seqlock_excl(). Use this variant only if the
 * seqlock_t write side section, *or other read sections*, can be invoked
 * from softirq contexts.
 */
static inline void read_seqlock_excl_bh(seqlock_t *sl)
{
        spin_lock_bh(&sl->lock);
}

/**
 * read_sequnlock_excl_bh() - stop a seqlock_t softirq-disabled locking
 *                              reader section
 * @sl: Pointer to seqlock_t
 */
static inline void read_sequnlock_excl_bh(seqlock_t *sl)
{
        spin_unlock_bh(&sl->lock);
}

/**
 * read_seqlock_excl_irq() - start a non-interruptible seqlock_t locking
 *                             reader section
 * @sl: Pointer to seqlock_t
 *
 * _irq variant of read_seqlock_excl(). Use this only if the seqlock_t
 * write side section, *or other read sections*, can be invoked from a
 * hardirq context.
 */
static inline void read_seqlock_excl_irq(seqlock_t *sl)
{
        spin_lock_irq(&sl->lock);
}

/**
 * read_sequnlock_excl_irq() - end an interrupts-disabled seqlock_t
 *                             locking reader section
 * @sl: Pointer to seqlock_t
 */
static inline void read_sequnlock_excl_irq(seqlock_t *sl)
{
        spin_unlock_irq(&sl->lock);
}

static inline unsigned long __read_seqlock_excl_irqsave(seqlock_t *sl)
{
        unsigned long flags;

        spin_lock_irqsave(&sl->lock, flags);
        return flags;
}

/**
 * read_seqlock_excl_irqsave() - start a non-interruptible seqlock_t
 *                                 locking reader section
 * @lock:  Pointer to seqlock_t
 * @flags: Stack-allocated storage for saving caller's local interrupt
 *         state, to be passed to read_sequnlock_excl_irqrestore().
 *
 * _irqsave variant of read_seqlock_excl(). Use this only if the seqlock_t
 * write side section, *or other read sections*, can be invoked from a
 * hardirq context.
 */
#define read_seqlock_excl_irqsave(lock, flags)                                \
        do { flags = __read_seqlock_excl_irqsave(lock); } while (0)

/**
 * read_sequnlock_excl_irqrestore() - end non-interruptible seqlock_t
 *                                      locking reader section
 * @sl:    Pointer to seqlock_t
 * @flags: Caller saved interrupt state, from read_seqlock_excl_irqsave()
 */
static inline void
read_sequnlock_excl_irqrestore(seqlock_t *sl, unsigned long flags)
{
        spin_unlock_irqrestore(&sl->lock, flags);
}

/**
 * read_seqbegin_or_lock() - begin a seqlock_t lockless or locking reader
 * @lock: Pointer to seqlock_t
 * @seq : Marker and return parameter. If the passed value is even, the
 * reader will become a *lockless* seqlock_t reader as in read_seqbegin().
 * If the passed value is odd, the reader will become a *locking* reader
 * as in read_seqlock_excl().  In the first call to this function, the
 * caller *must* initialize and pass an even value to @seq; this way, a
 * lockless read can be optimistically tried first.
 *
 * read_seqbegin_or_lock is an API designed to optimistically try a normal
 * lockless seqlock_t read section first.  If an odd counter is found, the
 * lockless read trial has failed, and the next read iteration transforms
 * itself into a full seqlock_t locking reader.
 *
 * This is typically used to avoid seqlock_t lockless readers starvation
 * (too much retry loops) in the case of a sharp spike in write side
 * activity.
 *
 * Context: if the seqlock_t write section, *or other read sections*, can
 * be invoked from hardirq or softirq contexts, use the _irqsave or _bh
 * variant of this function instead.
 *
 * Check Documentation/locking/seqlock.rst for template example code.
 *
 * Return: the encountered sequence counter value, through the @seq
 * parameter, which is overloaded as a return parameter. This returned
 * value must be checked with need_seqretry(). If the read section need to
 * be retried, this returned value must also be passed as the @seq
 * parameter of the next read_seqbegin_or_lock() iteration.
 */
static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq)
{
        if (!(*seq & 1))        /* Even */
                *seq = read_seqbegin(lock);
        else                        /* Odd */
                read_seqlock_excl(lock);
}

/**
 * need_seqretry() - validate seqlock_t "locking or lockless" read section
 * @lock: Pointer to seqlock_t
 * @seq: sequence count, from read_seqbegin_or_lock()
 *
 * Return: true if a read section retry is required, false otherwise
 */
static inline int need_seqretry(seqlock_t *lock, int seq)
{
        return !(seq & 1) && read_seqretry(lock, seq);
}

/**
 * done_seqretry() - end seqlock_t "locking or lockless" reader section
 * @lock: Pointer to seqlock_t
 * @seq: count, from read_seqbegin_or_lock()
 *
 * done_seqretry finishes the seqlock_t read side critical section started
 * with read_seqbegin_or_lock() and validated by need_seqretry().
 */
static inline void done_seqretry(seqlock_t *lock, int seq)
{
        if (seq & 1)
                read_sequnlock_excl(lock);
}

/**
 * read_seqbegin_or_lock_irqsave() - begin a seqlock_t lockless reader, or
 *                                   a non-interruptible locking reader
 * @lock: Pointer to seqlock_t
 * @seq:  Marker and return parameter. Check read_seqbegin_or_lock().
 *
 * This is the _irqsave variant of read_seqbegin_or_lock(). Use it only if
 * the seqlock_t write section, *or other read sections*, can be invoked
 * from hardirq context.
 *
 * Note: Interrupts will be disabled only for "locking reader" mode.
 *
 * Return:
 *
 *   1. The saved local interrupts state in case of a locking reader, to
 *      be passed to done_seqretry_irqrestore().
 *
 *   2. The encountered sequence counter value, returned through @seq
 *      overloaded as a return parameter. Check read_seqbegin_or_lock().
 */
static inline unsigned long
read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq)
{
        unsigned long flags = 0;

        if (!(*seq & 1))        /* Even */
                *seq = read_seqbegin(lock);
        else                        /* Odd */
                read_seqlock_excl_irqsave(lock, flags);

        return flags;
}

/**
 * done_seqretry_irqrestore() - end a seqlock_t lockless reader, or a
 *                                non-interruptible locking reader section
 * @lock:  Pointer to seqlock_t
 * @seq:   Count, from read_seqbegin_or_lock_irqsave()
 * @flags: Caller's saved local interrupt state in case of a locking
 *           reader, also from read_seqbegin_or_lock_irqsave()
 *
 * This is the _irqrestore variant of done_seqretry(). The read section
 * must've been opened with read_seqbegin_or_lock_irqsave(), and validated
 * by need_seqretry().
 */
static inline void
done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags)
{
        if (seq & 1)
                read_sequnlock_excl_irqrestore(lock, flags);
}
#endif /* __LINUX_SEQLOCK_H */












































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * NUMA memory policies for Linux.
 * Copyright 2003,2004 Andi Kleen SuSE Labs
 */
#ifndef _LINUX_MEMPOLICY_H
#define _LINUX_MEMPOLICY_H 1

#include <linux/sched.h>
#include <linux/mmzone.h>
#include <linux/dax.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
#include <linux/nodemask.h>
#include <linux/pagemap.h>
#include <uapi/linux/mempolicy.h>

struct mm_struct;

#ifdef CONFIG_NUMA

/*
 * Describe a memory policy.
 *
 * A mempolicy can be either associated with a process or with a VMA.
 * For VMA related allocations the VMA policy is preferred, otherwise
 * the process policy is used. Interrupts ignore the memory policy
 * of the current process.
 *
 * Locking policy for interleave:
 * In process context there is no locking because only the process accesses
 * its own state. All vma manipulation is somewhat protected by a down_read on
 * mmap_lock.
 *
 * Freeing policy:
 * Mempolicy objects are reference counted.  A mempolicy will be freed when
 * mpol_put() decrements the reference count to zero.
 *
 * Duplicating policy objects:
 * mpol_dup() allocates a new mempolicy and copies the specified mempolicy
 * to the new storage.  The reference count of the new object is initialized
 * to 1, representing the caller of mpol_dup().
 */
struct mempolicy {
        atomic_t refcnt;
        unsigned short mode;         /* See MPOL_* above */
        unsigned short flags;        /* See set_mempolicy() MPOL_F_* above */
        union {
                short                  preferred_node; /* preferred */
                nodemask_t         nodes;                /* interleave/bind */
                /* undefined for default */
        } v;
        union {
                nodemask_t cpuset_mems_allowed;        /* relative to these nodes */
                nodemask_t user_nodemask;        /* nodemask passed by user */
        } w;
};

/*
 * Support for managing mempolicy data objects (clone, copy, destroy)
 * The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
 */

extern void __mpol_put(struct mempolicy *pol);
static inline void mpol_put(struct mempolicy *pol)
{
        if (pol)
                __mpol_put(pol);
}

/*
 * Does mempolicy pol need explicit unref after use?
 * Currently only needed for shared policies.
 */
static inline int mpol_needs_cond_ref(struct mempolicy *pol)
{
        return (pol && (pol->flags & MPOL_F_SHARED));
}

static inline void mpol_cond_put(struct mempolicy *pol)
{
        if (mpol_needs_cond_ref(pol))
                __mpol_put(pol);
}

extern struct mempolicy *__mpol_dup(struct mempolicy *pol);
static inline struct mempolicy *mpol_dup(struct mempolicy *pol)
{
        if (pol)
                pol = __mpol_dup(pol);
        return pol;
}

#define vma_policy(vma) ((vma)->vm_policy)

static inline void mpol_get(struct mempolicy *pol)
{
        if (pol)
                atomic_inc(&pol->refcnt);
}

extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b);
static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
        if (a == b)
                return true;
        return __mpol_equal(a, b);
}

/*
 * Tree of shared policies for a shared memory region.
 * Maintain the policies in a pseudo mm that contains vmas. The vmas
 * carry the policy. As a special twist the pseudo mm is indexed in pages, not
 * bytes, so that we can work with shared memory segments bigger than
 * unsigned long.
 */

struct sp_node {
        struct rb_node nd;
        unsigned long start, end;
        struct mempolicy *policy;
};

struct shared_policy {
        struct rb_root root;
        rwlock_t lock;
};

int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst);
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
int mpol_set_shared_policy(struct shared_policy *info,
                                struct vm_area_struct *vma,
                                struct mempolicy *new);
void mpol_free_shared_policy(struct shared_policy *p);
struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
                                            unsigned long idx);

struct mempolicy *get_task_policy(struct task_struct *p);
struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
                unsigned long addr);
bool vma_policy_mof(struct vm_area_struct *vma);

extern void numa_default_policy(void);
extern void numa_policy_init(void);
extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new);
extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);

extern int huge_node(struct vm_area_struct *vma,
                                unsigned long addr, gfp_t gfp_flags,
                                struct mempolicy **mpol, nodemask_t **nodemask);
extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
extern bool mempolicy_nodemask_intersects(struct task_struct *tsk,
                                const nodemask_t *mask);
extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy);

static inline nodemask_t *policy_nodemask_current(gfp_t gfp)
{
        struct mempolicy *mpol = get_task_policy(current);

        return policy_nodemask(gfp, mpol);
}

extern unsigned int mempolicy_slab_node(void);

extern enum zone_type policy_zone;

static inline void check_highest_zone(enum zone_type k)
{
        if (k > policy_zone && k != ZONE_MOVABLE)
                policy_zone = k;
}

int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                     const nodemask_t *to, int flags);


#ifdef CONFIG_TMPFS
extern int mpol_parse_str(char *str, struct mempolicy **mpol);
#endif

extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);

/* Check if a vma is migratable */
extern bool vma_migratable(struct vm_area_struct *vma);

extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
extern void mpol_put_task_policy(struct task_struct *);

#else

struct mempolicy {};

static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
        return true;
}

static inline void mpol_put(struct mempolicy *p)
{
}

static inline void mpol_cond_put(struct mempolicy *pol)
{
}

static inline void mpol_get(struct mempolicy *pol)
{
}

struct shared_policy {};

static inline void mpol_shared_policy_init(struct shared_policy *sp,
                                                struct mempolicy *mpol)
{
}

static inline void mpol_free_shared_policy(struct shared_policy *p)
{
}

static inline struct mempolicy *
mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
{
        return NULL;
}

#define vma_policy(vma) NULL

static inline int
vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
        return 0;
}

static inline void numa_policy_init(void)
{
}

static inline void numa_default_policy(void)
{
}

static inline void mpol_rebind_task(struct task_struct *tsk,
                                const nodemask_t *new)
{
}

static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
}

static inline int huge_node(struct vm_area_struct *vma,
                                unsigned long addr, gfp_t gfp_flags,
                                struct mempolicy **mpol, nodemask_t **nodemask)
{
        *mpol = NULL;
        *nodemask = NULL;
        return 0;
}

static inline bool init_nodemask_of_mempolicy(nodemask_t *m)
{
        return false;
}

static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                                   const nodemask_t *to, int flags)
{
        return 0;
}

static inline void check_highest_zone(int k)
{
}

#ifdef CONFIG_TMPFS
static inline int mpol_parse_str(char *str, struct mempolicy **mpol)
{
        return 1;        /* error */
}
#endif

static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
                                 unsigned long address)
{
        return -1; /* no node preference */
}

static inline void mpol_put_task_policy(struct task_struct *task)
{
}

static inline nodemask_t *policy_nodemask_current(gfp_t gfp)
{
        return NULL;
}
#endif /* CONFIG_NUMA */
#endif















































    1 


















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Integer base 2 logarithm calculation
 *
 * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _LINUX_LOG2_H
#define _LINUX_LOG2_H

#include <linux/types.h>
#include <linux/bitops.h>

/*
 * non-constant log of base 2 calculators
 * - the arch may override these in asm/bitops.h if they can be implemented
 *   more efficiently than using fls() and fls64()
 * - the arch is not required to handle n==0 if implementing the fallback
 */
#ifndef CONFIG_ARCH_HAS_ILOG2_U32
static inline __attribute__((const))
int __ilog2_u32(u32 n)
{
        return fls(n) - 1;
}
#endif

#ifndef CONFIG_ARCH_HAS_ILOG2_U64
static inline __attribute__((const))
int __ilog2_u64(u64 n)
{
        return fls64(n) - 1;
}
#endif

/**
 * is_power_of_2() - check if a value is a power of two
 * @n: the value to check
 *
 * Determine whether some value is a power of two, where zero is
 * *not* considered a power of two.
 * Return: true if @n is a power of 2, otherwise false.
 */
static inline __attribute__((const))
bool is_power_of_2(unsigned long n)
{
        return (n != 0 && ((n & (n - 1)) == 0));
}

/**
 * __roundup_pow_of_two() - round up to nearest power of two
 * @n: value to round up
 */
static inline __attribute__((const))
unsigned long __roundup_pow_of_two(unsigned long n)
{
        return 1UL << fls_long(n - 1);
}

/**
 * __rounddown_pow_of_two() - round down to nearest power of two
 * @n: value to round down
 */
static inline __attribute__((const))
unsigned long __rounddown_pow_of_two(unsigned long n)
{
        return 1UL << (fls_long(n) - 1);
}

/**
 * const_ilog2 - log base 2 of 32-bit or a 64-bit constant unsigned value
 * @n: parameter
 *
 * Use this where sparse expects a true constant expression, e.g. for array
 * indices.
 */
#define const_ilog2(n)                                \
(                                                \
        __builtin_constant_p(n) ? (                \
                (n) < 2 ? 0 :                        \
                (n) & (1ULL << 63) ? 63 :        \
                (n) & (1ULL << 62) ? 62 :        \
                (n) & (1ULL << 61) ? 61 :        \
                (n) & (1ULL << 60) ? 60 :        \
                (n) & (1ULL << 59) ? 59 :        \
                (n) & (1ULL << 58) ? 58 :        \
                (n) & (1ULL << 57) ? 57 :        \
                (n) & (1ULL << 56) ? 56 :        \
                (n) & (1ULL << 55) ? 55 :        \
                (n) & (1ULL << 54) ? 54 :        \
                (n) & (1ULL << 53) ? 53 :        \
                (n) & (1ULL << 52) ? 52 :        \
                (n) & (1ULL << 51) ? 51 :        \
                (n) & (1ULL << 50) ? 50 :        \
                (n) & (1ULL << 49) ? 49 :        \
                (n) & (1ULL << 48) ? 48 :        \
                (n) & (1ULL << 47) ? 47 :        \
                (n) & (1ULL << 46) ? 46 :        \
                (n) & (1ULL << 45) ? 45 :        \
                (n) & (1ULL << 44) ? 44 :        \
                (n) & (1ULL << 43) ? 43 :        \
                (n) & (1ULL << 42) ? 42 :        \
                (n) & (1ULL << 41) ? 41 :        \
                (n) & (1ULL << 40) ? 40 :        \
                (n) & (1ULL << 39) ? 39 :        \
                (n) & (1ULL << 38) ? 38 :        \
                (n) & (1ULL << 37) ? 37 :        \
                (n) & (1ULL << 36) ? 36 :        \
                (n) & (1ULL << 35) ? 35 :        \
                (n) & (1ULL << 34) ? 34 :        \
                (n) & (1ULL << 33) ? 33 :        \
                (n) & (1ULL << 32) ? 32 :        \
                (n) & (1ULL << 31) ? 31 :        \
                (n) & (1ULL << 30) ? 30 :        \
                (n) & (1ULL << 29) ? 29 :        \
                (n) & (1ULL << 28) ? 28 :        \
                (n) & (1ULL << 27) ? 27 :        \
                (n) & (1ULL << 26) ? 26 :        \
                (n) & (1ULL << 25) ? 25 :        \
                (n) & (1ULL << 24) ? 24 :        \
                (n) & (1ULL << 23) ? 23 :        \
                (n) & (1ULL << 22) ? 22 :        \
                (n) & (1ULL << 21) ? 21 :        \
                (n) & (1ULL << 20) ? 20 :        \
                (n) & (1ULL << 19) ? 19 :        \
                (n) & (1ULL << 18) ? 18 :        \
                (n) & (1ULL << 17) ? 17 :        \
                (n) & (1ULL << 16) ? 16 :        \
                (n) & (1ULL << 15) ? 15 :        \
                (n) & (1ULL << 14) ? 14 :        \
                (n) & (1ULL << 13) ? 13 :        \
                (n) & (1ULL << 12) ? 12 :        \
                (n) & (1ULL << 11) ? 11 :        \
                (n) & (1ULL << 10) ? 10 :        \
                (n) & (1ULL <<  9) ?  9 :        \
                (n) & (1ULL <<  8) ?  8 :        \
                (n) & (1ULL <<  7) ?  7 :        \
                (n) & (1ULL <<  6) ?  6 :        \
                (n) & (1ULL <<  5) ?  5 :        \
                (n) & (1ULL <<  4) ?  4 :        \
                (n) & (1ULL <<  3) ?  3 :        \
                (n) & (1ULL <<  2) ?  2 :        \
                1) :                                \
        -1)

/**
 * ilog2 - log base 2 of 32-bit or a 64-bit unsigned value
 * @n: parameter
 *
 * constant-capable log of base 2 calculation
 * - this can be used to initialise global variables from constant data, hence
 * the massive ternary operator construction
 *
 * selects the appropriately-sized optimised version depending on sizeof(n)
 */
#define ilog2(n) \
( \
        __builtin_constant_p(n) ?        \
        const_ilog2(n) :                \
        (sizeof(n) <= 4) ?                \
        __ilog2_u32(n) :                \
        __ilog2_u64(n)                        \
 )

/**
 * roundup_pow_of_two - round the given value up to nearest power of two
 * @n: parameter
 *
 * round the given value up to the nearest power of two
 * - the result is undefined when n == 0
 * - this can be used to initialise global variables from constant data
 */
#define roundup_pow_of_two(n)                        \
(                                                \
        __builtin_constant_p(n) ? (                \
                ((n) == 1) ? 1 :                \
                (1UL << (ilog2((n) - 1) + 1))        \
                                   ) :                \
        __roundup_pow_of_two(n)                        \
 )

/**
 * rounddown_pow_of_two - round the given value down to nearest power of two
 * @n: parameter
 *
 * round the given value down to the nearest power of two
 * - the result is undefined when n == 0
 * - this can be used to initialise global variables from constant data
 */
#define rounddown_pow_of_two(n)                        \
(                                                \
        __builtin_constant_p(n) ? (                \
                (1UL << ilog2(n))) :                \
        __rounddown_pow_of_two(n)                \
 )

static inline __attribute_const__
int __order_base_2(unsigned long n)
{
        return n > 1 ? ilog2(n - 1) + 1 : 0;
}

/**
 * order_base_2 - calculate the (rounded up) base 2 order of the argument
 * @n: parameter
 *
 * The first few values calculated by this routine:
 *  ob2(0) = 0
 *  ob2(1) = 0
 *  ob2(2) = 1
 *  ob2(3) = 2
 *  ob2(4) = 2
 *  ob2(5) = 3
 *  ... and so on.
 */
#define order_base_2(n)                                \
(                                                \
        __builtin_constant_p(n) ? (                \
                ((n) == 0 || (n) == 1) ? 0 :        \
                ilog2((n) - 1) + 1) :                \
        __order_base_2(n)                        \
)

static inline __attribute__((const))
int __bits_per(unsigned long n)
{
        if (n < 2)
                return 1;
        if (is_power_of_2(n))
                return order_base_2(n) + 1;
        return order_base_2(n);
}

/**
 * bits_per - calculate the number of bits required for the argument
 * @n: parameter
 *
 * This is constant-capable and can be used for compile time
 * initializations, e.g bitfields.
 *
 * The first few values calculated by this routine:
 * bf(0) = 1
 * bf(1) = 1
 * bf(2) = 2
 * bf(3) = 2
 * bf(4) = 3
 * ... and so on.
 */
#define bits_per(n)                                \
(                                                \
        __builtin_constant_p(n) ? (                \
                ((n) == 0 || (n) == 1)                \
                        ? 1 : ilog2(n) + 1        \
        ) :                                        \
        __bits_per(n)                                \
)
#endif /* _LINUX_LOG2_H */
















































    1 



    1 










    1 

































































































    1 




































    1 










    1 
    1 





    1 


    1 













































    1 














































    1 




















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
#include <linux/gfp.h>
#include <linux/highmem.h>
#include <linux/kernel.h>
#include <linux/mmdebug.h>
#include <linux/mm_types.h>
#include <linux/pagemap.h>
#include <linux/rcupdate.h>
#include <linux/smp.h>
#include <linux/swap.h>

#include <asm/pgalloc.h>
#include <asm/tlb.h>

#ifndef CONFIG_MMU_GATHER_NO_GATHER

static bool tlb_next_batch(struct mmu_gather *tlb)
{
        struct mmu_gather_batch *batch;

        batch = tlb->active;
        if (batch->next) {
                tlb->active = batch->next;
                return true;
        }

        if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
                return false;

        batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
        if (!batch)
                return false;

        tlb->batch_count++;
        batch->next = NULL;
        batch->nr   = 0;
        batch->max  = MAX_GATHER_BATCH;

        tlb->active->next = batch;
        tlb->active = batch;

        return true;
}

static void tlb_batch_pages_flush(struct mmu_gather *tlb)
{
        struct mmu_gather_batch *batch;

        for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
                free_pages_and_swap_cache(batch->pages, batch->nr);
                batch->nr = 0;
        }
        tlb->active = &tlb->local;
}

static void tlb_batch_list_free(struct mmu_gather *tlb)
{
        struct mmu_gather_batch *batch, *next;

        for (batch = tlb->local.next; batch; batch = next) {
                next = batch->next;
                free_pages((unsigned long)batch, 0);
        }
        tlb->local.next = NULL;
}

bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
{
        struct mmu_gather_batch *batch;

        VM_BUG_ON(!tlb->end);

#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
        VM_WARN_ON(tlb->page_size != page_size);
#endif

        batch = tlb->active;
        /*
         * Add the page and check if we are full. If so
         * force a flush.
         */
        batch->pages[batch->nr++] = page;
        if (batch->nr == batch->max) {
                if (!tlb_next_batch(tlb))
                        return true;
                batch = tlb->active;
        }
        VM_BUG_ON_PAGE(batch->nr > batch->max, page);

        return false;
}

#endif /* MMU_GATHER_NO_GATHER */

#ifdef CONFIG_MMU_GATHER_TABLE_FREE

static void __tlb_remove_table_free(struct mmu_table_batch *batch)
{
        int i;

        for (i = 0; i < batch->nr; i++)
                __tlb_remove_table(batch->tables[i]);

        free_page((unsigned long)batch);
}

#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE

/*
 * Semi RCU freeing of the page directories.
 *
 * This is needed by some architectures to implement software pagetable walkers.
 *
 * gup_fast() and other software pagetable walkers do a lockless page-table
 * walk and therefore needs some synchronization with the freeing of the page
 * directories. The chosen means to accomplish that is by disabling IRQs over
 * the walk.
 *
 * Architectures that use IPIs to flush TLBs will then automagically DTRT,
 * since we unlink the page, flush TLBs, free the page. Since the disabling of
 * IRQs delays the completion of the TLB flush we can never observe an already
 * freed page.
 *
 * Architectures that do not have this (PPC) need to delay the freeing by some
 * other means, this is that means.
 *
 * What we do is batch the freed directory pages (tables) and RCU free them.
 * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling
 * holds off grace periods.
 *
 * However, in order to batch these pages we need to allocate storage, this
 * allocation is deep inside the MM code and can thus easily fail on memory
 * pressure. To guarantee progress we fall back to single table freeing, see
 * the implementation of tlb_remove_table_one().
 *
 */

static void tlb_remove_table_smp_sync(void *arg)
{
        /* Simply deliver the interrupt */
}

void tlb_remove_table_sync_one(void)
{
        /*
         * This isn't an RCU grace period and hence the page-tables cannot be
         * assumed to be actually RCU-freed.
         *
         * It is however sufficient for software page-table walkers that rely on
         * IRQ disabling.
         */
        smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
}

static void tlb_remove_table_rcu(struct rcu_head *head)
{
        __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
}

static void tlb_remove_table_free(struct mmu_table_batch *batch)
{
        call_rcu(&batch->rcu, tlb_remove_table_rcu);
}

#else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */

static void tlb_remove_table_free(struct mmu_table_batch *batch)
{
        __tlb_remove_table_free(batch);
}

#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */

/*
 * If we want tlb_remove_table() to imply TLB invalidates.
 */
static inline void tlb_table_invalidate(struct mmu_gather *tlb)
{
        if (tlb_needs_table_invalidate()) {
                /*
                 * Invalidate page-table caches used by hardware walkers. Then
                 * we still need to RCU-sched wait while freeing the pages
                 * because software walkers can still be in-flight.
                 */
                tlb_flush_mmu_tlbonly(tlb);
        }
}

static void tlb_remove_table_one(void *table)
{
        tlb_remove_table_sync_one();
        __tlb_remove_table(table);
}

static void tlb_table_flush(struct mmu_gather *tlb)
{
        struct mmu_table_batch **batch = &tlb->batch;

        if (*batch) {
                tlb_table_invalidate(tlb);
                tlb_remove_table_free(*batch);
                *batch = NULL;
        }
}

void tlb_remove_table(struct mmu_gather *tlb, void *table)
{
        struct mmu_table_batch **batch = &tlb->batch;

        if (*batch == NULL) {
                *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
                if (*batch == NULL) {
                        tlb_table_invalidate(tlb);
                        tlb_remove_table_one(table);
                        return;
                }
                (*batch)->nr = 0;
        }

        (*batch)->tables[(*batch)->nr++] = table;
        if ((*batch)->nr == MAX_TABLE_BATCH)
                tlb_table_flush(tlb);
}

static inline void tlb_table_init(struct mmu_gather *tlb)
{
        tlb->batch = NULL;
}

#else /* !CONFIG_MMU_GATHER_TABLE_FREE */

static inline void tlb_table_flush(struct mmu_gather *tlb) { }
static inline void tlb_table_init(struct mmu_gather *tlb) { }

#endif /* CONFIG_MMU_GATHER_TABLE_FREE */

static void tlb_flush_mmu_free(struct mmu_gather *tlb)
{
        tlb_table_flush(tlb);
#ifndef CONFIG_MMU_GATHER_NO_GATHER
        tlb_batch_pages_flush(tlb);
#endif
}

void tlb_flush_mmu(struct mmu_gather *tlb)
{
        tlb_flush_mmu_tlbonly(tlb);
        tlb_flush_mmu_free(tlb);
}

/**
 * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
 * @tlb: the mmu_gather structure to initialize
 * @mm: the mm_struct of the target address space
 * @start: start of the region that will be removed from the page-table
 * @end: end of the region that will be removed from the page-table
 *
 * Called to initialize an (on-stack) mmu_gather structure for page-table
 * tear-down from @mm. The @start and @end are set to 0 and -1
 * respectively when @mm is without users and we're going to destroy
 * the full address space (exit/execve).
 */
void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
                        unsigned long start, unsigned long end)
{
        tlb->mm = mm;

        /* Is it from 0 to ~0? */
        tlb->fullmm     = !(start | (end+1));

#ifndef CONFIG_MMU_GATHER_NO_GATHER
        tlb->need_flush_all = 0;
        tlb->local.next = NULL;
        tlb->local.nr   = 0;
        tlb->local.max  = ARRAY_SIZE(tlb->__pages);
        tlb->active     = &tlb->local;
        tlb->batch_count = 0;
#endif

        tlb_table_init(tlb);
#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
        tlb->page_size = 0;
#endif

        __tlb_reset_range(tlb);
        inc_tlb_flush_pending(tlb->mm);
}

/**
 * tlb_finish_mmu - finish an mmu_gather structure
 * @tlb: the mmu_gather structure to finish
 * @start: start of the region that will be removed from the page-table
 * @end: end of the region that will be removed from the page-table
 *
 * Called at the end of the shootdown operation to free up any resources that
 * were required.
 */
void tlb_finish_mmu(struct mmu_gather *tlb,
                unsigned long start, unsigned long end)
{
        /*
         * If there are parallel threads are doing PTE changes on same range
         * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
         * flush by batching, one thread may end up seeing inconsistent PTEs
         * and result in having stale TLB entries.  So flush TLB forcefully
         * if we detect parallel PTE batching threads.
         *
         * However, some syscalls, e.g. munmap(), may free page tables, this
         * needs force flush everything in the given range. Otherwise this
         * may result in having stale TLB entries for some architectures,
         * e.g. aarch64, that could specify flush what level TLB.
         */
        if (mm_tlb_flush_nested(tlb->mm)) {
                /*
                 * The aarch64 yields better performance with fullmm by
                 * avoiding multiple CPUs spamming TLBI messages at the
                 * same time.
                 *
                 * On x86 non-fullmm doesn't yield significant difference
                 * against fullmm.
                 */
                tlb->fullmm = 1;
                __tlb_reset_range(tlb);
                tlb->freed_tables = 1;
        }

        tlb_flush_mmu(tlb);

#ifndef CONFIG_MMU_GATHER_NO_GATHER
        tlb_batch_list_free(tlb);
#endif
        dec_tlb_flush_pending(tlb->mm);
}
























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PID_H
#define _LINUX_PID_H

#include <linux/rculist.h>
#include <linux/wait.h>
#include <linux/refcount.h>

enum pid_type
{
        PIDTYPE_PID,
        PIDTYPE_TGID,
        PIDTYPE_PGID,
        PIDTYPE_SID,
        PIDTYPE_MAX,
};

/*
 * What is struct pid?
 *
 * A struct pid is the kernel's internal notion of a process identifier.
 * It refers to individual tasks, process groups, and sessions.  While
 * there are processes attached to it the struct pid lives in a hash
 * table, so it and then the processes that it refers to can be found
 * quickly from the numeric pid value.  The attached processes may be
 * quickly accessed by following pointers from struct pid.
 *
 * Storing pid_t values in the kernel and referring to them later has a
 * problem.  The process originally with that pid may have exited and the
 * pid allocator wrapped, and another process could have come along
 * and been assigned that pid.
 *
 * Referring to user space processes by holding a reference to struct
 * task_struct has a problem.  When the user space process exits
 * the now useless task_struct is still kept.  A task_struct plus a
 * stack consumes around 10K of low kernel memory.  More precisely
 * this is THREAD_SIZE + sizeof(struct task_struct).  By comparison
 * a struct pid is about 64 bytes.
 *
 * Holding a reference to struct pid solves both of these problems.
 * It is small so holding a reference does not consume a lot of
 * resources, and since a new struct pid is allocated when the numeric pid
 * value is reused (when pids wrap around) we don't mistakenly refer to new
 * processes.
 */


/*
 * struct upid is used to get the id of the struct pid, as it is
 * seen in particular namespace. Later the struct pid is found with
 * find_pid_ns() using the int nr and struct pid_namespace *ns.
 */

struct upid {
        int nr;
        struct pid_namespace *ns;
};

struct pid
{
        refcount_t count;
        unsigned int level;
        spinlock_t lock;
        /* lists of tasks that use this pid */
        struct hlist_head tasks[PIDTYPE_MAX];
        struct hlist_head inodes;
        /* wait queue for pidfd notifications */
        wait_queue_head_t wait_pidfd;
        struct rcu_head rcu;
        struct upid numbers[1];
};

extern struct pid init_struct_pid;

extern const struct file_operations pidfd_fops;

struct file;

extern struct pid *pidfd_pid(const struct file *file);
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);

static inline struct pid *get_pid(struct pid *pid)
{
        if (pid)
                refcount_inc(&pid->count);
        return pid;
}

extern void put_pid(struct pid *pid);
extern struct task_struct *pid_task(struct pid *pid, enum pid_type);
static inline bool pid_has_task(struct pid *pid, enum pid_type type)
{
        return !hlist_empty(&pid->tasks[type]);
}
extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type);

extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);

/*
 * these helpers must be called with the tasklist_lock write-held.
 */
extern void attach_pid(struct task_struct *task, enum pid_type);
extern void detach_pid(struct task_struct *task, enum pid_type);
extern void change_pid(struct task_struct *task, enum pid_type,
                        struct pid *pid);
extern void exchange_tids(struct task_struct *task, struct task_struct *old);
extern void transfer_pid(struct task_struct *old, struct task_struct *new,
                         enum pid_type);

struct pid_namespace;
extern struct pid_namespace init_pid_ns;

extern int pid_max;
extern int pid_max_min, pid_max_max;

/*
 * look up a PID in the hash table. Must be called with the tasklist_lock
 * or rcu_read_lock() held.
 *
 * find_pid_ns() finds the pid in the namespace specified
 * find_vpid() finds the pid by its virtual id, i.e. in the current namespace
 *
 * see also find_task_by_vpid() set in include/linux/sched.h
 */
extern struct pid *find_pid_ns(int nr, struct pid_namespace *ns);
extern struct pid *find_vpid(int nr);

/*
 * Lookup a PID in the hash table, and return with it's count elevated.
 */
extern struct pid *find_get_pid(int nr);
extern struct pid *find_ge_pid(int nr, struct pid_namespace *);

extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
                             size_t set_tid_size);
extern void free_pid(struct pid *pid);
extern void disable_pid_allocation(struct pid_namespace *ns);

/*
 * ns_of_pid() returns the pid namespace in which the specified pid was
 * allocated.
 *
 * NOTE:
 *         ns_of_pid() is expected to be called for a process (task) that has
 *         an attached 'struct pid' (see attach_pid(), detach_pid()) i.e @pid
 *         is expected to be non-NULL. If @pid is NULL, caller should handle
 *         the resulting NULL pid-ns.
 */
static inline struct pid_namespace *ns_of_pid(struct pid *pid)
{
        struct pid_namespace *ns = NULL;
        if (pid)
                ns = pid->numbers[pid->level].ns;
        return ns;
}

/*
 * is_child_reaper returns true if the pid is the init process
 * of the current namespace. As this one could be checked before
 * pid_ns->child_reaper is assigned in copy_process, we check
 * with the pid number.
 */
static inline bool is_child_reaper(struct pid *pid)
{
        return pid->numbers[pid->level].nr == 1;
}

/*
 * the helpers to get the pid's id seen from different namespaces
 *
 * pid_nr()    : global id, i.e. the id seen from the init namespace;
 * pid_vnr()   : virtual id, i.e. the id seen from the pid namespace of
 *               current.
 * pid_nr_ns() : id seen from the ns specified.
 *
 * see also task_xid_nr() etc in include/linux/sched.h
 */

static inline pid_t pid_nr(struct pid *pid)
{
        pid_t nr = 0;
        if (pid)
                nr = pid->numbers[0].nr;
        return nr;
}

pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns);
pid_t pid_vnr(struct pid *pid);

#define do_each_pid_task(pid, type, task)                                \
        do {                                                                \
                if ((pid) != NULL)                                        \
                        hlist_for_each_entry_rcu((task),                \
                                &(pid)->tasks[type], pid_links[type]) {

                        /*
                         * Both old and new leaders may be attached to
                         * the same pid in the middle of de_thread().
                         */
#define while_each_pid_task(pid, type, task)                                \
                                if (type == PIDTYPE_PID)                \
                                        break;                                \
                        }                                                \
        } while (0)

#define do_each_pid_thread(pid, type, task)                                \
        do_each_pid_task(pid, type, task) {                                \
                struct task_struct *tg___ = task;                        \
                for_each_thread(tg___, task) {

#define while_each_pid_thread(pid, type, task)                                \
                }                                                        \
                task = tg___;                                                \
        } while_each_pid_task(pid, type, task)
#endif /* _LINUX_PID_H */








































































































    1 
    1 




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
#include <linux/gfp.h>
#include <linux/initrd.h>
#include <linux/ioport.h>
#include <linux/swap.h>
#include <linux/memblock.h>
#include <linux/swapfile.h>
#include <linux/swapops.h>
#include <linux/kmemleak.h>
#include <linux/sched/task.h>
#include <linux/sched/mm.h>

#include <asm/set_memory.h>
#include <asm/cpu_device_id.h>
#include <asm/e820/api.h>
#include <asm/init.h>
#include <asm/page.h>
#include <asm/page_types.h>
#include <asm/sections.h>
#include <asm/setup.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <asm/proto.h>
#include <asm/dma.h>                /* for MAX_DMA_PFN */
#include <asm/microcode.h>
#include <asm/kaslr.h>
#include <asm/hypervisor.h>
#include <asm/cpufeature.h>
#include <asm/pti.h>
#include <asm/text-patching.h>
#include <asm/memtype.h>
#include <asm/paravirt.h>

/*
 * We need to define the tracepoints somewhere, and tlb.c
 * is only compied when SMP=y.
 */
#define CREATE_TRACE_POINTS
#include <trace/events/tlb.h>

#include "mm_internal.h"

/*
 * Tables translating between page_cache_type_t and pte encoding.
 *
 * The default values are defined statically as minimal supported mode;
 * WC and WT fall back to UC-.  pat_init() updates these values to support
 * more cache modes, WC and WT, when it is safe to do so.  See pat_init()
 * for the details.  Note, __early_ioremap() used during early boot-time
 * takes pgprot_t (pte encoding) and does not use these tables.
 *
 *   Index into __cachemode2pte_tbl[] is the cachemode.
 *
 *   Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte
 *   (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2.
 */
static uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
        [_PAGE_CACHE_MODE_WB      ]        = 0         | 0        ,
        [_PAGE_CACHE_MODE_WC      ]        = 0         | _PAGE_PCD,
        [_PAGE_CACHE_MODE_UC_MINUS]        = 0         | _PAGE_PCD,
        [_PAGE_CACHE_MODE_UC      ]        = _PAGE_PWT | _PAGE_PCD,
        [_PAGE_CACHE_MODE_WT      ]        = 0         | _PAGE_PCD,
        [_PAGE_CACHE_MODE_WP      ]        = 0         | _PAGE_PCD,
};

unsigned long cachemode2protval(enum page_cache_mode pcm)
{
        if (likely(pcm == 0))
                return 0;
        return __cachemode2pte_tbl[pcm];
}
EXPORT_SYMBOL(cachemode2protval);

static uint8_t __pte2cachemode_tbl[8] = {
        [__pte2cm_idx( 0        | 0         | 0        )] = _PAGE_CACHE_MODE_WB,
        [__pte2cm_idx(_PAGE_PWT | 0         | 0        )] = _PAGE_CACHE_MODE_UC_MINUS,
        [__pte2cm_idx( 0        | _PAGE_PCD | 0        )] = _PAGE_CACHE_MODE_UC_MINUS,
        [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0        )] = _PAGE_CACHE_MODE_UC,
        [__pte2cm_idx( 0        | 0         | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB,
        [__pte2cm_idx(_PAGE_PWT | 0         | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
        [__pte2cm_idx(0         | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
        [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC,
};

/*
 * Check that the write-protect PAT entry is set for write-protect.
 * To do this without making assumptions how PAT has been set up (Xen has
 * another layout than the kernel), translate the _PAGE_CACHE_MODE_WP cache
 * mode via the __cachemode2pte_tbl[] into protection bits (those protection
 * bits will select a cache mode of WP or better), and then translate the
 * protection bits back into the cache mode using __pte2cm_idx() and the
 * __pte2cachemode_tbl[] array. This will return the really used cache mode.
 */
bool x86_has_pat_wp(void)
{
        uint16_t prot = __cachemode2pte_tbl[_PAGE_CACHE_MODE_WP];

        return __pte2cachemode_tbl[__pte2cm_idx(prot)] == _PAGE_CACHE_MODE_WP;
}

enum page_cache_mode pgprot2cachemode(pgprot_t pgprot)
{
        unsigned long masked;

        masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK;
        if (likely(masked == 0))
                return 0;
        return __pte2cachemode_tbl[__pte2cm_idx(masked)];
}

static unsigned long __initdata pgt_buf_start;
static unsigned long __initdata pgt_buf_end;
static unsigned long __initdata pgt_buf_top;

static unsigned long min_pfn_mapped;

static bool __initdata can_use_brk_pgt = true;

/*
 * Pages returned are already directly mapped.
 *
 * Changing that is likely to break Xen, see commit:
 *
 *    279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve
 *
 * for detailed information.
 */
__ref void *alloc_low_pages(unsigned int num)
{
        unsigned long pfn;
        int i;

        if (after_bootmem) {
                unsigned int order;

                order = get_order((unsigned long)num << PAGE_SHIFT);
                return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
        }

        if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
                unsigned long ret = 0;

                if (min_pfn_mapped < max_pfn_mapped) {
                        ret = memblock_find_in_range(
                                        min_pfn_mapped << PAGE_SHIFT,
                                        max_pfn_mapped << PAGE_SHIFT,
                                        PAGE_SIZE * num , PAGE_SIZE);
                }
                if (ret)
                        memblock_reserve(ret, PAGE_SIZE * num);
                else if (can_use_brk_pgt)
                        ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE));

                if (!ret)
                        panic("alloc_low_pages: can not alloc memory");

                pfn = ret >> PAGE_SHIFT;
        } else {
                pfn = pgt_buf_end;
                pgt_buf_end += num;
        }

        for (i = 0; i < num; i++) {
                void *adr;

                adr = __va((pfn + i) << PAGE_SHIFT);
                clear_page(adr);
        }

        return __va(pfn << PAGE_SHIFT);
}

/*
 * By default need 3 4k for initial PMD_SIZE,  3 4k for 0-ISA_END_ADDRESS.
 * With KASLR memory randomization, depending on the machine e820 memory
 * and the PUD alignment. We may need twice more pages when KASLR memory
 * randomization is enabled.
 */
#ifndef CONFIG_RANDOMIZE_MEMORY
#define INIT_PGD_PAGE_COUNT      6
#else
#define INIT_PGD_PAGE_COUNT      12
#endif
#define INIT_PGT_BUF_SIZE        (INIT_PGD_PAGE_COUNT * PAGE_SIZE)
RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
void  __init early_alloc_pgt_buf(void)
{
        unsigned long tables = INIT_PGT_BUF_SIZE;
        phys_addr_t base;

        base = __pa(extend_brk(tables, PAGE_SIZE));

        pgt_buf_start = base >> PAGE_SHIFT;
        pgt_buf_end = pgt_buf_start;
        pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
}

int after_bootmem;

early_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES);

struct map_range {
        unsigned long start;
        unsigned long end;
        unsigned page_size_mask;
};

static int page_size_mask;

/*
 * Save some of cr4 feature set we're using (e.g.  Pentium 4MB
 * enable and PPro Global page enable), so that any CPU's that boot
 * up after us can get the correct flags. Invoked on the boot CPU.
 */
static inline void cr4_set_bits_and_update_boot(unsigned long mask)
{
        mmu_cr4_features |= mask;
        if (trampoline_cr4_features)
                *trampoline_cr4_features = mmu_cr4_features;
        cr4_set_bits(mask);
}

static void __init probe_page_size_mask(void)
{
        /*
         * For pagealloc debugging, identity mapping will use small pages.
         * This will simplify cpa(), which otherwise needs to support splitting
         * large pages into small in interrupt context, etc.
         */
        if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled())
                page_size_mask |= 1 << PG_LEVEL_2M;
        else
                direct_gbpages = 0;

        /* Enable PSE if available */
        if (boot_cpu_has(X86_FEATURE_PSE))
                cr4_set_bits_and_update_boot(X86_CR4_PSE);

        /* Enable PGE if available */
        __supported_pte_mask &= ~_PAGE_GLOBAL;
        if (boot_cpu_has(X86_FEATURE_PGE)) {
                cr4_set_bits_and_update_boot(X86_CR4_PGE);
                __supported_pte_mask |= _PAGE_GLOBAL;
        }

        /* By the default is everything supported: */
        __default_kernel_pte_mask = __supported_pte_mask;
        /* Except when with PTI where the kernel is mostly non-Global: */
        if (cpu_feature_enabled(X86_FEATURE_PTI))
                __default_kernel_pte_mask &= ~_PAGE_GLOBAL;

        /* Enable 1 GB linear kernel mappings if available: */
        if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {
                printk(KERN_INFO "Using GB pages for direct mapping\n");
                page_size_mask |= 1 << PG_LEVEL_1G;
        } else {
                direct_gbpages = 0;
        }
}

#define INTEL_MATCH(_model) { .vendor  = X86_VENDOR_INTEL,        \
                              .family  = 6,                        \
                              .model = _model,                        \
                            }
/*
 * INVLPG may not properly flush Global entries
 * on these CPUs when PCIDs are enabled.
 */
static const struct x86_cpu_id invlpg_miss_ids[] = {
        INTEL_MATCH(INTEL_FAM6_ALDERLAKE   ),
        INTEL_MATCH(INTEL_FAM6_ALDERLAKE_L ),
        INTEL_MATCH(INTEL_FAM6_ALDERLAKE_N ),
        INTEL_MATCH(INTEL_FAM6_RAPTORLAKE  ),
        INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_P),
        INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_S),
        {}
};

static void setup_pcid(void)
{
        if (!IS_ENABLED(CONFIG_X86_64))
                return;

        if (!boot_cpu_has(X86_FEATURE_PCID))
                return;

        if (x86_match_cpu(invlpg_miss_ids)) {
                pr_info("Incomplete global flushes, disabling PCID");
                setup_clear_cpu_cap(X86_FEATURE_PCID);
                return;
        }

        if (boot_cpu_has(X86_FEATURE_PGE)) {
                /*
                 * This can't be cr4_set_bits_and_update_boot() -- the
                 * trampoline code can't handle CR4.PCIDE and it wouldn't
                 * do any good anyway.  Despite the name,
                 * cr4_set_bits_and_update_boot() doesn't actually cause
                 * the bits in question to remain set all the way through
                 * the secondary boot asm.
                 *
                 * Instead, we brute-force it and set CR4.PCIDE manually in
                 * start_secondary().
                 */
                cr4_set_bits(X86_CR4_PCIDE);

                /*
                 * INVPCID's single-context modes (2/3) only work if we set
                 * X86_CR4_PCIDE, *and* we INVPCID support.  It's unusable
                 * on systems that have X86_CR4_PCIDE clear, or that have
                 * no INVPCID support at all.
                 */
                if (boot_cpu_has(X86_FEATURE_INVPCID))
                        setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE);
        } else {
                /*
                 * flush_tlb_all(), as currently implemented, won't work if
                 * PCID is on but PGE is not.  Since that combination
                 * doesn't exist on real hardware, there's no reason to try
                 * to fully support it, but it's polite to avoid corrupting
                 * data if we're on an improperly configured VM.
                 */
                setup_clear_cpu_cap(X86_FEATURE_PCID);
        }
}

#ifdef CONFIG_X86_32
#define NR_RANGE_MR 3
#else /* CONFIG_X86_64 */
#define NR_RANGE_MR 5
#endif

static int __meminit save_mr(struct map_range *mr, int nr_range,
                             unsigned long start_pfn, unsigned long end_pfn,
                             unsigned long page_size_mask)
{
        if (start_pfn < end_pfn) {
                if (nr_range >= NR_RANGE_MR)
                        panic("run out of range for init_memory_mapping\n");
                mr[nr_range].start = start_pfn<<PAGE_SHIFT;
                mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
                mr[nr_range].page_size_mask = page_size_mask;
                nr_range++;
        }

        return nr_range;
}

/*
 * adjust the page_size_mask for small range to go with
 *        big page size instead small one if nearby are ram too.
 */
static void __ref adjust_range_page_size_mask(struct map_range *mr,
                                                         int nr_range)
{
        int i;

        for (i = 0; i < nr_range; i++) {
                if ((page_size_mask & (1<<PG_LEVEL_2M)) &&
                    !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) {
                        unsigned long start = round_down(mr[i].start, PMD_SIZE);
                        unsigned long end = round_up(mr[i].end, PMD_SIZE);

#ifdef CONFIG_X86_32
                        if ((end >> PAGE_SHIFT) > max_low_pfn)
                                continue;
#endif

                        if (memblock_is_region_memory(start, end - start))
                                mr[i].page_size_mask |= 1<<PG_LEVEL_2M;
                }
                if ((page_size_mask & (1<<PG_LEVEL_1G)) &&
                    !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) {
                        unsigned long start = round_down(mr[i].start, PUD_SIZE);
                        unsigned long end = round_up(mr[i].end, PUD_SIZE);

                        if (memblock_is_region_memory(start, end - start))
                                mr[i].page_size_mask |= 1<<PG_LEVEL_1G;
                }
        }
}

static const char *page_size_string(struct map_range *mr)
{
        static const char str_1g[] = "1G";
        static const char str_2m[] = "2M";
        static const char str_4m[] = "4M";
        static const char str_4k[] = "4k";

        if (mr->page_size_mask & (1<<PG_LEVEL_1G))
                return str_1g;
        /*
         * 32-bit without PAE has a 4M large page size.
         * PG_LEVEL_2M is misnamed, but we can at least
         * print out the right size in the string.
         */
        if (IS_ENABLED(CONFIG_X86_32) &&
            !IS_ENABLED(CONFIG_X86_PAE) &&
            mr->page_size_mask & (1<<PG_LEVEL_2M))
                return str_4m;

        if (mr->page_size_mask & (1<<PG_LEVEL_2M))
                return str_2m;

        return str_4k;
}

static int __meminit split_mem_range(struct map_range *mr, int nr_range,
                                     unsigned long start,
                                     unsigned long end)
{
        unsigned long start_pfn, end_pfn, limit_pfn;
        unsigned long pfn;
        int i;

        limit_pfn = PFN_DOWN(end);

        /* head if not big page alignment ? */
        pfn = start_pfn = PFN_DOWN(start);
#ifdef CONFIG_X86_32
        /*
         * Don't use a large page for the first 2/4MB of memory
         * because there are often fixed size MTRRs in there
         * and overlapping MTRRs into large pages can cause
         * slowdowns.
         */
        if (pfn == 0)
                end_pfn = PFN_DOWN(PMD_SIZE);
        else
                end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
#else /* CONFIG_X86_64 */
        end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
#endif
        if (end_pfn > limit_pfn)
                end_pfn = limit_pfn;
        if (start_pfn < end_pfn) {
                nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
                pfn = end_pfn;
        }

        /* big page (2M) range */
        start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
#ifdef CONFIG_X86_32
        end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
#else /* CONFIG_X86_64 */
        end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
        if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE)))
                end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
#endif

        if (start_pfn < end_pfn) {
                nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
                                page_size_mask & (1<<PG_LEVEL_2M));
                pfn = end_pfn;
        }

#ifdef CONFIG_X86_64
        /* big page (1G) range */
        start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
        end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE));
        if (start_pfn < end_pfn) {
                nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
                                page_size_mask &
                                 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
                pfn = end_pfn;
        }

        /* tail is not big page (1G) alignment */
        start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
        end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
        if (start_pfn < end_pfn) {
                nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
                                page_size_mask & (1<<PG_LEVEL_2M));
                pfn = end_pfn;
        }
#endif

        /* tail is not big page (2M) alignment */
        start_pfn = pfn;
        end_pfn = limit_pfn;
        nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);

        if (!after_bootmem)
                adjust_range_page_size_mask(mr, nr_range);

        /* try to merge same page size and continuous */
        for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
                unsigned long old_start;
                if (mr[i].end != mr[i+1].start ||
                    mr[i].page_size_mask != mr[i+1].page_size_mask)
                        continue;
                /* move it */
                old_start = mr[i].start;
                memmove(&mr[i], &mr[i+1],
                        (nr_range - 1 - i) * sizeof(struct map_range));
                mr[i--].start = old_start;
                nr_range--;
        }

        for (i = 0; i < nr_range; i++)
                pr_debug(" [mem %#010lx-%#010lx] page %s\n",
                                mr[i].start, mr[i].end - 1,
                                page_size_string(&mr[i]));

        return nr_range;
}

struct range pfn_mapped[E820_MAX_ENTRIES];
int nr_pfn_mapped;

static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn)
{
        nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_MAX_ENTRIES,
                                             nr_pfn_mapped, start_pfn, end_pfn);
        nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_MAX_ENTRIES);

        max_pfn_mapped = max(max_pfn_mapped, end_pfn);

        if (start_pfn < (1UL<<(32-PAGE_SHIFT)))
                max_low_pfn_mapped = max(max_low_pfn_mapped,
                                         min(end_pfn, 1UL<<(32-PAGE_SHIFT)));
}

bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
{
        int i;

        for (i = 0; i < nr_pfn_mapped; i++)
                if ((start_pfn >= pfn_mapped[i].start) &&
                    (end_pfn <= pfn_mapped[i].end))
                        return true;

        return false;
}

/*
 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
 * This runs before bootmem is initialized and gets pages directly from
 * the physical memory. To access them they are temporarily mapped.
 */
unsigned long __ref init_memory_mapping(unsigned long start,
                                        unsigned long end, pgprot_t prot)
{
        struct map_range mr[NR_RANGE_MR];
        unsigned long ret = 0;
        int nr_range, i;

        pr_debug("init_memory_mapping: [mem %#010lx-%#010lx]\n",
               start, end - 1);

        memset(mr, 0, sizeof(mr));
        nr_range = split_mem_range(mr, 0, start, end);

        for (i = 0; i < nr_range; i++)
                ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
                                                   mr[i].page_size_mask,
                                                   prot);

        add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);

        return ret >> PAGE_SHIFT;
}

/*
 * We need to iterate through the E820 memory map and create direct mappings
 * for only E820_TYPE_RAM and E820_KERN_RESERVED regions. We cannot simply
 * create direct mappings for all pfns from [0 to max_low_pfn) and
 * [4GB to max_pfn) because of possible memory holes in high addresses
 * that cannot be marked as UC by fixed/variable range MTRRs.
 * Depending on the alignment of E820 ranges, this may possibly result
 * in using smaller size (i.e. 4K instead of 2M or 1G) page tables.
 *
 * init_mem_mapping() calls init_range_memory_mapping() with big range.
 * That range would have hole in the middle or ends, and only ram parts
 * will be mapped in init_range_memory_mapping().
 */
static unsigned long __init init_range_memory_mapping(
                                           unsigned long r_start,
                                           unsigned long r_end)
{
        unsigned long start_pfn, end_pfn;
        unsigned long mapped_ram_size = 0;
        int i;

        for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
                u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end);
                u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end);
                if (start >= end)
                        continue;

                /*
                 * if it is overlapping with brk pgt, we need to
                 * alloc pgt buf from memblock instead.
                 */
                can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >=
                                    min(end, (u64)pgt_buf_top<<PAGE_SHIFT);
                init_memory_mapping(start, end, PAGE_KERNEL);
                mapped_ram_size += end - start;
                can_use_brk_pgt = true;
        }

        return mapped_ram_size;
}

static unsigned long __init get_new_step_size(unsigned long step_size)
{
        /*
         * Initial mapped size is PMD_SIZE (2M).
         * We can not set step_size to be PUD_SIZE (1G) yet.
         * In worse case, when we cross the 1G boundary, and
         * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k)
         * to map 1G range with PTE. Hence we use one less than the
         * difference of page table level shifts.
         *
         * Don't need to worry about overflow in the top-down case, on 32bit,
         * when step_size is 0, round_down() returns 0 for start, and that
         * turns it into 0x100000000ULL.
         * In the bottom-up case, round_up(x, 0) returns 0 though too, which
         * needs to be taken into consideration by the code below.
         */
        return step_size << (PMD_SHIFT - PAGE_SHIFT - 1);
}

/**
 * memory_map_top_down - Map [map_start, map_end) top down
 * @map_start: start address of the target memory range
 * @map_end: end address of the target memory range
 *
 * This function will setup direct mapping for memory range
 * [map_start, map_end) in top-down. That said, the page tables
 * will be allocated at the end of the memory, and we map the
 * memory in top-down.
 */
static void __init memory_map_top_down(unsigned long map_start,
                                       unsigned long map_end)
{
        unsigned long real_end, start, last_start;
        unsigned long step_size;
        unsigned long addr;
        unsigned long mapped_ram_size = 0;

        /* xen has big range in reserved near end of ram, skip it at first.*/
        addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE);
        real_end = addr + PMD_SIZE;

        /* step_size need to be small so pgt_buf from BRK could cover it */
        step_size = PMD_SIZE;
        max_pfn_mapped = 0; /* will get exact value next */
        min_pfn_mapped = real_end >> PAGE_SHIFT;
        last_start = start = real_end;

        /*
         * We start from the top (end of memory) and go to the bottom.
         * The memblock_find_in_range() gets us a block of RAM from the
         * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages
         * for page table.
         */
        while (last_start > map_start) {
                if (last_start > step_size) {
                        start = round_down(last_start - 1, step_size);
                        if (start < map_start)
                                start = map_start;
                } else
                        start = map_start;
                mapped_ram_size += init_range_memory_mapping(start,
                                                        last_start);
                last_start = start;
                min_pfn_mapped = last_start >> PAGE_SHIFT;
                if (mapped_ram_size >= step_size)
                        step_size = get_new_step_size(step_size);
        }

        if (real_end < map_end)
                init_range_memory_mapping(real_end, map_end);
}

/**
 * memory_map_bottom_up - Map [map_start, map_end) bottom up
 * @map_start: start address of the target memory range
 * @map_end: end address of the target memory range
 *
 * This function will setup direct mapping for memory range
 * [map_start, map_end) in bottom-up. Since we have limited the
 * bottom-up allocation above the kernel, the page tables will
 * be allocated just above the kernel and we map the memory
 * in [map_start, map_end) in bottom-up.
 */
static void __init memory_map_bottom_up(unsigned long map_start,
                                        unsigned long map_end)
{
        unsigned long next, start;
        unsigned long mapped_ram_size = 0;
        /* step_size need to be small so pgt_buf from BRK could cover it */
        unsigned long step_size = PMD_SIZE;

        start = map_start;
        min_pfn_mapped = start >> PAGE_SHIFT;

        /*
         * We start from the bottom (@map_start) and go to the top (@map_end).
         * The memblock_find_in_range() gets us a block of RAM from the
         * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages
         * for page table.
         */
        while (start < map_end) {
                if (step_size && map_end - start > step_size) {
                        next = round_up(start + 1, step_size);
                        if (next > map_end)
                                next = map_end;
                } else {
                        next = map_end;
                }

                mapped_ram_size += init_range_memory_mapping(start, next);
                start = next;

                if (mapped_ram_size >= step_size)
                        step_size = get_new_step_size(step_size);
        }
}

/*
 * The real mode trampoline, which is required for bootstrapping CPUs
 * occupies only a small area under the low 1MB.  See reserve_real_mode()
 * for details.
 *
 * If KASLR is disabled the first PGD entry of the direct mapping is copied
 * to map the real mode trampoline.
 *
 * If KASLR is enabled, copy only the PUD which covers the low 1MB
 * area. This limits the randomization granularity to 1GB for both 4-level
 * and 5-level paging.
 */
static void __init init_trampoline(void)
{
#ifdef CONFIG_X86_64
        if (!kaslr_memory_enabled())
                trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
        else
                init_trampoline_kaslr();
#endif
}

void __init init_mem_mapping(void)
{
        unsigned long end;

        pti_check_boottime_disable();
        probe_page_size_mask();
        setup_pcid();

#ifdef CONFIG_X86_64
        end = max_pfn << PAGE_SHIFT;
#else
        end = max_low_pfn << PAGE_SHIFT;
#endif

        /* the ISA range is always mapped regardless of memory holes */
        init_memory_mapping(0, ISA_END_ADDRESS, PAGE_KERNEL);

        /* Init the trampoline, possibly with KASLR memory offset */
        init_trampoline();

        /*
         * If the allocation is in bottom-up direction, we setup direct mapping
         * in bottom-up, otherwise we setup direct mapping in top-down.
         */
        if (memblock_bottom_up()) {
                unsigned long kernel_end = __pa_symbol(_end);

                /*
                 * we need two separate calls here. This is because we want to
                 * allocate page tables above the kernel. So we first map
                 * [kernel_end, end) to make memory above the kernel be mapped
                 * as soon as possible. And then use page tables allocated above
                 * the kernel to map [ISA_END_ADDRESS, kernel_end).
                 */
                memory_map_bottom_up(kernel_end, end);
                memory_map_bottom_up(ISA_END_ADDRESS, kernel_end);
        } else {
                memory_map_top_down(ISA_END_ADDRESS, end);
        }

#ifdef CONFIG_X86_64
        if (max_pfn > max_low_pfn) {
                /* can we preseve max_low_pfn ?*/
                max_low_pfn = max_pfn;
        }
#else
        early_ioremap_page_table_range_init();
#endif

        load_cr3(swapper_pg_dir);
        __flush_tlb_all();

        x86_init.hyper.init_mem_mapping();

        early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
}

/*
 * Initialize an mm_struct to be used during poking and a pointer to be used
 * during patching.
 */
void __init poking_init(void)
{
        spinlock_t *ptl;
        pte_t *ptep;

        poking_mm = mm_alloc();
        BUG_ON(!poking_mm);

        /* Xen PV guests need the PGD to be pinned. */
        paravirt_arch_dup_mmap(NULL, poking_mm);

        /*
         * Randomize the poking address, but make sure that the following page
         * will be mapped at the same PMD. We need 2 pages, so find space for 3,
         * and adjust the address if the PMD ends after the first one.
         */
        poking_addr = TASK_UNMAPPED_BASE;
        if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
                poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) %
                        (TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE);

        if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0)
                poking_addr += PAGE_SIZE;

        /*
         * We need to trigger the allocation of the page-tables that will be
         * needed for poking now. Later, poking may be performed in an atomic
         * section, which might cause allocation to fail.
         */
        ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
        BUG_ON(!ptep);
        pte_unmap_unlock(ptep, ptl);
}

/*
 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
 * is valid. The argument is a physical page number.
 *
 * On x86, access has to be given to the first megabyte of RAM because that
 * area traditionally contains BIOS code and data regions used by X, dosemu,
 * and similar apps. Since they map the entire memory range, the whole range
 * must be allowed (for mapping), but any areas that would otherwise be
 * disallowed are flagged as being "zero filled" instead of rejected.
 * Access has to be given to non-kernel-ram areas as well, these contain the
 * PCI mmio resources as well as potential bios/acpi data regions.
 */
int devmem_is_allowed(unsigned long pagenr)
{
        if (region_intersects(PFN_PHYS(pagenr), PAGE_SIZE,
                                IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)
                        != REGION_DISJOINT) {
                /*
                 * For disallowed memory regions in the low 1MB range,
                 * request that the page be shown as all zeros.
                 */
                if (pagenr < 256)
                        return 2;

                return 0;
        }

        /*
         * This must follow RAM test, since System RAM is considered a
         * restricted resource under CONFIG_STRICT_IOMEM.
         */
        if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) {
                /* Low 1MB bypasses iomem restrictions. */
                if (pagenr < 256)
                        return 1;

                return 0;
        }

        return 1;
}

void free_init_pages(const char *what, unsigned long begin, unsigned long end)
{
        unsigned long begin_aligned, end_aligned;

        /* Make sure boundaries are page aligned */
        begin_aligned = PAGE_ALIGN(begin);
        end_aligned   = end & PAGE_MASK;

        if (WARN_ON(begin_aligned != begin || end_aligned != end)) {
                begin = begin_aligned;
                end   = end_aligned;
        }

        if (begin >= end)
                return;

        /*
         * If debugging page accesses then do not free this memory but
         * mark them not present - any buggy init-section access will
         * create a kernel page fault:
         */
        if (debug_pagealloc_enabled()) {
                pr_info("debug: unmapping init [mem %#010lx-%#010lx]\n",
                        begin, end - 1);
                /*
                 * Inform kmemleak about the hole in the memory since the
                 * corresponding pages will be unmapped.
                 */
                kmemleak_free_part((void *)begin, end - begin);
                set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
        } else {
                /*
                 * We just marked the kernel text read only above, now that
                 * we are going to free part of that, we need to make that
                 * writeable and non-executable first.
                 */
                set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
                set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);

                free_reserved_area((void *)begin, (void *)end,
                                   POISON_FREE_INITMEM, what);
        }
}

/*
 * begin/end can be in the direct map or the "high kernel mapping"
 * used for the kernel image only.  free_init_pages() will do the
 * right thing for either kind of address.
 */
void free_kernel_image_pages(const char *what, void *begin, void *end)
{
        unsigned long begin_ul = (unsigned long)begin;
        unsigned long end_ul = (unsigned long)end;
        unsigned long len_pages = (end_ul - begin_ul) >> PAGE_SHIFT;

        free_init_pages(what, begin_ul, end_ul);

        /*
         * PTI maps some of the kernel into userspace.  For performance,
         * this includes some kernel areas that do not contain secrets.
         * Those areas might be adjacent to the parts of the kernel image
         * being freed, which may contain secrets.  Remove the "high kernel
         * image mapping" for these freed areas, ensuring they are not even
         * potentially vulnerable to Meltdown regardless of the specific
         * optimizations PTI is currently using.
         *
         * The "noalias" prevents unmapping the direct map alias which is
         * needed to access the freed pages.
         *
         * This is only valid for 64bit kernels. 32bit has only one mapping
         * which can't be treated in this way for obvious reasons.
         */
        if (IS_ENABLED(CONFIG_X86_64) && cpu_feature_enabled(X86_FEATURE_PTI))
                set_memory_np_noalias(begin_ul, len_pages);
}

void __ref free_initmem(void)
{
        e820__reallocate_tables();

        mem_encrypt_free_decrypted_mem();

        free_kernel_image_pages("unused kernel image (initmem)",
                                &__init_begin, &__init_end);
}

#ifdef CONFIG_BLK_DEV_INITRD
void __init free_initrd_mem(unsigned long start, unsigned long end)
{
        /*
         * end could be not aligned, and We can not align that,
         * decompresser could be confused by aligned initrd_end
         * We already reserve the end partial page before in
         *   - i386_start_kernel()
         *   - x86_64_start_kernel()
         *   - relocate_initrd()
         * So here We can do PAGE_ALIGN() safely to get partial page to be freed
         */
        free_init_pages("initrd", start, PAGE_ALIGN(end));
}
#endif

/*
 * Calculate the precise size of the DMA zone (first 16 MB of RAM),
 * and pass it to the MM layer - to help it set zone watermarks more
 * accurately.
 *
 * Done on 64-bit systems only for the time being, although 32-bit systems
 * might benefit from this as well.
 */
void __init memblock_find_dma_reserve(void)
{
#ifdef CONFIG_X86_64
        u64 nr_pages = 0, nr_free_pages = 0;
        unsigned long start_pfn, end_pfn;
        phys_addr_t start_addr, end_addr;
        int i;
        u64 u;

        /*
         * Iterate over all memory ranges (free and reserved ones alike),
         * to calculate the total number of pages in the first 16 MB of RAM:
         */
        nr_pages = 0;
        for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
                start_pfn = min(start_pfn, MAX_DMA_PFN);
                end_pfn   = min(end_pfn,   MAX_DMA_PFN);

                nr_pages += end_pfn - start_pfn;
        }

        /*
         * Iterate over free memory ranges to calculate the number of free
         * pages in the DMA zone, while not counting potential partial
         * pages at the beginning or the end of the range:
         */
        nr_free_pages = 0;
        for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) {
                start_pfn = min_t(unsigned long, PFN_UP(start_addr), MAX_DMA_PFN);
                end_pfn   = min_t(unsigned long, PFN_DOWN(end_addr), MAX_DMA_PFN);

                if (start_pfn < end_pfn)
                        nr_free_pages += end_pfn - start_pfn;
        }

        set_dma_reserve(nr_pages - nr_free_pages);
#endif
}

void __init zone_sizes_init(void)
{
        unsigned long max_zone_pfns[MAX_NR_ZONES];

        memset(max_zone_pfns, 0, sizeof(max_zone_pfns));

#ifdef CONFIG_ZONE_DMA
        max_zone_pfns[ZONE_DMA]                = min(MAX_DMA_PFN, max_low_pfn);
#endif
#ifdef CONFIG_ZONE_DMA32
        max_zone_pfns[ZONE_DMA32]        = min(MAX_DMA32_PFN, max_low_pfn);
#endif
        max_zone_pfns[ZONE_NORMAL]        = max_low_pfn;
#ifdef CONFIG_HIGHMEM
        max_zone_pfns[ZONE_HIGHMEM]        = max_pfn;
#endif

        free_area_init(max_zone_pfns);
}

__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
        .loaded_mm = &init_mm,
        .next_asid = 1,
        .cr4 = ~0UL,        /* fail hard if we screw up cr4 shadow initialization */
};

void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
{
        /* entry 0 MUST be WB (hardwired to speed up translations) */
        BUG_ON(!entry && cache != _PAGE_CACHE_MODE_WB);

        __cachemode2pte_tbl[cache] = __cm_idx2pte(entry);
        __pte2cachemode_tbl[entry] = cache;
}

#ifdef CONFIG_SWAP
unsigned long max_swapfile_size(void)
{
        unsigned long pages;

        pages = generic_max_swapfile_size();

        if (boot_cpu_has_bug(X86_BUG_L1TF) && l1tf_mitigation != L1TF_MITIGATION_OFF) {
                /* Limit the swap file size to MAX_PA/2 for L1TF workaround */
                unsigned long long l1tf_limit = l1tf_pfn_limit();
                /*
                 * We encode swap offsets also with 3 bits below those for pfn
                 * which makes the usable limit higher.
                 */
#if CONFIG_PGTABLE_LEVELS > 2
                l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT;
#endif
                pages = min_t(unsigned long long, l1tf_limit, pages);
        }
        return pages;
}
#endif










































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_BARRIER_H
#define _ASM_X86_BARRIER_H

#include <asm/alternative.h>
#include <asm/nops.h>

/*
 * Force strict CPU ordering.
 * And yes, this might be required on UP too when we're talking
 * to devices.
 */

#ifdef CONFIG_X86_32
#define mb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "mfence", \
                                      X86_FEATURE_XMM2) ::: "memory", "cc")
#define rmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "lfence", \
                                       X86_FEATURE_XMM2) ::: "memory", "cc")
#define wmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "sfence", \
                                       X86_FEATURE_XMM2) ::: "memory", "cc")
#else
#define mb()         asm volatile("mfence":::"memory")
#define rmb()        asm volatile("lfence":::"memory")
#define wmb()        asm volatile("sfence" ::: "memory")
#endif

/**
 * array_index_mask_nospec() - generate a mask that is ~0UL when the
 *         bounds check succeeds and 0 otherwise
 * @index: array element index
 * @size: number of elements in array
 *
 * Returns:
 *     0 - (index < size)
 */
static inline unsigned long array_index_mask_nospec(unsigned long index,
                unsigned long size)
{
        unsigned long mask;

        asm volatile ("cmp %1,%2; sbb %0,%0;"
                        :"=r" (mask)
                        :"g"(size),"r" (index)
                        :"cc");
        return mask;
}

/* Override the default implementation from linux/nospec.h. */
#define array_index_mask_nospec array_index_mask_nospec

/* Prevent speculative execution past this barrier. */
#define barrier_nospec() alternative("", "lfence", X86_FEATURE_LFENCE_RDTSC)

#define dma_rmb()        barrier()
#define dma_wmb()        barrier()

#ifdef CONFIG_X86_32
#define __smp_mb()        asm volatile("lock; addl $0,-4(%%esp)" ::: "memory", "cc")
#else
#define __smp_mb()        asm volatile("lock; addl $0,-4(%%rsp)" ::: "memory", "cc")
#endif
#define __smp_rmb()        dma_rmb()
#define __smp_wmb()        barrier()
#define __smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0)

#define __smp_store_release(p, v)                                        \
do {                                                                        \
        compiletime_assert_atomic_type(*p);                                \
        barrier();                                                        \
        WRITE_ONCE(*p, v);                                                \
} while (0)

#define __smp_load_acquire(p)                                                \
({                                                                        \
        typeof(*p) ___p1 = READ_ONCE(*p);                                \
        compiletime_assert_atomic_type(*p);                                \
        barrier();                                                        \
        ___p1;                                                                \
})

/* Atomic operations are already serializing on x86 */
#define __smp_mb__before_atomic()        do { } while (0)
#define __smp_mb__after_atomic()        do { } while (0)

#include <asm-generic/barrier.h>

/*
 * Make previous memory operations globally visible before
 * a WRMSR.
 *
 * MFENCE makes writes visible, but only affects load/store
 * instructions.  WRMSR is unfortunately not a load/store
 * instruction and is unaffected by MFENCE.  The LFENCE ensures
 * that the WRMSR is not reordered.
 *
 * Most WRMSRs are full serializing instructions themselves and
 * do not require this barrier.  This is only required for the
 * IA32_TSC_DEADLINE and X2APIC MSRs.
 */
static inline void weak_wrmsr_fence(void)
{
        asm volatile("mfence; lfence" : : : "memory");
}

#endif /* _ASM_X86_BARRIER_H */







































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RCULIST_H
#define _LINUX_RCULIST_H

#ifdef __KERNEL__

/*
 * RCU-protected list version
 */
#include <linux/list.h>
#include <linux/rcupdate.h>

/*
 * Why is there no list_empty_rcu()?  Because list_empty() serves this
 * purpose.  The list_empty() function fetches the RCU-protected pointer
 * and compares it to the address of the list head, but neither dereferences
 * this pointer itself nor provides this pointer to the caller.  Therefore,
 * it is not necessary to use rcu_dereference(), so that list_empty() can
 * be used anywhere you would want to use a list_empty_rcu().
 */

/*
 * INIT_LIST_HEAD_RCU - Initialize a list_head visible to RCU readers
 * @list: list to be initialized
 *
 * You should instead use INIT_LIST_HEAD() for normal initialization and
 * cleanup tasks, when readers have no access to the list being initialized.
 * However, if the list being initialized is visible to readers, you
 * need to keep the compiler from being too mischievous.
 */
static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
{
        WRITE_ONCE(list->next, list);
        WRITE_ONCE(list->prev, list);
}

/*
 * return the ->next pointer of a list_head in an rcu safe
 * way, we must not access it directly
 */
#define list_next_rcu(list)        (*((struct list_head __rcu **)(&(list)->next)))

/**
 * list_tail_rcu - returns the prev pointer of the head of the list
 * @head: the head of the list
 *
 * Note: This should only be used with the list header, and even then
 * only if list_del() and similar primitives are not also used on the
 * list header.
 */
#define list_tail_rcu(head)        (*((struct list_head __rcu **)(&(head)->prev)))

/*
 * Check during list traversal that we are within an RCU reader
 */

#define check_arg_count_one(dummy)

#ifdef CONFIG_PROVE_RCU_LIST
#define __list_check_rcu(dummy, cond, extra...)                                \
        ({                                                                \
        check_arg_count_one(extra);                                        \
        RCU_LOCKDEP_WARN(!(cond) && !rcu_read_lock_any_held(),                \
                         "RCU-list traversed in non-reader section!");        \
        })

#define __list_check_srcu(cond)                                         \
        ({                                                                 \
        RCU_LOCKDEP_WARN(!(cond),                                         \
                "RCU-list traversed without holding the required lock!");\
        })
#else
#define __list_check_rcu(dummy, cond, extra...)                                \
        ({ check_arg_count_one(extra); })

#define __list_check_srcu(cond) ({ })
#endif

/*
 * Insert a new entry between two known consecutive entries.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void __list_add_rcu(struct list_head *new,
                struct list_head *prev, struct list_head *next)
{
        if (!__list_add_valid(new, prev, next))
                return;

        new->next = next;
        new->prev = prev;
        rcu_assign_pointer(list_next_rcu(prev), new);
        next->prev = new;
}

/**
 * list_add_rcu - add a new entry to rcu-protected list
 * @new: new entry to be added
 * @head: list head to add it after
 *
 * Insert a new entry after the specified head.
 * This is good for implementing stacks.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as list_add_rcu()
 * or list_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * list_for_each_entry_rcu().
 */
static inline void list_add_rcu(struct list_head *new, struct list_head *head)
{
        __list_add_rcu(new, head, head->next);
}

/**
 * list_add_tail_rcu - add a new entry to rcu-protected list
 * @new: new entry to be added
 * @head: list head to add it before
 *
 * Insert a new entry before the specified head.
 * This is useful for implementing queues.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as list_add_tail_rcu()
 * or list_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * list_for_each_entry_rcu().
 */
static inline void list_add_tail_rcu(struct list_head *new,
                                        struct list_head *head)
{
        __list_add_rcu(new, head->prev, head);
}

/**
 * list_del_rcu - deletes entry from list without re-initialization
 * @entry: the element to delete from the list.
 *
 * Note: list_empty() on entry does not return true after this,
 * the entry is in an undefined state. It is useful for RCU based
 * lockfree traversal.
 *
 * In particular, it means that we can not poison the forward
 * pointers that may still be used for walking the list.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as list_del_rcu()
 * or list_add_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * list_for_each_entry_rcu().
 *
 * Note that the caller is not permitted to immediately free
 * the newly deleted entry.  Instead, either synchronize_rcu()
 * or call_rcu() must be used to defer freeing until an RCU
 * grace period has elapsed.
 */
static inline void list_del_rcu(struct list_head *entry)
{
        __list_del_entry(entry);
        entry->prev = LIST_POISON2;
}

/**
 * hlist_del_init_rcu - deletes entry from hash list with re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: list_unhashed() on the node return true after this. It is
 * useful for RCU based read lockfree traversal if the writer side
 * must know if the list entry is still hashed or already unhashed.
 *
 * In particular, it means that we can not poison the forward pointers
 * that may still be used for walking the hash list and we can only
 * zero the pprev pointer so list_unhashed() will return true after
 * this.
 *
 * The caller must take whatever precautions are necessary (such as
 * holding appropriate locks) to avoid racing with another
 * list-mutation primitive, such as hlist_add_head_rcu() or
 * hlist_del_rcu(), running on this same list.  However, it is
 * perfectly legal to run concurrently with the _rcu list-traversal
 * primitives, such as hlist_for_each_entry_rcu().
 */
static inline void hlist_del_init_rcu(struct hlist_node *n)
{
        if (!hlist_unhashed(n)) {
                __hlist_del(n);
                WRITE_ONCE(n->pprev, NULL);
        }
}

/**
 * list_replace_rcu - replace old entry by new one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * The @old entry will be replaced with the @new entry atomically.
 * Note: @old should not be empty.
 */
static inline void list_replace_rcu(struct list_head *old,
                                struct list_head *new)
{
        new->next = old->next;
        new->prev = old->prev;
        rcu_assign_pointer(list_next_rcu(new->prev), new);
        new->next->prev = new;
        old->prev = LIST_POISON2;
}

/**
 * __list_splice_init_rcu - join an RCU-protected list into an existing list.
 * @list:        the RCU-protected list to splice
 * @prev:        points to the last element of the existing list
 * @next:        points to the first element of the existing list
 * @sync:        synchronize_rcu, synchronize_rcu_expedited, ...
 *
 * The list pointed to by @prev and @next can be RCU-read traversed
 * concurrently with this function.
 *
 * Note that this function blocks.
 *
 * Important note: the caller must take whatever action is necessary to prevent
 * any other updates to the existing list.  In principle, it is possible to
 * modify the list as soon as sync() begins execution. If this sort of thing
 * becomes necessary, an alternative version based on call_rcu() could be
 * created.  But only if -really- needed -- there is no shortage of RCU API
 * members.
 */
static inline void __list_splice_init_rcu(struct list_head *list,
                                          struct list_head *prev,
                                          struct list_head *next,
                                          void (*sync)(void))
{
        struct list_head *first = list->next;
        struct list_head *last = list->prev;

        /*
         * "first" and "last" tracking list, so initialize it.  RCU readers
         * have access to this list, so we must use INIT_LIST_HEAD_RCU()
         * instead of INIT_LIST_HEAD().
         */

        INIT_LIST_HEAD_RCU(list);

        /*
         * At this point, the list body still points to the source list.
         * Wait for any readers to finish using the list before splicing
         * the list body into the new list.  Any new readers will see
         * an empty list.
         */

        sync();
        ASSERT_EXCLUSIVE_ACCESS(*first);
        ASSERT_EXCLUSIVE_ACCESS(*last);

        /*
         * Readers are finished with the source list, so perform splice.
         * The order is important if the new list is global and accessible
         * to concurrent RCU readers.  Note that RCU readers are not
         * permitted to traverse the prev pointers without excluding
         * this function.
         */

        last->next = next;
        rcu_assign_pointer(list_next_rcu(prev), first);
        first->prev = prev;
        next->prev = last;
}

/**
 * list_splice_init_rcu - splice an RCU-protected list into an existing list,
 *                        designed for stacks.
 * @list:        the RCU-protected list to splice
 * @head:        the place in the existing list to splice the first list into
 * @sync:        synchronize_rcu, synchronize_rcu_expedited, ...
 */
static inline void list_splice_init_rcu(struct list_head *list,
                                        struct list_head *head,
                                        void (*sync)(void))
{
        if (!list_empty(list))
                __list_splice_init_rcu(list, head, head->next, sync);
}

/**
 * list_splice_tail_init_rcu - splice an RCU-protected list into an existing
 *                             list, designed for queues.
 * @list:        the RCU-protected list to splice
 * @head:        the place in the existing list to splice the first list into
 * @sync:        synchronize_rcu, synchronize_rcu_expedited, ...
 */
static inline void list_splice_tail_init_rcu(struct list_head *list,
                                             struct list_head *head,
                                             void (*sync)(void))
{
        if (!list_empty(list))
                __list_splice_init_rcu(list, head->prev, head, sync);
}

/**
 * list_entry_rcu - get the struct for this entry
 * @ptr:        the &struct list_head pointer.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * This primitive may safely run concurrently with the _rcu list-mutation
 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
 */
#define list_entry_rcu(ptr, type, member) \
        container_of(READ_ONCE(ptr), type, member)

/*
 * Where are list_empty_rcu() and list_first_entry_rcu()?
 *
 * Implementing those functions following their counterparts list_empty() and
 * list_first_entry() is not advisable because they lead to subtle race
 * conditions as the following snippet shows:
 *
 * if (!list_empty_rcu(mylist)) {
 *        struct foo *bar = list_first_entry_rcu(mylist, struct foo, list_member);
 *        do_something(bar);
 * }
 *
 * The list may not be empty when list_empty_rcu checks it, but it may be when
 * list_first_entry_rcu rereads the ->next pointer.
 *
 * Rereading the ->next pointer is not a problem for list_empty() and
 * list_first_entry() because they would be protected by a lock that blocks
 * writers.
 *
 * See list_first_or_null_rcu for an alternative.
 */

/**
 * list_first_or_null_rcu - get the first element from a list
 * @ptr:        the list head to take the element from.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * Note that if the list is empty, it returns NULL.
 *
 * This primitive may safely run concurrently with the _rcu list-mutation
 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
 */
#define list_first_or_null_rcu(ptr, type, member) \
({ \
        struct list_head *__ptr = (ptr); \
        struct list_head *__next = READ_ONCE(__ptr->next); \
        likely(__ptr != __next) ? list_entry_rcu(__next, type, member) : NULL; \
})

/**
 * list_next_or_null_rcu - get the first element from a list
 * @head:        the head for the list.
 * @ptr:        the list head to take the next element from.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * Note that if the ptr is at the end of the list, NULL is returned.
 *
 * This primitive may safely run concurrently with the _rcu list-mutation
 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
 */
#define list_next_or_null_rcu(head, ptr, type, member) \
({ \
        struct list_head *__head = (head); \
        struct list_head *__ptr = (ptr); \
        struct list_head *__next = READ_ONCE(__ptr->next); \
        likely(__next != __head) ? list_entry_rcu(__next, type, \
                                                  member) : NULL; \
})

/**
 * list_for_each_entry_rcu        -        iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 * @cond:        optional lockdep expression if called from non-RCU protection.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as list_add_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
#define list_for_each_entry_rcu(pos, head, member, cond...)                \
        for (__list_check_rcu(dummy, ## cond, 0),                        \
             pos = list_entry_rcu((head)->next, typeof(*pos), member);        \
                &pos->member != (head);                                        \
                pos = list_entry_rcu(pos->member.next, typeof(*pos), member))

/**
 * list_for_each_entry_srcu        -        iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 * @cond:        lockdep expression for the lock required to traverse the list.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as list_add_rcu()
 * as long as the traversal is guarded by srcu_read_lock().
 * The lockdep expression srcu_read_lock_held() can be passed as the
 * cond argument from read side.
 */
#define list_for_each_entry_srcu(pos, head, member, cond)                \
        for (__list_check_srcu(cond),                                        \
             pos = list_entry_rcu((head)->next, typeof(*pos), member);        \
                &pos->member != (head);                                        \
                pos = list_entry_rcu(pos->member.next, typeof(*pos), member))

/**
 * list_entry_lockless - get the struct for this entry
 * @ptr:        the &struct list_head pointer.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * This primitive may safely run concurrently with the _rcu
 * list-mutation primitives such as list_add_rcu(), but requires some
 * implicit RCU read-side guarding.  One example is running within a special
 * exception-time environment where preemption is disabled and where lockdep
 * cannot be invoked.  Another example is when items are added to the list,
 * but never deleted.
 */
#define list_entry_lockless(ptr, type, member) \
        container_of((typeof(ptr))READ_ONCE(ptr), type, member)

/**
 * list_for_each_entry_lockless - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_struct within the struct.
 *
 * This primitive may safely run concurrently with the _rcu
 * list-mutation primitives such as list_add_rcu(), but requires some
 * implicit RCU read-side guarding.  One example is running within a special
 * exception-time environment where preemption is disabled and where lockdep
 * cannot be invoked.  Another example is when items are added to the list,
 * but never deleted.
 */
#define list_for_each_entry_lockless(pos, head, member) \
        for (pos = list_entry_lockless((head)->next, typeof(*pos), member); \
             &pos->member != (head); \
             pos = list_entry_lockless(pos->member.next, typeof(*pos), member))

/**
 * list_for_each_entry_continue_rcu - continue iteration over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Continue to iterate over list of given type, continuing after
 * the current position which must have been in the list when the RCU read
 * lock was taken.
 * This would typically require either that you obtained the node from a
 * previous walk of the list in the same RCU read-side critical section, or
 * that you held some sort of non-RCU reference (such as a reference count)
 * to keep the node alive *and* in the list.
 *
 * This iterator is similar to list_for_each_entry_from_rcu() except
 * this starts after the given position and that one starts at the given
 * position.
 */
#define list_for_each_entry_continue_rcu(pos, head, member)                 \
        for (pos = list_entry_rcu(pos->member.next, typeof(*pos), member); \
             &pos->member != (head);        \
             pos = list_entry_rcu(pos->member.next, typeof(*pos), member))

/**
 * list_for_each_entry_from_rcu - iterate over a list from current point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_node within the struct.
 *
 * Iterate over the tail of a list starting from a given position,
 * which must have been in the list when the RCU read lock was taken.
 * This would typically require either that you obtained the node from a
 * previous walk of the list in the same RCU read-side critical section, or
 * that you held some sort of non-RCU reference (such as a reference count)
 * to keep the node alive *and* in the list.
 *
 * This iterator is similar to list_for_each_entry_continue_rcu() except
 * this starts from the given position and that one starts from the position
 * after the given position.
 */
#define list_for_each_entry_from_rcu(pos, head, member)                        \
        for (; &(pos)->member != (head);                                        \
                pos = list_entry_rcu(pos->member.next, typeof(*(pos)), member))

/**
 * hlist_del_rcu - deletes entry from hash list without re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: list_unhashed() on entry does not return true after this,
 * the entry is in an undefined state. It is useful for RCU based
 * lockfree traversal.
 *
 * In particular, it means that we can not poison the forward
 * pointers that may still be used for walking the hash list.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry().
 */
static inline void hlist_del_rcu(struct hlist_node *n)
{
        __hlist_del(n);
        WRITE_ONCE(n->pprev, LIST_POISON2);
}

/**
 * hlist_replace_rcu - replace old entry by new one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * The @old entry will be replaced with the @new entry atomically.
 */
static inline void hlist_replace_rcu(struct hlist_node *old,
                                        struct hlist_node *new)
{
        struct hlist_node *next = old->next;

        new->next = next;
        WRITE_ONCE(new->pprev, old->pprev);
        rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new);
        if (next)
                WRITE_ONCE(new->next->pprev, &new->next);
        WRITE_ONCE(old->pprev, LIST_POISON2);
}

/**
 * hlists_swap_heads_rcu - swap the lists the hlist heads point to
 * @left:  The hlist head on the left
 * @right: The hlist head on the right
 *
 * The lists start out as [@left  ][node1 ... ] and
 *                        [@right ][node2 ... ]
 * The lists end up as    [@left  ][node2 ... ]
 *                        [@right ][node1 ... ]
 */
static inline void hlists_swap_heads_rcu(struct hlist_head *left, struct hlist_head *right)
{
        struct hlist_node *node1 = left->first;
        struct hlist_node *node2 = right->first;

        rcu_assign_pointer(left->first, node2);
        rcu_assign_pointer(right->first, node1);
        WRITE_ONCE(node2->pprev, &left->first);
        WRITE_ONCE(node1->pprev, &right->first);
}

/*
 * return the first or the next element in an RCU protected hlist
 */
#define hlist_first_rcu(head)        (*((struct hlist_node __rcu **)(&(head)->first)))
#define hlist_next_rcu(node)        (*((struct hlist_node __rcu **)(&(node)->next)))
#define hlist_pprev_rcu(node)        (*((struct hlist_node __rcu **)((node)->pprev)))

/**
 * hlist_add_head_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_add_head_rcu(struct hlist_node *n,
                                        struct hlist_head *h)
{
        struct hlist_node *first = h->first;

        n->next = first;
        WRITE_ONCE(n->pprev, &h->first);
        rcu_assign_pointer(hlist_first_rcu(h), n);
        if (first)
                WRITE_ONCE(first->pprev, &n->next);
}

/**
 * hlist_add_tail_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_add_tail_rcu(struct hlist_node *n,
                                      struct hlist_head *h)
{
        struct hlist_node *i, *last = NULL;

        /* Note: write side code, so rcu accessors are not needed. */
        for (i = h->first; i; i = i->next)
                last = i;

        if (last) {
                n->next = last->next;
                WRITE_ONCE(n->pprev, &last->next);
                rcu_assign_pointer(hlist_next_rcu(last), n);
        } else {
                hlist_add_head_rcu(n, h);
        }
}

/**
 * hlist_add_before_rcu
 * @n: the new element to add to the hash list.
 * @next: the existing element to add the new element before.
 *
 * Description:
 * Adds the specified element to the specified hlist
 * before the specified node while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.
 */
static inline void hlist_add_before_rcu(struct hlist_node *n,
                                        struct hlist_node *next)
{
        WRITE_ONCE(n->pprev, next->pprev);
        n->next = next;
        rcu_assign_pointer(hlist_pprev_rcu(n), n);
        WRITE_ONCE(next->pprev, &n->next);
}

/**
 * hlist_add_behind_rcu
 * @n: the new element to add to the hash list.
 * @prev: the existing element to add the new element after.
 *
 * Description:
 * Adds the specified element to the specified hlist
 * after the specified node while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.
 */
static inline void hlist_add_behind_rcu(struct hlist_node *n,
                                        struct hlist_node *prev)
{
        n->next = prev->next;
        WRITE_ONCE(n->pprev, &prev->next);
        rcu_assign_pointer(hlist_next_rcu(prev), n);
        if (n->next)
                WRITE_ONCE(n->next->pprev, &n->next);
}

#define __hlist_for_each_rcu(pos, head)                                \
        for (pos = rcu_dereference(hlist_first_rcu(head));        \
             pos;                                                \
             pos = rcu_dereference(hlist_next_rcu(pos)))

/**
 * hlist_for_each_entry_rcu - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 * @cond:        optional lockdep expression if called from non-RCU protection.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
#define hlist_for_each_entry_rcu(pos, head, member, cond...)                \
        for (__list_check_rcu(dummy, ## cond, 0),                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_srcu - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 * @cond:        lockdep expression for the lock required to traverse the list.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by srcu_read_lock().
 * The lockdep expression srcu_read_lock_held() can be passed as the
 * cond argument from read side.
 */
#define hlist_for_each_entry_srcu(pos, head, member, cond)                \
        for (__list_check_srcu(cond),                                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_rcu_notrace - iterate over rcu list of given type (for tracing)
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 *
 * This is the same as hlist_for_each_entry_rcu() except that it does
 * not do any RCU debugging or tracing.
 */
#define hlist_for_each_entry_rcu_notrace(pos, head, member)                        \
        for (pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
#define hlist_for_each_entry_rcu_bh(pos, head, member)                        \
        for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_continue_rcu - iterate over a hlist continuing after current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_continue_rcu(pos, member)                        \
        for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \
                        &(pos)->member)), typeof(*(pos)), member);        \
             pos;                                                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(        \
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_continue_rcu_bh - iterate over a hlist continuing after current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_continue_rcu_bh(pos, member)                \
        for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(  \
                        &(pos)->member)), typeof(*(pos)), member);        \
             pos;                                                        \
             pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(        \
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_from_rcu - iterate over a hlist continuing from current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_from_rcu(pos, member)                        \
        for (; pos;                                                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(        \
                        &(pos)->member)), typeof(*(pos)), member))

#endif        /* __KERNEL__ */
#endif




















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_NETLINK_H
#define __LINUX_NETLINK_H


#include <linux/capability.h>
#include <linux/skbuff.h>
#include <linux/export.h>
#include <net/scm.h>
#include <uapi/linux/netlink.h>

struct net;

static inline struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb)
{
        return (struct nlmsghdr *)skb->data;
}

enum netlink_skb_flags {
        NETLINK_SKB_DST                = 0x8,        /* Dst set in sendto or sendmsg */
};

struct netlink_skb_parms {
        struct scm_creds        creds;                /* Skb credentials        */
        __u32                        portid;
        __u32                        dst_group;
        __u32                        flags;
        struct sock                *sk;
        bool                        nsid_is_set;
        int                        nsid;
};

#define NETLINK_CB(skb)                (*(struct netlink_skb_parms*)&((skb)->cb))
#define NETLINK_CREDS(skb)        (&NETLINK_CB((skb)).creds)


void netlink_table_grab(void);
void netlink_table_ungrab(void);

#define NL_CFG_F_NONROOT_RECV        (1 << 0)
#define NL_CFG_F_NONROOT_SEND        (1 << 1)

/* optional Netlink kernel configuration parameters */
struct netlink_kernel_cfg {
        unsigned int        groups;
        unsigned int        flags;
        void                (*input)(struct sk_buff *skb);
        struct mutex        *cb_mutex;
        int                (*bind)(struct net *net, int group);
        void                (*unbind)(struct net *net, int group);
        bool                (*compare)(struct net *net, struct sock *sk);
};

struct sock *__netlink_kernel_create(struct net *net, int unit,
                                            struct module *module,
                                            struct netlink_kernel_cfg *cfg);
static inline struct sock *
netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg)
{
        return __netlink_kernel_create(net, unit, THIS_MODULE, cfg);
}

/* this can be increased when necessary - don't expose to userland */
#define NETLINK_MAX_COOKIE_LEN        20

/**
 * struct netlink_ext_ack - netlink extended ACK report struct
 * @_msg: message string to report - don't access directly, use
 *        %NL_SET_ERR_MSG
 * @bad_attr: attribute with error
 * @policy: policy for a bad attribute
 * @cookie: cookie data to return to userspace (for success)
 * @cookie_len: actual cookie data length
 */
struct netlink_ext_ack {
        const char *_msg;
        const struct nlattr *bad_attr;
        const struct nla_policy *policy;
        u8 cookie[NETLINK_MAX_COOKIE_LEN];
        u8 cookie_len;
};

/* Always use this macro, this allows later putting the
 * message into a separate section or such for things
 * like translation or listing all possible messages.
 * Currently string formatting is not supported (due
 * to the lack of an output buffer.)
 */
#define NL_SET_ERR_MSG(extack, msg) do {                \
        static const char __msg[] = msg;                \
        struct netlink_ext_ack *__extack = (extack);        \
                                                        \
        if (__extack)                                        \
                __extack->_msg = __msg;                        \
} while (0)

#define NL_SET_ERR_MSG_MOD(extack, msg)                        \
        NL_SET_ERR_MSG((extack), KBUILD_MODNAME ": " msg)

#define NL_SET_BAD_ATTR_POLICY(extack, attr, pol) do {        \
        if ((extack)) {                                        \
                (extack)->bad_attr = (attr);                \
                (extack)->policy = (pol);                \
        }                                                \
} while (0)

#define NL_SET_BAD_ATTR(extack, attr) NL_SET_BAD_ATTR_POLICY(extack, attr, NULL)

#define NL_SET_ERR_MSG_ATTR_POL(extack, attr, pol, msg) do {        \
        static const char __msg[] = msg;                        \
        struct netlink_ext_ack *__extack = (extack);                \
                                                                \
        if (__extack) {                                                \
                __extack->_msg = __msg;                                \
                __extack->bad_attr = (attr);                        \
                __extack->policy = (pol);                        \
        }                                                        \
} while (0)

#define NL_SET_ERR_MSG_ATTR(extack, attr, msg)                \
        NL_SET_ERR_MSG_ATTR_POL(extack, attr, NULL, msg)

static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack,
                                            u64 cookie)
{
        u64 __cookie = cookie;

        if (!extack)
                return;
        memcpy(extack->cookie, &__cookie, sizeof(__cookie));
        extack->cookie_len = sizeof(__cookie);
}

static inline void nl_set_extack_cookie_u32(struct netlink_ext_ack *extack,
                                            u32 cookie)
{
        u32 __cookie = cookie;

        if (!extack)
                return;
        memcpy(extack->cookie, &__cookie, sizeof(__cookie));
        extack->cookie_len = sizeof(__cookie);
}

void netlink_kernel_release(struct sock *sk);
int __netlink_change_ngroups(struct sock *sk, unsigned int groups);
int netlink_change_ngroups(struct sock *sk, unsigned int groups);
void __netlink_clear_multicast_users(struct sock *sk, unsigned int group);
void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
                 const struct netlink_ext_ack *extack);
int netlink_has_listeners(struct sock *sk, unsigned int group);
bool netlink_strict_get_check(struct sk_buff *skb);

int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock);
int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid,
                      __u32 group, gfp_t allocation);
int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb,
                               __u32 portid, __u32 group, gfp_t allocation,
                               int (*filter)(struct sock *dsk, struct sk_buff *skb, void *data),
                               void *filter_data);
int netlink_set_err(struct sock *ssk, __u32 portid, __u32 group, int code);
int netlink_register_notifier(struct notifier_block *nb);
int netlink_unregister_notifier(struct notifier_block *nb);

/* finegrained unicast helpers: */
struct sock *netlink_getsockbyfilp(struct file *filp);
int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
                      long *timeo, struct sock *ssk);
void netlink_detachskb(struct sock *sk, struct sk_buff *skb);
int netlink_sendskb(struct sock *sk, struct sk_buff *skb);

static inline struct sk_buff *
netlink_skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
{
        struct sk_buff *nskb;

        nskb = skb_clone(skb, gfp_mask);
        if (!nskb)
                return NULL;

        /* This is a large skb, set destructor callback to release head */
        if (is_vmalloc_addr(skb->head))
                nskb->destructor = skb->destructor;

        return nskb;
}

/*
 *        skb should fit one page. This choice is good for headerless malloc.
 *        But we should limit to 8K so that userspace does not have to
 *        use enormous buffer sizes on recvmsg() calls just to avoid
 *        MSG_TRUNC when PAGE_SIZE is very large.
 */
#if PAGE_SIZE < 8192UL
#define NLMSG_GOODSIZE        SKB_WITH_OVERHEAD(PAGE_SIZE)
#else
#define NLMSG_GOODSIZE        SKB_WITH_OVERHEAD(8192UL)
#endif

#define NLMSG_DEFAULT_SIZE (NLMSG_GOODSIZE - NLMSG_HDRLEN)


struct netlink_callback {
        struct sk_buff                *skb;
        const struct nlmsghdr        *nlh;
        int                        (*dump)(struct sk_buff * skb,
                                        struct netlink_callback *cb);
        int                        (*done)(struct netlink_callback *cb);
        void                        *data;
        /* the module that dump function belong to */
        struct module                *module;
        struct netlink_ext_ack        *extack;
        u16                        family;
        u16                        answer_flags;
        u32                        min_dump_alloc;
        unsigned int                prev_seq, seq;
        bool                        strict_check;
        union {
                u8                ctx[48];

                /* args is deprecated. Cast a struct over ctx instead
                 * for proper type safety.
                 */
                long                args[6];
        };
};

struct netlink_notify {
        struct net *net;
        u32 portid;
        int protocol;
};

struct nlmsghdr *
__nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags);

struct netlink_dump_control {
        int (*start)(struct netlink_callback *);
        int (*dump)(struct sk_buff *skb, struct netlink_callback *);
        int (*done)(struct netlink_callback *);
        void *data;
        struct module *module;
        u32 min_dump_alloc;
};

int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
                                const struct nlmsghdr *nlh,
                                struct netlink_dump_control *control);
static inline int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
                                     const struct nlmsghdr *nlh,
                                     struct netlink_dump_control *control)
{
        if (!control->module)
                control->module = THIS_MODULE;

        return __netlink_dump_start(ssk, skb, nlh, control);
}

struct netlink_tap {
        struct net_device *dev;
        struct module *module;
        struct list_head list;
};

int netlink_add_tap(struct netlink_tap *nt);
int netlink_remove_tap(struct netlink_tap *nt);

bool __netlink_ns_capable(const struct netlink_skb_parms *nsp,
                          struct user_namespace *ns, int cap);
bool netlink_ns_capable(const struct sk_buff *skb,
                        struct user_namespace *ns, int cap);
bool netlink_capable(const struct sk_buff *skb, int cap);
bool netlink_net_capable(const struct sk_buff *skb, int cap);

#endif        /* __LINUX_NETLINK_H */


































    1 

    1 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * RCU-based infrastructure for lightweight reader-writer locking
 *
 * Copyright (c) 2015, Red Hat, Inc.
 *
 * Author: Oleg Nesterov <oleg@redhat.com>
 */

#ifndef _LINUX_RCU_SYNC_H_
#define _LINUX_RCU_SYNC_H_

#include <linux/wait.h>
#include <linux/rcupdate.h>

/* Structure to mediate between updaters and fastpath-using readers.  */
struct rcu_sync {
        int                        gp_state;
        int                        gp_count;
        wait_queue_head_t        gp_wait;

        struct rcu_head                cb_head;
};

/**
 * rcu_sync_is_idle() - Are readers permitted to use their fastpaths?
 * @rsp: Pointer to rcu_sync structure to use for synchronization
 *
 * Returns true if readers are permitted to use their fastpaths.  Must be
 * invoked within some flavor of RCU read-side critical section.
 */
static inline bool rcu_sync_is_idle(struct rcu_sync *rsp)
{
        RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(),
                         "suspicious rcu_sync_is_idle() usage");
        return !READ_ONCE(rsp->gp_state); /* GP_IDLE */
}

extern void rcu_sync_init(struct rcu_sync *);
extern void rcu_sync_enter_start(struct rcu_sync *);
extern void rcu_sync_enter(struct rcu_sync *);
extern void rcu_sync_exit(struct rcu_sync *);
extern void rcu_sync_dtor(struct rcu_sync *);

#define __RCU_SYNC_INITIALIZER(name) {                                        \
                .gp_state = 0,                                                \
                .gp_count = 0,                                                \
                .gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait),        \
        }

#define        DEFINE_RCU_SYNC(name)        \
        struct rcu_sync name = __RCU_SYNC_INITIALIZER(name)

#endif /* _LINUX_RCU_SYNC_H_ */



















































































































































































































































































































































































































    1 














































    1 




    1 






















    1 












































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMU_NOTIFIER_H
#define _LINUX_MMU_NOTIFIER_H

#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/mm_types.h>
#include <linux/mmap_lock.h>
#include <linux/srcu.h>
#include <linux/interval_tree.h>

struct mmu_notifier_subscriptions;
struct mmu_notifier;
struct mmu_notifier_range;
struct mmu_interval_notifier;

/**
 * enum mmu_notifier_event - reason for the mmu notifier callback
 * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that
 * move the range
 *
 * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like
 * madvise() or replacing a page by another one, ...).
 *
 * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range
 * ie using the vma access permission (vm_page_prot) to update the whole range
 * is enough no need to inspect changes to the CPU page table (mprotect()
 * syscall)
 *
 * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for
 * pages in the range so to mirror those changes the user must inspect the CPU
 * page table (from the end callback).
 *
 * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same
 * access flags). User should soft dirty the page in the end callback to make
 * sure that anyone relying on soft dirtyness catch pages that might be written
 * through non CPU mappings.
 *
 * @MMU_NOTIFY_RELEASE: used during mmu_interval_notifier invalidate to signal
 * that the mm refcount is zero and the range is no longer accessible.
 *
 * @MMU_NOTIFY_MIGRATE: used during migrate_vma_collect() invalidate to signal
 * a device driver to possibly ignore the invalidation if the
 * migrate_pgmap_owner field matches the driver's device private pgmap owner.
 */
enum mmu_notifier_event {
        MMU_NOTIFY_UNMAP = 0,
        MMU_NOTIFY_CLEAR,
        MMU_NOTIFY_PROTECTION_VMA,
        MMU_NOTIFY_PROTECTION_PAGE,
        MMU_NOTIFY_SOFT_DIRTY,
        MMU_NOTIFY_RELEASE,
        MMU_NOTIFY_MIGRATE,
};

#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)

struct mmu_notifier_ops {
        /*
         * Called either by mmu_notifier_unregister or when the mm is
         * being destroyed by exit_mmap, always before all pages are
         * freed. This can run concurrently with other mmu notifier
         * methods (the ones invoked outside the mm context) and it
         * should tear down all secondary mmu mappings and freeze the
         * secondary mmu. If this method isn't implemented you've to
         * be sure that nothing could possibly write to the pages
         * through the secondary mmu by the time the last thread with
         * tsk->mm == mm exits.
         *
         * As side note: the pages freed after ->release returns could
         * be immediately reallocated by the gart at an alias physical
         * address with a different cache model, so if ->release isn't
         * implemented because all _software_ driven memory accesses
         * through the secondary mmu are terminated by the time the
         * last thread of this mm quits, you've also to be sure that
         * speculative _hardware_ operations can't allocate dirty
         * cachelines in the cpu that could not be snooped and made
         * coherent with the other read and write operations happening
         * through the gart alias address, so leading to memory
         * corruption.
         */
        void (*release)(struct mmu_notifier *subscription,
                        struct mm_struct *mm);

        /*
         * clear_flush_young is called after the VM is
         * test-and-clearing the young/accessed bitflag in the
         * pte. This way the VM will provide proper aging to the
         * accesses to the page through the secondary MMUs and not
         * only to the ones through the Linux pte.
         * Start-end is necessary in case the secondary MMU is mapping the page
         * at a smaller granularity than the primary MMU.
         */
        int (*clear_flush_young)(struct mmu_notifier *subscription,
                                 struct mm_struct *mm,
                                 unsigned long start,
                                 unsigned long end);

        /*
         * clear_young is a lightweight version of clear_flush_young. Like the
         * latter, it is supposed to test-and-clear the young/accessed bitflag
         * in the secondary pte, but it may omit flushing the secondary tlb.
         */
        int (*clear_young)(struct mmu_notifier *subscription,
                           struct mm_struct *mm,
                           unsigned long start,
                           unsigned long end);

        /*
         * test_young is called to check the young/accessed bitflag in
         * the secondary pte. This is used to know if the page is
         * frequently used without actually clearing the flag or tearing
         * down the secondary mapping on the page.
         */
        int (*test_young)(struct mmu_notifier *subscription,
                          struct mm_struct *mm,
                          unsigned long address);

        /*
         * change_pte is called in cases that pte mapping to page is changed:
         * for example, when ksm remaps pte to point to a new shared page.
         */
        void (*change_pte)(struct mmu_notifier *subscription,
                           struct mm_struct *mm,
                           unsigned long address,
                           pte_t pte);

        /*
         * invalidate_range_start() and invalidate_range_end() must be
         * paired and are called only when the mmap_lock and/or the
         * locks protecting the reverse maps are held. If the subsystem
         * can't guarantee that no additional references are taken to
         * the pages in the range, it has to implement the
         * invalidate_range() notifier to remove any references taken
         * after invalidate_range_start().
         *
         * Invalidation of multiple concurrent ranges may be
         * optionally permitted by the driver. Either way the
         * establishment of sptes is forbidden in the range passed to
         * invalidate_range_begin/end for the whole duration of the
         * invalidate_range_begin/end critical section.
         *
         * invalidate_range_start() is called when all pages in the
         * range are still mapped and have at least a refcount of one.
         *
         * invalidate_range_end() is called when all pages in the
         * range have been unmapped and the pages have been freed by
         * the VM.
         *
         * The VM will remove the page table entries and potentially
         * the page between invalidate_range_start() and
         * invalidate_range_end(). If the page must not be freed
         * because of pending I/O or other circumstances then the
         * invalidate_range_start() callback (or the initial mapping
         * by the driver) must make sure that the refcount is kept
         * elevated.
         *
         * If the driver increases the refcount when the pages are
         * initially mapped into an address space then either
         * invalidate_range_start() or invalidate_range_end() may
         * decrease the refcount. If the refcount is decreased on
         * invalidate_range_start() then the VM can free pages as page
         * table entries are removed.  If the refcount is only
         * droppped on invalidate_range_end() then the driver itself
         * will drop the last refcount but it must take care to flush
         * any secondary tlb before doing the final free on the
         * page. Pages will no longer be referenced by the linux
         * address space but may still be referenced by sptes until
         * the last refcount is dropped.
         *
         * If blockable argument is set to false then the callback cannot
         * sleep and has to return with -EAGAIN if sleeping would be required.
         * 0 should be returned otherwise. Please note that notifiers that can
         * fail invalidate_range_start are not allowed to implement
         * invalidate_range_end, as there is no mechanism for informing the
         * notifier that its start failed.
         */
        int (*invalidate_range_start)(struct mmu_notifier *subscription,
                                      const struct mmu_notifier_range *range);
        void (*invalidate_range_end)(struct mmu_notifier *subscription,
                                     const struct mmu_notifier_range *range);

        /*
         * invalidate_range() is either called between
         * invalidate_range_start() and invalidate_range_end() when the
         * VM has to free pages that where unmapped, but before the
         * pages are actually freed, or outside of _start()/_end() when
         * a (remote) TLB is necessary.
         *
         * If invalidate_range() is used to manage a non-CPU TLB with
         * shared page-tables, it not necessary to implement the
         * invalidate_range_start()/end() notifiers, as
         * invalidate_range() alread catches the points in time when an
         * external TLB range needs to be flushed. For more in depth
         * discussion on this see Documentation/vm/mmu_notifier.rst
         *
         * Note that this function might be called with just a sub-range
         * of what was passed to invalidate_range_start()/end(), if
         * called between those functions.
         */
        void (*invalidate_range)(struct mmu_notifier *subscription,
                                 struct mm_struct *mm,
                                 unsigned long start,
                                 unsigned long end);

        /*
         * These callbacks are used with the get/put interface to manage the
         * lifetime of the mmu_notifier memory. alloc_notifier() returns a new
         * notifier for use with the mm.
         *
         * free_notifier() is only called after the mmu_notifier has been
         * fully put, calls to any ops callback are prevented and no ops
         * callbacks are currently running. It is called from a SRCU callback
         * and cannot sleep.
         */
        struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm);
        void (*free_notifier)(struct mmu_notifier *subscription);
};

/*
 * The notifier chains are protected by mmap_lock and/or the reverse map
 * semaphores. Notifier chains are only changed when all reverse maps and
 * the mmap_lock locks are taken.
 *
 * Therefore notifier chains can only be traversed when either
 *
 * 1. mmap_lock is held.
 * 2. One of the reverse map locks is held (i_mmap_rwsem or anon_vma->rwsem).
 * 3. No other concurrent thread can access the list (release)
 */
struct mmu_notifier {
        struct hlist_node hlist;
        const struct mmu_notifier_ops *ops;
        struct mm_struct *mm;
        struct rcu_head rcu;
        unsigned int users;
};

/**
 * struct mmu_interval_notifier_ops
 * @invalidate: Upon return the caller must stop using any SPTEs within this
 *              range. This function can sleep. Return false only if sleeping
 *              was required but mmu_notifier_range_blockable(range) is false.
 */
struct mmu_interval_notifier_ops {
        bool (*invalidate)(struct mmu_interval_notifier *interval_sub,
                           const struct mmu_notifier_range *range,
                           unsigned long cur_seq);
};

struct mmu_interval_notifier {
        struct interval_tree_node interval_tree;
        const struct mmu_interval_notifier_ops *ops;
        struct mm_struct *mm;
        struct hlist_node deferred_item;
        unsigned long invalidate_seq;
};

#ifdef CONFIG_MMU_NOTIFIER

#ifdef CONFIG_LOCKDEP
extern struct lockdep_map __mmu_notifier_invalidate_range_start_map;
#endif

struct mmu_notifier_range {
        struct vm_area_struct *vma;
        struct mm_struct *mm;
        unsigned long start;
        unsigned long end;
        unsigned flags;
        enum mmu_notifier_event event;
        void *migrate_pgmap_owner;
};

static inline int mm_has_notifiers(struct mm_struct *mm)
{
        return unlikely(mm->notifier_subscriptions);
}

struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops,
                                             struct mm_struct *mm);
static inline struct mmu_notifier *
mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm)
{
        struct mmu_notifier *ret;

        mmap_write_lock(mm);
        ret = mmu_notifier_get_locked(ops, mm);
        mmap_write_unlock(mm);
        return ret;
}
void mmu_notifier_put(struct mmu_notifier *subscription);
void mmu_notifier_synchronize(void);

extern int mmu_notifier_register(struct mmu_notifier *subscription,
                                 struct mm_struct *mm);
extern int __mmu_notifier_register(struct mmu_notifier *subscription,
                                   struct mm_struct *mm);
extern void mmu_notifier_unregister(struct mmu_notifier *subscription,
                                    struct mm_struct *mm);

unsigned long
mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub);
int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
                                 struct mm_struct *mm, unsigned long start,
                                 unsigned long length,
                                 const struct mmu_interval_notifier_ops *ops);
int mmu_interval_notifier_insert_locked(
        struct mmu_interval_notifier *interval_sub, struct mm_struct *mm,
        unsigned long start, unsigned long length,
        const struct mmu_interval_notifier_ops *ops);
void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub);

/**
 * mmu_interval_set_seq - Save the invalidation sequence
 * @interval_sub - The subscription passed to invalidate
 * @cur_seq - The cur_seq passed to the invalidate() callback
 *
 * This must be called unconditionally from the invalidate callback of a
 * struct mmu_interval_notifier_ops under the same lock that is used to call
 * mmu_interval_read_retry(). It updates the sequence number for later use by
 * mmu_interval_read_retry(). The provided cur_seq will always be odd.
 *
 * If the caller does not call mmu_interval_read_begin() or
 * mmu_interval_read_retry() then this call is not required.
 */
static inline void
mmu_interval_set_seq(struct mmu_interval_notifier *interval_sub,
                     unsigned long cur_seq)
{
        WRITE_ONCE(interval_sub->invalidate_seq, cur_seq);
}

/**
 * mmu_interval_read_retry - End a read side critical section against a VA range
 * interval_sub: The subscription
 * seq: The return of the paired mmu_interval_read_begin()
 *
 * This MUST be called under a user provided lock that is also held
 * unconditionally by op->invalidate() when it calls mmu_interval_set_seq().
 *
 * Each call should be paired with a single mmu_interval_read_begin() and
 * should be used to conclude the read side.
 *
 * Returns true if an invalidation collided with this critical section, and
 * the caller should retry.
 */
static inline bool
mmu_interval_read_retry(struct mmu_interval_notifier *interval_sub,
                        unsigned long seq)
{
        return interval_sub->invalidate_seq != seq;
}

/**
 * mmu_interval_check_retry - Test if a collision has occurred
 * interval_sub: The subscription
 * seq: The return of the matching mmu_interval_read_begin()
 *
 * This can be used in the critical section between mmu_interval_read_begin()
 * and mmu_interval_read_retry().  A return of true indicates an invalidation
 * has collided with this critical region and a future
 * mmu_interval_read_retry() will return true.
 *
 * False is not reliable and only suggests a collision may not have
 * occured. It can be called many times and does not have to hold the user
 * provided lock.
 *
 * This call can be used as part of loops and other expensive operations to
 * expedite a retry.
 */
static inline bool
mmu_interval_check_retry(struct mmu_interval_notifier *interval_sub,
                         unsigned long seq)
{
        /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */
        return READ_ONCE(interval_sub->invalidate_seq) != seq;
}

extern void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm);
extern void __mmu_notifier_release(struct mm_struct *mm);
extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
                                          unsigned long start,
                                          unsigned long end);
extern int __mmu_notifier_clear_young(struct mm_struct *mm,
                                      unsigned long start,
                                      unsigned long end);
extern int __mmu_notifier_test_young(struct mm_struct *mm,
                                     unsigned long address);
extern void __mmu_notifier_change_pte(struct mm_struct *mm,
                                      unsigned long address, pte_t pte);
extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r);
extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r,
                                  bool only_end);
extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
                                  unsigned long start, unsigned long end);
extern bool
mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range);

static inline bool
mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
{
        return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE);
}

static inline void mmu_notifier_release(struct mm_struct *mm)
{
        if (mm_has_notifiers(mm))
                __mmu_notifier_release(mm);
}

static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
                                          unsigned long start,
                                          unsigned long end)
{
        if (mm_has_notifiers(mm))
                return __mmu_notifier_clear_flush_young(mm, start, end);
        return 0;
}

static inline int mmu_notifier_clear_young(struct mm_struct *mm,
                                           unsigned long start,
                                           unsigned long end)
{
        if (mm_has_notifiers(mm))
                return __mmu_notifier_clear_young(mm, start, end);
        return 0;
}

static inline int mmu_notifier_test_young(struct mm_struct *mm,
                                          unsigned long address)
{
        if (mm_has_notifiers(mm))
                return __mmu_notifier_test_young(mm, address);
        return 0;
}

static inline void mmu_notifier_change_pte(struct mm_struct *mm,
                                           unsigned long address, pte_t pte)
{
        if (mm_has_notifiers(mm))
                __mmu_notifier_change_pte(mm, address, pte);
}

static inline void
mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
{
        might_sleep();

        lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
        if (mm_has_notifiers(range->mm)) {
                range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE;
                __mmu_notifier_invalidate_range_start(range);
        }
        lock_map_release(&__mmu_notifier_invalidate_range_start_map);
}

static inline int
mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
{
        int ret = 0;

        lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
        if (mm_has_notifiers(range->mm)) {
                range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE;
                ret = __mmu_notifier_invalidate_range_start(range);
        }
        lock_map_release(&__mmu_notifier_invalidate_range_start_map);
        return ret;
}

static inline void
mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range)
{
        if (mmu_notifier_range_blockable(range))
                might_sleep();

        if (mm_has_notifiers(range->mm))
                __mmu_notifier_invalidate_range_end(range, false);
}

static inline void
mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range)
{
        if (mm_has_notifiers(range->mm))
                __mmu_notifier_invalidate_range_end(range, true);
}

static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
                                  unsigned long start, unsigned long end)
{
        if (mm_has_notifiers(mm))
                __mmu_notifier_invalidate_range(mm, start, end);
}

static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm)
{
        mm->notifier_subscriptions = NULL;
}

static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
{
        if (mm_has_notifiers(mm))
                __mmu_notifier_subscriptions_destroy(mm);
}


static inline void mmu_notifier_range_init(struct mmu_notifier_range *range,
                                           enum mmu_notifier_event event,
                                           unsigned flags,
                                           struct vm_area_struct *vma,
                                           struct mm_struct *mm,
                                           unsigned long start,
                                           unsigned long end)
{
        range->vma = vma;
        range->event = event;
        range->mm = mm;
        range->start = start;
        range->end = end;
        range->flags = flags;
}

static inline void mmu_notifier_range_init_migrate(
                        struct mmu_notifier_range *range, unsigned int flags,
                        struct vm_area_struct *vma, struct mm_struct *mm,
                        unsigned long start, unsigned long end, void *pgmap)
{
        mmu_notifier_range_init(range, MMU_NOTIFY_MIGRATE, flags, vma, mm,
                                start, end);
        range->migrate_pgmap_owner = pgmap;
}

#define ptep_clear_flush_young_notify(__vma, __address, __ptep)                \
({                                                                        \
        int __young;                                                        \
        struct vm_area_struct *___vma = __vma;                                \
        unsigned long ___address = __address;                                \
        __young = ptep_clear_flush_young(___vma, ___address, __ptep);        \
        __young |= mmu_notifier_clear_flush_young(___vma->vm_mm,        \
                                                  ___address,                \
                                                  ___address +                \
                                                        PAGE_SIZE);        \
        __young;                                                        \
})

#define pmdp_clear_flush_young_notify(__vma, __address, __pmdp)                \
({                                                                        \
        int __young;                                                        \
        struct vm_area_struct *___vma = __vma;                                \
        unsigned long ___address = __address;                                \
        __young = pmdp_clear_flush_young(___vma, ___address, __pmdp);        \
        __young |= mmu_notifier_clear_flush_young(___vma->vm_mm,        \
                                                  ___address,                \
                                                  ___address +                \
                                                        PMD_SIZE);        \
        __young;                                                        \
})

#define ptep_clear_young_notify(__vma, __address, __ptep)                \
({                                                                        \
        int __young;                                                        \
        struct vm_area_struct *___vma = __vma;                                \
        unsigned long ___address = __address;                                \
        __young = ptep_test_and_clear_young(___vma, ___address, __ptep);\
        __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address,        \
                                            ___address + PAGE_SIZE);        \
        __young;                                                        \
})

#define pmdp_clear_young_notify(__vma, __address, __pmdp)                \
({                                                                        \
        int __young;                                                        \
        struct vm_area_struct *___vma = __vma;                                \
        unsigned long ___address = __address;                                \
        __young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\
        __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address,        \
                                            ___address + PMD_SIZE);        \
        __young;                                                        \
})

#define        ptep_clear_flush_notify(__vma, __address, __ptep)                \
({                                                                        \
        unsigned long ___addr = __address & PAGE_MASK;                        \
        struct mm_struct *___mm = (__vma)->vm_mm;                        \
        pte_t ___pte;                                                        \
                                                                        \
        ___pte = ptep_clear_flush(__vma, __address, __ptep);                \
        mmu_notifier_invalidate_range(___mm, ___addr,                        \
                                        ___addr + PAGE_SIZE);                \
                                                                        \
        ___pte;                                                                \
})

#define pmdp_huge_clear_flush_notify(__vma, __haddr, __pmd)                \
({                                                                        \
        unsigned long ___haddr = __haddr & HPAGE_PMD_MASK;                \
        struct mm_struct *___mm = (__vma)->vm_mm;                        \
        pmd_t ___pmd;                                                        \
                                                                        \
        ___pmd = pmdp_huge_clear_flush(__vma, __haddr, __pmd);                \
        mmu_notifier_invalidate_range(___mm, ___haddr,                        \
                                      ___haddr + HPAGE_PMD_SIZE);        \
                                                                        \
        ___pmd;                                                                \
})

#define pudp_huge_clear_flush_notify(__vma, __haddr, __pud)                \
({                                                                        \
        unsigned long ___haddr = __haddr & HPAGE_PUD_MASK;                \
        struct mm_struct *___mm = (__vma)->vm_mm;                        \
        pud_t ___pud;                                                        \
                                                                        \
        ___pud = pudp_huge_clear_flush(__vma, __haddr, __pud);                \
        mmu_notifier_invalidate_range(___mm, ___haddr,                        \
                                      ___haddr + HPAGE_PUD_SIZE);        \
                                                                        \
        ___pud;                                                                \
})

/*
 * set_pte_at_notify() sets the pte _after_ running the notifier.
 * This is safe to start by updating the secondary MMUs, because the primary MMU
 * pte invalidate must have already happened with a ptep_clear_flush() before
 * set_pte_at_notify() has been invoked.  Updating the secondary MMUs first is
 * required when we change both the protection of the mapping from read-only to
 * read-write and the pfn (like during copy on write page faults). Otherwise the
 * old page would remain mapped readonly in the secondary MMUs after the new
 * page is already writable by some CPU through the primary MMU.
 */
#define set_pte_at_notify(__mm, __address, __ptep, __pte)                \
({                                                                        \
        struct mm_struct *___mm = __mm;                                        \
        unsigned long ___address = __address;                                \
        pte_t ___pte = __pte;                                                \
                                                                        \
        mmu_notifier_change_pte(___mm, ___address, ___pte);                \
        set_pte_at(___mm, ___address, __ptep, ___pte);                        \
})

#else /* CONFIG_MMU_NOTIFIER */

struct mmu_notifier_range {
        unsigned long start;
        unsigned long end;
};

static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range,
                                            unsigned long start,
                                            unsigned long end)
{
        range->start = start;
        range->end = end;
}

#define mmu_notifier_range_init(range,event,flags,vma,mm,start,end)  \
        _mmu_notifier_range_init(range, start, end)
#define mmu_notifier_range_init_migrate(range, flags, vma, mm, start, end, \
                                        pgmap) \
        _mmu_notifier_range_init(range, start, end)

static inline bool
mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
{
        return true;
}

static inline int mm_has_notifiers(struct mm_struct *mm)
{
        return 0;
}

static inline void mmu_notifier_release(struct mm_struct *mm)
{
}

static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
                                          unsigned long start,
                                          unsigned long end)
{
        return 0;
}

static inline int mmu_notifier_test_young(struct mm_struct *mm,
                                          unsigned long address)
{
        return 0;
}

static inline void mmu_notifier_change_pte(struct mm_struct *mm,
                                           unsigned long address, pte_t pte)
{
}

static inline void
mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
{
}

static inline int
mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
{
        return 0;
}

static inline
void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range)
{
}

static inline void
mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range)
{
}

static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
                                  unsigned long start, unsigned long end)
{
}

static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm)
{
}

static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
{
}

#define mmu_notifier_range_update_to_read_only(r) false

#define ptep_clear_flush_young_notify ptep_clear_flush_young
#define pmdp_clear_flush_young_notify pmdp_clear_flush_young
#define ptep_clear_young_notify ptep_test_and_clear_young
#define pmdp_clear_young_notify pmdp_test_and_clear_young
#define        ptep_clear_flush_notify ptep_clear_flush
#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
#define pudp_huge_clear_flush_notify pudp_huge_clear_flush
#define set_pte_at_notify set_pte_at

static inline void mmu_notifier_synchronize(void)
{
}

#endif /* CONFIG_MMU_NOTIFIER */

#endif /* _LINUX_MMU_NOTIFIER_H */




































































































































    1 











































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
/* SPDX-License-Identifier: GPL-2.0 */
/* thread_info.h: common low-level thread information accessors
 *
 * Copyright (C) 2002  David Howells (dhowells@redhat.com)
 * - Incorporating suggestions made by Linus Torvalds
 */

#ifndef _LINUX_THREAD_INFO_H
#define _LINUX_THREAD_INFO_H

#include <linux/types.h>
#include <linux/bug.h>
#include <linux/restart_block.h>
#include <linux/errno.h>

#ifdef CONFIG_THREAD_INFO_IN_TASK
/*
 * For CONFIG_THREAD_INFO_IN_TASK kernels we need <asm/current.h> for the
 * definition of current, but for !CONFIG_THREAD_INFO_IN_TASK kernels,
 * including <asm/current.h> can cause a circular dependency on some platforms.
 */
#include <asm/current.h>
#define current_thread_info() ((struct thread_info *)current)
#endif

#include <linux/bitops.h>

/*
 * For per-arch arch_within_stack_frames() implementations, defined in
 * asm/thread_info.h.
 */
enum {
        BAD_STACK = -1,
        NOT_STACK = 0,
        GOOD_FRAME,
        GOOD_STACK,
};

#include <asm/thread_info.h>

#ifdef __KERNEL__

#ifndef arch_set_restart_data
#define arch_set_restart_data(restart) do { } while (0)
#endif

static inline long set_restart_fn(struct restart_block *restart,
                                        long (*fn)(struct restart_block *))
{
        restart->fn = fn;
        arch_set_restart_data(restart);
        return -ERESTART_RESTARTBLOCK;
}

#ifndef THREAD_ALIGN
#define THREAD_ALIGN        THREAD_SIZE
#endif

#define THREADINFO_GFP                (GFP_KERNEL_ACCOUNT | __GFP_ZERO)

/*
 * flag set/clear/test wrappers
 * - pass TIF_xxxx constants to these functions
 */

static inline void set_ti_thread_flag(struct thread_info *ti, int flag)
{
        set_bit(flag, (unsigned long *)&ti->flags);
}

static inline void clear_ti_thread_flag(struct thread_info *ti, int flag)
{
        clear_bit(flag, (unsigned long *)&ti->flags);
}

static inline void update_ti_thread_flag(struct thread_info *ti, int flag,
                                         bool value)
{
        if (value)
                set_ti_thread_flag(ti, flag);
        else
                clear_ti_thread_flag(ti, flag);
}

static inline int test_and_set_ti_thread_flag(struct thread_info *ti, int flag)
{
        return test_and_set_bit(flag, (unsigned long *)&ti->flags);
}

static inline int test_and_clear_ti_thread_flag(struct thread_info *ti, int flag)
{
        return test_and_clear_bit(flag, (unsigned long *)&ti->flags);
}

static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
{
        return test_bit(flag, (unsigned long *)&ti->flags);
}

#define set_thread_flag(flag) \
        set_ti_thread_flag(current_thread_info(), flag)
#define clear_thread_flag(flag) \
        clear_ti_thread_flag(current_thread_info(), flag)
#define update_thread_flag(flag, value) \
        update_ti_thread_flag(current_thread_info(), flag, value)
#define test_and_set_thread_flag(flag) \
        test_and_set_ti_thread_flag(current_thread_info(), flag)
#define test_and_clear_thread_flag(flag) \
        test_and_clear_ti_thread_flag(current_thread_info(), flag)
#define test_thread_flag(flag) \
        test_ti_thread_flag(current_thread_info(), flag)

#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)

#ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
static inline int arch_within_stack_frames(const void * const stack,
                                           const void * const stackend,
                                           const void *obj, unsigned long len)
{
        return 0;
}
#endif

#ifdef CONFIG_HARDENED_USERCOPY
extern void __check_object_size(const void *ptr, unsigned long n,
                                        bool to_user);

static __always_inline void check_object_size(const void *ptr, unsigned long n,
                                              bool to_user)
{
        if (!__builtin_constant_p(n))
                __check_object_size(ptr, n, to_user);
}
#else
static inline void check_object_size(const void *ptr, unsigned long n,
                                     bool to_user)
{ }
#endif /* CONFIG_HARDENED_USERCOPY */

extern void __compiletime_error("copy source size is too small")
__bad_copy_from(void);
extern void __compiletime_error("copy destination size is too small")
__bad_copy_to(void);

static inline void copy_overflow(int size, unsigned long count)
{
        WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count);
}

static __always_inline __must_check bool
check_copy_size(const void *addr, size_t bytes, bool is_source)
{
        int sz = __compiletime_object_size(addr);
        if (unlikely(sz >= 0 && sz < bytes)) {
                if (!__builtin_constant_p(bytes))
                        copy_overflow(sz, bytes);
                else if (is_source)
                        __bad_copy_from();
                else
                        __bad_copy_to();
                return false;
        }
        if (WARN_ON_ONCE(bytes > INT_MAX))
                return false;
        check_object_size(addr, bytes, is_source);
        return true;
}

#ifndef arch_setup_new_exec
static inline void arch_setup_new_exec(void) { }
#endif

#endif        /* __KERNEL__ */

#endif /* _LINUX_THREAD_INFO_H */









































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#ifndef _LINUX_HASH_H
#define _LINUX_HASH_H
/* Fast hashing routine for ints,  longs and pointers.
   (C) 2002 Nadia Yvette Chambers, IBM */

#include <asm/types.h>
#include <linux/compiler.h>

/*
 * The "GOLDEN_RATIO_PRIME" is used in ifs/btrfs/brtfs_inode.h and
 * fs/inode.c.  It's not actually prime any more (the previous primes
 * were actively bad for hashing), but the name remains.
 */
#if BITS_PER_LONG == 32
#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_32
#define hash_long(val, bits) hash_32(val, bits)
#elif BITS_PER_LONG == 64
#define hash_long(val, bits) hash_64(val, bits)
#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_64
#else
#error Wordsize not 32 or 64
#endif

/*
 * This hash multiplies the input by a large odd number and takes the
 * high bits.  Since multiplication propagates changes to the most
 * significant end only, it is essential that the high bits of the
 * product be used for the hash value.
 *
 * Chuck Lever verified the effectiveness of this technique:
 * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
 *
 * Although a random odd number will do, it turns out that the golden
 * ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice
 * properties.  (See Knuth vol 3, section 6.4, exercise 9.)
 *
 * These are the negative, (1 - phi) = phi**2 = (3 - sqrt(5))/2,
 * which is very slightly easier to multiply by and makes no
 * difference to the hash distribution.
 */
#define GOLDEN_RATIO_32 0x61C88647
#define GOLDEN_RATIO_64 0x61C8864680B583EBull

#ifdef CONFIG_HAVE_ARCH_HASH
/* This header may use the GOLDEN_RATIO_xx constants */
#include <asm/hash.h>
#endif

/*
 * The _generic versions exist only so lib/test_hash.c can compare
 * the arch-optimized versions with the generic.
 *
 * Note that if you change these, any <asm/hash.h> that aren't updated
 * to match need to have their HAVE_ARCH_* define values updated so the
 * self-test will not false-positive.
 */
#ifndef HAVE_ARCH__HASH_32
#define __hash_32 __hash_32_generic
#endif
static inline u32 __hash_32_generic(u32 val)
{
        return val * GOLDEN_RATIO_32;
}

#ifndef HAVE_ARCH_HASH_32
#define hash_32 hash_32_generic
#endif
static inline u32 hash_32_generic(u32 val, unsigned int bits)
{
        /* High bits are more random, so use them. */
        return __hash_32(val) >> (32 - bits);
}

#ifndef HAVE_ARCH_HASH_64
#define hash_64 hash_64_generic
#endif
static __always_inline u32 hash_64_generic(u64 val, unsigned int bits)
{
#if BITS_PER_LONG == 64
        /* 64x64-bit multiply is efficient on all 64-bit processors */
        return val * GOLDEN_RATIO_64 >> (64 - bits);
#else
        /* Hash 64 bits using only 32x32-bit multiply. */
        return hash_32((u32)val ^ __hash_32(val >> 32), bits);
#endif
}

static inline u32 hash_ptr(const void *ptr, unsigned int bits)
{
        return hash_long((unsigned long)ptr, bits);
}

/* This really should be called fold32_ptr; it does no hashing to speak of. */
static inline u32 hash32_ptr(const void *ptr)
{
        unsigned long val = (unsigned long)ptr;

#if BITS_PER_LONG == 64
        val ^= (val >> 32);
#endif
        return (u32)val;
}

#endif /* _LINUX_HASH_H */



































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_SIGNAL_H
#define _LINUX_SCHED_SIGNAL_H

#include <linux/rculist.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/sched/jobctl.h>
#include <linux/sched/task.h>
#include <linux/cred.h>
#include <linux/refcount.h>
#include <linux/posix-timers.h>
#include <linux/mm_types.h>
#include <asm/ptrace.h>

/*
 * Types defining task->signal and task->sighand and APIs using them:
 */

struct sighand_struct {
        spinlock_t                siglock;
        refcount_t                count;
        wait_queue_head_t        signalfd_wqh;
        struct k_sigaction        action[_NSIG];
};

/*
 * Per-process accounting stats:
 */
struct pacct_struct {
        int                        ac_flag;
        long                        ac_exitcode;
        unsigned long                ac_mem;
        u64                        ac_utime, ac_stime;
        unsigned long                ac_minflt, ac_majflt;
};

struct cpu_itimer {
        u64 expires;
        u64 incr;
};

/*
 * This is the atomic variant of task_cputime, which can be used for
 * storing and updating task_cputime statistics without locking.
 */
struct task_cputime_atomic {
        atomic64_t utime;
        atomic64_t stime;
        atomic64_t sum_exec_runtime;
};

#define INIT_CPUTIME_ATOMIC \
        (struct task_cputime_atomic) {                                \
                .utime = ATOMIC64_INIT(0),                        \
                .stime = ATOMIC64_INIT(0),                        \
                .sum_exec_runtime = ATOMIC64_INIT(0),                \
        }
/**
 * struct thread_group_cputimer - thread group interval timer counts
 * @cputime_atomic:        atomic thread group interval timers.
 *
 * This structure contains the version of task_cputime, above, that is
 * used for thread group CPU timer calculations.
 */
struct thread_group_cputimer {
        struct task_cputime_atomic cputime_atomic;
};

struct multiprocess_signals {
        sigset_t signal;
        struct hlist_node node;
};

/*
 * NOTE! "signal_struct" does not have its own
 * locking, because a shared signal_struct always
 * implies a shared sighand_struct, so locking
 * sighand_struct is always a proper superset of
 * the locking of signal_struct.
 */
struct signal_struct {
        refcount_t                sigcnt;
        atomic_t                live;
        int                        nr_threads;
        struct list_head        thread_head;

        wait_queue_head_t        wait_chldexit;        /* for wait4() */

        /* current thread group signal load-balancing target: */
        struct task_struct        *curr_target;

        /* shared signal handling: */
        struct sigpending        shared_pending;

        /* For collecting multiprocess signals during fork */
        struct hlist_head        multiprocess;

        /* thread group exit support */
        int                        group_exit_code;
        /* overloaded:
         * - notify group_exit_task when ->count is equal to notify_count
         * - everyone except group_exit_task is stopped during signal delivery
         *   of fatal signals, group_exit_task processes the signal.
         */
        int                        notify_count;
        struct task_struct        *group_exit_task;

        /* thread group stop support, overloads group_exit_code too */
        int                        group_stop_count;
        unsigned int                flags; /* see SIGNAL_* flags below */

        /*
         * PR_SET_CHILD_SUBREAPER marks a process, like a service
         * manager, to re-parent orphan (double-forking) child processes
         * to this process instead of 'init'. The service manager is
         * able to receive SIGCHLD signals and is able to investigate
         * the process until it calls wait(). All children of this
         * process will inherit a flag if they should look for a
         * child_subreaper process at exit.
         */
        unsigned int                is_child_subreaper:1;
        unsigned int                has_child_subreaper:1;

#ifdef CONFIG_POSIX_TIMERS

        /* POSIX.1b Interval Timers */
        unsigned int                next_posix_timer_id;
        struct list_head        posix_timers;

        /* ITIMER_REAL timer for the process */
        struct hrtimer real_timer;
        ktime_t it_real_incr;

        /*
         * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use
         * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these
         * values are defined to 0 and 1 respectively
         */
        struct cpu_itimer it[2];

        /*
         * Thread group totals for process CPU timers.
         * See thread_group_cputimer(), et al, for details.
         */
        struct thread_group_cputimer cputimer;

#endif
        /* Empty if CONFIG_POSIX_TIMERS=n */
        struct posix_cputimers posix_cputimers;

        /* PID/PID hash table linkage. */
        struct pid *pids[PIDTYPE_MAX];

#ifdef CONFIG_NO_HZ_FULL
        atomic_t tick_dep_mask;
#endif

        struct pid *tty_old_pgrp;

        /* boolean value for session group leader */
        int leader;

        struct tty_struct *tty; /* NULL if no tty */

#ifdef CONFIG_SCHED_AUTOGROUP
        struct autogroup *autogroup;
#endif
        /*
         * Cumulative resource counters for dead threads in the group,
         * and for reaped dead child processes forked by this group.
         * Live threads maintain their own counters and add to these
         * in __exit_signal, except for the group leader.
         */
        seqlock_t stats_lock;
        u64 utime, stime, cutime, cstime;
        u64 gtime;
        u64 cgtime;
        struct prev_cputime prev_cputime;
        unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
        unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
        unsigned long inblock, oublock, cinblock, coublock;
        unsigned long maxrss, cmaxrss;
        struct task_io_accounting ioac;

        /*
         * Cumulative ns of schedule CPU time fo dead threads in the
         * group, not including a zombie group leader, (This only differs
         * from jiffies_to_ns(utime + stime) if sched_clock uses something
         * other than jiffies.)
         */
        unsigned long long sum_sched_runtime;

        /*
         * We don't bother to synchronize most readers of this at all,
         * because there is no reader checking a limit that actually needs
         * to get both rlim_cur and rlim_max atomically, and either one
         * alone is a single word that can safely be read normally.
         * getrlimit/setrlimit use task_lock(current->group_leader) to
         * protect this instead of the siglock, because they really
         * have no need to disable irqs.
         */
        struct rlimit rlim[RLIM_NLIMITS];

#ifdef CONFIG_BSD_PROCESS_ACCT
        struct pacct_struct pacct;        /* per-process accounting information */
#endif
#ifdef CONFIG_TASKSTATS
        struct taskstats *stats;
#endif
#ifdef CONFIG_AUDIT
        unsigned audit_tty;
        struct tty_audit_buf *tty_audit_buf;
#endif

        /*
         * Thread is the potential origin of an oom condition; kill first on
         * oom
         */
        bool oom_flag_origin;
        short oom_score_adj;                /* OOM kill score adjustment */
        short oom_score_adj_min;        /* OOM kill score adjustment min value.
                                         * Only settable by CAP_SYS_RESOURCE. */
        struct mm_struct *oom_mm;        /* recorded mm when the thread group got
                                         * killed by the oom killer */

        struct mutex cred_guard_mutex;        /* guard against foreign influences on
                                         * credential calculations
                                         * (notably. ptrace)
                                         * Deprecated do not use in new code.
                                         * Use exec_update_lock instead.
                                         */
        struct rw_semaphore exec_update_lock;        /* Held while task_struct is
                                                 * being updated during exec,
                                                 * and may have inconsistent
                                                 * permissions.
                                                 */
} __randomize_layout;

/*
 * Bits in flags field of signal_struct.
 */
#define SIGNAL_STOP_STOPPED        0x00000001 /* job control stop in effect */
#define SIGNAL_STOP_CONTINUED        0x00000002 /* SIGCONT since WCONTINUED reap */
#define SIGNAL_GROUP_EXIT        0x00000004 /* group exit in progress */
#define SIGNAL_GROUP_COREDUMP        0x00000008 /* coredump in progress */
/*
 * Pending notifications to parent.
 */
#define SIGNAL_CLD_STOPPED        0x00000010
#define SIGNAL_CLD_CONTINUED        0x00000020
#define SIGNAL_CLD_MASK                (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED)

#define SIGNAL_UNKILLABLE        0x00000040 /* for init: ignore fatal signals */

#define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \
                          SIGNAL_STOP_CONTINUED)

static inline void signal_set_stop_flags(struct signal_struct *sig,
                                         unsigned int flags)
{
        WARN_ON(sig->flags & (SIGNAL_GROUP_EXIT|SIGNAL_GROUP_COREDUMP));
        sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags;
}

/* If true, all threads except ->group_exit_task have pending SIGKILL */
static inline int signal_group_exit(const struct signal_struct *sig)
{
        return        (sig->flags & SIGNAL_GROUP_EXIT) ||
                (sig->group_exit_task != NULL);
}

extern void flush_signals(struct task_struct *);
extern void ignore_signals(struct task_struct *);
extern void flush_signal_handlers(struct task_struct *, int force_default);
extern int dequeue_signal(struct task_struct *task,
                          sigset_t *mask, kernel_siginfo_t *info);

static inline int kernel_dequeue_signal(void)
{
        struct task_struct *task = current;
        kernel_siginfo_t __info;
        int ret;

        spin_lock_irq(&task->sighand->siglock);
        ret = dequeue_signal(task, &task->blocked, &__info);
        spin_unlock_irq(&task->sighand->siglock);

        return ret;
}

static inline void kernel_signal_stop(void)
{
        spin_lock_irq(&current->sighand->siglock);
        if (current->jobctl & JOBCTL_STOP_DEQUEUED)
                set_special_state(TASK_STOPPED);
        spin_unlock_irq(&current->sighand->siglock);

        schedule();
}
#ifdef __ARCH_SI_TRAPNO
# define ___ARCH_SI_TRAPNO(_a1) , _a1
#else
# define ___ARCH_SI_TRAPNO(_a1)
#endif
#ifdef __ia64__
# define ___ARCH_SI_IA64(_a1, _a2, _a3) , _a1, _a2, _a3
#else
# define ___ARCH_SI_IA64(_a1, _a2, _a3)
#endif

int force_sig_fault_to_task(int sig, int code, void __user *addr
        ___ARCH_SI_TRAPNO(int trapno)
        ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
        , struct task_struct *t);
int force_sig_fault(int sig, int code, void __user *addr
        ___ARCH_SI_TRAPNO(int trapno)
        ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr));
int send_sig_fault(int sig, int code, void __user *addr
        ___ARCH_SI_TRAPNO(int trapno)
        ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
        , struct task_struct *t);

int force_sig_mceerr(int code, void __user *, short);
int send_sig_mceerr(int code, void __user *, short, struct task_struct *);

int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper);
int force_sig_pkuerr(void __user *addr, u32 pkey);

int force_sig_ptrace_errno_trap(int errno, void __user *addr);

extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
extern void force_sigsegv(int sig);
extern int force_sig_info(struct kernel_siginfo *);
extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp);
extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid);
extern int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, struct pid *,
                                const struct cred *);
extern int kill_pgrp(struct pid *pid, int sig, int priv);
extern int kill_pid(struct pid *pid, int sig, int priv);
extern __must_check bool do_notify_parent(struct task_struct *, int);
extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
extern void force_sig(int);
extern int send_sig(int, struct task_struct *, int);
extern int zap_other_threads(struct task_struct *p);
extern struct sigqueue *sigqueue_alloc(void);
extern void sigqueue_free(struct sigqueue *);
extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type);
extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);

static inline int restart_syscall(void)
{
        set_tsk_thread_flag(current, TIF_SIGPENDING);
        return -ERESTARTNOINTR;
}

static inline int task_sigpending(struct task_struct *p)
{
        return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
}

static inline int signal_pending(struct task_struct *p)
{
        /*
         * TIF_NOTIFY_SIGNAL isn't really a signal, but it requires the same
         * behavior in terms of ensuring that we break out of wait loops
         * so that notify signal callbacks can be processed.
         */
        if (unlikely(test_tsk_thread_flag(p, TIF_NOTIFY_SIGNAL)))
                return 1;
        return task_sigpending(p);
}

static inline int __fatal_signal_pending(struct task_struct *p)
{
        return unlikely(sigismember(&p->pending.signal, SIGKILL));
}

static inline int fatal_signal_pending(struct task_struct *p)
{
        return task_sigpending(p) && __fatal_signal_pending(p);
}

static inline int signal_pending_state(long state, struct task_struct *p)
{
        if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
                return 0;
        if (!signal_pending(p))
                return 0;

        return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
}

/*
 * This should only be used in fault handlers to decide whether we
 * should stop the current fault routine to handle the signals
 * instead, especially with the case where we've got interrupted with
 * a VM_FAULT_RETRY.
 */
static inline bool fault_signal_pending(vm_fault_t fault_flags,
                                        struct pt_regs *regs)
{
        return unlikely((fault_flags & VM_FAULT_RETRY) &&
                        (fatal_signal_pending(current) ||
                         (user_mode(regs) && signal_pending(current))));
}

/*
 * Reevaluate whether the task has signals pending delivery.
 * Wake the task if so.
 * This is required every time the blocked sigset_t changes.
 * callers must hold sighand->siglock.
 */
extern void recalc_sigpending_and_wake(struct task_struct *t);
extern void recalc_sigpending(void);
extern void calculate_sigpending(void);

extern void signal_wake_up_state(struct task_struct *t, unsigned int state);

static inline void signal_wake_up(struct task_struct *t, bool resume)
{
        signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0);
}
static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
{
        signal_wake_up_state(t, resume ? __TASK_TRACED : 0);
}

void task_join_group_stop(struct task_struct *task);

#ifdef TIF_RESTORE_SIGMASK
/*
 * Legacy restore_sigmask accessors.  These are inefficient on
 * SMP architectures because they require atomic operations.
 */

/**
 * set_restore_sigmask() - make sure saved_sigmask processing gets done
 *
 * This sets TIF_RESTORE_SIGMASK and ensures that the arch signal code
 * will run before returning to user mode, to process the flag.  For
 * all callers, TIF_SIGPENDING is already set or it's no harm to set
 * it.  TIF_RESTORE_SIGMASK need not be in the set of bits that the
 * arch code will notice on return to user mode, in case those bits
 * are scarce.  We set TIF_SIGPENDING here to ensure that the arch
 * signal code always gets run when TIF_RESTORE_SIGMASK is set.
 */
static inline void set_restore_sigmask(void)
{
        set_thread_flag(TIF_RESTORE_SIGMASK);
}

static inline void clear_tsk_restore_sigmask(struct task_struct *task)
{
        clear_tsk_thread_flag(task, TIF_RESTORE_SIGMASK);
}

static inline void clear_restore_sigmask(void)
{
        clear_thread_flag(TIF_RESTORE_SIGMASK);
}
static inline bool test_tsk_restore_sigmask(struct task_struct *task)
{
        return test_tsk_thread_flag(task, TIF_RESTORE_SIGMASK);
}
static inline bool test_restore_sigmask(void)
{
        return test_thread_flag(TIF_RESTORE_SIGMASK);
}
static inline bool test_and_clear_restore_sigmask(void)
{
        return test_and_clear_thread_flag(TIF_RESTORE_SIGMASK);
}

#else        /* TIF_RESTORE_SIGMASK */

/* Higher-quality implementation, used if TIF_RESTORE_SIGMASK doesn't exist. */
static inline void set_restore_sigmask(void)
{
        current->restore_sigmask = true;
}
static inline void clear_tsk_restore_sigmask(struct task_struct *task)
{
        task->restore_sigmask = false;
}
static inline void clear_restore_sigmask(void)
{
        current->restore_sigmask = false;
}
static inline bool test_restore_sigmask(void)
{
        return current->restore_sigmask;
}
static inline bool test_tsk_restore_sigmask(struct task_struct *task)
{
        return task->restore_sigmask;
}
static inline bool test_and_clear_restore_sigmask(void)
{
        if (!current->restore_sigmask)
                return false;
        current->restore_sigmask = false;
        return true;
}
#endif

static inline void restore_saved_sigmask(void)
{
        if (test_and_clear_restore_sigmask())
                __set_current_blocked(&current->saved_sigmask);
}

extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize);

static inline void restore_saved_sigmask_unless(bool interrupted)
{
        if (interrupted)
                WARN_ON(!signal_pending(current));
        else
                restore_saved_sigmask();
}

static inline sigset_t *sigmask_to_save(void)
{
        sigset_t *res = &current->blocked;
        if (unlikely(test_restore_sigmask()))
                res = &current->saved_sigmask;
        return res;
}

static inline int kill_cad_pid(int sig, int priv)
{
        return kill_pid(cad_pid, sig, priv);
}

/* These can be the second arg to send_sig_info/send_group_sig_info.  */
#define SEND_SIG_NOINFO ((struct kernel_siginfo *) 0)
#define SEND_SIG_PRIV        ((struct kernel_siginfo *) 1)

static inline int __on_sig_stack(unsigned long sp)
{
#ifdef CONFIG_STACK_GROWSUP
        return sp >= current->sas_ss_sp &&
                sp - current->sas_ss_sp < current->sas_ss_size;
#else
        return sp > current->sas_ss_sp &&
                sp - current->sas_ss_sp <= current->sas_ss_size;
#endif
}

/*
 * True if we are on the alternate signal stack.
 */
static inline int on_sig_stack(unsigned long sp)
{
        /*
         * If the signal stack is SS_AUTODISARM then, by construction, we
         * can't be on the signal stack unless user code deliberately set
         * SS_AUTODISARM when we were already on it.
         *
         * This improves reliability: if user state gets corrupted such that
         * the stack pointer points very close to the end of the signal stack,
         * then this check will enable the signal to be handled anyway.
         */
        if (current->sas_ss_flags & SS_AUTODISARM)
                return 0;

        return __on_sig_stack(sp);
}

static inline int sas_ss_flags(unsigned long sp)
{
        if (!current->sas_ss_size)
                return SS_DISABLE;

        return on_sig_stack(sp) ? SS_ONSTACK : 0;
}

static inline void sas_ss_reset(struct task_struct *p)
{
        p->sas_ss_sp = 0;
        p->sas_ss_size = 0;
        p->sas_ss_flags = SS_DISABLE;
}

static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig)
{
        if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp))
#ifdef CONFIG_STACK_GROWSUP
                return current->sas_ss_sp;
#else
                return current->sas_ss_sp + current->sas_ss_size;
#endif
        return sp;
}

extern void __cleanup_sighand(struct sighand_struct *);
extern void flush_itimer_signals(void);

#define tasklist_empty() \
        list_empty(&init_task.tasks)

#define next_task(p) \
        list_entry_rcu((p)->tasks.next, struct task_struct, tasks)

#define for_each_process(p) \
        for (p = &init_task ; (p = next_task(p)) != &init_task ; )

extern bool current_is_single_threaded(void);

/*
 * Careful: do_each_thread/while_each_thread is a double loop so
 *          'break' will not work as expected - use goto instead.
 */
#define do_each_thread(g, t) \
        for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do

#define while_each_thread(g, t) \
        while ((t = next_thread(t)) != g)

#define __for_each_thread(signal, t)        \
        list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node)

#define for_each_thread(p, t)                \
        __for_each_thread((p)->signal, t)

/* Careful: this is a double loop, 'break' won't work as expected. */
#define for_each_process_thread(p, t)        \
        for_each_process(p) for_each_thread(p, t)

typedef int (*proc_visitor)(struct task_struct *p, void *data);
void walk_process_tree(struct task_struct *top, proc_visitor, void *);

static inline
struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
{
        struct pid *pid;
        if (type == PIDTYPE_PID)
                pid = task_pid(task);
        else
                pid = task->signal->pids[type];
        return pid;
}

static inline struct pid *task_tgid(struct task_struct *task)
{
        return task->signal->pids[PIDTYPE_TGID];
}

/*
 * Without tasklist or RCU lock it is not safe to dereference
 * the result of task_pgrp/task_session even if task == current,
 * we can race with another thread doing sys_setsid/sys_setpgid.
 */
static inline struct pid *task_pgrp(struct task_struct *task)
{
        return task->signal->pids[PIDTYPE_PGID];
}

static inline struct pid *task_session(struct task_struct *task)
{
        return task->signal->pids[PIDTYPE_SID];
}

static inline int get_nr_threads(struct task_struct *task)
{
        return task->signal->nr_threads;
}

static inline bool thread_group_leader(struct task_struct *p)
{
        return p->exit_signal >= 0;
}

static inline
bool same_thread_group(struct task_struct *p1, struct task_struct *p2)
{
        return p1->signal == p2->signal;
}

static inline struct task_struct *next_thread(const struct task_struct *p)
{
        return list_entry_rcu(p->thread_group.next,
                              struct task_struct, thread_group);
}

static inline int thread_group_empty(struct task_struct *p)
{
        return list_empty(&p->thread_group);
}

#define delay_group_leader(p) \
                (thread_group_leader(p) && !thread_group_empty(p))

extern bool thread_group_exited(struct pid *pid);

extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
                                                        unsigned long *flags);

static inline struct sighand_struct *lock_task_sighand(struct task_struct *task,
                                                       unsigned long *flags)
{
        struct sighand_struct *ret;

        ret = __lock_task_sighand(task, flags);
        (void)__cond_lock(&task->sighand->siglock, ret);
        return ret;
}

static inline void unlock_task_sighand(struct task_struct *task,
                                                unsigned long *flags)
{
        spin_unlock_irqrestore(&task->sighand->siglock, *flags);
}

static inline unsigned long task_rlimit(const struct task_struct *task,
                unsigned int limit)
{
        return READ_ONCE(task->signal->rlim[limit].rlim_cur);
}

static inline unsigned long task_rlimit_max(const struct task_struct *task,
                unsigned int limit)
{
        return READ_ONCE(task->signal->rlim[limit].rlim_max);
}

static inline unsigned long rlimit(unsigned int limit)
{
        return task_rlimit(current, limit);
}

static inline unsigned long rlimit_max(unsigned int limit)
{
        return task_rlimit_max(current, limit);
}

#endif /* _LINUX_SCHED_SIGNAL_H */















































































































































































































































































    1 































































































































































































    1 


    1 






























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Filesystem access notification for Linux
 *
 *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
 */

#ifndef __LINUX_FSNOTIFY_BACKEND_H
#define __LINUX_FSNOTIFY_BACKEND_H

#ifdef __KERNEL__

#include <linux/idr.h> /* inotify uses this */
#include <linux/fs.h> /* struct inode */
#include <linux/list.h>
#include <linux/path.h> /* struct path */
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/atomic.h>
#include <linux/user_namespace.h>
#include <linux/refcount.h>

/*
 * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily
 * convert between them.  dnotify only needs conversion at watch creation
 * so no perf loss there.  fanotify isn't defined yet, so it can use the
 * wholes if it needs more events.
 */
#define FS_ACCESS                0x00000001        /* File was accessed */
#define FS_MODIFY                0x00000002        /* File was modified */
#define FS_ATTRIB                0x00000004        /* Metadata changed */
#define FS_CLOSE_WRITE                0x00000008        /* Writtable file was closed */
#define FS_CLOSE_NOWRITE        0x00000010        /* Unwrittable file closed */
#define FS_OPEN                        0x00000020        /* File was opened */
#define FS_MOVED_FROM                0x00000040        /* File was moved from X */
#define FS_MOVED_TO                0x00000080        /* File was moved to Y */
#define FS_CREATE                0x00000100        /* Subfile was created */
#define FS_DELETE                0x00000200        /* Subfile was deleted */
#define FS_DELETE_SELF                0x00000400        /* Self was deleted */
#define FS_MOVE_SELF                0x00000800        /* Self was moved */
#define FS_OPEN_EXEC                0x00001000        /* File was opened for exec */

#define FS_UNMOUNT                0x00002000        /* inode on umount fs */
#define FS_Q_OVERFLOW                0x00004000        /* Event queued overflowed */
#define FS_IN_IGNORED                0x00008000        /* last inotify event here */

#define FS_OPEN_PERM                0x00010000        /* open event in an permission hook */
#define FS_ACCESS_PERM                0x00020000        /* access event in a permissions hook */
#define FS_OPEN_EXEC_PERM        0x00040000        /* open/exec event in a permission hook */

#define FS_EXCL_UNLINK                0x04000000        /* do not send events if object is unlinked */
/*
 * Set on inode mark that cares about things that happen to its children.
 * Always set for dnotify and inotify.
 * Set on inode/sb/mount marks that care about parent/name info.
 */
#define FS_EVENT_ON_CHILD        0x08000000

#define FS_DN_RENAME                0x10000000        /* file renamed */
#define FS_DN_MULTISHOT                0x20000000        /* dnotify multishot */
#define FS_ISDIR                0x40000000        /* event occurred against dir */
#define FS_IN_ONESHOT                0x80000000        /* only send event once */

#define FS_MOVE                        (FS_MOVED_FROM | FS_MOVED_TO)

/*
 * Directory entry modification events - reported only to directory
 * where entry is modified and not to a watching parent.
 * The watching parent may get an FS_ATTRIB|FS_EVENT_ON_CHILD event
 * when a directory entry inside a child subdir changes.
 */
#define ALL_FSNOTIFY_DIRENT_EVENTS        (FS_CREATE | FS_DELETE | FS_MOVE)

#define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \
                                  FS_OPEN_EXEC_PERM)

/*
 * This is a list of all events that may get sent to a parent that is watching
 * with flag FS_EVENT_ON_CHILD based on fs event on a child of that directory.
 */
#define FS_EVENTS_POSS_ON_CHILD   (ALL_FSNOTIFY_PERM_EVENTS | \
                                   FS_ACCESS | FS_MODIFY | FS_ATTRIB | \
                                   FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | \
                                   FS_OPEN | FS_OPEN_EXEC)

/*
 * This is a list of all events that may get sent with the parent inode as the
 * @to_tell argument of fsnotify().
 * It may include events that can be sent to an inode/sb/mount mark, but cannot
 * be sent to a parent watching children.
 */
#define FS_EVENTS_POSS_TO_PARENT (FS_EVENTS_POSS_ON_CHILD)

/* Events that can be reported to backends */
#define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \
                             FS_EVENTS_POSS_ON_CHILD | \
                             FS_DELETE_SELF | FS_MOVE_SELF | FS_DN_RENAME | \
                             FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED)

/* Extra flags that may be reported with event or control handling of events */
#define ALL_FSNOTIFY_FLAGS  (FS_EXCL_UNLINK | FS_ISDIR | FS_IN_ONESHOT | \
                             FS_DN_MULTISHOT | FS_EVENT_ON_CHILD)

#define ALL_FSNOTIFY_BITS   (ALL_FSNOTIFY_EVENTS | ALL_FSNOTIFY_FLAGS)

struct fsnotify_group;
struct fsnotify_event;
struct fsnotify_mark;
struct fsnotify_event_private_data;
struct fsnotify_fname;
struct fsnotify_iter_info;

struct mem_cgroup;

/*
 * Each group much define these ops.  The fsnotify infrastructure will call
 * these operations for each relevant group.
 *
 * handle_event - main call for a group to handle an fs event
 * @group:        group to notify
 * @mask:        event type and flags
 * @data:        object that event happened on
 * @data_type:        type of object for fanotify_data_XXX() accessors
 * @dir:        optional directory associated with event -
 *                if @file_name is not NULL, this is the directory that
 *                @file_name is relative to
 * @file_name:        optional file name associated with event
 * @cookie:        inotify rename cookie
 * @iter_info:        array of marks from this group that are interested in the event
 *
 * handle_inode_event - simple variant of handle_event() for groups that only
 *                have inode marks and don't have ignore mask
 * @mark:        mark to notify
 * @mask:        event type and flags
 * @inode:        inode that event happened on
 * @dir:        optional directory associated with event -
 *                if @file_name is not NULL, this is the directory that
 *                @file_name is relative to.
 * @file_name:        optional file name associated with event
 * @cookie:        inotify rename cookie
 *
 * free_group_priv - called when a group refcnt hits 0 to clean up the private union
 * freeing_mark - called when a mark is being destroyed for some reason.  The group
 *                MUST be holding a reference on each mark and that reference must be
 *                dropped in this function.  inotify uses this function to send
 *                userspace messages that marks have been removed.
 */
struct fsnotify_ops {
        int (*handle_event)(struct fsnotify_group *group, u32 mask,
                            const void *data, int data_type, struct inode *dir,
                            const struct qstr *file_name, u32 cookie,
                            struct fsnotify_iter_info *iter_info);
        int (*handle_inode_event)(struct fsnotify_mark *mark, u32 mask,
                            struct inode *inode, struct inode *dir,
                            const struct qstr *file_name, u32 cookie);
        void (*free_group_priv)(struct fsnotify_group *group);
        void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group);
        void (*free_event)(struct fsnotify_event *event);
        /* called on final put+free to free memory */
        void (*free_mark)(struct fsnotify_mark *mark);
};

/*
 * all of the information about the original object we want to now send to
 * a group.  If you want to carry more info from the accessing task to the
 * listener this structure is where you need to be adding fields.
 */
struct fsnotify_event {
        struct list_head list;
        unsigned long objectid;        /* identifier for queue merges */
};

/*
 * A group is a "thing" that wants to receive notification about filesystem
 * events.  The mask holds the subset of event types this group cares about.
 * refcnt on a group is up to the implementor and at any moment if it goes 0
 * everything will be cleaned up.
 */
struct fsnotify_group {
        const struct fsnotify_ops *ops;        /* how this group handles things */

        /*
         * How the refcnt is used is up to each group.  When the refcnt hits 0
         * fsnotify will clean up all of the resources associated with this group.
         * As an example, the dnotify group will always have a refcnt=1 and that
         * will never change.  Inotify, on the other hand, has a group per
         * inotify_init() and the refcnt will hit 0 only when that fd has been
         * closed.
         */
        refcount_t refcnt;                /* things with interest in this group */

        /* needed to send notification to userspace */
        spinlock_t notification_lock;                /* protect the notification_list */
        struct list_head notification_list;        /* list of event_holder this group needs to send to userspace */
        wait_queue_head_t notification_waitq;        /* read() on the notification file blocks on this waitq */
        unsigned int q_len;                        /* events on the queue */
        unsigned int max_events;                /* maximum events allowed on the list */
        /*
         * Valid fsnotify group priorities.  Events are send in order from highest
         * priority to lowest priority.  We default to the lowest priority.
         */
        #define FS_PRIO_0        0 /* normal notifiers, no permissions */
        #define FS_PRIO_1        1 /* fanotify content based access control */
        #define FS_PRIO_2        2 /* fanotify pre-content access */
        unsigned int priority;
        bool shutdown;                /* group is being shut down, don't queue more events */

        /* stores all fastpath marks assoc with this group so they can be cleaned on unregister */
        struct mutex mark_mutex;        /* protect marks_list */
        atomic_t num_marks;                /* 1 for each mark and 1 for not being
                                         * past the point of no return when freeing
                                         * a group */
        atomic_t user_waits;                /* Number of tasks waiting for user
                                         * response */
        struct list_head marks_list;        /* all inode marks for this group */

        struct fasync_struct *fsn_fa;    /* async notification */

        struct fsnotify_event *overflow_event;        /* Event we queue when the
                                                 * notification list is too
                                                 * full */

        struct mem_cgroup *memcg;        /* memcg to charge allocations */

        /* groups can define private fields here or use the void *private */
        union {
                void *private;
#ifdef CONFIG_INOTIFY_USER
                struct inotify_group_private_data {
                        spinlock_t        idr_lock;
                        struct idr      idr;
                        struct ucounts *ucounts;
                } inotify_data;
#endif
#ifdef CONFIG_FANOTIFY
                struct fanotify_group_private_data {
                        /* allows a group to block waiting for a userspace response */
                        struct list_head access_list;
                        wait_queue_head_t access_waitq;
                        int flags;           /* flags from fanotify_init() */
                        int f_flags; /* event_f_flags from fanotify_init() */
                        unsigned int max_marks;
                        struct user_struct *user;
                } fanotify_data;
#endif /* CONFIG_FANOTIFY */
        };
};

/* When calling fsnotify tell it if the data is a path or inode */
enum fsnotify_data_type {
        FSNOTIFY_EVENT_NONE,
        FSNOTIFY_EVENT_PATH,
        FSNOTIFY_EVENT_INODE,
};

static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_INODE:
                return (struct inode *)data;
        case FSNOTIFY_EVENT_PATH:
                return d_inode(((const struct path *)data)->dentry);
        default:
                return NULL;
        }
}

static inline const struct path *fsnotify_data_path(const void *data,
                                                    int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_PATH:
                return data;
        default:
                return NULL;
        }
}

enum fsnotify_obj_type {
        FSNOTIFY_OBJ_TYPE_INODE,
        FSNOTIFY_OBJ_TYPE_PARENT,
        FSNOTIFY_OBJ_TYPE_VFSMOUNT,
        FSNOTIFY_OBJ_TYPE_SB,
        FSNOTIFY_OBJ_TYPE_COUNT,
        FSNOTIFY_OBJ_TYPE_DETACHED = FSNOTIFY_OBJ_TYPE_COUNT
};

#define FSNOTIFY_OBJ_TYPE_INODE_FL        (1U << FSNOTIFY_OBJ_TYPE_INODE)
#define FSNOTIFY_OBJ_TYPE_PARENT_FL        (1U << FSNOTIFY_OBJ_TYPE_PARENT)
#define FSNOTIFY_OBJ_TYPE_VFSMOUNT_FL        (1U << FSNOTIFY_OBJ_TYPE_VFSMOUNT)
#define FSNOTIFY_OBJ_TYPE_SB_FL                (1U << FSNOTIFY_OBJ_TYPE_SB)
#define FSNOTIFY_OBJ_ALL_TYPES_MASK        ((1U << FSNOTIFY_OBJ_TYPE_COUNT) - 1)

static inline bool fsnotify_valid_obj_type(unsigned int type)
{
        return (type < FSNOTIFY_OBJ_TYPE_COUNT);
}

struct fsnotify_iter_info {
        struct fsnotify_mark *marks[FSNOTIFY_OBJ_TYPE_COUNT];
        unsigned int report_mask;
        int srcu_idx;
};

static inline bool fsnotify_iter_should_report_type(
                struct fsnotify_iter_info *iter_info, int type)
{
        return (iter_info->report_mask & (1U << type));
}

static inline void fsnotify_iter_set_report_type(
                struct fsnotify_iter_info *iter_info, int type)
{
        iter_info->report_mask |= (1U << type);
}

static inline void fsnotify_iter_set_report_type_mark(
                struct fsnotify_iter_info *iter_info, int type,
                struct fsnotify_mark *mark)
{
        iter_info->marks[type] = mark;
        iter_info->report_mask |= (1U << type);
}

#define FSNOTIFY_ITER_FUNCS(name, NAME) \
static inline struct fsnotify_mark *fsnotify_iter_##name##_mark( \
                struct fsnotify_iter_info *iter_info) \
{ \
        return (iter_info->report_mask & FSNOTIFY_OBJ_TYPE_##NAME##_FL) ? \
                iter_info->marks[FSNOTIFY_OBJ_TYPE_##NAME] : NULL; \
}

FSNOTIFY_ITER_FUNCS(inode, INODE)
FSNOTIFY_ITER_FUNCS(parent, PARENT)
FSNOTIFY_ITER_FUNCS(vfsmount, VFSMOUNT)
FSNOTIFY_ITER_FUNCS(sb, SB)

#define fsnotify_foreach_obj_type(type) \
        for (type = 0; type < FSNOTIFY_OBJ_TYPE_COUNT; type++)

/*
 * fsnotify_connp_t is what we embed in objects which connector can be attached
 * to. fsnotify_connp_t * is how we refer from connector back to object.
 */
struct fsnotify_mark_connector;
typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t;

/*
 * Inode/vfsmount/sb point to this structure which tracks all marks attached to
 * the inode/vfsmount/sb. The reference to inode/vfsmount/sb is held by this
 * structure. We destroy this structure when there are no more marks attached
 * to it. The structure is protected by fsnotify_mark_srcu.
 */
struct fsnotify_mark_connector {
        spinlock_t lock;
        unsigned short type;        /* Type of object [lock] */
#define FSNOTIFY_CONN_FLAG_HAS_FSID        0x01
        unsigned short flags;        /* flags [lock] */
        __kernel_fsid_t fsid;        /* fsid of filesystem containing object */
        union {
                /* Object pointer [lock] */
                fsnotify_connp_t *obj;
                /* Used listing heads to free after srcu period expires */
                struct fsnotify_mark_connector *destroy_next;
        };
        struct hlist_head list;
};

/*
 * A mark is simply an object attached to an in core inode which allows an
 * fsnotify listener to indicate they are either no longer interested in events
 * of a type matching mask or only interested in those events.
 *
 * These are flushed when an inode is evicted from core and may be flushed
 * when the inode is modified (as seen by fsnotify_access).  Some fsnotify
 * users (such as dnotify) will flush these when the open fd is closed and not
 * at inode eviction or modification.
 *
 * Text in brackets is showing the lock(s) protecting modifications of a
 * particular entry. obj_lock means either inode->i_lock or
 * mnt->mnt_root->d_lock depending on the mark type.
 */
struct fsnotify_mark {
        /* Mask this mark is for [mark->lock, group->mark_mutex] */
        __u32 mask;
        /* We hold one for presence in g_list. Also one ref for each 'thing'
         * in kernel that found and may be using this mark. */
        refcount_t refcnt;
        /* Group this mark is for. Set on mark creation, stable until last ref
         * is dropped */
        struct fsnotify_group *group;
        /* List of marks by group->marks_list. Also reused for queueing
         * mark into destroy_list when it's waiting for the end of SRCU period
         * before it can be freed. [group->mark_mutex] */
        struct list_head g_list;
        /* Protects inode / mnt pointers, flags, masks */
        spinlock_t lock;
        /* List of marks for inode / vfsmount [connector->lock, mark ref] */
        struct hlist_node obj_list;
        /* Head of list of marks for an object [mark ref] */
        struct fsnotify_mark_connector *connector;
        /* Events types to ignore [mark->lock, group->mark_mutex] */
        __u32 ignored_mask;
#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY        0x01
#define FSNOTIFY_MARK_FLAG_ALIVE                0x02
#define FSNOTIFY_MARK_FLAG_ATTACHED                0x04
        unsigned int flags;                /* flags [mark->lock] */
};

#ifdef CONFIG_FSNOTIFY

/* called from the vfs helpers */

/* main fsnotify call to send events */
extern int fsnotify(__u32 mask, const void *data, int data_type,
                    struct inode *dir, const struct qstr *name,
                    struct inode *inode, u32 cookie);
extern int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
                           int data_type);
extern void __fsnotify_inode_delete(struct inode *inode);
extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt);
extern void fsnotify_sb_delete(struct super_block *sb);
extern u32 fsnotify_get_cookie(void);

static inline __u32 fsnotify_parent_needed_mask(__u32 mask)
{
        /* FS_EVENT_ON_CHILD is set on marks that want parent/name info */
        if (!(mask & FS_EVENT_ON_CHILD))
                return 0;
        /*
         * This object might be watched by a mark that cares about parent/name
         * info, does it care about the specific set of events that can be
         * reported with parent/name info?
         */
        return mask & FS_EVENTS_POSS_TO_PARENT;
}

static inline int fsnotify_inode_watches_children(struct inode *inode)
{
        /* FS_EVENT_ON_CHILD is set if the inode may care */
        if (!(inode->i_fsnotify_mask & FS_EVENT_ON_CHILD))
                return 0;
        /* this inode might care about child events, does it care about the
         * specific set of events that can happen on a child? */
        return inode->i_fsnotify_mask & FS_EVENTS_POSS_ON_CHILD;
}

/*
 * Update the dentry with a flag indicating the interest of its parent to receive
 * filesystem events when those events happens to this dentry->d_inode.
 */
static inline void fsnotify_update_flags(struct dentry *dentry)
{
        assert_spin_locked(&dentry->d_lock);

        /*
         * Serialisation of setting PARENT_WATCHED on the dentries is provided
         * by d_lock. If inotify_inode_watched changes after we have taken
         * d_lock, the following __fsnotify_update_child_dentry_flags call will
         * find our entry, so it will spin until we complete here, and update
         * us with the new state.
         */
        if (fsnotify_inode_watches_children(dentry->d_parent->d_inode))
                dentry->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
        else
                dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
}

/* called from fsnotify listeners, such as fanotify or dnotify */

/* create a new group */
extern struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops);
/* get reference to a group */
extern void fsnotify_get_group(struct fsnotify_group *group);
/* drop reference on a group from fsnotify_alloc_group */
extern void fsnotify_put_group(struct fsnotify_group *group);
/* group destruction begins, stop queuing new events */
extern void fsnotify_group_stop_queueing(struct fsnotify_group *group);
/* destroy group */
extern void fsnotify_destroy_group(struct fsnotify_group *group);
/* fasync handler function */
extern int fsnotify_fasync(int fd, struct file *file, int on);
/* Free event from memory */
extern void fsnotify_destroy_event(struct fsnotify_group *group,
                                   struct fsnotify_event *event);
/* attach the event to the group notification queue */
extern int fsnotify_add_event(struct fsnotify_group *group,
                              struct fsnotify_event *event,
                              int (*merge)(struct list_head *,
                                           struct fsnotify_event *));
/* Queue overflow event to a notification group */
static inline void fsnotify_queue_overflow(struct fsnotify_group *group)
{
        fsnotify_add_event(group, group->overflow_event, NULL);
}

/* true if the group notification queue is empty */
extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
/* return, but do not dequeue the first event on the notification queue */
extern struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group);
/* return AND dequeue the first event on the notification queue */
extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group);
/* Remove event queued in the notification list */
extern void fsnotify_remove_queued_event(struct fsnotify_group *group,
                                         struct fsnotify_event *event);

/* functions used to manipulate the marks attached to inodes */

/* Get mask of events for a list of marks */
extern __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn);
/* Calculate mask of events for a list of marks */
extern void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn);
extern void fsnotify_init_mark(struct fsnotify_mark *mark,
                               struct fsnotify_group *group);
/* Find mark belonging to given group in the list of marks */
extern struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp,
                                                struct fsnotify_group *group);
/* Get cached fsid of filesystem containing object */
extern int fsnotify_get_conn_fsid(const struct fsnotify_mark_connector *conn,
                                  __kernel_fsid_t *fsid);
/* attach the mark to the object */
extern int fsnotify_add_mark(struct fsnotify_mark *mark,
                             fsnotify_connp_t *connp, unsigned int type,
                             int allow_dups, __kernel_fsid_t *fsid);
extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
                                    fsnotify_connp_t *connp,
                                    unsigned int type, int allow_dups,
                                    __kernel_fsid_t *fsid);

/* attach the mark to the inode */
static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
                                          struct inode *inode,
                                          int allow_dups)
{
        return fsnotify_add_mark(mark, &inode->i_fsnotify_marks,
                                 FSNOTIFY_OBJ_TYPE_INODE, allow_dups, NULL);
}
static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark,
                                                 struct inode *inode,
                                                 int allow_dups)
{
        return fsnotify_add_mark_locked(mark, &inode->i_fsnotify_marks,
                                        FSNOTIFY_OBJ_TYPE_INODE, allow_dups,
                                        NULL);
}

/* given a group and a mark, flag mark to be freed when all references are dropped */
extern void fsnotify_destroy_mark(struct fsnotify_mark *mark,
                                  struct fsnotify_group *group);
/* detach mark from inode / mount list, group list, drop inode reference */
extern void fsnotify_detach_mark(struct fsnotify_mark *mark);
/* free mark */
extern void fsnotify_free_mark(struct fsnotify_mark *mark);
/* Wait until all marks queued for destruction are destroyed */
extern void fsnotify_wait_marks_destroyed(void);
/* run all the marks in a group, and clear all of the marks attached to given object type */
extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int type);
/* run all the marks in a group, and clear all of the vfsmount marks */
static inline void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
{
        fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT_FL);
}
/* run all the marks in a group, and clear all of the inode marks */
static inline void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
{
        fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_INODE_FL);
}
/* run all the marks in a group, and clear all of the sn marks */
static inline void fsnotify_clear_sb_marks_by_group(struct fsnotify_group *group)
{
        fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_SB_FL);
}
extern void fsnotify_get_mark(struct fsnotify_mark *mark);
extern void fsnotify_put_mark(struct fsnotify_mark *mark);
extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info);
extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info);

static inline void fsnotify_init_event(struct fsnotify_event *event,
                                       unsigned long objectid)
{
        INIT_LIST_HEAD(&event->list);
        event->objectid = objectid;
}

#else

static inline int fsnotify(__u32 mask, const void *data, int data_type,
                           struct inode *dir, const struct qstr *name,
                           struct inode *inode, u32 cookie)
{
        return 0;
}

static inline int __fsnotify_parent(struct dentry *dentry, __u32 mask,
                                  const void *data, int data_type)
{
        return 0;
}

static inline void __fsnotify_inode_delete(struct inode *inode)
{}

static inline void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
{}

static inline void fsnotify_sb_delete(struct super_block *sb)
{}

static inline void fsnotify_update_flags(struct dentry *dentry)
{}

static inline u32 fsnotify_get_cookie(void)
{
        return 0;
}

static inline void fsnotify_unmount_inodes(struct super_block *sb)
{}

#endif        /* CONFIG_FSNOTIFY */

#endif        /* __KERNEL __ */

#endif        /* __LINUX_FSNOTIFY_BACKEND_H */




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _DELAYED_CALL_H
#define _DELAYED_CALL_H

/*
 * Poor man's closures; I wish we could've done them sanely polymorphic,
 * but...
 */

struct delayed_call {
        void (*fn)(void *);
        void *arg;
};

#define DEFINE_DELAYED_CALL(name) struct delayed_call name = {NULL, NULL}

/* I really wish we had closures with sane typechecking... */
static inline void set_delayed_call(struct delayed_call *call,
                void (*fn)(void *), void *arg)
{
        call->fn = fn;
        call->arg = arg;
}

static inline void do_delayed_call(struct delayed_call *call)
{
        if (call->fn)
                call->fn(call->arg);
}

static inline void clear_delayed_call(struct delayed_call *call)
{
        call->fn = NULL;
}
#endif





































    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/fault-inject.h>
#include <linux/fault-inject-usercopy.h>

static struct {
        struct fault_attr attr;
} fail_usercopy = {
        .attr = FAULT_ATTR_INITIALIZER,
};

static int __init setup_fail_usercopy(char *str)
{
        return setup_fault_attr(&fail_usercopy.attr, str);
}
__setup("fail_usercopy=", setup_fail_usercopy);

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

static int __init fail_usercopy_debugfs(void)
{
        struct dentry *dir;

        dir = fault_create_debugfs_attr("fail_usercopy", NULL,
                                        &fail_usercopy.attr);
        if (IS_ERR(dir))
                return PTR_ERR(dir);

        return 0;
}

late_initcall(fail_usercopy_debugfs);

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

bool should_fail_usercopy(void)
{
        return should_fail(&fail_usercopy.attr, 1);
}
EXPORT_SYMBOL_GPL(should_fail_usercopy);














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
/*
 * Performance events:
 *
 *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
 *    Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar
 *    Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra
 *
 * Data type definitions, declarations, prototypes.
 *
 *    Started by: Thomas Gleixner and Ingo Molnar
 *
 * For licencing details see kernel-base/COPYING
 */
#ifndef _LINUX_PERF_EVENT_H
#define _LINUX_PERF_EVENT_H

#include <uapi/linux/perf_event.h>
#include <uapi/linux/bpf_perf_event.h>

/*
 * Kernel-internal data types and definitions:
 */

#ifdef CONFIG_PERF_EVENTS
# include <asm/perf_event.h>
# include <asm/local64.h>
#endif

struct perf_guest_info_callbacks {
        int                                (*is_in_guest)(void);
        int                                (*is_user_mode)(void);
        unsigned long                        (*get_guest_ip)(void);
        void                                (*handle_intel_pt_intr)(void);
};

#ifdef CONFIG_HAVE_HW_BREAKPOINT
#include <asm/hw_breakpoint.h>
#endif

#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/hrtimer.h>
#include <linux/fs.h>
#include <linux/pid_namespace.h>
#include <linux/workqueue.h>
#include <linux/ftrace.h>
#include <linux/cpu.h>
#include <linux/irq_work.h>
#include <linux/static_key.h>
#include <linux/jump_label_ratelimit.h>
#include <linux/atomic.h>
#include <linux/sysfs.h>
#include <linux/perf_regs.h>
#include <linux/cgroup.h>
#include <linux/refcount.h>
#include <linux/security.h>
#include <asm/local.h>

struct perf_callchain_entry {
        __u64                                nr;
        __u64                                ip[]; /* /proc/sys/kernel/perf_event_max_stack */
};

struct perf_callchain_entry_ctx {
        struct perf_callchain_entry *entry;
        u32                            max_stack;
        u32                            nr;
        short                            contexts;
        bool                            contexts_maxed;
};

typedef unsigned long (*perf_copy_f)(void *dst, const void *src,
                                     unsigned long off, unsigned long len);

struct perf_raw_frag {
        union {
                struct perf_raw_frag        *next;
                unsigned long                pad;
        };
        perf_copy_f                        copy;
        void                                *data;
        u32                                size;
} __packed;

struct perf_raw_record {
        struct perf_raw_frag                frag;
        u32                                size;
};

/*
 * branch stack layout:
 *  nr: number of taken branches stored in entries[]
 *  hw_idx: The low level index of raw branch records
 *          for the most recent branch.
 *          -1ULL means invalid/unknown.
 *
 * Note that nr can vary from sample to sample
 * branches (to, from) are stored from most recent
 * to least recent, i.e., entries[0] contains the most
 * recent branch.
 * The entries[] is an abstraction of raw branch records,
 * which may not be stored in age order in HW, e.g. Intel LBR.
 * The hw_idx is to expose the low level index of raw
 * branch record for the most recent branch aka entries[0].
 * The hw_idx index is between -1 (unknown) and max depth,
 * which can be retrieved in /sys/devices/cpu/caps/branches.
 * For the architectures whose raw branch records are
 * already stored in age order, the hw_idx should be 0.
 */
struct perf_branch_stack {
        __u64                                nr;
        __u64                                hw_idx;
        struct perf_branch_entry        entries[];
};

struct task_struct;

/*
 * extra PMU register associated with an event
 */
struct hw_perf_event_extra {
        u64                config;        /* register value */
        unsigned int        reg;        /* register address or index */
        int                alloc;        /* extra register already allocated */
        int                idx;        /* index in shared_regs->regs[] */
};

/**
 * struct hw_perf_event - performance event hardware details:
 */
struct hw_perf_event {
#ifdef CONFIG_PERF_EVENTS
        union {
                struct { /* hardware */
                        u64                config;
                        u64                last_tag;
                        unsigned long        config_base;
                        unsigned long        event_base;
                        int                event_base_rdpmc;
                        int                idx;
                        int                last_cpu;
                        int                flags;

                        struct hw_perf_event_extra extra_reg;
                        struct hw_perf_event_extra branch_reg;
                };
                struct { /* software */
                        struct hrtimer        hrtimer;
                };
                struct { /* tracepoint */
                        /* for tp_event->class */
                        struct list_head        tp_list;
                };
                struct { /* amd_power */
                        u64        pwr_acc;
                        u64        ptsc;
                };
#ifdef CONFIG_HAVE_HW_BREAKPOINT
                struct { /* breakpoint */
                        /*
                         * Crufty hack to avoid the chicken and egg
                         * problem hw_breakpoint has with context
                         * creation and event initalization.
                         */
                        struct arch_hw_breakpoint        info;
                        struct list_head                bp_list;
                };
#endif
                struct { /* amd_iommu */
                        u8        iommu_bank;
                        u8        iommu_cntr;
                        u16        padding;
                        u64        conf;
                        u64        conf1;
                };
        };
        /*
         * If the event is a per task event, this will point to the task in
         * question. See the comment in perf_event_alloc().
         */
        struct task_struct                *target;

        /*
         * PMU would store hardware filter configuration
         * here.
         */
        void                                *addr_filters;

        /* Last sync'ed generation of filters */
        unsigned long                        addr_filters_gen;

/*
 * hw_perf_event::state flags; used to track the PERF_EF_* state.
 */
#define PERF_HES_STOPPED        0x01 /* the counter is stopped */
#define PERF_HES_UPTODATE        0x02 /* event->count up-to-date */
#define PERF_HES_ARCH                0x04

        int                                state;

        /*
         * The last observed hardware counter value, updated with a
         * local64_cmpxchg() such that pmu::read() can be called nested.
         */
        local64_t                        prev_count;

        /*
         * The period to start the next sample with.
         */
        u64                                sample_period;

        union {
                struct { /* Sampling */
                        /*
                         * The period we started this sample with.
                         */
                        u64                                last_period;

                        /*
                         * However much is left of the current period;
                         * note that this is a full 64bit value and
                         * allows for generation of periods longer
                         * than hardware might allow.
                         */
                        local64_t                        period_left;
                };
                struct { /* Topdown events counting for context switch */
                        u64                                saved_metric;
                        u64                                saved_slots;
                };
        };

        /*
         * State for throttling the event, see __perf_event_overflow() and
         * perf_adjust_freq_unthr_context().
         */
        u64                             interrupts_seq;
        u64                                interrupts;

        /*
         * State for freq target events, see __perf_event_overflow() and
         * perf_adjust_freq_unthr_context().
         */
        u64                                freq_time_stamp;
        u64                                freq_count_stamp;
#endif
};

struct perf_event;

/*
 * Common implementation detail of pmu::{start,commit,cancel}_txn
 */
#define PERF_PMU_TXN_ADD  0x1                /* txn to add/schedule event on PMU */
#define PERF_PMU_TXN_READ 0x2                /* txn to read event group from PMU */

/**
 * pmu::capabilities flags
 */
#define PERF_PMU_CAP_NO_INTERRUPT                0x01
#define PERF_PMU_CAP_NO_NMI                        0x02
#define PERF_PMU_CAP_AUX_NO_SG                        0x04
#define PERF_PMU_CAP_EXTENDED_REGS                0x08
#define PERF_PMU_CAP_EXCLUSIVE                        0x10
#define PERF_PMU_CAP_ITRACE                        0x20
#define PERF_PMU_CAP_HETEROGENEOUS_CPUS                0x40
#define PERF_PMU_CAP_NO_EXCLUDE                        0x80
#define PERF_PMU_CAP_AUX_OUTPUT                        0x100

struct perf_output_handle;

/**
 * struct pmu - generic performance monitoring unit
 */
struct pmu {
        struct list_head                entry;

        struct module                        *module;
        struct device                        *dev;
        const struct attribute_group        **attr_groups;
        const struct attribute_group        **attr_update;
        const char                        *name;
        int                                type;

        /*
         * various common per-pmu feature flags
         */
        int                                capabilities;

        int __percpu                        *pmu_disable_count;
        struct perf_cpu_context __percpu *pmu_cpu_context;
        atomic_t                        exclusive_cnt; /* < 0: cpu; > 0: tsk */
        int                                task_ctx_nr;
        int                                hrtimer_interval_ms;

        /* number of address filters this PMU can do */
        unsigned int                        nr_addr_filters;

        /*
         * Fully disable/enable this PMU, can be used to protect from the PMI
         * as well as for lazy/batch writing of the MSRs.
         */
        void (*pmu_enable)                (struct pmu *pmu); /* optional */
        void (*pmu_disable)                (struct pmu *pmu); /* optional */

        /*
         * Try and initialize the event for this PMU.
         *
         * Returns:
         *  -ENOENT        -- @event is not for this PMU
         *
         *  -ENODEV        -- @event is for this PMU but PMU not present
         *  -EBUSY        -- @event is for this PMU but PMU temporarily unavailable
         *  -EINVAL        -- @event is for this PMU but @event is not valid
         *  -EOPNOTSUPP -- @event is for this PMU, @event is valid, but not supported
         *  -EACCES        -- @event is for this PMU, @event is valid, but no privileges
         *
         *  0                -- @event is for this PMU and valid
         *
         * Other error return values are allowed.
         */
        int (*event_init)                (struct perf_event *event);

        /*
         * Notification that the event was mapped or unmapped.  Called
         * in the context of the mapping task.
         */
        void (*event_mapped)                (struct perf_event *event, struct mm_struct *mm); /* optional */
        void (*event_unmapped)                (struct perf_event *event, struct mm_struct *mm); /* optional */

        /*
         * Flags for ->add()/->del()/ ->start()/->stop(). There are
         * matching hw_perf_event::state flags.
         */
#define PERF_EF_START        0x01                /* start the counter when adding    */
#define PERF_EF_RELOAD        0x02                /* reload the counter when starting */
#define PERF_EF_UPDATE        0x04                /* update the counter when stopping */

        /*
         * Adds/Removes a counter to/from the PMU, can be done inside a
         * transaction, see the ->*_txn() methods.
         *
         * The add/del callbacks will reserve all hardware resources required
         * to service the event, this includes any counter constraint
         * scheduling etc.
         *
         * Called with IRQs disabled and the PMU disabled on the CPU the event
         * is on.
         *
         * ->add() called without PERF_EF_START should result in the same state
         *  as ->add() followed by ->stop().
         *
         * ->del() must always PERF_EF_UPDATE stop an event. If it calls
         *  ->stop() that must deal with already being stopped without
         *  PERF_EF_UPDATE.
         */
        int  (*add)                        (struct perf_event *event, int flags);
        void (*del)                        (struct perf_event *event, int flags);

        /*
         * Starts/Stops a counter present on the PMU.
         *
         * The PMI handler should stop the counter when perf_event_overflow()
         * returns !0. ->start() will be used to continue.
         *
         * Also used to change the sample period.
         *
         * Called with IRQs disabled and the PMU disabled on the CPU the event
         * is on -- will be called from NMI context with the PMU generates
         * NMIs.
         *
         * ->stop() with PERF_EF_UPDATE will read the counter and update
         *  period/count values like ->read() would.
         *
         * ->start() with PERF_EF_RELOAD will reprogram the counter
         *  value, must be preceded by a ->stop() with PERF_EF_UPDATE.
         */
        void (*start)                        (struct perf_event *event, int flags);
        void (*stop)                        (struct perf_event *event, int flags);

        /*
         * Updates the counter value of the event.
         *
         * For sampling capable PMUs this will also update the software period
         * hw_perf_event::period_left field.
         */
        void (*read)                        (struct perf_event *event);

        /*
         * Group events scheduling is treated as a transaction, add
         * group events as a whole and perform one schedulability test.
         * If the test fails, roll back the whole group
         *
         * Start the transaction, after this ->add() doesn't need to
         * do schedulability tests.
         *
         * Optional.
         */
        void (*start_txn)                (struct pmu *pmu, unsigned int txn_flags);
        /*
         * If ->start_txn() disabled the ->add() schedulability test
         * then ->commit_txn() is required to perform one. On success
         * the transaction is closed. On error the transaction is kept
         * open until ->cancel_txn() is called.
         *
         * Optional.
         */
        int  (*commit_txn)                (struct pmu *pmu);
        /*
         * Will cancel the transaction, assumes ->del() is called
         * for each successful ->add() during the transaction.
         *
         * Optional.
         */
        void (*cancel_txn)                (struct pmu *pmu);

        /*
         * Will return the value for perf_event_mmap_page::index for this event,
         * if no implementation is provided it will default to: event->hw.idx + 1.
         */
        int (*event_idx)                (struct perf_event *event); /*optional */

        /*
         * context-switches callback
         */
        void (*sched_task)                (struct perf_event_context *ctx,
                                        bool sched_in);

        /*
         * Kmem cache of PMU specific data
         */
        struct kmem_cache                *task_ctx_cache;

        /*
         * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data)
         * can be synchronized using this function. See Intel LBR callstack support
         * implementation and Perf core context switch handling callbacks for usage
         * examples.
         */
        void (*swap_task_ctx)                (struct perf_event_context *prev,
                                         struct perf_event_context *next);
                                        /* optional */

        /*
         * Set up pmu-private data structures for an AUX area
         */
        void *(*setup_aux)                (struct perf_event *event, void **pages,
                                         int nr_pages, bool overwrite);
                                        /* optional */

        /*
         * Free pmu-private AUX data structures
         */
        void (*free_aux)                (void *aux); /* optional */

        /*
         * Take a snapshot of the AUX buffer without touching the event
         * state, so that preempting ->start()/->stop() callbacks does
         * not interfere with their logic. Called in PMI context.
         *
         * Returns the size of AUX data copied to the output handle.
         *
         * Optional.
         */
        long (*snapshot_aux)                (struct perf_event *event,
                                         struct perf_output_handle *handle,
                                         unsigned long size);

        /*
         * Validate address range filters: make sure the HW supports the
         * requested configuration and number of filters; return 0 if the
         * supplied filters are valid, -errno otherwise.
         *
         * Runs in the context of the ioctl()ing process and is not serialized
         * with the rest of the PMU callbacks.
         */
        int (*addr_filters_validate)        (struct list_head *filters);
                                        /* optional */

        /*
         * Synchronize address range filter configuration:
         * translate hw-agnostic filters into hardware configuration in
         * event::hw::addr_filters.
         *
         * Runs as a part of filter sync sequence that is done in ->start()
         * callback by calling perf_event_addr_filters_sync().
         *
         * May (and should) traverse event::addr_filters::list, for which its
         * caller provides necessary serialization.
         */
        void (*addr_filters_sync)        (struct perf_event *event);
                                        /* optional */

        /*
         * Check if event can be used for aux_output purposes for
         * events of this PMU.
         *
         * Runs from perf_event_open(). Should return 0 for "no match"
         * or non-zero for "match".
         */
        int (*aux_output_match)                (struct perf_event *event);
                                        /* optional */

        /*
         * Filter events for PMU-specific reasons.
         */
        int (*filter_match)                (struct perf_event *event); /* optional */

        /*
         * Check period value for PERF_EVENT_IOC_PERIOD ioctl.
         */
        int (*check_period)                (struct perf_event *event, u64 value); /* optional */
};

enum perf_addr_filter_action_t {
        PERF_ADDR_FILTER_ACTION_STOP = 0,
        PERF_ADDR_FILTER_ACTION_START,
        PERF_ADDR_FILTER_ACTION_FILTER,
};

/**
 * struct perf_addr_filter - address range filter definition
 * @entry:        event's filter list linkage
 * @path:        object file's path for file-based filters
 * @offset:        filter range offset
 * @size:        filter range size (size==0 means single address trigger)
 * @action:        filter/start/stop
 *
 * This is a hardware-agnostic filter configuration as specified by the user.
 */
struct perf_addr_filter {
        struct list_head        entry;
        struct path                path;
        unsigned long                offset;
        unsigned long                size;
        enum perf_addr_filter_action_t        action;
};

/**
 * struct perf_addr_filters_head - container for address range filters
 * @list:        list of filters for this event
 * @lock:        spinlock that serializes accesses to the @list and event's
 *                (and its children's) filter generations.
 * @nr_file_filters:        number of file-based filters
 *
 * A child event will use parent's @list (and therefore @lock), so they are
 * bundled together; see perf_event_addr_filters().
 */
struct perf_addr_filters_head {
        struct list_head        list;
        raw_spinlock_t                lock;
        unsigned int                nr_file_filters;
};

struct perf_addr_filter_range {
        unsigned long                start;
        unsigned long                size;
};

/**
 * enum perf_event_state - the states of an event:
 */
enum perf_event_state {
        PERF_EVENT_STATE_DEAD                = -4,
        PERF_EVENT_STATE_EXIT                = -3,
        PERF_EVENT_STATE_ERROR                = -2,
        PERF_EVENT_STATE_OFF                = -1,
        PERF_EVENT_STATE_INACTIVE        =  0,
        PERF_EVENT_STATE_ACTIVE                =  1,
};

struct file;
struct perf_sample_data;

typedef void (*perf_overflow_handler_t)(struct perf_event *,
                                        struct perf_sample_data *,
                                        struct pt_regs *regs);

/*
 * Event capabilities. For event_caps and groups caps.
 *
 * PERF_EV_CAP_SOFTWARE: Is a software event.
 * PERF_EV_CAP_READ_ACTIVE_PKG: A CPU event (or cgroup event) that can be read
 * from any CPU in the package where it is active.
 * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and
 * cannot be a group leader. If an event with this flag is detached from the
 * group it is scheduled out and moved into an unrecoverable ERROR state.
 */
#define PERF_EV_CAP_SOFTWARE                BIT(0)
#define PERF_EV_CAP_READ_ACTIVE_PKG        BIT(1)
#define PERF_EV_CAP_SIBLING                BIT(2)

#define SWEVENT_HLIST_BITS                8
#define SWEVENT_HLIST_SIZE                (1 << SWEVENT_HLIST_BITS)

struct swevent_hlist {
        struct hlist_head                heads[SWEVENT_HLIST_SIZE];
        struct rcu_head                        rcu_head;
};

#define PERF_ATTACH_CONTEXT        0x01
#define PERF_ATTACH_GROUP        0x02
#define PERF_ATTACH_TASK        0x04
#define PERF_ATTACH_TASK_DATA        0x08
#define PERF_ATTACH_ITRACE        0x10
#define PERF_ATTACH_SCHED_CB        0x20

struct perf_cgroup;
struct perf_buffer;

struct pmu_event_list {
        raw_spinlock_t                lock;
        struct list_head        list;
};

#define for_each_sibling_event(sibling, event)                        \
        if ((event)->group_leader == (event))                        \
                list_for_each_entry((sibling), &(event)->sibling_list, sibling_list)

/**
 * struct perf_event - performance event kernel representation:
 */
struct perf_event {
#ifdef CONFIG_PERF_EVENTS
        /*
         * entry onto perf_event_context::event_list;
         *   modifications require ctx->lock
         *   RCU safe iterations.
         */
        struct list_head                event_entry;

        /*
         * Locked for modification by both ctx->mutex and ctx->lock; holding
         * either sufficies for read.
         */
        struct list_head                sibling_list;
        struct list_head                active_list;
        /*
         * Node on the pinned or flexible tree located at the event context;
         */
        struct rb_node                        group_node;
        u64                                group_index;
        /*
         * We need storage to track the entries in perf_pmu_migrate_context; we
         * cannot use the event_entry because of RCU and we want to keep the
         * group in tact which avoids us using the other two entries.
         */
        struct list_head                migrate_entry;

        struct hlist_node                hlist_entry;
        struct list_head                active_entry;
        int                                nr_siblings;

        /* Not serialized. Only written during event initialization. */
        int                                event_caps;
        /* The cumulative AND of all event_caps for events in this group. */
        int                                group_caps;

        unsigned int                        group_generation;
        struct perf_event                *group_leader;
        struct pmu                        *pmu;
        void                                *pmu_private;

        enum perf_event_state                state;
        unsigned int                        attach_state;
        local64_t                        count;
        atomic64_t                        child_count;

        /*
         * These are the total time in nanoseconds that the event
         * has been enabled (i.e. eligible to run, and the task has
         * been scheduled in, if this is a per-task event)
         * and running (scheduled onto the CPU), respectively.
         */
        u64                                total_time_enabled;
        u64                                total_time_running;
        u64                                tstamp;

        struct perf_event_attr                attr;
        u16                                header_size;
        u16                                id_header_size;
        u16                                read_size;
        struct hw_perf_event                hw;

        struct perf_event_context        *ctx;
        atomic_long_t                        refcount;

        /*
         * These accumulate total time (in nanoseconds) that children
         * events have been enabled and running, respectively.
         */
        atomic64_t                        child_total_time_enabled;
        atomic64_t                        child_total_time_running;

        /*
         * Protect attach/detach and child_list:
         */
        struct mutex                        child_mutex;
        struct list_head                child_list;
        struct perf_event                *parent;

        int                                oncpu;
        int                                cpu;

        struct list_head                owner_entry;
        struct task_struct                *owner;

        /* mmap bits */
        struct mutex                        mmap_mutex;
        atomic_t                        mmap_count;

        struct perf_buffer                *rb;
        struct list_head                rb_entry;
        unsigned long                        rcu_batches;
        int                                rcu_pending;

        /* poll related */
        wait_queue_head_t                waitq;
        struct fasync_struct                *fasync;

        /* delayed work for NMIs and such */
        int                                pending_wakeup;
        int                                pending_kill;
        int                                pending_disable;
        struct irq_work                        pending;

        atomic_t                        event_limit;

        /* address range filters */
        struct perf_addr_filters_head        addr_filters;
        /* vma address array for file-based filders */
        struct perf_addr_filter_range        *addr_filter_ranges;
        unsigned long                        addr_filters_gen;

        /* for aux_output events */
        struct perf_event                *aux_event;

        void (*destroy)(struct perf_event *);
        struct rcu_head                        rcu_head;

        struct pid_namespace                *ns;
        u64                                id;

        atomic64_t                        lost_samples;

        u64                                (*clock)(void);
        perf_overflow_handler_t                overflow_handler;
        void                                *overflow_handler_context;
#ifdef CONFIG_BPF_SYSCALL
        perf_overflow_handler_t                orig_overflow_handler;
        struct bpf_prog                        *prog;
#endif

#ifdef CONFIG_EVENT_TRACING
        struct trace_event_call                *tp_event;
        struct event_filter                *filter;
#ifdef CONFIG_FUNCTION_TRACER
        struct ftrace_ops               ftrace_ops;
#endif
#endif

#ifdef CONFIG_CGROUP_PERF
        struct perf_cgroup                *cgrp; /* cgroup event is attach to */
#endif

#ifdef CONFIG_SECURITY
        void *security;
#endif
        struct list_head                sb_list;
#endif /* CONFIG_PERF_EVENTS */
};


struct perf_event_groups {
        struct rb_root        tree;
        u64                index;
};

/**
 * struct perf_event_context - event context structure
 *
 * Used as a container for task events and CPU events as well:
 */
struct perf_event_context {
        struct pmu                        *pmu;
        /*
         * Protect the states of the events in the list,
         * nr_active, and the list:
         */
        raw_spinlock_t                        lock;
        /*
         * Protect the list of events.  Locking either mutex or lock
         * is sufficient to ensure the list doesn't change; to change
         * the list you need to lock both the mutex and the spinlock.
         */
        struct mutex                        mutex;

        struct list_head                active_ctx_list;
        struct perf_event_groups        pinned_groups;
        struct perf_event_groups        flexible_groups;
        struct list_head                event_list;

        struct list_head                pinned_active;
        struct list_head                flexible_active;

        int                                nr_events;
        int                                nr_active;
        int                                is_active;
        int                                nr_stat;
        int                                nr_freq;
        int                                rotate_disable;
        /*
         * Set when nr_events != nr_active, except tolerant to events not
         * necessary to be active due to scheduling constraints, such as cgroups.
         */
        int                                rotate_necessary;
        refcount_t                        refcount;
        struct task_struct                *task;

        /*
         * Context clock, runs when context enabled.
         */
        u64                                time;
        u64                                timestamp;
        u64                                timeoffset;

        /*
         * These fields let us detect when two contexts have both
         * been cloned (inherited) from a common ancestor.
         */
        struct perf_event_context        *parent_ctx;
        u64                                parent_gen;
        u64                                generation;
        int                                pin_count;
#ifdef CONFIG_CGROUP_PERF
        int                                nr_cgroups;         /* cgroup evts */
#endif
        void                                *task_ctx_data; /* pmu specific data */
        struct rcu_head                        rcu_head;
};

/*
 * Number of contexts where an event can trigger:
 *        task, softirq, hardirq, nmi.
 */
#define PERF_NR_CONTEXTS        4

/**
 * struct perf_event_cpu_context - per cpu event context structure
 */
struct perf_cpu_context {
        struct perf_event_context        ctx;
        struct perf_event_context        *task_ctx;
        int                                active_oncpu;
        int                                exclusive;

        raw_spinlock_t                        hrtimer_lock;
        struct hrtimer                        hrtimer;
        ktime_t                                hrtimer_interval;
        unsigned int                        hrtimer_active;

#ifdef CONFIG_CGROUP_PERF
        struct perf_cgroup                *cgrp;
        struct list_head                cgrp_cpuctx_entry;
#endif

        struct list_head                sched_cb_entry;
        int                                sched_cb_usage;

        int                                online;
        /*
         * Per-CPU storage for iterators used in visit_groups_merge. The default
         * storage is of size 2 to hold the CPU and any CPU event iterators.
         */
        int                                heap_size;
        struct perf_event                **heap;
        struct perf_event                *heap_default[2];
};

struct perf_output_handle {
        struct perf_event                *event;
        struct perf_buffer                *rb;
        unsigned long                        wakeup;
        unsigned long                        size;
        u64                                aux_flags;
        union {
                void                        *addr;
                unsigned long                head;
        };
        int                                page;
};

struct bpf_perf_event_data_kern {
        bpf_user_pt_regs_t *regs;
        struct perf_sample_data *data;
        struct perf_event *event;
};

#ifdef CONFIG_CGROUP_PERF

/*
 * perf_cgroup_info keeps track of time_enabled for a cgroup.
 * This is a per-cpu dynamically allocated data structure.
 */
struct perf_cgroup_info {
        u64                                time;
        u64                                timestamp;
        u64                                timeoffset;
        int                                active;
};

struct perf_cgroup {
        struct cgroup_subsys_state        css;
        struct perf_cgroup_info        __percpu *info;
};

/*
 * Must ensure cgroup is pinned (css_get) before calling
 * this function. In other words, we cannot call this function
 * if there is no cgroup event for the current CPU context.
 */
static inline struct perf_cgroup *
perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx)
{
        return container_of(task_css_check(task, perf_event_cgrp_id,
                                           ctx ? lockdep_is_held(&ctx->lock)
                                               : true),
                            struct perf_cgroup, css);
}
#endif /* CONFIG_CGROUP_PERF */

#ifdef CONFIG_PERF_EVENTS

extern void *perf_aux_output_begin(struct perf_output_handle *handle,
                                   struct perf_event *event);
extern void perf_aux_output_end(struct perf_output_handle *handle,
                                unsigned long size);
extern int perf_aux_output_skip(struct perf_output_handle *handle,
                                unsigned long size);
extern void *perf_get_aux(struct perf_output_handle *handle);
extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags);
extern void perf_event_itrace_started(struct perf_event *event);

extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
extern void perf_pmu_unregister(struct pmu *pmu);

extern int perf_num_counters(void);
extern const char *perf_pmu_name(void);
extern void __perf_event_task_sched_in(struct task_struct *prev,
                                       struct task_struct *task);
extern void __perf_event_task_sched_out(struct task_struct *prev,
                                        struct task_struct *next);
extern int perf_event_init_task(struct task_struct *child);
extern void perf_event_exit_task(struct task_struct *child);
extern void perf_event_free_task(struct task_struct *task);
extern void perf_event_delayed_put(struct task_struct *task);
extern struct file *perf_event_get(unsigned int fd);
extern const struct perf_event *perf_get_event(struct file *file);
extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event);
extern void perf_event_print_debug(void);
extern void perf_pmu_disable(struct pmu *pmu);
extern void perf_pmu_enable(struct pmu *pmu);
extern void perf_sched_cb_dec(struct pmu *pmu);
extern void perf_sched_cb_inc(struct pmu *pmu);
extern int perf_event_task_disable(void);
extern int perf_event_task_enable(void);

extern void perf_pmu_resched(struct pmu *pmu);

extern int perf_event_refresh(struct perf_event *event, int refresh);
extern void perf_event_update_userpage(struct perf_event *event);
extern int perf_event_release_kernel(struct perf_event *event);
extern struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr,
                                int cpu,
                                struct task_struct *task,
                                perf_overflow_handler_t callback,
                                void *context);
extern void perf_pmu_migrate_context(struct pmu *pmu,
                                int src_cpu, int dst_cpu);
int perf_event_read_local(struct perf_event *event, u64 *value,
                          u64 *enabled, u64 *running);
extern u64 perf_event_read_value(struct perf_event *event,
                                 u64 *enabled, u64 *running);


struct perf_sample_data {
        /*
         * Fields set by perf_sample_data_init(), group so as to
         * minimize the cachelines touched.
         */
        u64                                addr;
        struct perf_raw_record                *raw;
        struct perf_branch_stack        *br_stack;
        u64                                period;
        u64                                weight;
        u64                                txn;
        union  perf_mem_data_src        data_src;

        /*
         * The other fields, optionally {set,used} by
         * perf_{prepare,output}_sample().
         */
        u64                                type;
        u64                                ip;
        struct {
                u32        pid;
                u32        tid;
        }                                tid_entry;
        u64                                time;
        u64                                id;
        u64                                stream_id;
        struct {
                u32        cpu;
                u32        reserved;
        }                                cpu_entry;
        struct perf_callchain_entry        *callchain;
        u64                                aux_size;

        struct perf_regs                regs_user;
        struct perf_regs                regs_intr;
        u64                                stack_user_size;

        u64                                phys_addr;
        u64                                cgroup;
} ____cacheline_aligned;

/* default value for data source */
#define PERF_MEM_NA (PERF_MEM_S(OP, NA)   |\
                    PERF_MEM_S(LVL, NA)   |\
                    PERF_MEM_S(SNOOP, NA) |\
                    PERF_MEM_S(LOCK, NA)  |\
                    PERF_MEM_S(TLB, NA))

static inline void perf_sample_data_init(struct perf_sample_data *data,
                                         u64 addr, u64 period)
{
        /* remaining struct members initialized in perf_prepare_sample() */
        data->addr = addr;
        data->raw  = NULL;
        data->br_stack = NULL;
        data->period = period;
        data->weight = 0;
        data->data_src.val = PERF_MEM_NA;
        data->txn = 0;
}

extern void perf_output_sample(struct perf_output_handle *handle,
                               struct perf_event_header *header,
                               struct perf_sample_data *data,
                               struct perf_event *event);
extern void perf_prepare_sample(struct perf_event_header *header,
                                struct perf_sample_data *data,
                                struct perf_event *event,
                                struct pt_regs *regs);

extern int perf_event_overflow(struct perf_event *event,
                                 struct perf_sample_data *data,
                                 struct pt_regs *regs);

extern void perf_event_output_forward(struct perf_event *event,
                                     struct perf_sample_data *data,
                                     struct pt_regs *regs);
extern void perf_event_output_backward(struct perf_event *event,
                                       struct perf_sample_data *data,
                                       struct pt_regs *regs);
extern int perf_event_output(struct perf_event *event,
                             struct perf_sample_data *data,
                             struct pt_regs *regs);

static inline bool
__is_default_overflow_handler(perf_overflow_handler_t overflow_handler)
{
        if (likely(overflow_handler == perf_event_output_forward))
                return true;
        if (unlikely(overflow_handler == perf_event_output_backward))
                return true;
        return false;
}

#define is_default_overflow_handler(event) \
        __is_default_overflow_handler((event)->overflow_handler)

#ifdef CONFIG_BPF_SYSCALL
static inline bool uses_default_overflow_handler(struct perf_event *event)
{
        if (likely(is_default_overflow_handler(event)))
                return true;

        return __is_default_overflow_handler(event->orig_overflow_handler);
}
#else
#define uses_default_overflow_handler(event) \
        is_default_overflow_handler(event)
#endif

extern void
perf_event_header__init_id(struct perf_event_header *header,
                           struct perf_sample_data *data,
                           struct perf_event *event);
extern void
perf_event__output_id_sample(struct perf_event *event,
                             struct perf_output_handle *handle,
                             struct perf_sample_data *sample);

extern void
perf_log_lost_samples(struct perf_event *event, u64 lost);

static inline bool event_has_any_exclude_flag(struct perf_event *event)
{
        struct perf_event_attr *attr = &event->attr;

        return attr->exclude_idle || attr->exclude_user ||
               attr->exclude_kernel || attr->exclude_hv ||
               attr->exclude_guest || attr->exclude_host;
}

static inline bool is_sampling_event(struct perf_event *event)
{
        return event->attr.sample_period != 0;
}

/*
 * Return 1 for a software event, 0 for a hardware event
 */
static inline int is_software_event(struct perf_event *event)
{
        return event->event_caps & PERF_EV_CAP_SOFTWARE;
}

/*
 * Return 1 for event in sw context, 0 for event in hw context
 */
static inline int in_software_context(struct perf_event *event)
{
        return event->ctx->pmu->task_ctx_nr == perf_sw_context;
}

static inline int is_exclusive_pmu(struct pmu *pmu)
{
        return pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE;
}

extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];

extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64);
extern void __perf_sw_event(u32, u64, struct pt_regs *, u64);

#ifndef perf_arch_fetch_caller_regs
static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { }
#endif

/*
 * When generating a perf sample in-line, instead of from an interrupt /
 * exception, we lack a pt_regs. This is typically used from software events
 * like: SW_CONTEXT_SWITCHES, SW_MIGRATIONS and the tie-in with tracepoints.
 *
 * We typically don't need a full set, but (for x86) do require:
 * - ip for PERF_SAMPLE_IP
 * - cs for user_mode() tests
 * - sp for PERF_SAMPLE_CALLCHAIN
 * - eflags for MISC bits and CALLCHAIN (see: perf_hw_regs())
 *
 * NOTE: assumes @regs is otherwise already 0 filled; this is important for
 * things like PERF_SAMPLE_REGS_INTR.
 */
static inline void perf_fetch_caller_regs(struct pt_regs *regs)
{
        perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
}

static __always_inline void
perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
        if (static_key_false(&perf_swevent_enabled[event_id]))
                __perf_sw_event(event_id, nr, regs, addr);
}

DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]);

/*
 * 'Special' version for the scheduler, it hard assumes no recursion,
 * which is guaranteed by us not actually scheduling inside other swevents
 * because those disable preemption.
 */
static __always_inline void
perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
{
        if (static_key_false(&perf_swevent_enabled[event_id])) {
                struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);

                perf_fetch_caller_regs(regs);
                ___perf_sw_event(event_id, nr, regs, addr);
        }
}

extern struct static_key_false perf_sched_events;

static __always_inline bool
perf_sw_migrate_enabled(void)
{
        if (static_key_false(&perf_swevent_enabled[PERF_COUNT_SW_CPU_MIGRATIONS]))
                return true;
        return false;
}

static inline void perf_event_task_migrate(struct task_struct *task)
{
        if (perf_sw_migrate_enabled())
                task->sched_migrated = 1;
}

static inline void perf_event_task_sched_in(struct task_struct *prev,
                                            struct task_struct *task)
{
        if (static_branch_unlikely(&perf_sched_events))
                __perf_event_task_sched_in(prev, task);

        if (perf_sw_migrate_enabled() && task->sched_migrated) {
                struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);

                perf_fetch_caller_regs(regs);
                ___perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, regs, 0);
                task->sched_migrated = 0;
        }
}

static inline void perf_event_task_sched_out(struct task_struct *prev,
                                             struct task_struct *next)
{
        perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);

        if (static_branch_unlikely(&perf_sched_events))
                __perf_event_task_sched_out(prev, next);
}

extern void perf_event_mmap(struct vm_area_struct *vma);

extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len,
                               bool unregister, const char *sym);
extern void perf_event_bpf_event(struct bpf_prog *prog,
                                 enum perf_bpf_event_type type,
                                 u16 flags);

extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void)
{
        /*
         * Callbacks are RCU-protected and must be READ_ONCE to avoid reloading
         * the callbacks between a !NULL check and dereferences, to ensure
         * pending stores/changes to the callback pointers are visible before a
         * non-NULL perf_guest_cbs is visible to readers, and to prevent a
         * module from unloading callbacks while readers are active.
         */
        return rcu_dereference(perf_guest_cbs);
}
extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);

extern void perf_event_exec(void);
extern void perf_event_comm(struct task_struct *tsk, bool exec);
extern void perf_event_namespaces(struct task_struct *tsk);
extern void perf_event_fork(struct task_struct *tsk);
extern void perf_event_text_poke(const void *addr,
                                 const void *old_bytes, size_t old_len,
                                 const void *new_bytes, size_t new_len);

/* Callchains */
DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);

extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
extern struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
                   u32 max_stack, bool crosstask, bool add_mark);
extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs);
extern int get_callchain_buffers(int max_stack);
extern void put_callchain_buffers(void);
extern struct perf_callchain_entry *get_callchain_entry(int *rctx);
extern void put_callchain_entry(int rctx);

extern int sysctl_perf_event_max_stack;
extern int sysctl_perf_event_max_contexts_per_stack;

static inline int perf_callchain_store_context(struct perf_callchain_entry_ctx *ctx, u64 ip)
{
        if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) {
                struct perf_callchain_entry *entry = ctx->entry;
                entry->ip[entry->nr++] = ip;
                ++ctx->contexts;
                return 0;
        } else {
                ctx->contexts_maxed = true;
                return -1; /* no more room, stop walking the stack */
        }
}

static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip)
{
        if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) {
                struct perf_callchain_entry *entry = ctx->entry;
                entry->ip[entry->nr++] = ip;
                ++ctx->nr;
                return 0;
        } else {
                return -1; /* no more room, stop walking the stack */
        }
}

extern int sysctl_perf_event_paranoid;
extern int sysctl_perf_event_mlock;
extern int sysctl_perf_event_sample_rate;
extern int sysctl_perf_cpu_time_max_percent;

extern void perf_sample_event_took(u64 sample_len_ns);

int perf_proc_update_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos);
int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos);
int perf_event_max_stack_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos);

/* Access to perf_event_open(2) syscall. */
#define PERF_SECURITY_OPEN                0

/* Finer grained perf_event_open(2) access control. */
#define PERF_SECURITY_CPU                1
#define PERF_SECURITY_KERNEL                2
#define PERF_SECURITY_TRACEPOINT        3

static inline int perf_is_paranoid(void)
{
        return sysctl_perf_event_paranoid > -1;
}

static inline int perf_allow_kernel(struct perf_event_attr *attr)
{
        if (sysctl_perf_event_paranoid > 1 && !perfmon_capable())
                return -EACCES;

        return security_perf_event_open(attr, PERF_SECURITY_KERNEL);
}

static inline int perf_allow_cpu(struct perf_event_attr *attr)
{
        if (sysctl_perf_event_paranoid > 0 && !perfmon_capable())
                return -EACCES;

        return security_perf_event_open(attr, PERF_SECURITY_CPU);
}

static inline int perf_allow_tracepoint(struct perf_event_attr *attr)
{
        if (sysctl_perf_event_paranoid > -1 && !perfmon_capable())
                return -EPERM;

        return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT);
}

extern void perf_event_init(void);
extern void perf_tp_event(u16 event_type, u64 count, void *record,
                          int entry_size, struct pt_regs *regs,
                          struct hlist_head *head, int rctx,
                          struct task_struct *task);
extern void perf_bp_event(struct perf_event *event, void *data);

#ifndef perf_misc_flags
# define perf_misc_flags(regs) \
                (user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL)
# define perf_instruction_pointer(regs)        instruction_pointer(regs)
#endif
#ifndef perf_arch_bpf_user_pt_regs
# define perf_arch_bpf_user_pt_regs(regs) regs
#endif

static inline bool has_branch_stack(struct perf_event *event)
{
        return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
}

static inline bool needs_branch_stack(struct perf_event *event)
{
        return event->attr.branch_sample_type != 0;
}

static inline bool has_aux(struct perf_event *event)
{
        return event->pmu->setup_aux;
}

static inline bool is_write_backward(struct perf_event *event)
{
        return !!event->attr.write_backward;
}

static inline bool has_addr_filter(struct perf_event *event)
{
        return event->pmu->nr_addr_filters;
}

/*
 * An inherited event uses parent's filters
 */
static inline struct perf_addr_filters_head *
perf_event_addr_filters(struct perf_event *event)
{
        struct perf_addr_filters_head *ifh = &event->addr_filters;

        if (event->parent)
                ifh = &event->parent->addr_filters;

        return ifh;
}

extern void perf_event_addr_filters_sync(struct perf_event *event);

extern int perf_output_begin(struct perf_output_handle *handle,
                             struct perf_sample_data *data,
                             struct perf_event *event, unsigned int size);
extern int perf_output_begin_forward(struct perf_output_handle *handle,
                                     struct perf_sample_data *data,
                                     struct perf_event *event,
                                     unsigned int size);
extern int perf_output_begin_backward(struct perf_output_handle *handle,
                                      struct perf_sample_data *data,
                                      struct perf_event *event,
                                      unsigned int size);

extern void perf_output_end(struct perf_output_handle *handle);
extern unsigned int perf_output_copy(struct perf_output_handle *handle,
                             const void *buf, unsigned int len);
extern unsigned int perf_output_skip(struct perf_output_handle *handle,
                                     unsigned int len);
extern long perf_output_copy_aux(struct perf_output_handle *aux_handle,
                                 struct perf_output_handle *handle,
                                 unsigned long from, unsigned long to);
extern int perf_swevent_get_recursion_context(void);
extern void perf_swevent_put_recursion_context(int rctx);
extern u64 perf_swevent_set_period(struct perf_event *event);
extern void perf_event_enable(struct perf_event *event);
extern void perf_event_disable(struct perf_event *event);
extern void perf_event_disable_local(struct perf_event *event);
extern void perf_event_disable_inatomic(struct perf_event *event);
extern void perf_event_task_tick(void);
extern int perf_event_account_interrupt(struct perf_event *event);
extern int perf_event_period(struct perf_event *event, u64 value);
extern u64 perf_event_pause(struct perf_event *event, bool reset);
#else /* !CONFIG_PERF_EVENTS: */
static inline void *
perf_aux_output_begin(struct perf_output_handle *handle,
                      struct perf_event *event)                                { return NULL; }
static inline void
perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
                                                                        { }
static inline int
perf_aux_output_skip(struct perf_output_handle *handle,
                     unsigned long size)                                { return -EINVAL; }
static inline void *
perf_get_aux(struct perf_output_handle *handle)                                { return NULL; }
static inline void
perf_event_task_migrate(struct task_struct *task)                        { }
static inline void
perf_event_task_sched_in(struct task_struct *prev,
                         struct task_struct *task)                        { }
static inline void
perf_event_task_sched_out(struct task_struct *prev,
                          struct task_struct *next)                        { }
static inline int perf_event_init_task(struct task_struct *child)        { return 0; }
static inline void perf_event_exit_task(struct task_struct *child)        { }
static inline void perf_event_free_task(struct task_struct *task)        { }
static inline void perf_event_delayed_put(struct task_struct *task)        { }
static inline struct file *perf_event_get(unsigned int fd)        { return ERR_PTR(-EINVAL); }
static inline const struct perf_event *perf_get_event(struct file *file)
{
        return ERR_PTR(-EINVAL);
}
static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
{
        return ERR_PTR(-EINVAL);
}
static inline int perf_event_read_local(struct perf_event *event, u64 *value,
                                        u64 *enabled, u64 *running)
{
        return -EINVAL;
}
static inline void perf_event_print_debug(void)                                { }
static inline int perf_event_task_disable(void)                                { return -EINVAL; }
static inline int perf_event_task_enable(void)                                { return -EINVAL; }
static inline int perf_event_refresh(struct perf_event *event, int refresh)
{
        return -EINVAL;
}

static inline void
perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)        { }
static inline void
perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)                        { }
static inline void
perf_bp_event(struct perf_event *event, void *data)                        { }

static inline int perf_register_guest_info_callbacks
(struct perf_guest_info_callbacks *callbacks)                                { return 0; }
static inline int perf_unregister_guest_info_callbacks
(struct perf_guest_info_callbacks *callbacks)                                { return 0; }

static inline void perf_event_mmap(struct vm_area_struct *vma)                { }

typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data);
static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len,
                                      bool unregister, const char *sym)        { }
static inline void perf_event_bpf_event(struct bpf_prog *prog,
                                        enum perf_bpf_event_type type,
                                        u16 flags)                        { }
static inline void perf_event_exec(void)                                { }
static inline void perf_event_comm(struct task_struct *tsk, bool exec)        { }
static inline void perf_event_namespaces(struct task_struct *tsk)        { }
static inline void perf_event_fork(struct task_struct *tsk)                { }
static inline void perf_event_text_poke(const void *addr,
                                        const void *old_bytes,
                                        size_t old_len,
                                        const void *new_bytes,
                                        size_t new_len)                        { }
static inline void perf_event_init(void)                                { }
static inline int  perf_swevent_get_recursion_context(void)                { return -1; }
static inline void perf_swevent_put_recursion_context(int rctx)                { }
static inline u64 perf_swevent_set_period(struct perf_event *event)        { return 0; }
static inline void perf_event_enable(struct perf_event *event)                { }
static inline void perf_event_disable(struct perf_event *event)                { }
static inline int __perf_event_disable(void *info)                        { return -1; }
static inline void perf_event_task_tick(void)                                { }
static inline int perf_event_release_kernel(struct perf_event *event)        { return 0; }
static inline int perf_event_period(struct perf_event *event, u64 value)
{
        return -EINVAL;
}
static inline u64 perf_event_pause(struct perf_event *event, bool reset)
{
        return 0;
}
#endif

#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
extern void perf_restore_debug_store(void);
#else
static inline void perf_restore_debug_store(void)                        { }
#endif

static __always_inline bool perf_raw_frag_last(const struct perf_raw_frag *frag)
{
        return frag->pad < sizeof(u64);
}

#define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x))

struct perf_pmu_events_attr {
        struct device_attribute attr;
        u64 id;
        const char *event_str;
};

struct perf_pmu_events_ht_attr {
        struct device_attribute                        attr;
        u64                                        id;
        const char                                *event_str_ht;
        const char                                *event_str_noht;
};

ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
                              char *page);

#define PMU_EVENT_ATTR(_name, _var, _id, _show)                                \
static struct perf_pmu_events_attr _var = {                                \
        .attr = __ATTR(_name, 0444, _show, NULL),                        \
        .id   =  _id,                                                        \
};

#define PMU_EVENT_ATTR_STRING(_name, _var, _str)                            \
static struct perf_pmu_events_attr _var = {                                    \
        .attr                = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
        .id                = 0,                                                    \
        .event_str        = _str,                                                    \
};

#define PMU_FORMAT_ATTR(_name, _format)                                        \
static ssize_t                                                                \
_name##_show(struct device *dev,                                        \
                               struct device_attribute *attr,                \
                               char *page)                                \
{                                                                        \
        BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);                        \
        return sprintf(page, _format "\n");                                \
}                                                                        \
                                                                        \
static struct device_attribute format_attr_##_name = __ATTR_RO(_name)

/* Performance counter hotplug functions */
#ifdef CONFIG_PERF_EVENTS
int perf_event_init_cpu(unsigned int cpu);
int perf_event_exit_cpu(unsigned int cpu);
#else
#define perf_event_init_cpu        NULL
#define perf_event_exit_cpu        NULL
#endif

extern void __weak arch_perf_update_userpage(struct perf_event *event,
                                             struct perf_event_mmap_page *userpg,
                                             u64 now);

#endif /* _LINUX_PERF_EVENT_H */


































































































































































































































































































































































































































































































































































































































































































































































































    1 











    1 












    1 
    1 

    1 





    1 



    1 




    1 

    1 

    1 



    1 


    1 
    1 
    1 
    1 



    1 
































    1 

































































    1 

































































    1 


    1 


















    1 



    1 



    1 




    1 

    1 








    1 





    1 












    1 
    1 









    1 


    1 

    1 

    1 







    1 

    1 
    1 

    1 

    1 

    1 

    1 

    1 

    1 






    1 
    1 




























































































































































































































































    1 

    1 




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/open.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/string.h>
#include <linux/mm.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/fsnotify.h>
#include <linux/module.h>
#include <linux/tty.h>
#include <linux/namei.h>
#include <linux/backing-dev.h>
#include <linux/capability.h>
#include <linux/securebits.h>
#include <linux/security.h>
#include <linux/mount.h>
#include <linux/fcntl.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/fs.h>
#include <linux/personality.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/rcupdate.h>
#include <linux/audit.h>
#include <linux/falloc.h>
#include <linux/fs_struct.h>
#include <linux/ima.h>
#include <linux/dnotify.h>
#include <linux/compat.h>

#include "internal.h"

int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
        struct file *filp)
{
        int ret;
        struct iattr newattrs;

        /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
        if (length < 0)
                return -EINVAL;

        newattrs.ia_size = length;
        newattrs.ia_valid = ATTR_SIZE | time_attrs;
        if (filp) {
                newattrs.ia_file = filp;
                newattrs.ia_valid |= ATTR_FILE;
        }

        /* Remove suid, sgid, and file capabilities on truncate too */
        ret = dentry_needs_remove_privs(dentry);
        if (ret < 0)
                return ret;
        if (ret)
                newattrs.ia_valid |= ret | ATTR_FORCE;

        inode_lock(dentry->d_inode);
        /* Note any delegations or leases have already been broken: */
        ret = notify_change(dentry, &newattrs, NULL);
        inode_unlock(dentry->d_inode);
        return ret;
}

long vfs_truncate(const struct path *path, loff_t length)
{
        struct inode *inode;
        long error;

        inode = path->dentry->d_inode;

        /* For directories it's -EISDIR, for other non-regulars - -EINVAL */
        if (S_ISDIR(inode->i_mode))
                return -EISDIR;
        if (!S_ISREG(inode->i_mode))
                return -EINVAL;

        error = mnt_want_write(path->mnt);
        if (error)
                goto out;

        error = inode_permission(inode, MAY_WRITE);
        if (error)
                goto mnt_drop_write_and_out;

        error = -EPERM;
        if (IS_APPEND(inode))
                goto mnt_drop_write_and_out;

        error = get_write_access(inode);
        if (error)
                goto mnt_drop_write_and_out;

        /*
         * Make sure that there are no leases.  get_write_access() protects
         * against the truncate racing with a lease-granting setlease().
         */
        error = break_lease(inode, O_WRONLY);
        if (error)
                goto put_write_and_out;

        error = locks_verify_truncate(inode, NULL, length);
        if (!error)
                error = security_path_truncate(path);
        if (!error)
                error = do_truncate(path->dentry, length, 0, NULL);

put_write_and_out:
        put_write_access(inode);
mnt_drop_write_and_out:
        mnt_drop_write(path->mnt);
out:
        return error;
}
EXPORT_SYMBOL_GPL(vfs_truncate);

long do_sys_truncate(const char __user *pathname, loff_t length)
{
        unsigned int lookup_flags = LOOKUP_FOLLOW;
        struct path path;
        int error;

        if (length < 0)        /* sorry, but loff_t says... */
                return -EINVAL;

retry:
        error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
        if (!error) {
                error = vfs_truncate(&path, length);
                path_put(&path);
        }
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
{
        return do_sys_truncate(path, length);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length)
{
        return do_sys_truncate(path, length);
}
#endif

long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
{
        struct inode *inode;
        struct dentry *dentry;
        struct fd f;
        int error;

        error = -EINVAL;
        if (length < 0)
                goto out;
        error = -EBADF;
        f = fdget(fd);
        if (!f.file)
                goto out;

        /* explicitly opened as large or we are on 64-bit box */
        if (f.file->f_flags & O_LARGEFILE)
                small = 0;

        dentry = f.file->f_path.dentry;
        inode = dentry->d_inode;
        error = -EINVAL;
        if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE))
                goto out_putf;

        error = -EINVAL;
        /* Cannot ftruncate over 2^31 bytes without large file support */
        if (small && length > MAX_NON_LFS)
                goto out_putf;

        error = -EPERM;
        /* Check IS_APPEND on real upper inode */
        if (IS_APPEND(file_inode(f.file)))
                goto out_putf;

        sb_start_write(inode->i_sb);
        error = locks_verify_truncate(inode, f.file, length);
        if (!error)
                error = security_path_truncate(&f.file->f_path);
        if (!error)
                error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file);
        sb_end_write(inode->i_sb);
out_putf:
        fdput(f);
out:
        return error;
}

SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
{
        return do_sys_ftruncate(fd, length, 1);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length)
{
        return do_sys_ftruncate(fd, length, 1);
}
#endif

/* LFS versions of truncate are only needed on 32 bit machines */
#if BITS_PER_LONG == 32
SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length)
{
        return do_sys_truncate(path, length);
}

SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length)
{
        return do_sys_ftruncate(fd, length, 0);
}
#endif /* BITS_PER_LONG == 32 */


int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
        struct inode *inode = file_inode(file);
        long ret;

        if (offset < 0 || len <= 0)
                return -EINVAL;

        /* Return error if mode is not supported */
        if (mode & ~FALLOC_FL_SUPPORTED_MASK)
                return -EOPNOTSUPP;

        /* Punch hole and zero range are mutually exclusive */
        if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) ==
            (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
                return -EOPNOTSUPP;

        /* Punch hole must have keep size set */
        if ((mode & FALLOC_FL_PUNCH_HOLE) &&
            !(mode & FALLOC_FL_KEEP_SIZE))
                return -EOPNOTSUPP;

        /* Collapse range should only be used exclusively. */
        if ((mode & FALLOC_FL_COLLAPSE_RANGE) &&
            (mode & ~FALLOC_FL_COLLAPSE_RANGE))
                return -EINVAL;

        /* Insert range should only be used exclusively. */
        if ((mode & FALLOC_FL_INSERT_RANGE) &&
            (mode & ~FALLOC_FL_INSERT_RANGE))
                return -EINVAL;

        /* Unshare range should only be used with allocate mode. */
        if ((mode & FALLOC_FL_UNSHARE_RANGE) &&
            (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE)))
                return -EINVAL;

        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;

        /*
         * We can only allow pure fallocate on append only files
         */
        if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode))
                return -EPERM;

        if (IS_IMMUTABLE(inode))
                return -EPERM;

        /*
         * We cannot allow any fallocate operation on an active swapfile
         */
        if (IS_SWAPFILE(inode))
                return -ETXTBSY;

        /*
         * Revalidate the write permissions, in case security policy has
         * changed since the files were opened.
         */
        ret = security_file_permission(file, MAY_WRITE);
        if (ret)
                return ret;

        if (S_ISFIFO(inode->i_mode))
                return -ESPIPE;

        if (S_ISDIR(inode->i_mode))
                return -EISDIR;

        if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
                return -ENODEV;

        /* Check for wrap through zero too */
        if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
                return -EFBIG;

        if (!file->f_op->fallocate)
                return -EOPNOTSUPP;

        file_start_write(file);
        ret = file->f_op->fallocate(file, mode, offset, len);

        /*
         * Create inotify and fanotify events.
         *
         * To keep the logic simple always create events if fallocate succeeds.
         * This implies that events are even created if the file size remains
         * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE.
         */
        if (ret == 0)
                fsnotify_modify(file);

        file_end_write(file);
        return ret;
}
EXPORT_SYMBOL_GPL(vfs_fallocate);

int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len)
{
        struct fd f = fdget(fd);
        int error = -EBADF;

        if (f.file) {
                error = vfs_fallocate(f.file, mode, offset, len);
                fdput(f);
        }
        return error;
}

SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
{
        return ksys_fallocate(fd, mode, offset, len);
}

/*
 * access() needs to use the real uid/gid, not the effective uid/gid.
 * We do this by temporarily clearing all FS-related capabilities and
 * switching the fsuid/fsgid around to the real ones.
 */
static const struct cred *access_override_creds(void)
{
        const struct cred *old_cred;
        struct cred *override_cred;

        override_cred = prepare_creds();
        if (!override_cred)
                return NULL;

        override_cred->fsuid = override_cred->uid;
        override_cred->fsgid = override_cred->gid;

        if (!issecure(SECURE_NO_SETUID_FIXUP)) {
                /* Clear the capabilities if we switch to a non-root user */
                kuid_t root_uid = make_kuid(override_cred->user_ns, 0);
                if (!uid_eq(override_cred->uid, root_uid))
                        cap_clear(override_cred->cap_effective);
                else
                        override_cred->cap_effective =
                                override_cred->cap_permitted;
        }

        /*
         * The new set of credentials can *only* be used in
         * task-synchronous circumstances, and does not need
         * RCU freeing, unless somebody then takes a separate
         * reference to it.
         *
         * NOTE! This is _only_ true because this credential
         * is used purely for override_creds() that installs
         * it as the subjective cred. Other threads will be
         * accessing ->real_cred, not the subjective cred.
         *
         * If somebody _does_ make a copy of this (using the
         * 'get_current_cred()' function), that will clear the
         * non_rcu field, because now that other user may be
         * expecting RCU freeing. But normal thread-synchronous
         * cred accesses will keep things non-RCY.
         */
        override_cred->non_rcu = 1;

        old_cred = override_creds(override_cred);

        /* override_cred() gets its own ref */
        put_cred(override_cred);

        return old_cred;
}

static long do_faccessat(int dfd, const char __user *filename, int mode, int flags)
{
        struct path path;
        struct inode *inode;
        int res;
        unsigned int lookup_flags = LOOKUP_FOLLOW;
        const struct cred *old_cred = NULL;

        if (mode & ~S_IRWXO)        /* where's F_OK, X_OK, W_OK, R_OK? */
                return -EINVAL;

        if (flags & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH))
                return -EINVAL;

        if (flags & AT_SYMLINK_NOFOLLOW)
                lookup_flags &= ~LOOKUP_FOLLOW;
        if (flags & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;

        if (!(flags & AT_EACCESS)) {
                old_cred = access_override_creds();
                if (!old_cred)
                        return -ENOMEM;
        }

retry:
        res = user_path_at(dfd, filename, lookup_flags, &path);
        if (res)
                goto out;

        inode = d_backing_inode(path.dentry);

        if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
                /*
                 * MAY_EXEC on regular files is denied if the fs is mounted
                 * with the "noexec" flag.
                 */
                res = -EACCES;
                if (path_noexec(&path))
                        goto out_path_release;
        }

        res = inode_permission(inode, mode | MAY_ACCESS);
        /* SuS v2 requires we report a read only fs too */
        if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
                goto out_path_release;
        /*
         * This is a rare case where using __mnt_is_readonly()
         * is OK without a mnt_want/drop_write() pair.  Since
         * no actual write to the fs is performed here, we do
         * not need to telegraph to that to anyone.
         *
         * By doing this, we accept that this access is
         * inherently racy and know that the fs may change
         * state before we even see this result.
         */
        if (__mnt_is_readonly(path.mnt))
                res = -EROFS;

out_path_release:
        path_put(&path);
        if (retry_estale(res, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out:
        if (old_cred)
                revert_creds(old_cred);

        return res;
}

SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
{
        return do_faccessat(dfd, filename, mode, 0);
}

SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user *, filename, int, mode,
                int, flags)
{
        return do_faccessat(dfd, filename, mode, flags);
}

SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
{
        return do_faccessat(AT_FDCWD, filename, mode, 0);
}

SYSCALL_DEFINE1(chdir, const char __user *, filename)
{
        struct path path;
        int error;
        unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
retry:
        error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
        if (error)
                goto out;

        error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
        if (error)
                goto dput_and_out;

        set_fs_pwd(current->fs, &path);

dput_and_out:
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out:
        return error;
}

SYSCALL_DEFINE1(fchdir, unsigned int, fd)
{
        struct fd f = fdget_raw(fd);
        int error;

        error = -EBADF;
        if (!f.file)
                goto out;

        error = -ENOTDIR;
        if (!d_can_lookup(f.file->f_path.dentry))
                goto out_putf;

        error = inode_permission(file_inode(f.file), MAY_EXEC | MAY_CHDIR);
        if (!error)
                set_fs_pwd(current->fs, &f.file->f_path);
out_putf:
        fdput(f);
out:
        return error;
}

SYSCALL_DEFINE1(chroot, const char __user *, filename)
{
        struct path path;
        int error;
        unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
retry:
        error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
        if (error)
                goto out;

        error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
        if (error)
                goto dput_and_out;

        error = -EPERM;
        if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT))
                goto dput_and_out;
        error = security_path_chroot(&path);
        if (error)
                goto dput_and_out;

        set_fs_root(current->fs, &path);
        error = 0;
dput_and_out:
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out:
        return error;
}

int chmod_common(const struct path *path, umode_t mode)
{
        struct inode *inode = path->dentry->d_inode;
        struct inode *delegated_inode = NULL;
        struct iattr newattrs;
        int error;

        error = mnt_want_write(path->mnt);
        if (error)
                return error;
retry_deleg:
        inode_lock(inode);
        error = security_path_chmod(path, mode);
        if (error)
                goto out_unlock;
        newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
        newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
        error = notify_change(path->dentry, &newattrs, &delegated_inode);
out_unlock:
        inode_unlock(inode);
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
        mnt_drop_write(path->mnt);
        return error;
}

int vfs_fchmod(struct file *file, umode_t mode)
{
        audit_file(file);
        return chmod_common(&file->f_path, mode);
}

SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
{
        struct fd f = fdget(fd);
        int err = -EBADF;

        if (f.file) {
                err = vfs_fchmod(f.file, mode);
                fdput(f);
        }
        return err;
}

static int do_fchmodat(int dfd, const char __user *filename, umode_t mode)
{
        struct path path;
        int error;
        unsigned int lookup_flags = LOOKUP_FOLLOW;
retry:
        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (!error) {
                error = chmod_common(&path, mode);
                path_put(&path);
                if (retry_estale(error, lookup_flags)) {
                        lookup_flags |= LOOKUP_REVAL;
                        goto retry;
                }
        }
        return error;
}

SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename,
                umode_t, mode)
{
        return do_fchmodat(dfd, filename, mode);
}

SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
{
        return do_fchmodat(AT_FDCWD, filename, mode);
}

int chown_common(const struct path *path, uid_t user, gid_t group)
{
        struct inode *inode = path->dentry->d_inode;
        struct inode *delegated_inode = NULL;
        int error;
        struct iattr newattrs;
        kuid_t uid;
        kgid_t gid;

        uid = make_kuid(current_user_ns(), user);
        gid = make_kgid(current_user_ns(), group);

retry_deleg:
        newattrs.ia_valid =  ATTR_CTIME;
        if (user != (uid_t) -1) {
                if (!uid_valid(uid))
                        return -EINVAL;
                newattrs.ia_valid |= ATTR_UID;
                newattrs.ia_uid = uid;
        }
        if (group != (gid_t) -1) {
                if (!gid_valid(gid))
                        return -EINVAL;
                newattrs.ia_valid |= ATTR_GID;
                newattrs.ia_gid = gid;
        }
        inode_lock(inode);
        if (!S_ISDIR(inode->i_mode))
                newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV |
                                     setattr_should_drop_sgid(inode);
        error = security_path_chown(path, uid, gid);
        if (!error)
                error = notify_change(path->dentry, &newattrs, &delegated_inode);
        inode_unlock(inode);
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
        return error;
}

int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
                int flag)
{
        struct path path;
        int error = -EINVAL;
        int lookup_flags;

        if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
                goto out;

        lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
        if (flag & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;
retry:
        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (error)
                goto out;
        error = mnt_want_write(path.mnt);
        if (error)
                goto out_release;
        error = chown_common(&path, user, group);
        mnt_drop_write(path.mnt);
out_release:
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out:
        return error;
}

SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
                gid_t, group, int, flag)
{
        return do_fchownat(dfd, filename, user, group, flag);
}

SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
{
        return do_fchownat(AT_FDCWD, filename, user, group, 0);
}

SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
{
        return do_fchownat(AT_FDCWD, filename, user, group,
                           AT_SYMLINK_NOFOLLOW);
}

int vfs_fchown(struct file *file, uid_t user, gid_t group)
{
        int error;

        error = mnt_want_write_file(file);
        if (error)
                return error;
        audit_file(file);
        error = chown_common(&file->f_path, user, group);
        mnt_drop_write_file(file);
        return error;
}

int ksys_fchown(unsigned int fd, uid_t user, gid_t group)
{
        struct fd f = fdget(fd);
        int error = -EBADF;

        if (f.file) {
                error = vfs_fchown(f.file, user, group);
                fdput(f);
        }
        return error;
}

SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
{
        return ksys_fchown(fd, user, group);
}

static int do_dentry_open(struct file *f,
                          struct inode *inode,
                          int (*open)(struct inode *, struct file *))
{
        static const struct file_operations empty_fops = {};
        int error;

        path_get(&f->f_path);
        f->f_inode = inode;
        f->f_mapping = inode->i_mapping;
        f->f_wb_err = filemap_sample_wb_err(f->f_mapping);
        f->f_sb_err = file_sample_sb_err(f);

        if (unlikely(f->f_flags & O_PATH)) {
                f->f_mode = FMODE_PATH | FMODE_OPENED;
                f->f_op = &empty_fops;
                return 0;
        }

        if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
                error = get_write_access(inode);
                if (unlikely(error))
                        goto cleanup_file;
                error = __mnt_want_write(f->f_path.mnt);
                if (unlikely(error)) {
                        put_write_access(inode);
                        goto cleanup_file;
                }
                f->f_mode |= FMODE_WRITER;
        }

        /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
        if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
                f->f_mode |= FMODE_ATOMIC_POS;

        f->f_op = fops_get(inode->i_fop);
        if (WARN_ON(!f->f_op)) {
                error = -ENODEV;
                goto cleanup_all;
        }

        error = security_file_open(f);
        if (error)
                goto cleanup_all;

        error = break_lease(locks_inode(f), f->f_flags);
        if (error)
                goto cleanup_all;

        /* normally all 3 are set; ->open() can clear them if needed */
        f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
        if (!open)
                open = f->f_op->open;
        if (open) {
                error = open(inode, f);
                if (error)
                        goto cleanup_all;
        }
        f->f_mode |= FMODE_OPENED;
        if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
                i_readcount_inc(inode);
        if ((f->f_mode & FMODE_READ) &&
             likely(f->f_op->read || f->f_op->read_iter))
                f->f_mode |= FMODE_CAN_READ;
        if ((f->f_mode & FMODE_WRITE) &&
             likely(f->f_op->write || f->f_op->write_iter))
                f->f_mode |= FMODE_CAN_WRITE;

        f->f_write_hint = WRITE_LIFE_NOT_SET;
        f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);

        file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);

        /* NB: we're sure to have correct a_ops only after f_op->open */
        if (f->f_flags & O_DIRECT) {
                if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
                        return -EINVAL;
        }

        /*
         * XXX: Huge page cache doesn't support writing yet. Drop all page
         * cache for this file before processing writes.
         */
        if ((f->f_mode & FMODE_WRITE) && filemap_nr_thps(inode->i_mapping))
                truncate_pagecache(inode, 0);

        return 0;

cleanup_all:
        if (WARN_ON_ONCE(error > 0))
                error = -EINVAL;
        fops_put(f->f_op);
        if (f->f_mode & FMODE_WRITER) {
                put_write_access(inode);
                __mnt_drop_write(f->f_path.mnt);
        }
cleanup_file:
        path_put(&f->f_path);
        f->f_path.mnt = NULL;
        f->f_path.dentry = NULL;
        f->f_inode = NULL;
        return error;
}

/**
 * finish_open - finish opening a file
 * @file: file pointer
 * @dentry: pointer to dentry
 * @open: open callback
 * @opened: state of open
 *
 * This can be used to finish opening a file passed to i_op->atomic_open().
 *
 * If the open callback is set to NULL, then the standard f_op->open()
 * filesystem callback is substituted.
 *
 * NB: the dentry reference is _not_ consumed.  If, for example, the dentry is
 * the return value of d_splice_alias(), then the caller needs to perform dput()
 * on it after finish_open().
 *
 * Returns zero on success or -errno if the open failed.
 */
int finish_open(struct file *file, struct dentry *dentry,
                int (*open)(struct inode *, struct file *))
{
        BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */

        file->f_path.dentry = dentry;
        return do_dentry_open(file, d_backing_inode(dentry), open);
}
EXPORT_SYMBOL(finish_open);

/**
 * finish_no_open - finish ->atomic_open() without opening the file
 *
 * @file: file pointer
 * @dentry: dentry or NULL (as returned from ->lookup())
 *
 * This can be used to set the result of a successful lookup in ->atomic_open().
 *
 * NB: unlike finish_open() this function does consume the dentry reference and
 * the caller need not dput() it.
 *
 * Returns "0" which must be the return value of ->atomic_open() after having
 * called this function.
 */
int finish_no_open(struct file *file, struct dentry *dentry)
{
        file->f_path.dentry = dentry;
        return 0;
}
EXPORT_SYMBOL(finish_no_open);

char *file_path(struct file *filp, char *buf, int buflen)
{
        return d_path(&filp->f_path, buf, buflen);
}
EXPORT_SYMBOL(file_path);

/**
 * vfs_open - open the file at the given path
 * @path: path to open
 * @file: newly allocated file with f_flag initialized
 * @cred: credentials to use
 */
int vfs_open(const struct path *path, struct file *file)
{
        file->f_path = *path;
        return do_dentry_open(file, d_backing_inode(path->dentry), NULL);
}

struct file *dentry_open(const struct path *path, int flags,
                         const struct cred *cred)
{
        int error;
        struct file *f;

        validate_creds(cred);

        /* We must always pass in a valid mount pointer. */
        BUG_ON(!path->mnt);

        f = alloc_empty_file(flags, cred);
        if (!IS_ERR(f)) {
                error = vfs_open(path, f);
                if (error) {
                        fput(f);
                        f = ERR_PTR(error);
                }
        }
        return f;
}
EXPORT_SYMBOL(dentry_open);

struct file *open_with_fake_path(const struct path *path, int flags,
                                struct inode *inode, const struct cred *cred)
{
        struct file *f = alloc_empty_file_noaccount(flags, cred);
        if (!IS_ERR(f)) {
                int error;

                f->f_path = *path;
                error = do_dentry_open(f, inode, NULL);
                if (error) {
                        fput(f);
                        f = ERR_PTR(error);
                }
        }
        return f;
}
EXPORT_SYMBOL(open_with_fake_path);

#define WILL_CREATE(flags)        (flags & (O_CREAT | __O_TMPFILE))
#define O_PATH_FLAGS                (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)

inline struct open_how build_open_how(int flags, umode_t mode)
{
        struct open_how how = {
                .flags = flags & VALID_OPEN_FLAGS,
                .mode = mode & S_IALLUGO,
        };

        /* O_PATH beats everything else. */
        if (how.flags & O_PATH)
                how.flags &= O_PATH_FLAGS;
        /* Modes should only be set for create-like flags. */
        if (!WILL_CREATE(how.flags))
                how.mode = 0;
        return how;
}

inline int build_open_flags(const struct open_how *how, struct open_flags *op)
{
        u64 flags = how->flags;
        u64 strip = FMODE_NONOTIFY | O_CLOEXEC;
        int lookup_flags = 0;
        int acc_mode = ACC_MODE(flags);

        BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS),
                         "struct open_flags doesn't yet handle flags > 32 bits");

        /*
         * Strip flags that either shouldn't be set by userspace like
         * FMODE_NONOTIFY or that aren't relevant in determining struct
         * open_flags like O_CLOEXEC.
         */
        flags &= ~strip;

        /*
         * Older syscalls implicitly clear all of the invalid flags or argument
         * values before calling build_open_flags(), but openat2(2) checks all
         * of its arguments.
         */
        if (flags & ~VALID_OPEN_FLAGS)
                return -EINVAL;
        if (how->resolve & ~VALID_RESOLVE_FLAGS)
                return -EINVAL;

        /* Scoping flags are mutually exclusive. */
        if ((how->resolve & RESOLVE_BENEATH) && (how->resolve & RESOLVE_IN_ROOT))
                return -EINVAL;

        /* Deal with the mode. */
        if (WILL_CREATE(flags)) {
                if (how->mode & ~S_IALLUGO)
                        return -EINVAL;
                op->mode = how->mode | S_IFREG;
        } else {
                if (how->mode != 0)
                        return -EINVAL;
                op->mode = 0;
        }

        /*
         * In order to ensure programs get explicit errors when trying to use
         * O_TMPFILE on old kernels, O_TMPFILE is implemented such that it
         * looks like (O_DIRECTORY|O_RDWR & ~O_CREAT) to old kernels. But we
         * have to require userspace to explicitly set it.
         */
        if (flags & __O_TMPFILE) {
                if ((flags & O_TMPFILE_MASK) != O_TMPFILE)
                        return -EINVAL;
                if (!(acc_mode & MAY_WRITE))
                        return -EINVAL;
        }
        if (flags & O_PATH) {
                /* O_PATH only permits certain other flags to be set. */
                if (flags & ~O_PATH_FLAGS)
                        return -EINVAL;
                acc_mode = 0;
        }

        /*
         * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
         * check for O_DSYNC if the need any syncing at all we enforce it's
         * always set instead of having to deal with possibly weird behaviour
         * for malicious applications setting only __O_SYNC.
         */
        if (flags & __O_SYNC)
                flags |= O_DSYNC;

        op->open_flag = flags;

        /* O_TRUNC implies we need access checks for write permissions */
        if (flags & O_TRUNC)
                acc_mode |= MAY_WRITE;

        /* Allow the LSM permission hook to distinguish append
           access from general write access. */
        if (flags & O_APPEND)
                acc_mode |= MAY_APPEND;

        op->acc_mode = acc_mode;

        op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;

        if (flags & O_CREAT) {
                op->intent |= LOOKUP_CREATE;
                if (flags & O_EXCL) {
                        op->intent |= LOOKUP_EXCL;
                        flags |= O_NOFOLLOW;
                }
        }

        if (flags & O_DIRECTORY)
                lookup_flags |= LOOKUP_DIRECTORY;
        if (!(flags & O_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;

        if (how->resolve & RESOLVE_NO_XDEV)
                lookup_flags |= LOOKUP_NO_XDEV;
        if (how->resolve & RESOLVE_NO_MAGICLINKS)
                lookup_flags |= LOOKUP_NO_MAGICLINKS;
        if (how->resolve & RESOLVE_NO_SYMLINKS)
                lookup_flags |= LOOKUP_NO_SYMLINKS;
        if (how->resolve & RESOLVE_BENEATH)
                lookup_flags |= LOOKUP_BENEATH;
        if (how->resolve & RESOLVE_IN_ROOT)
                lookup_flags |= LOOKUP_IN_ROOT;
        if (how->resolve & RESOLVE_CACHED) {
                /* Don't bother even trying for create/truncate/tmpfile open */
                if (flags & (O_TRUNC | O_CREAT | __O_TMPFILE))
                        return -EAGAIN;
                lookup_flags |= LOOKUP_CACHED;
        }

        op->lookup_flags = lookup_flags;
        return 0;
}

/**
 * file_open_name - open file and return file pointer
 *
 * @name:        struct filename containing path to open
 * @flags:        open flags as per the open(2) second argument
 * @mode:        mode for the new file if O_CREAT is set, else ignored
 *
 * This is the helper to open a file from kernelspace if you really
 * have to.  But in generally you should not do this, so please move
 * along, nothing to see here..
 */
struct file *file_open_name(struct filename *name, int flags, umode_t mode)
{
        struct open_flags op;
        struct open_how how = build_open_how(flags, mode);
        int err = build_open_flags(&how, &op);
        if (err)
                return ERR_PTR(err);
        return do_filp_open(AT_FDCWD, name, &op);
}

/**
 * filp_open - open file and return file pointer
 *
 * @filename:        path to open
 * @flags:        open flags as per the open(2) second argument
 * @mode:        mode for the new file if O_CREAT is set, else ignored
 *
 * This is the helper to open a file from kernelspace if you really
 * have to.  But in generally you should not do this, so please move
 * along, nothing to see here..
 */
struct file *filp_open(const char *filename, int flags, umode_t mode)
{
        struct filename *name = getname_kernel(filename);
        struct file *file = ERR_CAST(name);
        
        if (!IS_ERR(name)) {
                file = file_open_name(name, flags, mode);
                putname(name);
        }
        return file;
}
EXPORT_SYMBOL(filp_open);

struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
                            const char *filename, int flags, umode_t mode)
{
        struct open_flags op;
        struct open_how how = build_open_how(flags, mode);
        int err = build_open_flags(&how, &op);
        if (err)
                return ERR_PTR(err);
        return do_file_open_root(dentry, mnt, filename, &op);
}
EXPORT_SYMBOL(file_open_root);

static long do_sys_openat2(int dfd, const char __user *filename,
                           struct open_how *how)
{
        struct open_flags op;
        int fd = build_open_flags(how, &op);
        struct filename *tmp;

        if (fd)
                return fd;

        tmp = getname(filename);
        if (IS_ERR(tmp))
                return PTR_ERR(tmp);

        fd = get_unused_fd_flags(how->flags);
        if (fd >= 0) {
                struct file *f = do_filp_open(dfd, tmp, &op);
                if (IS_ERR(f)) {
                        put_unused_fd(fd);
                        fd = PTR_ERR(f);
                } else {
                        fsnotify_open(f);
                        fd_install(fd, f);
                }
        }
        putname(tmp);
        return fd;
}

long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
        struct open_how how = build_open_how(flags, mode);
        return do_sys_openat2(dfd, filename, &how);
}


SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
        if (force_o_largefile())
                flags |= O_LARGEFILE;
        return do_sys_open(AT_FDCWD, filename, flags, mode);
}

SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
                umode_t, mode)
{
        if (force_o_largefile())
                flags |= O_LARGEFILE;
        return do_sys_open(dfd, filename, flags, mode);
}

SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename,
                struct open_how __user *, how, size_t, usize)
{
        int err;
        struct open_how tmp;

        BUILD_BUG_ON(sizeof(struct open_how) < OPEN_HOW_SIZE_VER0);
        BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_LATEST);

        if (unlikely(usize < OPEN_HOW_SIZE_VER0))
                return -EINVAL;

        err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize);
        if (err)
                return err;

        /* O_LARGEFILE is only allowed for non-O_PATH. */
        if (!(tmp.flags & O_PATH) && force_o_largefile())
                tmp.flags |= O_LARGEFILE;

        return do_sys_openat2(dfd, filename, &tmp);
}

#ifdef CONFIG_COMPAT
/*
 * Exactly like sys_open(), except that it doesn't set the
 * O_LARGEFILE flag.
 */
COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
        return do_sys_open(AT_FDCWD, filename, flags, mode);
}

/*
 * Exactly like sys_openat(), except that it doesn't set the
 * O_LARGEFILE flag.
 */
COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode)
{
        return do_sys_open(dfd, filename, flags, mode);
}
#endif

#ifndef __alpha__

/*
 * For backward compatibility?  Maybe this should be moved
 * into arch/i386 instead?
 */
SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode)
{
        int flags = O_CREAT | O_WRONLY | O_TRUNC;

        if (force_o_largefile())
                flags |= O_LARGEFILE;
        return do_sys_open(AT_FDCWD, pathname, flags, mode);
}
#endif

/*
 * "id" is the POSIX thread ID. We use the
 * files pointer for this..
 */
int filp_close(struct file *filp, fl_owner_t id)
{
        int retval = 0;

        if (!file_count(filp)) {
                printk(KERN_ERR "VFS: Close: file count is 0\n");
                return 0;
        }

        if (filp->f_op->flush)
                retval = filp->f_op->flush(filp, id);

        if (likely(!(filp->f_mode & FMODE_PATH))) {
                dnotify_flush(filp, id);
                locks_remove_posix(filp, id);
        }
        fput(filp);
        return retval;
}

EXPORT_SYMBOL(filp_close);

/*
 * Careful here! We test whether the file pointer is NULL before
 * releasing the fd. This ensures that one clone task can't release
 * an fd while another clone is opening it.
 */
SYSCALL_DEFINE1(close, unsigned int, fd)
{
        int retval = __close_fd(current->files, fd);

        /* can't restart close syscall because file table entry was cleared */
        if (unlikely(retval == -ERESTARTSYS ||
                     retval == -ERESTARTNOINTR ||
                     retval == -ERESTARTNOHAND ||
                     retval == -ERESTART_RESTARTBLOCK))
                retval = -EINTR;

        return retval;
}

/**
 * close_range() - Close all file descriptors in a given range.
 *
 * @fd:     starting file descriptor to close
 * @max_fd: last file descriptor to close
 * @flags:  reserved for future extensions
 *
 * This closes a range of file descriptors. All file descriptors
 * from @fd up to and including @max_fd are closed.
 * Currently, errors to close a given file descriptor are ignored.
 */
SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
                unsigned int, flags)
{
        return __close_range(fd, max_fd, flags);
}

/*
 * This routine simulates a hangup on the tty, to arrange that users
 * are given clean terminals at login time.
 */
SYSCALL_DEFINE0(vhangup)
{
        if (capable(CAP_SYS_TTY_CONFIG)) {
                tty_vhangup_self();
                return 0;
        }
        return -EPERM;
}

/*
 * Called when an inode is about to be open.
 * We use this to disallow opening large files on 32bit systems if
 * the caller didn't specify O_LARGEFILE.  On 64bit systems we force
 * on this flag in sys_open.
 */
int generic_file_open(struct inode * inode, struct file * filp)
{
        if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
                return -EOVERFLOW;
        return 0;
}

EXPORT_SYMBOL(generic_file_open);

/*
 * This is used by subsystems that don't want seekable
 * file descriptors. The function is not supposed to ever fail, the only
 * reason it returns an 'int' and not 'void' is so that it can be plugged
 * directly into file_operations structure.
 */
int nonseekable_open(struct inode *inode, struct file *filp)
{
        filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
        return 0;
}

EXPORT_SYMBOL(nonseekable_open);

/*
 * stream_open is used by subsystems that want stream-like file descriptors.
 * Such file descriptors are not seekable and don't have notion of position
 * (file.f_pos is always 0 and ppos passed to .read()/.write() is always NULL).
 * Contrary to file descriptors of other regular files, .read() and .write()
 * can run simultaneously.
 *
 * stream_open never fails and is marked to return int so that it could be
 * directly used as file_operations.open .
 */
int stream_open(struct inode *inode, struct file *filp)
{
        filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS);
        filp->f_mode |= FMODE_STREAM;
        return 0;
}

EXPORT_SYMBOL(stream_open);























































































































































































































































































    1 

    1 





    1 

    1 






















































































































































































































































    1 


    1 

































































































































































































































































    1 



    1 


















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/sched/task_stack.h>
#include <linux/security.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mman.h>
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
#include <linux/userfaultfd_k.h>
#include <linux/elf.h>
#include <linux/elf-randomize.h>
#include <linux/personality.h>
#include <linux/random.h>
#include <linux/processor.h>
#include <linux/sizes.h>
#include <linux/compat.h>

#include <linux/uaccess.h>

#include "internal.h"

/**
 * kfree_const - conditionally free memory
 * @x: pointer to the memory
 *
 * Function calls kfree only if @x is not in .rodata section.
 */
void kfree_const(const void *x)
{
        if (!is_kernel_rodata((unsigned long)x))
                kfree(x);
}
EXPORT_SYMBOL(kfree_const);

/**
 * kstrdup - allocate space for and copy an existing string
 * @s: the string to duplicate
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Return: newly allocated copy of @s or %NULL in case of error
 */
char *kstrdup(const char *s, gfp_t gfp)
{
        size_t len;
        char *buf;

        if (!s)
                return NULL;

        len = strlen(s) + 1;
        buf = kmalloc_track_caller(len, gfp);
        if (buf)
                memcpy(buf, s, len);
        return buf;
}
EXPORT_SYMBOL(kstrdup);

/**
 * kstrdup_const - conditionally duplicate an existing const string
 * @s: the string to duplicate
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Note: Strings allocated by kstrdup_const should be freed by kfree_const and
 * must not be passed to krealloc().
 *
 * Return: source string if it is in .rodata section otherwise
 * fallback to kstrdup.
 */
const char *kstrdup_const(const char *s, gfp_t gfp)
{
        if (is_kernel_rodata((unsigned long)s))
                return s;

        return kstrdup(s, gfp);
}
EXPORT_SYMBOL(kstrdup_const);

/**
 * kstrndup - allocate space for and copy an existing string
 * @s: the string to duplicate
 * @max: read at most @max chars from @s
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Note: Use kmemdup_nul() instead if the size is known exactly.
 *
 * Return: newly allocated copy of @s or %NULL in case of error
 */
char *kstrndup(const char *s, size_t max, gfp_t gfp)
{
        size_t len;
        char *buf;

        if (!s)
                return NULL;

        len = strnlen(s, max);
        buf = kmalloc_track_caller(len+1, gfp);
        if (buf) {
                memcpy(buf, s, len);
                buf[len] = '\0';
        }
        return buf;
}
EXPORT_SYMBOL(kstrndup);

/**
 * kmemdup - duplicate region of memory
 *
 * @src: memory region to duplicate
 * @len: memory region length
 * @gfp: GFP mask to use
 *
 * Return: newly allocated copy of @src or %NULL in case of error
 */
void *kmemdup(const void *src, size_t len, gfp_t gfp)
{
        void *p;

        p = kmalloc_track_caller(len, gfp);
        if (p)
                memcpy(p, src, len);
        return p;
}
EXPORT_SYMBOL(kmemdup);

/**
 * kmemdup_nul - Create a NUL-terminated string from unterminated data
 * @s: The data to stringify
 * @len: The size of the data
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Return: newly allocated copy of @s with NUL-termination or %NULL in
 * case of error
 */
char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
{
        char *buf;

        if (!s)
                return NULL;

        buf = kmalloc_track_caller(len + 1, gfp);
        if (buf) {
                memcpy(buf, s, len);
                buf[len] = '\0';
        }
        return buf;
}
EXPORT_SYMBOL(kmemdup_nul);

/**
 * memdup_user - duplicate memory region from user space
 *
 * @src: source address in user space
 * @len: number of bytes to copy
 *
 * Return: an ERR_PTR() on failure.  Result is physically
 * contiguous, to be freed by kfree().
 */
void *memdup_user(const void __user *src, size_t len)
{
        void *p;

        p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);
        if (!p)
                return ERR_PTR(-ENOMEM);

        if (copy_from_user(p, src, len)) {
                kfree(p);
                return ERR_PTR(-EFAULT);
        }

        return p;
}
EXPORT_SYMBOL(memdup_user);

/**
 * vmemdup_user - duplicate memory region from user space
 *
 * @src: source address in user space
 * @len: number of bytes to copy
 *
 * Return: an ERR_PTR() on failure.  Result may be not
 * physically contiguous.  Use kvfree() to free.
 */
void *vmemdup_user(const void __user *src, size_t len)
{
        void *p;

        p = kvmalloc(len, GFP_USER);
        if (!p)
                return ERR_PTR(-ENOMEM);

        if (copy_from_user(p, src, len)) {
                kvfree(p);
                return ERR_PTR(-EFAULT);
        }

        return p;
}
EXPORT_SYMBOL(vmemdup_user);

/**
 * strndup_user - duplicate an existing string from user space
 * @s: The string to duplicate
 * @n: Maximum number of bytes to copy, including the trailing NUL.
 *
 * Return: newly allocated copy of @s or an ERR_PTR() in case of error
 */
char *strndup_user(const char __user *s, long n)
{
        char *p;
        long length;

        length = strnlen_user(s, n);

        if (!length)
                return ERR_PTR(-EFAULT);

        if (length > n)
                return ERR_PTR(-EINVAL);

        p = memdup_user(s, length);

        if (IS_ERR(p))
                return p;

        p[length - 1] = '\0';

        return p;
}
EXPORT_SYMBOL(strndup_user);

/**
 * memdup_user_nul - duplicate memory region from user space and NUL-terminate
 *
 * @src: source address in user space
 * @len: number of bytes to copy
 *
 * Return: an ERR_PTR() on failure.
 */
void *memdup_user_nul(const void __user *src, size_t len)
{
        char *p;

        /*
         * Always use GFP_KERNEL, since copy_from_user() can sleep and
         * cause pagefault, which makes it pointless to use GFP_NOFS
         * or GFP_ATOMIC.
         */
        p = kmalloc_track_caller(len + 1, GFP_KERNEL);
        if (!p)
                return ERR_PTR(-ENOMEM);

        if (copy_from_user(p, src, len)) {
                kfree(p);
                return ERR_PTR(-EFAULT);
        }
        p[len] = '\0';

        return p;
}
EXPORT_SYMBOL(memdup_user_nul);

void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
                struct vm_area_struct *prev)
{
        struct vm_area_struct *next;

        vma->vm_prev = prev;
        if (prev) {
                next = prev->vm_next;
                prev->vm_next = vma;
        } else {
                next = mm->mmap;
                mm->mmap = vma;
        }
        vma->vm_next = next;
        if (next)
                next->vm_prev = vma;
}

void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma)
{
        struct vm_area_struct *prev, *next;

        next = vma->vm_next;
        prev = vma->vm_prev;
        if (prev)
                prev->vm_next = next;
        else
                mm->mmap = next;
        if (next)
                next->vm_prev = prev;
}

/* Check if the vma is being used as a stack by this task */
int vma_is_stack_for_current(struct vm_area_struct *vma)
{
        struct task_struct * __maybe_unused t = current;

        return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
}

#ifndef STACK_RND_MASK
#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12))     /* 8MB of VA */
#endif

unsigned long randomize_stack_top(unsigned long stack_top)
{
        unsigned long random_variable = 0;

        if (current->flags & PF_RANDOMIZE) {
                random_variable = get_random_long();
                random_variable &= STACK_RND_MASK;
                random_variable <<= PAGE_SHIFT;
        }
#ifdef CONFIG_STACK_GROWSUP
        return PAGE_ALIGN(stack_top) + random_variable;
#else
        return PAGE_ALIGN(stack_top) - random_variable;
#endif
}

/**
 * randomize_page - Generate a random, page aligned address
 * @start:        The smallest acceptable address the caller will take.
 * @range:        The size of the area, starting at @start, within which the
 *                random address must fall.
 *
 * If @start + @range would overflow, @range is capped.
 *
 * NOTE: Historical use of randomize_range, which this replaces, presumed that
 * @start was already page aligned.  We now align it regardless.
 *
 * Return: A page aligned address within [start, start + range).  On error,
 * @start is returned.
 */
unsigned long randomize_page(unsigned long start, unsigned long range)
{
        if (!PAGE_ALIGNED(start)) {
                range -= PAGE_ALIGN(start) - start;
                start = PAGE_ALIGN(start);
        }

        if (start > ULONG_MAX - range)
                range = ULONG_MAX - start;

        range >>= PAGE_SHIFT;

        if (range == 0)
                return start;

        return start + (get_random_long() % range << PAGE_SHIFT);
}

#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
unsigned long arch_randomize_brk(struct mm_struct *mm)
{
        /* Is the current task 32bit ? */
        if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
                return randomize_page(mm->brk, SZ_32M);

        return randomize_page(mm->brk, SZ_1G);
}

unsigned long arch_mmap_rnd(void)
{
        unsigned long rnd;

#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
        if (is_compat_task())
                rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
        else
#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
                rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);

        return rnd << PAGE_SHIFT;
}

static int mmap_is_legacy(struct rlimit *rlim_stack)
{
        if (current->personality & ADDR_COMPAT_LAYOUT)
                return 1;

        if (rlim_stack->rlim_cur == RLIM_INFINITY)
                return 1;

        return sysctl_legacy_va_layout;
}

/*
 * Leave enough space between the mmap area and the stack to honour ulimit in
 * the face of randomisation.
 */
#define MIN_GAP                (SZ_128M)
#define MAX_GAP                (STACK_TOP / 6 * 5)

static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
{
        unsigned long gap = rlim_stack->rlim_cur;
        unsigned long pad = stack_guard_gap;

        /* Account for stack randomization if necessary */
        if (current->flags & PF_RANDOMIZE)
                pad += (STACK_RND_MASK << PAGE_SHIFT);

        /* Values close to RLIM_INFINITY can overflow. */
        if (gap + pad > gap)
                gap += pad;

        if (gap < MIN_GAP)
                gap = MIN_GAP;
        else if (gap > MAX_GAP)
                gap = MAX_GAP;

        return PAGE_ALIGN(STACK_TOP - gap - rnd);
}

void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
        unsigned long random_factor = 0UL;

        if (current->flags & PF_RANDOMIZE)
                random_factor = arch_mmap_rnd();

        if (mmap_is_legacy(rlim_stack)) {
                mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
                mm->get_unmapped_area = arch_get_unmapped_area;
        } else {
                mm->mmap_base = mmap_base(random_factor, rlim_stack);
                mm->get_unmapped_area = arch_get_unmapped_area_topdown;
        }
}
#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
        mm->mmap_base = TASK_UNMAPPED_BASE;
        mm->get_unmapped_area = arch_get_unmapped_area;
}
#endif

/**
 * __account_locked_vm - account locked pages to an mm's locked_vm
 * @mm:          mm to account against
 * @pages:       number of pages to account
 * @inc:         %true if @pages should be considered positive, %false if not
 * @task:        task used to check RLIMIT_MEMLOCK
 * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
 *
 * Assumes @task and @mm are valid (i.e. at least one reference on each), and
 * that mmap_lock is held as writer.
 *
 * Return:
 * * 0       on success
 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
 */
int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
                        struct task_struct *task, bool bypass_rlim)
{
        unsigned long locked_vm, limit;
        int ret = 0;

        mmap_assert_write_locked(mm);

        locked_vm = mm->locked_vm;
        if (inc) {
                if (!bypass_rlim) {
                        limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
                        if (locked_vm + pages > limit)
                                ret = -ENOMEM;
                }
                if (!ret)
                        mm->locked_vm = locked_vm + pages;
        } else {
                WARN_ON_ONCE(pages > locked_vm);
                mm->locked_vm = locked_vm - pages;
        }

        pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
                 (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
                 locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
                 ret ? " - exceeded" : "");

        return ret;
}
EXPORT_SYMBOL_GPL(__account_locked_vm);

/**
 * account_locked_vm - account locked pages to an mm's locked_vm
 * @mm:          mm to account against, may be NULL
 * @pages:       number of pages to account
 * @inc:         %true if @pages should be considered positive, %false if not
 *
 * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
 *
 * Return:
 * * 0       on success, or if mm is NULL
 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
 */
int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
{
        int ret;

        if (pages == 0 || !mm)
                return 0;

        mmap_write_lock(mm);
        ret = __account_locked_vm(mm, pages, inc, current,
                                  capable(CAP_IPC_LOCK));
        mmap_write_unlock(mm);

        return ret;
}
EXPORT_SYMBOL_GPL(account_locked_vm);

unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
        unsigned long len, unsigned long prot,
        unsigned long flag, unsigned long pgoff)
{
        unsigned long ret;
        struct mm_struct *mm = current->mm;
        unsigned long populate;
        LIST_HEAD(uf);

        ret = security_mmap_file(file, prot, flag);
        if (!ret) {
                if (mmap_write_lock_killable(mm))
                        return -EINTR;
                ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate,
                              &uf);
                mmap_write_unlock(mm);
                userfaultfd_unmap_complete(mm, &uf);
                if (populate)
                        mm_populate(ret, populate);
        }
        return ret;
}

unsigned long vm_mmap(struct file *file, unsigned long addr,
        unsigned long len, unsigned long prot,
        unsigned long flag, unsigned long offset)
{
        if (unlikely(offset + PAGE_ALIGN(len) < offset))
                return -EINVAL;
        if (unlikely(offset_in_page(offset)))
                return -EINVAL;

        return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
}
EXPORT_SYMBOL(vm_mmap);

/**
 * kvmalloc_node - attempt to allocate physically contiguous memory, but upon
 * failure, fall back to non-contiguous (vmalloc) allocation.
 * @size: size of the request.
 * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
 * @node: numa node to allocate from
 *
 * Uses kmalloc to get the memory but if the allocation fails then falls back
 * to the vmalloc allocator. Use kvfree for freeing the memory.
 *
 * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported.
 * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
 * preferable to the vmalloc fallback, due to visible performance drawbacks.
 *
 * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not
 * fall back to vmalloc.
 *
 * Return: pointer to the allocated memory of %NULL in case of failure
 */
void *kvmalloc_node(size_t size, gfp_t flags, int node)
{
        gfp_t kmalloc_flags = flags;
        void *ret;

        /*
         * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables)
         * so the given set of flags has to be compatible.
         */
        if ((flags & GFP_KERNEL) != GFP_KERNEL)
                return kmalloc_node(size, flags, node);

        /*
         * We want to attempt a large physically contiguous block first because
         * it is less likely to fragment multiple larger blocks and therefore
         * contribute to a long term fragmentation less than vmalloc fallback.
         * However make sure that larger requests are not too disruptive - no
         * OOM killer and no allocation failure warnings as we have a fallback.
         */
        if (size > PAGE_SIZE) {
                kmalloc_flags |= __GFP_NOWARN;

                if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL))
                        kmalloc_flags |= __GFP_NORETRY;
        }

        ret = kmalloc_node(size, kmalloc_flags, node);

        /*
         * It doesn't really make sense to fallback to vmalloc for sub page
         * requests
         */
        if (ret || size <= PAGE_SIZE)
                return ret;

        /* Don't even allow crazy sizes */
        if (unlikely(size > INT_MAX)) {
                WARN_ON_ONCE(!(flags & __GFP_NOWARN));
                return NULL;
        }

        return __vmalloc_node(size, 1, flags, node,
                        __builtin_return_address(0));
}
EXPORT_SYMBOL(kvmalloc_node);

/**
 * kvfree() - Free memory.
 * @addr: Pointer to allocated memory.
 *
 * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
 * It is slightly more efficient to use kfree() or vfree() if you are certain
 * that you know which one to use.
 *
 * Context: Either preemptible task context or not-NMI interrupt.
 */
void kvfree(const void *addr)
{
        if (is_vmalloc_addr(addr))
                vfree(addr);
        else
                kfree(addr);
}
EXPORT_SYMBOL(kvfree);

/**
 * kvfree_sensitive - Free a data object containing sensitive information.
 * @addr: address of the data object to be freed.
 * @len: length of the data object.
 *
 * Use the special memzero_explicit() function to clear the content of a
 * kvmalloc'ed object containing sensitive data to make sure that the
 * compiler won't optimize out the data clearing.
 */
void kvfree_sensitive(const void *addr, size_t len)
{
        if (likely(!ZERO_OR_NULL_PTR(addr))) {
                memzero_explicit((void *)addr, len);
                kvfree(addr);
        }
}
EXPORT_SYMBOL(kvfree_sensitive);

void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
{
        void *newp;

        if (oldsize >= newsize)
                return (void *)p;
        newp = kvmalloc(newsize, flags);
        if (!newp)
                return NULL;
        memcpy(newp, p, oldsize);
        kvfree(p);
        return newp;
}
EXPORT_SYMBOL(kvrealloc);

static inline void *__page_rmapping(struct page *page)
{
        unsigned long mapping;

        mapping = (unsigned long)page->mapping;
        mapping &= ~PAGE_MAPPING_FLAGS;

        return (void *)mapping;
}

/**
 * __vmalloc_array - allocate memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
void *__vmalloc_array(size_t n, size_t size, gfp_t flags)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;
        return __vmalloc(bytes, flags);
}
EXPORT_SYMBOL(__vmalloc_array);

/**
 * vmalloc_array - allocate memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 */
void *vmalloc_array(size_t n, size_t size)
{
        return __vmalloc_array(n, size, GFP_KERNEL);
}
EXPORT_SYMBOL(vmalloc_array);

/**
 * __vcalloc - allocate and zero memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
void *__vcalloc(size_t n, size_t size, gfp_t flags)
{
        return __vmalloc_array(n, size, flags | __GFP_ZERO);
}
EXPORT_SYMBOL(__vcalloc);

/**
 * vcalloc - allocate and zero memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 */
void *vcalloc(size_t n, size_t size)
{
        return __vmalloc_array(n, size, GFP_KERNEL | __GFP_ZERO);
}
EXPORT_SYMBOL(vcalloc);

/* Neutral page->mapping pointer to address_space or anon_vma or other */
void *page_rmapping(struct page *page)
{
        page = compound_head(page);
        return __page_rmapping(page);
}

/*
 * Return true if this page is mapped into pagetables.
 * For compound page it returns true if any subpage of compound page is mapped.
 */
bool page_mapped(struct page *page)
{
        int i;

        if (likely(!PageCompound(page)))
                return atomic_read(&page->_mapcount) >= 0;
        page = compound_head(page);
        if (atomic_read(compound_mapcount_ptr(page)) >= 0)
                return true;
        if (PageHuge(page))
                return false;
        for (i = 0; i < compound_nr(page); i++) {
                if (atomic_read(&page[i]._mapcount) >= 0)
                        return true;
        }
        return false;
}
EXPORT_SYMBOL(page_mapped);

struct anon_vma *page_anon_vma(struct page *page)
{
        unsigned long mapping;

        page = compound_head(page);
        mapping = (unsigned long)page->mapping;
        if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
                return NULL;
        return __page_rmapping(page);
}

struct address_space *page_mapping(struct page *page)
{
        struct address_space *mapping;

        page = compound_head(page);

        /* This happens if someone calls flush_dcache_page on slab page */
        if (unlikely(PageSlab(page)))
                return NULL;

        if (unlikely(PageSwapCache(page))) {
                swp_entry_t entry;

                entry.val = page_private(page);
                return swap_address_space(entry);
        }

        mapping = page->mapping;
        if ((unsigned long)mapping & PAGE_MAPPING_ANON)
                return NULL;

        return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS);
}
EXPORT_SYMBOL(page_mapping);

/*
 * For file cache pages, return the address_space, otherwise return NULL
 */
struct address_space *page_mapping_file(struct page *page)
{
        if (unlikely(PageSwapCache(page)))
                return NULL;
        return page_mapping(page);
}

/* Slow path of page_mapcount() for compound pages */
int __page_mapcount(struct page *page)
{
        int ret;

        ret = atomic_read(&page->_mapcount) + 1;
        /*
         * For file THP page->_mapcount contains total number of mapping
         * of the page: no need to look into compound_mapcount.
         */
        if (!PageAnon(page) && !PageHuge(page))
                return ret;
        page = compound_head(page);
        ret += atomic_read(compound_mapcount_ptr(page)) + 1;
        if (PageDoubleMap(page))
                ret--;
        return ret;
}
EXPORT_SYMBOL_GPL(__page_mapcount);

int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
int sysctl_overcommit_ratio __read_mostly = 50;
unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */

int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer,
                size_t *lenp, loff_t *ppos)
{
        int ret;

        ret = proc_dointvec(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                sysctl_overcommit_kbytes = 0;
        return ret;
}

static void sync_overcommit_as(struct work_struct *dummy)
{
        percpu_counter_sync(&vm_committed_as);
}

int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer,
                size_t *lenp, loff_t *ppos)
{
        struct ctl_table t;
        int new_policy = -1;
        int ret;

        /*
         * The deviation of sync_overcommit_as could be big with loose policy
         * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
         * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
         * with the strict "NEVER", and to avoid possible race condtion (even
         * though user usually won't too frequently do the switching to policy
         * OVERCOMMIT_NEVER), the switch is done in the following order:
         *        1. changing the batch
         *        2. sync percpu count on each CPU
         *        3. switch the policy
         */
        if (write) {
                t = *table;
                t.data = &new_policy;
                ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
                if (ret || new_policy == -1)
                        return ret;

                mm_compute_batch(new_policy);
                if (new_policy == OVERCOMMIT_NEVER)
                        schedule_on_each_cpu(sync_overcommit_as);
                sysctl_overcommit_memory = new_policy;
        } else {
                ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        }

        return ret;
}

int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer,
                size_t *lenp, loff_t *ppos)
{
        int ret;

        ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                sysctl_overcommit_ratio = 0;
        return ret;
}

/*
 * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
 */
unsigned long vm_commit_limit(void)
{
        unsigned long allowed;

        if (sysctl_overcommit_kbytes)
                allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
        else
                allowed = ((totalram_pages() - hugetlb_total_pages())
                           * sysctl_overcommit_ratio / 100);
        allowed += total_swap_pages;

        return allowed;
}

/*
 * Make sure vm_committed_as in one cacheline and not cacheline shared with
 * other variables. It can be updated by several CPUs frequently.
 */
struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;

/*
 * The global memory commitment made in the system can be a metric
 * that can be used to drive ballooning decisions when Linux is hosted
 * as a guest. On Hyper-V, the host implements a policy engine for dynamically
 * balancing memory across competing virtual machines that are hosted.
 * Several metrics drive this policy engine including the guest reported
 * memory commitment.
 *
 * The time cost of this is very low for small platforms, and for big
 * platform like a 2S/36C/72T Skylake server, in worst case where
 * vm_committed_as's spinlock is under severe contention, the time cost
 * could be about 30~40 microseconds.
 */
unsigned long vm_memory_committed(void)
{
        return percpu_counter_sum_positive(&vm_committed_as);
}
EXPORT_SYMBOL_GPL(vm_memory_committed);

/*
 * Check that a process has enough memory to allocate a new virtual
 * mapping. 0 means there is enough memory for the allocation to
 * succeed and -ENOMEM implies there is not.
 *
 * We currently support three overcommit policies, which are set via the
 * vm.overcommit_memory sysctl.  See Documentation/vm/overcommit-accounting.rst
 *
 * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
 * Additional code 2002 Jul 20 by Robert Love.
 *
 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
 *
 * Note this is a helper function intended to be used by LSMs which
 * wish to use this logic.
 */
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
        long allowed;

        vm_acct_memory(pages);

        /*
         * Sometimes we want to use more memory than we have
         */
        if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
                return 0;

        if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
                if (pages > totalram_pages() + total_swap_pages)
                        goto error;
                return 0;
        }

        allowed = vm_commit_limit();
        /*
         * Reserve some for root
         */
        if (!cap_sys_admin)
                allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);

        /*
         * Don't let a single process grow so big a user can't recover
         */
        if (mm) {
                long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);

                allowed -= min_t(long, mm->total_vm / 32, reserve);
        }

        if (percpu_counter_read_positive(&vm_committed_as) < allowed)
                return 0;
error:
        vm_unacct_memory(pages);

        return -ENOMEM;
}

/**
 * get_cmdline() - copy the cmdline value to a buffer.
 * @task:     the task whose cmdline value to copy.
 * @buffer:   the buffer to copy to.
 * @buflen:   the length of the buffer. Larger cmdline values are truncated
 *            to this length.
 *
 * Return: the size of the cmdline field copied. Note that the copy does
 * not guarantee an ending NULL byte.
 */
int get_cmdline(struct task_struct *task, char *buffer, int buflen)
{
        int res = 0;
        unsigned int len;
        struct mm_struct *mm = get_task_mm(task);
        unsigned long arg_start, arg_end, env_start, env_end;
        if (!mm)
                goto out;
        if (!mm->arg_end)
                goto out_mm;        /* Shh! No looking before we're done */

        spin_lock(&mm->arg_lock);
        arg_start = mm->arg_start;
        arg_end = mm->arg_end;
        env_start = mm->env_start;
        env_end = mm->env_end;
        spin_unlock(&mm->arg_lock);

        len = arg_end - arg_start;

        if (len > buflen)
                len = buflen;

        res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE);

        /*
         * If the nul at the end of args has been overwritten, then
         * assume application is using setproctitle(3).
         */
        if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
                len = strnlen(buffer, res);
                if (len < res) {
                        res = len;
                } else {
                        len = env_end - env_start;
                        if (len > buflen - res)
                                len = buflen - res;
                        res += access_process_vm(task, env_start,
                                                 buffer+res, len,
                                                 FOLL_FORCE);
                        res = strnlen(buffer, res);
                }
        }
out_mm:
        mmput(mm);
out:
        return res;
}

int __weak memcmp_pages(struct page *page1, struct page *page2)
{
        char *addr1, *addr2;
        int ret;

        addr1 = kmap_atomic(page1);
        addr2 = kmap_atomic(page2);
        ret = memcmp(addr1, addr2, PAGE_SIZE);
        kunmap_atomic(addr2);
        kunmap_atomic(addr1);
        return ret;
}





































































































































































































































    1 





    1 








    1 
    1 


    1 



























    1 









    1 





    1 
































    1 
    1 



    1 







































































    1 





    1 

    1 








    1 






    1 
    1 


    1 







    1 






































































































    1 














































    1 





    1 


    1 



















    1 
























    1 
    1 


    1 





































    1 

    1 













































    1 





    1 




    1 



    1 


















































    1 





    1 


    1 

    1 
    1 




    1 

    1 


    1 

    1 
















































    1 



    1 











    1 






















































































































































































    1 




    1 


    1 







    1 



































    1 








    1 

    1 








    1 




























    1 



    1 






    1 

    1 





    1 


    1 










    1 




    1 



    1 









































































































































































































































































































































































































    1 























































































    1 
    1 
    1 












    1 

    1 



    1 













    1 

    1 











    1 















    1 
    1 
    1 







    1 









































    1 


















    1 


    1 



    1 


    1 



















































    1 








































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/percpu.c - percpu memory allocator
 *
 * Copyright (C) 2009                SUSE Linux Products GmbH
 * Copyright (C) 2009                Tejun Heo <tj@kernel.org>
 *
 * Copyright (C) 2017                Facebook Inc.
 * Copyright (C) 2017                Dennis Zhou <dennis@kernel.org>
 *
 * The percpu allocator handles both static and dynamic areas.  Percpu
 * areas are allocated in chunks which are divided into units.  There is
 * a 1-to-1 mapping for units to possible cpus.  These units are grouped
 * based on NUMA properties of the machine.
 *
 *  c0                           c1                         c2
 *  -------------------          -------------------        ------------
 * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
 *  -------------------  ......  -------------------  ....  ------------
 *
 * Allocation is done by offsets into a unit's address space.  Ie., an
 * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
 * c1:u1, c1:u2, etc.  On NUMA machines, the mapping may be non-linear
 * and even sparse.  Access is handled by configuring percpu base
 * registers according to the cpu to unit mappings and offsetting the
 * base address using pcpu_unit_size.
 *
 * There is special consideration for the first chunk which must handle
 * the static percpu variables in the kernel image as allocation services
 * are not online yet.  In short, the first chunk is structured like so:
 *
 *                  <Static | [Reserved] | Dynamic>
 *
 * The static data is copied from the original section managed by the
 * linker.  The reserved section, if non-zero, primarily manages static
 * percpu variables from kernel modules.  Finally, the dynamic section
 * takes care of normal allocations.
 *
 * The allocator organizes chunks into lists according to free size and
 * memcg-awareness.  To make a percpu allocation memcg-aware the __GFP_ACCOUNT
 * flag should be passed.  All memcg-aware allocations are sharing one set
 * of chunks and all unaccounted allocations and allocations performed
 * by processes belonging to the root memory cgroup are using the second set.
 *
 * The allocator tries to allocate from the fullest chunk first. Each chunk
 * is managed by a bitmap with metadata blocks.  The allocation map is updated
 * on every allocation and free to reflect the current state while the boundary
 * map is only updated on allocation.  Each metadata block contains
 * information to help mitigate the need to iterate over large portions
 * of the bitmap.  The reverse mapping from page to chunk is stored in
 * the page's index.  Lastly, units are lazily backed and grow in unison.
 *
 * There is a unique conversion that goes on here between bytes and bits.
 * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE.  The chunk
 * tracks the number of pages it is responsible for in nr_pages.  Helper
 * functions are used to convert from between the bytes, bits, and blocks.
 * All hints are managed in bits unless explicitly stated.
 *
 * To use this allocator, arch code should do the following:
 *
 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
 *   regular address to percpu pointer and back if they need to be
 *   different from the default
 *
 * - use pcpu_setup_first_chunk() during percpu area initialization to
 *   setup the first chunk containing the kernel static percpu area
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/bitmap.h>
#include <linux/memblock.h>
#include <linux/err.h>
#include <linux/lcm.h>
#include <linux/list.h>
#include <linux/log2.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/pfn.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
#include <linux/kmemleak.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/memcontrol.h>

#include <asm/cacheflush.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
#include <asm/io.h>

#define CREATE_TRACE_POINTS
#include <trace/events/percpu.h>

#include "percpu-internal.h"

/* the slots are sorted by free bytes left, 1-31 bytes share the same slot */
#define PCPU_SLOT_BASE_SHIFT                5
/* chunks in slots below this are subject to being sidelined on failed alloc */
#define PCPU_SLOT_FAIL_THRESHOLD        3

#define PCPU_EMPTY_POP_PAGES_LOW        2
#define PCPU_EMPTY_POP_PAGES_HIGH        4

#ifdef CONFIG_SMP
/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
#ifndef __addr_to_pcpu_ptr
#define __addr_to_pcpu_ptr(addr)                                        \
        (void __percpu *)((unsigned long)(addr) -                        \
                          (unsigned long)pcpu_base_addr        +                \
                          (unsigned long)__per_cpu_start)
#endif
#ifndef __pcpu_ptr_to_addr
#define __pcpu_ptr_to_addr(ptr)                                                \
        (void __force *)((unsigned long)(ptr) +                                \
                         (unsigned long)pcpu_base_addr -                \
                         (unsigned long)__per_cpu_start)
#endif
#else        /* CONFIG_SMP */
/* on UP, it's always identity mapped */
#define __addr_to_pcpu_ptr(addr)        (void __percpu *)(addr)
#define __pcpu_ptr_to_addr(ptr)                (void __force *)(ptr)
#endif        /* CONFIG_SMP */

static int pcpu_unit_pages __ro_after_init;
static int pcpu_unit_size __ro_after_init;
static int pcpu_nr_units __ro_after_init;
static int pcpu_atom_size __ro_after_init;
int pcpu_nr_slots __ro_after_init;
static size_t pcpu_chunk_struct_size __ro_after_init;

/* cpus with the lowest and highest unit addresses */
static unsigned int pcpu_low_unit_cpu __ro_after_init;
static unsigned int pcpu_high_unit_cpu __ro_after_init;

/* the address of the first chunk which starts with the kernel static area */
void *pcpu_base_addr __ro_after_init;
EXPORT_SYMBOL_GPL(pcpu_base_addr);

static const int *pcpu_unit_map __ro_after_init;                /* cpu -> unit */
const unsigned long *pcpu_unit_offsets __ro_after_init;        /* cpu -> unit offset */

/* group information, used for vm allocation */
static int pcpu_nr_groups __ro_after_init;
static const unsigned long *pcpu_group_offsets __ro_after_init;
static const size_t *pcpu_group_sizes __ro_after_init;

/*
 * The first chunk which always exists.  Note that unlike other
 * chunks, this one can be allocated and mapped in several different
 * ways and thus often doesn't live in the vmalloc area.
 */
struct pcpu_chunk *pcpu_first_chunk __ro_after_init;

/*
 * Optional reserved chunk.  This chunk reserves part of the first
 * chunk and serves it for reserved allocations.  When the reserved
 * region doesn't exist, the following variable is NULL.
 */
struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;

DEFINE_SPINLOCK(pcpu_lock);        /* all internal data structures */
static DEFINE_MUTEX(pcpu_alloc_mutex);        /* chunk create/destroy, [de]pop, map ext */

struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */

/* chunks which need their map areas extended, protected by pcpu_lock */
static LIST_HEAD(pcpu_map_extend_chunks);

/*
 * The number of empty populated pages by chunk type, protected by pcpu_lock.
 * The reserved chunk doesn't contribute to the count.
 */
int pcpu_nr_empty_pop_pages[PCPU_NR_CHUNK_TYPES];

/*
 * The number of populated pages in use by the allocator, protected by
 * pcpu_lock.  This number is kept per a unit per chunk (i.e. when a page gets
 * allocated/deallocated, it is allocated/deallocated in all units of a chunk
 * and increments/decrements this count by 1).
 */
static unsigned long pcpu_nr_populated;

/*
 * Balance work is used to populate or destroy chunks asynchronously.  We
 * try to keep the number of populated free pages between
 * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
 * empty chunk.
 */
static void pcpu_balance_workfn(struct work_struct *work);
static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
static bool pcpu_async_enabled __read_mostly;
static bool pcpu_atomic_alloc_failed;

static void pcpu_schedule_balance_work(void)
{
        if (pcpu_async_enabled)
                schedule_work(&pcpu_balance_work);
}

/**
 * pcpu_addr_in_chunk - check if the address is served from this chunk
 * @chunk: chunk of interest
 * @addr: percpu address
 *
 * RETURNS:
 * True if the address is served from this chunk.
 */
static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
{
        void *start_addr, *end_addr;

        if (!chunk)
                return false;

        start_addr = chunk->base_addr + chunk->start_offset;
        end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
                   chunk->end_offset;

        return addr >= start_addr && addr < end_addr;
}

static int __pcpu_size_to_slot(int size)
{
        int highbit = fls(size);        /* size is in bytes */
        return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
}

static int pcpu_size_to_slot(int size)
{
        if (size == pcpu_unit_size)
                return pcpu_nr_slots - 1;
        return __pcpu_size_to_slot(size);
}

static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
{
        const struct pcpu_block_md *chunk_md = &chunk->chunk_md;

        if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE ||
            chunk_md->contig_hint == 0)
                return 0;

        return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
}

/* set the pointer to a chunk in a page struct */
static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
{
        page->index = (unsigned long)pcpu;
}

/* obtain pointer to a chunk from a page struct */
static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
{
        return (struct pcpu_chunk *)page->index;
}

static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
{
        return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
}

static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
{
        return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
}

static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
                                     unsigned int cpu, int page_idx)
{
        return (unsigned long)chunk->base_addr +
               pcpu_unit_page_offset(cpu, page_idx);
}

/*
 * The following are helper functions to help access bitmaps and convert
 * between bitmap offsets to address offsets.
 */
static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
{
        return chunk->alloc_map +
               (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
}

static unsigned long pcpu_off_to_block_index(int off)
{
        return off / PCPU_BITMAP_BLOCK_BITS;
}

static unsigned long pcpu_off_to_block_off(int off)
{
        return off & (PCPU_BITMAP_BLOCK_BITS - 1);
}

static unsigned long pcpu_block_off_to_off(int index, int off)
{
        return index * PCPU_BITMAP_BLOCK_BITS + off;
}

/*
 * pcpu_next_hint - determine which hint to use
 * @block: block of interest
 * @alloc_bits: size of allocation
 *
 * This determines if we should scan based on the scan_hint or first_free.
 * In general, we want to scan from first_free to fulfill allocations by
 * first fit.  However, if we know a scan_hint at position scan_hint_start
 * cannot fulfill an allocation, we can begin scanning from there knowing
 * the contig_hint will be our fallback.
 */
static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits)
{
        /*
         * The three conditions below determine if we can skip past the
         * scan_hint.  First, does the scan hint exist.  Second, is the
         * contig_hint after the scan_hint (possibly not true iff
         * contig_hint == scan_hint).  Third, is the allocation request
         * larger than the scan_hint.
         */
        if (block->scan_hint &&
            block->contig_hint_start > block->scan_hint_start &&
            alloc_bits > block->scan_hint)
                return block->scan_hint_start + block->scan_hint;

        return block->first_free;
}

/**
 * pcpu_next_md_free_region - finds the next hint free area
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of free area
 *
 * Helper function for pcpu_for_each_md_free_region.  It checks
 * block->contig_hint and performs aggregation across blocks to find the
 * next hint.  It modifies bit_off and bits in-place to be consumed in the
 * loop.
 */
static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
                                     int *bits)
{
        int i = pcpu_off_to_block_index(*bit_off);
        int block_off = pcpu_off_to_block_off(*bit_off);
        struct pcpu_block_md *block;

        *bits = 0;
        for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
             block++, i++) {
                /* handles contig area across blocks */
                if (*bits) {
                        *bits += block->left_free;
                        if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
                                continue;
                        return;
                }

                /*
                 * This checks three things.  First is there a contig_hint to
                 * check.  Second, have we checked this hint before by
                 * comparing the block_off.  Third, is this the same as the
                 * right contig hint.  In the last case, it spills over into
                 * the next block and should be handled by the contig area
                 * across blocks code.
                 */
                *bits = block->contig_hint;
                if (*bits && block->contig_hint_start >= block_off &&
                    *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
                        *bit_off = pcpu_block_off_to_off(i,
                                        block->contig_hint_start);
                        return;
                }
                /* reset to satisfy the second predicate above */
                block_off = 0;

                *bits = block->right_free;
                *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
        }
}

/**
 * pcpu_next_fit_region - finds fit areas for a given allocation request
 * @chunk: chunk of interest
 * @alloc_bits: size of allocation
 * @align: alignment of area (max PAGE_SIZE)
 * @bit_off: chunk offset
 * @bits: size of free area
 *
 * Finds the next free region that is viable for use with a given size and
 * alignment.  This only returns if there is a valid area to be used for this
 * allocation.  block->first_free is returned if the allocation request fits
 * within the block to see if the request can be fulfilled prior to the contig
 * hint.
 */
static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
                                 int align, int *bit_off, int *bits)
{
        int i = pcpu_off_to_block_index(*bit_off);
        int block_off = pcpu_off_to_block_off(*bit_off);
        struct pcpu_block_md *block;

        *bits = 0;
        for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
             block++, i++) {
                /* handles contig area across blocks */
                if (*bits) {
                        *bits += block->left_free;
                        if (*bits >= alloc_bits)
                                return;
                        if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
                                continue;
                }

                /* check block->contig_hint */
                *bits = ALIGN(block->contig_hint_start, align) -
                        block->contig_hint_start;
                /*
                 * This uses the block offset to determine if this has been
                 * checked in the prior iteration.
                 */
                if (block->contig_hint &&
                    block->contig_hint_start >= block_off &&
                    block->contig_hint >= *bits + alloc_bits) {
                        int start = pcpu_next_hint(block, alloc_bits);

                        *bits += alloc_bits + block->contig_hint_start -
                                 start;
                        *bit_off = pcpu_block_off_to_off(i, start);
                        return;
                }
                /* reset to satisfy the second predicate above */
                block_off = 0;

                *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
                                 align);
                *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
                *bit_off = pcpu_block_off_to_off(i, *bit_off);
                if (*bits >= alloc_bits)
                        return;
        }

        /* no valid offsets were found - fail condition */
        *bit_off = pcpu_chunk_map_bits(chunk);
}

/*
 * Metadata free area iterators.  These perform aggregation of free areas
 * based on the metadata blocks and return the offset @bit_off and size in
 * bits of the free area @bits.  pcpu_for_each_fit_region only returns when
 * a fit is found for the allocation request.
 */
#define pcpu_for_each_md_free_region(chunk, bit_off, bits)                \
        for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits));        \
             (bit_off) < pcpu_chunk_map_bits((chunk));                        \
             (bit_off) += (bits) + 1,                                        \
             pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))

#define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits)     \
        for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
                                  &(bits));                                      \
             (bit_off) < pcpu_chunk_map_bits((chunk));                              \
             (bit_off) += (bits),                                              \
             pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
                                  &(bits)))

/**
 * pcpu_mem_zalloc - allocate memory
 * @size: bytes to allocate
 * @gfp: allocation flags
 *
 * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
 * kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
 * This is to facilitate passing through whitelisted flags.  The
 * returned memory is always zeroed.
 *
 * RETURNS:
 * Pointer to the allocated area on success, NULL on failure.
 */
static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
{
        if (WARN_ON_ONCE(!slab_is_available()))
                return NULL;

        if (size <= PAGE_SIZE)
                return kzalloc(size, gfp);
        else
                return __vmalloc(size, gfp | __GFP_ZERO);
}

/**
 * pcpu_mem_free - free memory
 * @ptr: memory to free
 *
 * Free @ptr.  @ptr should have been allocated using pcpu_mem_zalloc().
 */
static void pcpu_mem_free(void *ptr)
{
        kvfree(ptr);
}

static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
                              bool move_front)
{
        if (chunk != pcpu_reserved_chunk) {
                struct list_head *pcpu_slot;

                pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
                if (move_front)
                        list_move(&chunk->list, &pcpu_slot[slot]);
                else
                        list_move_tail(&chunk->list, &pcpu_slot[slot]);
        }
}

static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot)
{
        __pcpu_chunk_move(chunk, slot, true);
}

/**
 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
 * @chunk: chunk of interest
 * @oslot: the previous slot it was on
 *
 * This function is called after an allocation or free changed @chunk.
 * New slot according to the changed state is determined and @chunk is
 * moved to the slot.  Note that the reserved chunk is never put on
 * chunk slots.
 *
 * CONTEXT:
 * pcpu_lock.
 */
static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
{
        int nslot = pcpu_chunk_slot(chunk);

        if (oslot != nslot)
                __pcpu_chunk_move(chunk, nslot, oslot < nslot);
}

/*
 * pcpu_update_empty_pages - update empty page counters
 * @chunk: chunk of interest
 * @nr: nr of empty pages
 *
 * This is used to keep track of the empty pages now based on the premise
 * a md_block covers a page.  The hint update functions recognize if a block
 * is made full or broken to calculate deltas for keeping track of free pages.
 */
static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
{
        chunk->nr_empty_pop_pages += nr;
        if (chunk != pcpu_reserved_chunk)
                pcpu_nr_empty_pop_pages[pcpu_chunk_type(chunk)] += nr;
}

/*
 * pcpu_region_overlap - determines if two regions overlap
 * @a: start of first region, inclusive
 * @b: end of first region, exclusive
 * @x: start of second region, inclusive
 * @y: end of second region, exclusive
 *
 * This is used to determine if the hint region [a, b) overlaps with the
 * allocated region [x, y).
 */
static inline bool pcpu_region_overlap(int a, int b, int x, int y)
{
        return (a < y) && (x < b);
}

/**
 * pcpu_block_update - updates a block given a free area
 * @block: block of interest
 * @start: start offset in block
 * @end: end offset in block
 *
 * Updates a block given a known free area.  The region [start, end) is
 * expected to be the entirety of the free area within a block.  Chooses
 * the best starting offset if the contig hints are equal.
 */
static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
{
        int contig = end - start;

        block->first_free = min(block->first_free, start);
        if (start == 0)
                block->left_free = contig;

        if (end == block->nr_bits)
                block->right_free = contig;

        if (contig > block->contig_hint) {
                /* promote the old contig_hint to be the new scan_hint */
                if (start > block->contig_hint_start) {
                        if (block->contig_hint > block->scan_hint) {
                                block->scan_hint_start =
                                        block->contig_hint_start;
                                block->scan_hint = block->contig_hint;
                        } else if (start < block->scan_hint_start) {
                                /*
                                 * The old contig_hint == scan_hint.  But, the
                                 * new contig is larger so hold the invariant
                                 * scan_hint_start < contig_hint_start.
                                 */
                                block->scan_hint = 0;
                        }
                } else {
                        block->scan_hint = 0;
                }
                block->contig_hint_start = start;
                block->contig_hint = contig;
        } else if (contig == block->contig_hint) {
                if (block->contig_hint_start &&
                    (!start ||
                     __ffs(start) > __ffs(block->contig_hint_start))) {
                        /* start has a better alignment so use it */
                        block->contig_hint_start = start;
                        if (start < block->scan_hint_start &&
                            block->contig_hint > block->scan_hint)
                                block->scan_hint = 0;
                } else if (start > block->scan_hint_start ||
                           block->contig_hint > block->scan_hint) {
                        /*
                         * Knowing contig == contig_hint, update the scan_hint
                         * if it is farther than or larger than the current
                         * scan_hint.
                         */
                        block->scan_hint_start = start;
                        block->scan_hint = contig;
                }
        } else {
                /*
                 * The region is smaller than the contig_hint.  So only update
                 * the scan_hint if it is larger than or equal and farther than
                 * the current scan_hint.
                 */
                if ((start < block->contig_hint_start &&
                     (contig > block->scan_hint ||
                      (contig == block->scan_hint &&
                       start > block->scan_hint_start)))) {
                        block->scan_hint_start = start;
                        block->scan_hint = contig;
                }
        }
}

/*
 * pcpu_block_update_scan - update a block given a free area from a scan
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of free area
 *
 * Finding the final allocation spot first goes through pcpu_find_block_fit()
 * to find a block that can hold the allocation and then pcpu_alloc_area()
 * where a scan is used.  When allocations require specific alignments,
 * we can inadvertently create holes which will not be seen in the alloc
 * or free paths.
 *
 * This takes a given free area hole and updates a block as it may change the
 * scan_hint.  We need to scan backwards to ensure we don't miss free bits
 * from alignment.
 */
static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off,
                                   int bits)
{
        int s_off = pcpu_off_to_block_off(bit_off);
        int e_off = s_off + bits;
        int s_index, l_bit;
        struct pcpu_block_md *block;

        if (e_off > PCPU_BITMAP_BLOCK_BITS)
                return;

        s_index = pcpu_off_to_block_index(bit_off);
        block = chunk->md_blocks + s_index;

        /* scan backwards in case of alignment skipping free bits */
        l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off);
        s_off = (s_off == l_bit) ? 0 : l_bit + 1;

        pcpu_block_update(block, s_off, e_off);
}

/**
 * pcpu_chunk_refresh_hint - updates metadata about a chunk
 * @chunk: chunk of interest
 * @full_scan: if we should scan from the beginning
 *
 * Iterates over the metadata blocks to find the largest contig area.
 * A full scan can be avoided on the allocation path as this is triggered
 * if we broke the contig_hint.  In doing so, the scan_hint will be before
 * the contig_hint or after if the scan_hint == contig_hint.  This cannot
 * be prevented on freeing as we want to find the largest area possibly
 * spanning blocks.
 */
static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int bit_off, bits;

        /* promote scan_hint to contig_hint */
        if (!full_scan && chunk_md->scan_hint) {
                bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint;
                chunk_md->contig_hint_start = chunk_md->scan_hint_start;
                chunk_md->contig_hint = chunk_md->scan_hint;
                chunk_md->scan_hint = 0;
        } else {
                bit_off = chunk_md->first_free;
                chunk_md->contig_hint = 0;
        }

        bits = 0;
        pcpu_for_each_md_free_region(chunk, bit_off, bits)
                pcpu_block_update(chunk_md, bit_off, bit_off + bits);
}

/**
 * pcpu_block_refresh_hint
 * @chunk: chunk of interest
 * @index: index of the metadata block
 *
 * Scans over the block beginning at first_free and updates the block
 * metadata accordingly.
 */
static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
{
        struct pcpu_block_md *block = chunk->md_blocks + index;
        unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
        unsigned int rs, re, start;        /* region start, region end */

        /* promote scan_hint to contig_hint */
        if (block->scan_hint) {
                start = block->scan_hint_start + block->scan_hint;
                block->contig_hint_start = block->scan_hint_start;
                block->contig_hint = block->scan_hint;
                block->scan_hint = 0;
        } else {
                start = block->first_free;
                block->contig_hint = 0;
        }

        block->right_free = 0;

        /* iterate over free areas and update the contig hints */
        bitmap_for_each_clear_region(alloc_map, rs, re, start,
                                     PCPU_BITMAP_BLOCK_BITS)
                pcpu_block_update(block, rs, re);
}

/**
 * pcpu_block_update_hint_alloc - update hint on allocation path
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of request
 *
 * Updates metadata for the allocation path.  The metadata only has to be
 * refreshed by a full scan iff the chunk's contig hint is broken.  Block level
 * scans are required if the block's contig hint is broken.
 */
static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
                                         int bits)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int nr_empty_pages = 0;
        struct pcpu_block_md *s_block, *e_block, *block;
        int s_index, e_index;        /* block indexes of the freed allocation */
        int s_off, e_off;        /* block offsets of the freed allocation */

        /*
         * Calculate per block offsets.
         * The calculation uses an inclusive range, but the resulting offsets
         * are [start, end).  e_index always points to the last block in the
         * range.
         */
        s_index = pcpu_off_to_block_index(bit_off);
        e_index = pcpu_off_to_block_index(bit_off + bits - 1);
        s_off = pcpu_off_to_block_off(bit_off);
        e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;

        s_block = chunk->md_blocks + s_index;
        e_block = chunk->md_blocks + e_index;

        /*
         * Update s_block.
         * block->first_free must be updated if the allocation takes its place.
         * If the allocation breaks the contig_hint, a scan is required to
         * restore this hint.
         */
        if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
                nr_empty_pages++;

        if (s_off == s_block->first_free)
                s_block->first_free = find_next_zero_bit(
                                        pcpu_index_alloc_map(chunk, s_index),
                                        PCPU_BITMAP_BLOCK_BITS,
                                        s_off + bits);

        if (pcpu_region_overlap(s_block->scan_hint_start,
                                s_block->scan_hint_start + s_block->scan_hint,
                                s_off,
                                s_off + bits))
                s_block->scan_hint = 0;

        if (pcpu_region_overlap(s_block->contig_hint_start,
                                s_block->contig_hint_start +
                                s_block->contig_hint,
                                s_off,
                                s_off + bits)) {
                /* block contig hint is broken - scan to fix it */
                if (!s_off)
                        s_block->left_free = 0;
                pcpu_block_refresh_hint(chunk, s_index);
        } else {
                /* update left and right contig manually */
                s_block->left_free = min(s_block->left_free, s_off);
                if (s_index == e_index)
                        s_block->right_free = min_t(int, s_block->right_free,
                                        PCPU_BITMAP_BLOCK_BITS - e_off);
                else
                        s_block->right_free = 0;
        }

        /*
         * Update e_block.
         */
        if (s_index != e_index) {
                if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
                        nr_empty_pages++;

                /*
                 * When the allocation is across blocks, the end is along
                 * the left part of the e_block.
                 */
                e_block->first_free = find_next_zero_bit(
                                pcpu_index_alloc_map(chunk, e_index),
                                PCPU_BITMAP_BLOCK_BITS, e_off);

                if (e_off == PCPU_BITMAP_BLOCK_BITS) {
                        /* reset the block */
                        e_block++;
                } else {
                        if (e_off > e_block->scan_hint_start)
                                e_block->scan_hint = 0;

                        e_block->left_free = 0;
                        if (e_off > e_block->contig_hint_start) {
                                /* contig hint is broken - scan to fix it */
                                pcpu_block_refresh_hint(chunk, e_index);
                        } else {
                                e_block->right_free =
                                        min_t(int, e_block->right_free,
                                              PCPU_BITMAP_BLOCK_BITS - e_off);
                        }
                }

                /* update in-between md_blocks */
                nr_empty_pages += (e_index - s_index - 1);
                for (block = s_block + 1; block < e_block; block++) {
                        block->scan_hint = 0;
                        block->contig_hint = 0;
                        block->left_free = 0;
                        block->right_free = 0;
                }
        }

        if (nr_empty_pages)
                pcpu_update_empty_pages(chunk, -nr_empty_pages);

        if (pcpu_region_overlap(chunk_md->scan_hint_start,
                                chunk_md->scan_hint_start +
                                chunk_md->scan_hint,
                                bit_off,
                                bit_off + bits))
                chunk_md->scan_hint = 0;

        /*
         * The only time a full chunk scan is required is if the chunk
         * contig hint is broken.  Otherwise, it means a smaller space
         * was used and therefore the chunk contig hint is still correct.
         */
        if (pcpu_region_overlap(chunk_md->contig_hint_start,
                                chunk_md->contig_hint_start +
                                chunk_md->contig_hint,
                                bit_off,
                                bit_off + bits))
                pcpu_chunk_refresh_hint(chunk, false);
}

/**
 * pcpu_block_update_hint_free - updates the block hints on the free path
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of request
 *
 * Updates metadata for the allocation path.  This avoids a blind block
 * refresh by making use of the block contig hints.  If this fails, it scans
 * forward and backward to determine the extent of the free area.  This is
 * capped at the boundary of blocks.
 *
 * A chunk update is triggered if a page becomes free, a block becomes free,
 * or the free spans across blocks.  This tradeoff is to minimize iterating
 * over the block metadata to update chunk_md->contig_hint.
 * chunk_md->contig_hint may be off by up to a page, but it will never be more
 * than the available space.  If the contig hint is contained in one block, it
 * will be accurate.
 */
static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
                                        int bits)
{
        int nr_empty_pages = 0;
        struct pcpu_block_md *s_block, *e_block, *block;
        int s_index, e_index;        /* block indexes of the freed allocation */
        int s_off, e_off;        /* block offsets of the freed allocation */
        int start, end;                /* start and end of the whole free area */

        /*
         * Calculate per block offsets.
         * The calculation uses an inclusive range, but the resulting offsets
         * are [start, end).  e_index always points to the last block in the
         * range.
         */
        s_index = pcpu_off_to_block_index(bit_off);
        e_index = pcpu_off_to_block_index(bit_off + bits - 1);
        s_off = pcpu_off_to_block_off(bit_off);
        e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;

        s_block = chunk->md_blocks + s_index;
        e_block = chunk->md_blocks + e_index;

        /*
         * Check if the freed area aligns with the block->contig_hint.
         * If it does, then the scan to find the beginning/end of the
         * larger free area can be avoided.
         *
         * start and end refer to beginning and end of the free area
         * within each their respective blocks.  This is not necessarily
         * the entire free area as it may span blocks past the beginning
         * or end of the block.
         */
        start = s_off;
        if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
                start = s_block->contig_hint_start;
        } else {
                /*
                 * Scan backwards to find the extent of the free area.
                 * find_last_bit returns the starting bit, so if the start bit
                 * is returned, that means there was no last bit and the
                 * remainder of the chunk is free.
                 */
                int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
                                          start);
                start = (start == l_bit) ? 0 : l_bit + 1;
        }

        end = e_off;
        if (e_off == e_block->contig_hint_start)
                end = e_block->contig_hint_start + e_block->contig_hint;
        else
                end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
                                    PCPU_BITMAP_BLOCK_BITS, end);

        /* update s_block */
        e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
        if (!start && e_off == PCPU_BITMAP_BLOCK_BITS)
                nr_empty_pages++;
        pcpu_block_update(s_block, start, e_off);

        /* freeing in the same block */
        if (s_index != e_index) {
                /* update e_block */
                if (end == PCPU_BITMAP_BLOCK_BITS)
                        nr_empty_pages++;
                pcpu_block_update(e_block, 0, end);

                /* reset md_blocks in the middle */
                nr_empty_pages += (e_index - s_index - 1);
                for (block = s_block + 1; block < e_block; block++) {
                        block->first_free = 0;
                        block->scan_hint = 0;
                        block->contig_hint_start = 0;
                        block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
                        block->left_free = PCPU_BITMAP_BLOCK_BITS;
                        block->right_free = PCPU_BITMAP_BLOCK_BITS;
                }
        }

        if (nr_empty_pages)
                pcpu_update_empty_pages(chunk, nr_empty_pages);

        /*
         * Refresh chunk metadata when the free makes a block free or spans
         * across blocks.  The contig_hint may be off by up to a page, but if
         * the contig_hint is contained in a block, it will be accurate with
         * the else condition below.
         */
        if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index)
                pcpu_chunk_refresh_hint(chunk, true);
        else
                pcpu_block_update(&chunk->chunk_md,
                                  pcpu_block_off_to_off(s_index, start),
                                  end);
}

/**
 * pcpu_is_populated - determines if the region is populated
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of area
 * @next_off: return value for the next offset to start searching
 *
 * For atomic allocations, check if the backing pages are populated.
 *
 * RETURNS:
 * Bool if the backing pages are populated.
 * next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
 */
static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
                              int *next_off)
{
        unsigned int page_start, page_end, rs, re;

        page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
        page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);

        rs = page_start;
        bitmap_next_clear_region(chunk->populated, &rs, &re, page_end);
        if (rs >= page_end)
                return true;

        *next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
        return false;
}

/**
 * pcpu_find_block_fit - finds the block index to start searching
 * @chunk: chunk of interest
 * @alloc_bits: size of request in allocation units
 * @align: alignment of area (max PAGE_SIZE bytes)
 * @pop_only: use populated regions only
 *
 * Given a chunk and an allocation spec, find the offset to begin searching
 * for a free region.  This iterates over the bitmap metadata blocks to
 * find an offset that will be guaranteed to fit the requirements.  It is
 * not quite first fit as if the allocation does not fit in the contig hint
 * of a block or chunk, it is skipped.  This errs on the side of caution
 * to prevent excess iteration.  Poor alignment can cause the allocator to
 * skip over blocks and chunks that have valid free areas.
 *
 * RETURNS:
 * The offset in the bitmap to begin searching.
 * -1 if no offset is found.
 */
static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
                               size_t align, bool pop_only)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int bit_off, bits, next_off;

        /*
         * Check to see if the allocation can fit in the chunk's contig hint.
         * This is an optimization to prevent scanning by assuming if it
         * cannot fit in the global hint, there is memory pressure and creating
         * a new chunk would happen soon.
         */
        bit_off = ALIGN(chunk_md->contig_hint_start, align) -
                  chunk_md->contig_hint_start;
        if (bit_off + alloc_bits > chunk_md->contig_hint)
                return -1;

        bit_off = pcpu_next_hint(chunk_md, alloc_bits);
        bits = 0;
        pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
                if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
                                                   &next_off))
                        break;

                bit_off = next_off;
                bits = 0;
        }

        if (bit_off == pcpu_chunk_map_bits(chunk))
                return -1;

        return bit_off;
}

/*
 * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off()
 * @map: the address to base the search on
 * @size: the bitmap size in bits
 * @start: the bitnumber to start searching at
 * @nr: the number of zeroed bits we're looking for
 * @align_mask: alignment mask for zero area
 * @largest_off: offset of the largest area skipped
 * @largest_bits: size of the largest area skipped
 *
 * The @align_mask should be one less than a power of 2.
 *
 * This is a modified version of bitmap_find_next_zero_area_off() to remember
 * the largest area that was skipped.  This is imperfect, but in general is
 * good enough.  The largest remembered region is the largest failed region
 * seen.  This does not include anything we possibly skipped due to alignment.
 * pcpu_block_update_scan() does scan backwards to try and recover what was
 * lost to alignment.  While this can cause scanning to miss earlier possible
 * free areas, smaller allocations will eventually fill those holes.
 */
static unsigned long pcpu_find_zero_area(unsigned long *map,
                                         unsigned long size,
                                         unsigned long start,
                                         unsigned long nr,
                                         unsigned long align_mask,
                                         unsigned long *largest_off,
                                         unsigned long *largest_bits)
{
        unsigned long index, end, i, area_off, area_bits;
again:
        index = find_next_zero_bit(map, size, start);

        /* Align allocation */
        index = __ALIGN_MASK(index, align_mask);
        area_off = index;

        end = index + nr;
        if (end > size)
                return end;
        i = find_next_bit(map, end, index);
        if (i < end) {
                area_bits = i - area_off;
                /* remember largest unused area with best alignment */
                if (area_bits > *largest_bits ||
                    (area_bits == *largest_bits && *largest_off &&
                     (!area_off || __ffs(area_off) > __ffs(*largest_off)))) {
                        *largest_off = area_off;
                        *largest_bits = area_bits;
                }

                start = i + 1;
                goto again;
        }
        return index;
}

/**
 * pcpu_alloc_area - allocates an area from a pcpu_chunk
 * @chunk: chunk of interest
 * @alloc_bits: size of request in allocation units
 * @align: alignment of area (max PAGE_SIZE)
 * @start: bit_off to start searching
 *
 * This function takes in a @start offset to begin searching to fit an
 * allocation of @alloc_bits with alignment @align.  It needs to scan
 * the allocation map because if it fits within the block's contig hint,
 * @start will be block->first_free. This is an attempt to fill the
 * allocation prior to breaking the contig hint.  The allocation and
 * boundary maps are updated accordingly if it confirms a valid
 * free area.
 *
 * RETURNS:
 * Allocated addr offset in @chunk on success.
 * -1 if no matching area is found.
 */
static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
                           size_t align, int start)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        size_t align_mask = (align) ? (align - 1) : 0;
        unsigned long area_off = 0, area_bits = 0;
        int bit_off, end, oslot;

        lockdep_assert_held(&pcpu_lock);

        oslot = pcpu_chunk_slot(chunk);

        /*
         * Search to find a fit.
         */
        end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS,
                    pcpu_chunk_map_bits(chunk));
        bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits,
                                      align_mask, &area_off, &area_bits);
        if (bit_off >= end)
                return -1;

        if (area_bits)
                pcpu_block_update_scan(chunk, area_off, area_bits);

        /* update alloc map */
        bitmap_set(chunk->alloc_map, bit_off, alloc_bits);

        /* update boundary map */
        set_bit(bit_off, chunk->bound_map);
        bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
        set_bit(bit_off + alloc_bits, chunk->bound_map);

        chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;

        /* update first free bit */
        if (bit_off == chunk_md->first_free)
                chunk_md->first_free = find_next_zero_bit(
                                        chunk->alloc_map,
                                        pcpu_chunk_map_bits(chunk),
                                        bit_off + alloc_bits);

        pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);

        pcpu_chunk_relocate(chunk, oslot);

        return bit_off * PCPU_MIN_ALLOC_SIZE;
}

/**
 * pcpu_free_area - frees the corresponding offset
 * @chunk: chunk of interest
 * @off: addr offset into chunk
 *
 * This function determines the size of an allocation to free using
 * the boundary bitmap and clears the allocation map.
 *
 * RETURNS:
 * Number of freed bytes.
 */
static int pcpu_free_area(struct pcpu_chunk *chunk, int off)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int bit_off, bits, end, oslot, freed;

        lockdep_assert_held(&pcpu_lock);
        pcpu_stats_area_dealloc(chunk);

        oslot = pcpu_chunk_slot(chunk);

        bit_off = off / PCPU_MIN_ALLOC_SIZE;

        /* find end index */
        end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
                            bit_off + 1);
        bits = end - bit_off;
        bitmap_clear(chunk->alloc_map, bit_off, bits);

        freed = bits * PCPU_MIN_ALLOC_SIZE;

        /* update metadata */
        chunk->free_bytes += freed;

        /* update first free bit */
        chunk_md->first_free = min(chunk_md->first_free, bit_off);

        pcpu_block_update_hint_free(chunk, bit_off, bits);

        pcpu_chunk_relocate(chunk, oslot);

        return freed;
}

static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits)
{
        block->scan_hint = 0;
        block->contig_hint = nr_bits;
        block->left_free = nr_bits;
        block->right_free = nr_bits;
        block->first_free = 0;
        block->nr_bits = nr_bits;
}

static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
{
        struct pcpu_block_md *md_block;

        /* init the chunk's block */
        pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk));

        for (md_block = chunk->md_blocks;
             md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
             md_block++)
                pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS);
}

/**
 * pcpu_alloc_first_chunk - creates chunks that serve the first chunk
 * @tmp_addr: the start of the region served
 * @map_size: size of the region served
 *
 * This is responsible for creating the chunks that serve the first chunk.  The
 * base_addr is page aligned down of @tmp_addr while the region end is page
 * aligned up.  Offsets are kept track of to determine the region served. All
 * this is done to appease the bitmap allocator in avoiding partial blocks.
 *
 * RETURNS:
 * Chunk serving the region at @tmp_addr of @map_size.
 */
static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
                                                         int map_size)
{
        struct pcpu_chunk *chunk;
        unsigned long aligned_addr, lcm_align;
        int start_offset, offset_bits, region_size, region_bits;
        size_t alloc_size;

        /* region calculations */
        aligned_addr = tmp_addr & PAGE_MASK;

        start_offset = tmp_addr - aligned_addr;

        /*
         * Align the end of the region with the LCM of PAGE_SIZE and
         * PCPU_BITMAP_BLOCK_SIZE.  One of these constants is a multiple of
         * the other.
         */
        lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE);
        region_size = ALIGN(start_offset + map_size, lcm_align);

        /* allocate chunk */
        alloc_size = struct_size(chunk, populated,
                                 BITS_TO_LONGS(region_size >> PAGE_SHIFT));
        chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!chunk)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        INIT_LIST_HEAD(&chunk->list);

        chunk->base_addr = (void *)aligned_addr;
        chunk->start_offset = start_offset;
        chunk->end_offset = region_size - chunk->start_offset - map_size;

        chunk->nr_pages = region_size >> PAGE_SHIFT;
        region_bits = pcpu_chunk_map_bits(chunk);

        alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]);
        chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!chunk->alloc_map)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        alloc_size =
                BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]);
        chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!chunk->bound_map)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]);
        chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!chunk->md_blocks)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

#ifdef CONFIG_MEMCG_KMEM
        /* first chunk isn't memcg-aware */
        chunk->obj_cgroups = NULL;
#endif
        pcpu_init_md_blocks(chunk);

        /* manage populated page bitmap */
        chunk->immutable = true;
        bitmap_fill(chunk->populated, chunk->nr_pages);
        chunk->nr_populated = chunk->nr_pages;
        chunk->nr_empty_pop_pages = chunk->nr_pages;

        chunk->free_bytes = map_size;

        if (chunk->start_offset) {
                /* hide the beginning of the bitmap */
                offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
                bitmap_set(chunk->alloc_map, 0, offset_bits);
                set_bit(0, chunk->bound_map);
                set_bit(offset_bits, chunk->bound_map);

                chunk->chunk_md.first_free = offset_bits;

                pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
        }

        if (chunk->end_offset) {
                /* hide the end of the bitmap */
                offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
                bitmap_set(chunk->alloc_map,
                           pcpu_chunk_map_bits(chunk) - offset_bits,
                           offset_bits);
                set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
                        chunk->bound_map);
                set_bit(region_bits, chunk->bound_map);

                pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
                                             - offset_bits, offset_bits);
        }

        return chunk;
}

static struct pcpu_chunk *pcpu_alloc_chunk(enum pcpu_chunk_type type, gfp_t gfp)
{
        struct pcpu_chunk *chunk;
        int region_bits;

        chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
        if (!chunk)
                return NULL;

        INIT_LIST_HEAD(&chunk->list);
        chunk->nr_pages = pcpu_unit_pages;
        region_bits = pcpu_chunk_map_bits(chunk);

        chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
                                           sizeof(chunk->alloc_map[0]), gfp);
        if (!chunk->alloc_map)
                goto alloc_map_fail;

        chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
                                           sizeof(chunk->bound_map[0]), gfp);
        if (!chunk->bound_map)
                goto bound_map_fail;

        chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
                                           sizeof(chunk->md_blocks[0]), gfp);
        if (!chunk->md_blocks)
                goto md_blocks_fail;

#ifdef CONFIG_MEMCG_KMEM
        if (pcpu_is_memcg_chunk(type)) {
                chunk->obj_cgroups =
                        pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
                                        sizeof(struct obj_cgroup *), gfp);
                if (!chunk->obj_cgroups)
                        goto objcg_fail;
        }
#endif

        pcpu_init_md_blocks(chunk);

        /* init metadata */
        chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;

        return chunk;

#ifdef CONFIG_MEMCG_KMEM
objcg_fail:
        pcpu_mem_free(chunk->md_blocks);
#endif
md_blocks_fail:
        pcpu_mem_free(chunk->bound_map);
bound_map_fail:
        pcpu_mem_free(chunk->alloc_map);
alloc_map_fail:
        pcpu_mem_free(chunk);

        return NULL;
}

static void pcpu_free_chunk(struct pcpu_chunk *chunk)
{
        if (!chunk)
                return;
#ifdef CONFIG_MEMCG_KMEM
        pcpu_mem_free(chunk->obj_cgroups);
#endif
        pcpu_mem_free(chunk->md_blocks);
        pcpu_mem_free(chunk->bound_map);
        pcpu_mem_free(chunk->alloc_map);
        pcpu_mem_free(chunk);
}

/**
 * pcpu_chunk_populated - post-population bookkeeping
 * @chunk: pcpu_chunk which got populated
 * @page_start: the start page
 * @page_end: the end page
 *
 * Pages in [@page_start,@page_end) have been populated to @chunk.  Update
 * the bookkeeping information accordingly.  Must be called after each
 * successful population.
 *
 * If this is @for_alloc, do not increment pcpu_nr_empty_pop_pages because it
 * is to serve an allocation in that area.
 */
static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
                                 int page_end)
{
        int nr = page_end - page_start;

        lockdep_assert_held(&pcpu_lock);

        bitmap_set(chunk->populated, page_start, nr);
        chunk->nr_populated += nr;
        pcpu_nr_populated += nr;

        pcpu_update_empty_pages(chunk, nr);
}

/**
 * pcpu_chunk_depopulated - post-depopulation bookkeeping
 * @chunk: pcpu_chunk which got depopulated
 * @page_start: the start page
 * @page_end: the end page
 *
 * Pages in [@page_start,@page_end) have been depopulated from @chunk.
 * Update the bookkeeping information accordingly.  Must be called after
 * each successful depopulation.
 */
static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
                                   int page_start, int page_end)
{
        int nr = page_end - page_start;

        lockdep_assert_held(&pcpu_lock);

        bitmap_clear(chunk->populated, page_start, nr);
        chunk->nr_populated -= nr;
        pcpu_nr_populated -= nr;

        pcpu_update_empty_pages(chunk, -nr);
}

/*
 * Chunk management implementation.
 *
 * To allow different implementations, chunk alloc/free and
 * [de]population are implemented in a separate file which is pulled
 * into this file and compiled together.  The following functions
 * should be implemented.
 *
 * pcpu_populate_chunk                - populate the specified range of a chunk
 * pcpu_depopulate_chunk        - depopulate the specified range of a chunk
 * pcpu_create_chunk                - create a new chunk
 * pcpu_destroy_chunk                - destroy a chunk, always preceded by full depop
 * pcpu_addr_to_page                - translate address to physical address
 * pcpu_verify_alloc_info        - check alloc_info is acceptable during init
 */
static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
                               int page_start, int page_end, gfp_t gfp);
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
                                  int page_start, int page_end);
static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
                                            gfp_t gfp);
static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
static struct page *pcpu_addr_to_page(void *addr);
static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);

#ifdef CONFIG_NEED_PER_CPU_KM
#include "percpu-km.c"
#else
#include "percpu-vm.c"
#endif

/**
 * pcpu_chunk_addr_search - determine chunk containing specified address
 * @addr: address for which the chunk needs to be determined.
 *
 * This is an internal function that handles all but static allocations.
 * Static percpu address values should never be passed into the allocator.
 *
 * RETURNS:
 * The address of the found chunk.
 */
static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
{
        /* is it in the dynamic region (first chunk)? */
        if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
                return pcpu_first_chunk;

        /* is it in the reserved region? */
        if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
                return pcpu_reserved_chunk;

        /*
         * The address is relative to unit0 which might be unused and
         * thus unmapped.  Offset the address to the unit space of the
         * current processor before looking it up in the vmalloc
         * space.  Note that any possible cpu id can be used here, so
         * there's no need to worry about preemption or cpu hotplug.
         */
        addr += pcpu_unit_offsets[raw_smp_processor_id()];
        return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
}

#ifdef CONFIG_MEMCG_KMEM
static enum pcpu_chunk_type pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
                                                     struct obj_cgroup **objcgp)
{
        struct obj_cgroup *objcg;

        if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT))
                return PCPU_CHUNK_ROOT;

        objcg = get_obj_cgroup_from_current();
        if (!objcg)
                return PCPU_CHUNK_ROOT;

        if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) {
                obj_cgroup_put(objcg);
                return PCPU_FAIL_ALLOC;
        }

        *objcgp = objcg;
        return PCPU_CHUNK_MEMCG;
}

static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
                                       struct pcpu_chunk *chunk, int off,
                                       size_t size)
{
        if (!objcg)
                return;

        if (chunk) {
                chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg;

                rcu_read_lock();
                mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
                                size * num_possible_cpus());
                rcu_read_unlock();
        } else {
                obj_cgroup_uncharge(objcg, size * num_possible_cpus());
                obj_cgroup_put(objcg);
        }
}

static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
        struct obj_cgroup *objcg;

        if (!pcpu_is_memcg_chunk(pcpu_chunk_type(chunk)))
                return;

        objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT];
        chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL;

        obj_cgroup_uncharge(objcg, size * num_possible_cpus());

        rcu_read_lock();
        mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
                        -(size * num_possible_cpus()));
        rcu_read_unlock();

        obj_cgroup_put(objcg);
}

#else /* CONFIG_MEMCG_KMEM */
static enum pcpu_chunk_type
pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp)
{
        return PCPU_CHUNK_ROOT;
}

static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
                                       struct pcpu_chunk *chunk, int off,
                                       size_t size)
{
}

static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
}
#endif /* CONFIG_MEMCG_KMEM */

/**
 * pcpu_alloc - the percpu allocator
 * @size: size of area to allocate in bytes
 * @align: alignment of area (max PAGE_SIZE)
 * @reserved: allocate from the reserved chunk if available
 * @gfp: allocation flags
 *
 * Allocate percpu area of @size bytes aligned at @align.  If @gfp doesn't
 * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN
 * then no warning will be triggered on invalid or failed allocation
 * requests.
 *
 * RETURNS:
 * Percpu pointer to the allocated area on success, NULL on failure.
 */
static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
                                 gfp_t gfp)
{
        gfp_t pcpu_gfp;
        bool is_atomic;
        bool do_warn;
        enum pcpu_chunk_type type;
        struct list_head *pcpu_slot;
        struct obj_cgroup *objcg = NULL;
        static int warn_limit = 10;
        struct pcpu_chunk *chunk, *next;
        const char *err;
        int slot, off, cpu, ret;
        unsigned long flags;
        void __percpu *ptr;
        size_t bits, bit_align;

        gfp = current_gfp_context(gfp);
        /* whitelisted flags that can be passed to the backing allocators */
        pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
        is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
        do_warn = !(gfp & __GFP_NOWARN);

        /*
         * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
         * therefore alignment must be a minimum of that many bytes.
         * An allocation may have internal fragmentation from rounding up
         * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
         */
        if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
                align = PCPU_MIN_ALLOC_SIZE;

        size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
        bits = size >> PCPU_MIN_ALLOC_SHIFT;
        bit_align = align >> PCPU_MIN_ALLOC_SHIFT;

        if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
                     !is_power_of_2(align))) {
                WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n",
                     size, align);
                return NULL;
        }

        type = pcpu_memcg_pre_alloc_hook(size, gfp, &objcg);
        if (unlikely(type == PCPU_FAIL_ALLOC))
                return NULL;
        pcpu_slot = pcpu_chunk_list(type);

        if (!is_atomic) {
                /*
                 * pcpu_balance_workfn() allocates memory under this mutex,
                 * and it may wait for memory reclaim. Allow current task
                 * to become OOM victim, in case of memory pressure.
                 */
                if (gfp & __GFP_NOFAIL) {
                        mutex_lock(&pcpu_alloc_mutex);
                } else if (mutex_lock_killable(&pcpu_alloc_mutex)) {
                        pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
                        return NULL;
                }
        }

        spin_lock_irqsave(&pcpu_lock, flags);

        /* serve reserved allocations from the reserved chunk if available */
        if (reserved && pcpu_reserved_chunk) {
                chunk = pcpu_reserved_chunk;

                off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
                if (off < 0) {
                        err = "alloc from reserved chunk failed";
                        goto fail_unlock;
                }

                off = pcpu_alloc_area(chunk, bits, bit_align, off);
                if (off >= 0)
                        goto area_found;

                err = "alloc from reserved chunk failed";
                goto fail_unlock;
        }

restart:
        /* search through normal chunks */
        for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
                list_for_each_entry_safe(chunk, next, &pcpu_slot[slot], list) {
                        off = pcpu_find_block_fit(chunk, bits, bit_align,
                                                  is_atomic);
                        if (off < 0) {
                                if (slot < PCPU_SLOT_FAIL_THRESHOLD)
                                        pcpu_chunk_move(chunk, 0);
                                continue;
                        }

                        off = pcpu_alloc_area(chunk, bits, bit_align, off);
                        if (off >= 0)
                                goto area_found;

                }
        }

        spin_unlock_irqrestore(&pcpu_lock, flags);

        /*
         * No space left.  Create a new chunk.  We don't want multiple
         * tasks to create chunks simultaneously.  Serialize and create iff
         * there's still no empty chunk after grabbing the mutex.
         */
        if (is_atomic) {
                err = "atomic alloc failed, no space left";
                goto fail;
        }

        if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
                chunk = pcpu_create_chunk(type, pcpu_gfp);
                if (!chunk) {
                        err = "failed to allocate new chunk";
                        goto fail;
                }

                spin_lock_irqsave(&pcpu_lock, flags);
                pcpu_chunk_relocate(chunk, -1);
        } else {
                spin_lock_irqsave(&pcpu_lock, flags);
        }

        goto restart;

area_found:
        pcpu_stats_area_alloc(chunk, size);
        spin_unlock_irqrestore(&pcpu_lock, flags);

        /* populate if not all pages are already there */
        if (!is_atomic) {
                unsigned int page_start, page_end, rs, re;

                page_start = PFN_DOWN(off);
                page_end = PFN_UP(off + size);

                bitmap_for_each_clear_region(chunk->populated, rs, re,
                                             page_start, page_end) {
                        WARN_ON(chunk->immutable);

                        ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);

                        spin_lock_irqsave(&pcpu_lock, flags);
                        if (ret) {
                                pcpu_free_area(chunk, off);
                                err = "failed to populate";
                                goto fail_unlock;
                        }
                        pcpu_chunk_populated(chunk, rs, re);
                        spin_unlock_irqrestore(&pcpu_lock, flags);
                }

                mutex_unlock(&pcpu_alloc_mutex);
        }

        if (pcpu_nr_empty_pop_pages[type] < PCPU_EMPTY_POP_PAGES_LOW)
                pcpu_schedule_balance_work();

        /* clear the areas and return address relative to base address */
        for_each_possible_cpu(cpu)
                memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);

        ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
        kmemleak_alloc_percpu(ptr, size, gfp);

        trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
                        chunk->base_addr, off, ptr);

        pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);

        return ptr;

fail_unlock:
        spin_unlock_irqrestore(&pcpu_lock, flags);
fail:
        trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);

        if (!is_atomic && do_warn && warn_limit) {
                pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
                        size, align, is_atomic, err);
                dump_stack();
                if (!--warn_limit)
                        pr_info("limit reached, disable warning\n");
        }
        if (is_atomic) {
                /* see the flag handling in pcpu_blance_workfn() */
                pcpu_atomic_alloc_failed = true;
                pcpu_schedule_balance_work();
        } else {
                mutex_unlock(&pcpu_alloc_mutex);
        }

        pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);

        return NULL;
}

/**
 * __alloc_percpu_gfp - allocate dynamic percpu area
 * @size: size of area to allocate in bytes
 * @align: alignment of area (max PAGE_SIZE)
 * @gfp: allocation flags
 *
 * Allocate zero-filled percpu area of @size bytes aligned at @align.  If
 * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can
 * be called from any context but is a lot more likely to fail. If @gfp
 * has __GFP_NOWARN then no warning will be triggered on invalid or failed
 * allocation requests.
 *
 * RETURNS:
 * Percpu pointer to the allocated area on success, NULL on failure.
 */
void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
{
        return pcpu_alloc(size, align, false, gfp);
}
EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);

/**
 * __alloc_percpu - allocate dynamic percpu area
 * @size: size of area to allocate in bytes
 * @align: alignment of area (max PAGE_SIZE)
 *
 * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL).
 */
void __percpu *__alloc_percpu(size_t size, size_t align)
{
        return pcpu_alloc(size, align, false, GFP_KERNEL);
}
EXPORT_SYMBOL_GPL(__alloc_percpu);

/**
 * __alloc_reserved_percpu - allocate reserved percpu area
 * @size: size of area to allocate in bytes
 * @align: alignment of area (max PAGE_SIZE)
 *
 * Allocate zero-filled percpu area of @size bytes aligned at @align
 * from reserved percpu area if arch has set it up; otherwise,
 * allocation is served from the same dynamic area.  Might sleep.
 * Might trigger writeouts.
 *
 * CONTEXT:
 * Does GFP_KERNEL allocation.
 *
 * RETURNS:
 * Percpu pointer to the allocated area on success, NULL on failure.
 */
void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
{
        return pcpu_alloc(size, align, true, GFP_KERNEL);
}

/**
 * __pcpu_balance_workfn - manage the amount of free chunks and populated pages
 * @type: chunk type
 *
 * Reclaim all fully free chunks except for the first one.  This is also
 * responsible for maintaining the pool of empty populated pages.  However,
 * it is possible that this is called when physical memory is scarce causing
 * OOM killer to be triggered.  We should avoid doing so until an actual
 * allocation causes the failure as it is possible that requests can be
 * serviced from already backed regions.
 */
static void __pcpu_balance_workfn(enum pcpu_chunk_type type)
{
        /* gfp flags passed to underlying allocators */
        const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
        LIST_HEAD(to_free);
        struct list_head *pcpu_slot = pcpu_chunk_list(type);
        struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
        struct pcpu_chunk *chunk, *next;
        int slot, nr_to_pop, ret;

        /*
         * There's no reason to keep around multiple unused chunks and VM
         * areas can be scarce.  Destroy all free chunks except for one.
         */
        mutex_lock(&pcpu_alloc_mutex);
        spin_lock_irq(&pcpu_lock);

        list_for_each_entry_safe(chunk, next, free_head, list) {
                WARN_ON(chunk->immutable);

                /* spare the first one */
                if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
                        continue;

                list_move(&chunk->list, &to_free);
        }

        spin_unlock_irq(&pcpu_lock);

        list_for_each_entry_safe(chunk, next, &to_free, list) {
                unsigned int rs, re;

                bitmap_for_each_set_region(chunk->populated, rs, re, 0,
                                           chunk->nr_pages) {
                        pcpu_depopulate_chunk(chunk, rs, re);
                        spin_lock_irq(&pcpu_lock);
                        pcpu_chunk_depopulated(chunk, rs, re);
                        spin_unlock_irq(&pcpu_lock);
                }
                pcpu_destroy_chunk(chunk);
                cond_resched();
        }

        /*
         * Ensure there are certain number of free populated pages for
         * atomic allocs.  Fill up from the most packed so that atomic
         * allocs don't increase fragmentation.  If atomic allocation
         * failed previously, always populate the maximum amount.  This
         * should prevent atomic allocs larger than PAGE_SIZE from keeping
         * failing indefinitely; however, large atomic allocs are not
         * something we support properly and can be highly unreliable and
         * inefficient.
         */
retry_pop:
        if (pcpu_atomic_alloc_failed) {
                nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
                /* best effort anyway, don't worry about synchronization */
                pcpu_atomic_alloc_failed = false;
        } else {
                nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
                                  pcpu_nr_empty_pop_pages[type],
                                  0, PCPU_EMPTY_POP_PAGES_HIGH);
        }

        for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
                unsigned int nr_unpop = 0, rs, re;

                if (!nr_to_pop)
                        break;

                spin_lock_irq(&pcpu_lock);
                list_for_each_entry(chunk, &pcpu_slot[slot], list) {
                        nr_unpop = chunk->nr_pages - chunk->nr_populated;
                        if (nr_unpop)
                                break;
                }
                spin_unlock_irq(&pcpu_lock);

                if (!nr_unpop)
                        continue;

                /* @chunk can't go away while pcpu_alloc_mutex is held */
                bitmap_for_each_clear_region(chunk->populated, rs, re, 0,
                                             chunk->nr_pages) {
                        int nr = min_t(int, re - rs, nr_to_pop);

                        ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
                        if (!ret) {
                                nr_to_pop -= nr;
                                spin_lock_irq(&pcpu_lock);
                                pcpu_chunk_populated(chunk, rs, rs + nr);
                                spin_unlock_irq(&pcpu_lock);
                        } else {
                                nr_to_pop = 0;
                        }

                        if (!nr_to_pop)
                                break;
                }
        }

        if (nr_to_pop) {
                /* ran out of chunks to populate, create a new one and retry */
                chunk = pcpu_create_chunk(type, gfp);
                if (chunk) {
                        spin_lock_irq(&pcpu_lock);
                        pcpu_chunk_relocate(chunk, -1);
                        spin_unlock_irq(&pcpu_lock);
                        goto retry_pop;
                }
        }

        mutex_unlock(&pcpu_alloc_mutex);
}

/**
 * pcpu_balance_workfn - manage the amount of free chunks and populated pages
 * @work: unused
 *
 * Call __pcpu_balance_workfn() for each chunk type.
 */
static void pcpu_balance_workfn(struct work_struct *work)
{
        enum pcpu_chunk_type type;

        for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
                __pcpu_balance_workfn(type);
}

/**
 * free_percpu - free percpu area
 * @ptr: pointer to area to free
 *
 * Free percpu area @ptr.
 *
 * CONTEXT:
 * Can be called from atomic context.
 */
void free_percpu(void __percpu *ptr)
{
        void *addr;
        struct pcpu_chunk *chunk;
        unsigned long flags;
        int size, off;
        bool need_balance = false;
        struct list_head *pcpu_slot;

        if (!ptr)
                return;

        kmemleak_free_percpu(ptr);

        addr = __pcpu_ptr_to_addr(ptr);

        spin_lock_irqsave(&pcpu_lock, flags);

        chunk = pcpu_chunk_addr_search(addr);
        off = addr - chunk->base_addr;

        size = pcpu_free_area(chunk, off);

        pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));

        pcpu_memcg_free_hook(chunk, off, size);

        /* if there are more than one fully free chunks, wake up grim reaper */
        if (chunk->free_bytes == pcpu_unit_size) {
                struct pcpu_chunk *pos;

                list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
                        if (pos != chunk) {
                                need_balance = true;
                                break;
                        }
        }

        trace_percpu_free_percpu(chunk->base_addr, off, ptr);

        spin_unlock_irqrestore(&pcpu_lock, flags);

        if (need_balance)
                pcpu_schedule_balance_work();
}
EXPORT_SYMBOL_GPL(free_percpu);

bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
{
#ifdef CONFIG_SMP
        const size_t static_size = __per_cpu_end - __per_cpu_start;
        void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
        unsigned int cpu;

        for_each_possible_cpu(cpu) {
                void *start = per_cpu_ptr(base, cpu);
                void *va = (void *)addr;

                if (va >= start && va < start + static_size) {
                        if (can_addr) {
                                *can_addr = (unsigned long) (va - start);
                                *can_addr += (unsigned long)
                                        per_cpu_ptr(base, get_boot_cpu_id());
                        }
                        return true;
                }
        }
#endif
        /* on UP, can't distinguish from other static vars, always false */
        return false;
}

/**
 * is_kernel_percpu_address - test whether address is from static percpu area
 * @addr: address to test
 *
 * Test whether @addr belongs to in-kernel static percpu area.  Module
 * static percpu areas are not considered.  For those, use
 * is_module_percpu_address().
 *
 * RETURNS:
 * %true if @addr is from in-kernel static percpu area, %false otherwise.
 */
bool is_kernel_percpu_address(unsigned long addr)
{
        return __is_kernel_percpu_address(addr, NULL);
}

/**
 * per_cpu_ptr_to_phys - convert translated percpu address to physical address
 * @addr: the address to be converted to physical address
 *
 * Given @addr which is dereferenceable address obtained via one of
 * percpu access macros, this function translates it into its physical
 * address.  The caller is responsible for ensuring @addr stays valid
 * until this function finishes.
 *
 * percpu allocator has special setup for the first chunk, which currently
 * supports either embedding in linear address space or vmalloc mapping,
 * and, from the second one, the backing allocator (currently either vm or
 * km) provides translation.
 *
 * The addr can be translated simply without checking if it falls into the
 * first chunk. But the current code reflects better how percpu allocator
 * actually works, and the verification can discover both bugs in percpu
 * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
 * code.
 *
 * RETURNS:
 * The physical address for @addr.
 */
phys_addr_t per_cpu_ptr_to_phys(void *addr)
{
        void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
        bool in_first_chunk = false;
        unsigned long first_low, first_high;
        unsigned int cpu;

        /*
         * The following test on unit_low/high isn't strictly
         * necessary but will speed up lookups of addresses which
         * aren't in the first chunk.
         *
         * The address check is against full chunk sizes.  pcpu_base_addr
         * points to the beginning of the first chunk including the
         * static region.  Assumes good intent as the first chunk may
         * not be full (ie. < pcpu_unit_pages in size).
         */
        first_low = (unsigned long)pcpu_base_addr +
                    pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
        first_high = (unsigned long)pcpu_base_addr +
                     pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages);
        if ((unsigned long)addr >= first_low &&
            (unsigned long)addr < first_high) {
                for_each_possible_cpu(cpu) {
                        void *start = per_cpu_ptr(base, cpu);

                        if (addr >= start && addr < start + pcpu_unit_size) {
                                in_first_chunk = true;
                                break;
                        }
                }
        }

        if (in_first_chunk) {
                if (!is_vmalloc_addr(addr))
                        return __pa(addr);
                else
                        return page_to_phys(vmalloc_to_page(addr)) +
                               offset_in_page(addr);
        } else
                return page_to_phys(pcpu_addr_to_page(addr)) +
                       offset_in_page(addr);
}

/**
 * pcpu_alloc_alloc_info - allocate percpu allocation info
 * @nr_groups: the number of groups
 * @nr_units: the number of units
 *
 * Allocate ai which is large enough for @nr_groups groups containing
 * @nr_units units.  The returned ai's groups[0].cpu_map points to the
 * cpu_map array which is long enough for @nr_units and filled with
 * NR_CPUS.  It's the caller's responsibility to initialize cpu_map
 * pointer of other groups.
 *
 * RETURNS:
 * Pointer to the allocated pcpu_alloc_info on success, NULL on
 * failure.
 */
struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
                                                      int nr_units)
{
        struct pcpu_alloc_info *ai;
        size_t base_size, ai_size;
        void *ptr;
        int unit;

        base_size = ALIGN(struct_size(ai, groups, nr_groups),
                          __alignof__(ai->groups[0].cpu_map[0]));
        ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);

        ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE);
        if (!ptr)
                return NULL;
        ai = ptr;
        ptr += base_size;

        ai->groups[0].cpu_map = ptr;

        for (unit = 0; unit < nr_units; unit++)
                ai->groups[0].cpu_map[unit] = NR_CPUS;

        ai->nr_groups = nr_groups;
        ai->__ai_size = PFN_ALIGN(ai_size);

        return ai;
}

/**
 * pcpu_free_alloc_info - free percpu allocation info
 * @ai: pcpu_alloc_info to free
 *
 * Free @ai which was allocated by pcpu_alloc_alloc_info().
 */
void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
{
        memblock_free_early(__pa(ai), ai->__ai_size);
}

/**
 * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
 * @lvl: loglevel
 * @ai: allocation info to dump
 *
 * Print out information about @ai using loglevel @lvl.
 */
static void pcpu_dump_alloc_info(const char *lvl,
                                 const struct pcpu_alloc_info *ai)
{
        int group_width = 1, cpu_width = 1, width;
        char empty_str[] = "--------";
        int alloc = 0, alloc_end = 0;
        int group, v;
        int upa, apl;        /* units per alloc, allocs per line */

        v = ai->nr_groups;
        while (v /= 10)
                group_width++;

        v = num_possible_cpus();
        while (v /= 10)
                cpu_width++;
        empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';

        upa = ai->alloc_size / ai->unit_size;
        width = upa * (cpu_width + 1) + group_width + 3;
        apl = rounddown_pow_of_two(max(60 / width, 1));

        printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
               lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
               ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);

        for (group = 0; group < ai->nr_groups; group++) {
                const struct pcpu_group_info *gi = &ai->groups[group];
                int unit = 0, unit_end = 0;

                BUG_ON(gi->nr_units % upa);
                for (alloc_end += gi->nr_units / upa;
                     alloc < alloc_end; alloc++) {
                        if (!(alloc % apl)) {
                                pr_cont("\n");
                                printk("%spcpu-alloc: ", lvl);
                        }
                        pr_cont("[%0*d] ", group_width, group);

                        for (unit_end += upa; unit < unit_end; unit++)
                                if (gi->cpu_map[unit] != NR_CPUS)
                                        pr_cont("%0*d ",
                                                cpu_width, gi->cpu_map[unit]);
                                else
                                        pr_cont("%s ", empty_str);
                }
        }
        pr_cont("\n");
}

/**
 * pcpu_setup_first_chunk - initialize the first percpu chunk
 * @ai: pcpu_alloc_info describing how to percpu area is shaped
 * @base_addr: mapped address
 *
 * Initialize the first percpu chunk which contains the kernel static
 * percpu area.  This function is to be called from arch percpu area
 * setup path.
 *
 * @ai contains all information necessary to initialize the first
 * chunk and prime the dynamic percpu allocator.
 *
 * @ai->static_size is the size of static percpu area.
 *
 * @ai->reserved_size, if non-zero, specifies the amount of bytes to
 * reserve after the static area in the first chunk.  This reserves
 * the first chunk such that it's available only through reserved
 * percpu allocation.  This is primarily used to serve module percpu
 * static areas on architectures where the addressing model has
 * limited offset range for symbol relocations to guarantee module
 * percpu symbols fall inside the relocatable range.
 *
 * @ai->dyn_size determines the number of bytes available for dynamic
 * allocation in the first chunk.  The area between @ai->static_size +
 * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
 *
 * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
 * and equal to or larger than @ai->static_size + @ai->reserved_size +
 * @ai->dyn_size.
 *
 * @ai->atom_size is the allocation atom size and used as alignment
 * for vm areas.
 *
 * @ai->alloc_size is the allocation size and always multiple of
 * @ai->atom_size.  This is larger than @ai->atom_size if
 * @ai->unit_size is larger than @ai->atom_size.
 *
 * @ai->nr_groups and @ai->groups describe virtual memory layout of
 * percpu areas.  Units which should be colocated are put into the
 * same group.  Dynamic VM areas will be allocated according to these
 * groupings.  If @ai->nr_groups is zero, a single group containing
 * all units is assumed.
 *
 * The caller should have mapped the first chunk at @base_addr and
 * copied static data to each unit.
 *
 * The first chunk will always contain a static and a dynamic region.
 * However, the static region is not managed by any chunk.  If the first
 * chunk also contains a reserved region, it is served by two chunks -
 * one for the reserved region and one for the dynamic region.  They
 * share the same vm, but use offset regions in the area allocation map.
 * The chunk serving the dynamic region is circulated in the chunk slots
 * and available for dynamic allocation like any other chunk.
 */
void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
                                   void *base_addr)
{
        size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
        size_t static_size, dyn_size;
        struct pcpu_chunk *chunk;
        unsigned long *group_offsets;
        size_t *group_sizes;
        unsigned long *unit_off;
        unsigned int cpu;
        int *unit_map;
        int group, unit, i;
        int map_size;
        unsigned long tmp_addr;
        size_t alloc_size;
        enum pcpu_chunk_type type;

#define PCPU_SETUP_BUG_ON(cond)        do {                                        \
        if (unlikely(cond)) {                                                \
                pr_emerg("failed to initialize, %s\n", #cond);                \
                pr_emerg("cpu_possible_mask=%*pb\n",                        \
                         cpumask_pr_args(cpu_possible_mask));                \
                pcpu_dump_alloc_info(KERN_EMERG, ai);                        \
                BUG();                                                        \
        }                                                                \
} while (0)

        /* sanity checks */
        PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
#ifdef CONFIG_SMP
        PCPU_SETUP_BUG_ON(!ai->static_size);
        PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
#endif
        PCPU_SETUP_BUG_ON(!base_addr);
        PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
        PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
        PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
        PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
        PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
        PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
        PCPU_SETUP_BUG_ON(!ai->dyn_size);
        PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
        PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
                            IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
        PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);

        /* process group information and build config tables accordingly */
        alloc_size = ai->nr_groups * sizeof(group_offsets[0]);
        group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!group_offsets)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        alloc_size = ai->nr_groups * sizeof(group_sizes[0]);
        group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!group_sizes)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        alloc_size = nr_cpu_ids * sizeof(unit_map[0]);
        unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!unit_map)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        alloc_size = nr_cpu_ids * sizeof(unit_off[0]);
        unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!unit_off)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        for (cpu = 0; cpu < nr_cpu_ids; cpu++)
                unit_map[cpu] = UINT_MAX;

        pcpu_low_unit_cpu = NR_CPUS;
        pcpu_high_unit_cpu = NR_CPUS;

        for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
                const struct pcpu_group_info *gi = &ai->groups[group];

                group_offsets[group] = gi->base_offset;
                group_sizes[group] = gi->nr_units * ai->unit_size;

                for (i = 0; i < gi->nr_units; i++) {
                        cpu = gi->cpu_map[i];
                        if (cpu == NR_CPUS)
                                continue;

                        PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
                        PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
                        PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);

                        unit_map[cpu] = unit + i;
                        unit_off[cpu] = gi->base_offset + i * ai->unit_size;

                        /* determine low/high unit_cpu */
                        if (pcpu_low_unit_cpu == NR_CPUS ||
                            unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
                                pcpu_low_unit_cpu = cpu;
                        if (pcpu_high_unit_cpu == NR_CPUS ||
                            unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
                                pcpu_high_unit_cpu = cpu;
                }
        }
        pcpu_nr_units = unit;

        for_each_possible_cpu(cpu)
                PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);

        /* we're done parsing the input, undefine BUG macro and dump config */
#undef PCPU_SETUP_BUG_ON
        pcpu_dump_alloc_info(KERN_DEBUG, ai);

        pcpu_nr_groups = ai->nr_groups;
        pcpu_group_offsets = group_offsets;
        pcpu_group_sizes = group_sizes;
        pcpu_unit_map = unit_map;
        pcpu_unit_offsets = unit_off;

        /* determine basic parameters */
        pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
        pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
        pcpu_atom_size = ai->atom_size;
        pcpu_chunk_struct_size = struct_size(chunk, populated,
                                             BITS_TO_LONGS(pcpu_unit_pages));

        pcpu_stats_save_ai(ai);

        /*
         * Allocate chunk slots.  The additional last slot is for
         * empty chunks.
         */
        pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
        pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots *
                                          sizeof(pcpu_chunk_lists[0]) *
                                          PCPU_NR_CHUNK_TYPES,
                                          SMP_CACHE_BYTES);
        if (!pcpu_chunk_lists)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]) *
                      PCPU_NR_CHUNK_TYPES);

        for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
                for (i = 0; i < pcpu_nr_slots; i++)
                        INIT_LIST_HEAD(&pcpu_chunk_list(type)[i]);

        /*
         * The end of the static region needs to be aligned with the
         * minimum allocation size as this offsets the reserved and
         * dynamic region.  The first chunk ends page aligned by
         * expanding the dynamic region, therefore the dynamic region
         * can be shrunk to compensate while still staying above the
         * configured sizes.
         */
        static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
        dyn_size = ai->dyn_size - (static_size - ai->static_size);

        /*
         * Initialize first chunk.
         * If the reserved_size is non-zero, this initializes the reserved
         * chunk.  If the reserved_size is zero, the reserved chunk is NULL
         * and the dynamic region is initialized here.  The first chunk,
         * pcpu_first_chunk, will always point to the chunk that serves
         * the dynamic region.
         */
        tmp_addr = (unsigned long)base_addr + static_size;
        map_size = ai->reserved_size ?: dyn_size;
        chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);

        /* init dynamic chunk if necessary */
        if (ai->reserved_size) {
                pcpu_reserved_chunk = chunk;

                tmp_addr = (unsigned long)base_addr + static_size +
                           ai->reserved_size;
                map_size = dyn_size;
                chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
        }

        /* link the first chunk in */
        pcpu_first_chunk = chunk;
        pcpu_nr_empty_pop_pages[PCPU_CHUNK_ROOT] = pcpu_first_chunk->nr_empty_pop_pages;
        pcpu_chunk_relocate(pcpu_first_chunk, -1);

        /* include all regions of the first chunk */
        pcpu_nr_populated += PFN_DOWN(size_sum);

        pcpu_stats_chunk_alloc();
        trace_percpu_create_chunk(base_addr);

        /* we're done */
        pcpu_base_addr = base_addr;
}

#ifdef CONFIG_SMP

const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
        [PCPU_FC_AUTO]        = "auto",
        [PCPU_FC_EMBED]        = "embed",
        [PCPU_FC_PAGE]        = "page",
};

enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;

static int __init percpu_alloc_setup(char *str)
{
        if (!str)
                return -EINVAL;

        if (0)
                /* nada */;
#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
        else if (!strcmp(str, "embed"))
                pcpu_chosen_fc = PCPU_FC_EMBED;
#endif
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
        else if (!strcmp(str, "page"))
                pcpu_chosen_fc = PCPU_FC_PAGE;
#endif
        else
                pr_warn("unknown allocator %s specified\n", str);

        return 0;
}
early_param("percpu_alloc", percpu_alloc_setup);

/*
 * pcpu_embed_first_chunk() is used by the generic percpu setup.
 * Build it if needed by the arch config or the generic setup is going
 * to be used.
 */
#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
        !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
#define BUILD_EMBED_FIRST_CHUNK
#endif

/* build pcpu_page_first_chunk() iff needed by the arch config */
#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
#define BUILD_PAGE_FIRST_CHUNK
#endif

/* pcpu_build_alloc_info() is used by both embed and page first chunk */
#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
/**
 * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
 * @reserved_size: the size of reserved percpu area in bytes
 * @dyn_size: minimum free size for dynamic allocation in bytes
 * @atom_size: allocation atom size
 * @cpu_distance_fn: callback to determine distance between cpus, optional
 *
 * This function determines grouping of units, their mappings to cpus
 * and other parameters considering needed percpu size, allocation
 * atom size and distances between CPUs.
 *
 * Groups are always multiples of atom size and CPUs which are of
 * LOCAL_DISTANCE both ways are grouped together and share space for
 * units in the same group.  The returned configuration is guaranteed
 * to have CPUs on different nodes on different groups and >=75% usage
 * of allocated virtual address space.
 *
 * RETURNS:
 * On success, pointer to the new allocation_info is returned.  On
 * failure, ERR_PTR value is returned.
 */
static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
                                size_t reserved_size, size_t dyn_size,
                                size_t atom_size,
                                pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
{
        static int group_map[NR_CPUS] __initdata;
        static int group_cnt[NR_CPUS] __initdata;
        const size_t static_size = __per_cpu_end - __per_cpu_start;
        int nr_groups = 1, nr_units = 0;
        size_t size_sum, min_unit_size, alloc_size;
        int upa, max_upa, best_upa;        /* units_per_alloc */
        int last_allocs, group, unit;
        unsigned int cpu, tcpu;
        struct pcpu_alloc_info *ai;
        unsigned int *cpu_map;

        /* this function may be called multiple times */
        memset(group_map, 0, sizeof(group_map));
        memset(group_cnt, 0, sizeof(group_cnt));

        /* calculate size_sum and ensure dyn_size is enough for early alloc */
        size_sum = PFN_ALIGN(static_size + reserved_size +
                            max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
        dyn_size = size_sum - static_size - reserved_size;

        /*
         * Determine min_unit_size, alloc_size and max_upa such that
         * alloc_size is multiple of atom_size and is the smallest
         * which can accommodate 4k aligned segments which are equal to
         * or larger than min_unit_size.
         */
        min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);

        /* determine the maximum # of units that can fit in an allocation */
        alloc_size = roundup(min_unit_size, atom_size);
        upa = alloc_size / min_unit_size;
        while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
                upa--;
        max_upa = upa;

        /* group cpus according to their proximity */
        for_each_possible_cpu(cpu) {
                group = 0;
        next_group:
                for_each_possible_cpu(tcpu) {
                        if (cpu == tcpu)
                                break;
                        if (group_map[tcpu] == group && cpu_distance_fn &&
                            (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
                             cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
                                group++;
                                nr_groups = max(nr_groups, group + 1);
                                goto next_group;
                        }
                }
                group_map[cpu] = group;
                group_cnt[group]++;
        }

        /*
         * Wasted space is caused by a ratio imbalance of upa to group_cnt.
         * Expand the unit_size until we use >= 75% of the units allocated.
         * Related to atom_size, which could be much larger than the unit_size.
         */
        last_allocs = INT_MAX;
        for (upa = max_upa; upa; upa--) {
                int allocs = 0, wasted = 0;

                if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
                        continue;

                for (group = 0; group < nr_groups; group++) {
                        int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
                        allocs += this_allocs;
                        wasted += this_allocs * upa - group_cnt[group];
                }

                /*
                 * Don't accept if wastage is over 1/3.  The
                 * greater-than comparison ensures upa==1 always
                 * passes the following check.
                 */
                if (wasted > num_possible_cpus() / 3)
                        continue;

                /* and then don't consume more memory */
                if (allocs > last_allocs)
                        break;
                last_allocs = allocs;
                best_upa = upa;
        }
        upa = best_upa;

        /* allocate and fill alloc_info */
        for (group = 0; group < nr_groups; group++)
                nr_units += roundup(group_cnt[group], upa);

        ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
        if (!ai)
                return ERR_PTR(-ENOMEM);
        cpu_map = ai->groups[0].cpu_map;

        for (group = 0; group < nr_groups; group++) {
                ai->groups[group].cpu_map = cpu_map;
                cpu_map += roundup(group_cnt[group], upa);
        }

        ai->static_size = static_size;
        ai->reserved_size = reserved_size;
        ai->dyn_size = dyn_size;
        ai->unit_size = alloc_size / upa;
        ai->atom_size = atom_size;
        ai->alloc_size = alloc_size;

        for (group = 0, unit = 0; group < nr_groups; group++) {
                struct pcpu_group_info *gi = &ai->groups[group];

                /*
                 * Initialize base_offset as if all groups are located
                 * back-to-back.  The caller should update this to
                 * reflect actual allocation.
                 */
                gi->base_offset = unit * ai->unit_size;

                for_each_possible_cpu(cpu)
                        if (group_map[cpu] == group)
                                gi->cpu_map[gi->nr_units++] = cpu;
                gi->nr_units = roundup(gi->nr_units, upa);
                unit += gi->nr_units;
        }
        BUG_ON(unit != nr_units);

        return ai;
}
#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */

#if defined(BUILD_EMBED_FIRST_CHUNK)
/**
 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
 * @reserved_size: the size of reserved percpu area in bytes
 * @dyn_size: minimum free size for dynamic allocation in bytes
 * @atom_size: allocation atom size
 * @cpu_distance_fn: callback to determine distance between cpus, optional
 * @alloc_fn: function to allocate percpu page
 * @free_fn: function to free percpu page
 *
 * This is a helper to ease setting up embedded first percpu chunk and
 * can be called where pcpu_setup_first_chunk() is expected.
 *
 * If this function is used to setup the first chunk, it is allocated
 * by calling @alloc_fn and used as-is without being mapped into
 * vmalloc area.  Allocations are always whole multiples of @atom_size
 * aligned to @atom_size.
 *
 * This enables the first chunk to piggy back on the linear physical
 * mapping which often uses larger page size.  Please note that this
 * can result in very sparse cpu->unit mapping on NUMA machines thus
 * requiring large vmalloc address space.  Don't use this allocator if
 * vmalloc space is not orders of magnitude larger than distances
 * between node memory addresses (ie. 32bit NUMA machines).
 *
 * @dyn_size specifies the minimum dynamic area size.
 *
 * If the needed size is smaller than the minimum or specified unit
 * size, the leftover is returned using @free_fn.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
                                  size_t atom_size,
                                  pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
                                  pcpu_fc_alloc_fn_t alloc_fn,
                                  pcpu_fc_free_fn_t free_fn)
{
        void *base = (void *)ULONG_MAX;
        void **areas = NULL;
        struct pcpu_alloc_info *ai;
        size_t size_sum, areas_size;
        unsigned long max_distance;
        int group, i, highest_group, rc = 0;

        ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
                                   cpu_distance_fn);
        if (IS_ERR(ai))
                return PTR_ERR(ai);

        size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
        areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));

        areas = memblock_alloc(areas_size, SMP_CACHE_BYTES);
        if (!areas) {
                rc = -ENOMEM;
                goto out_free;
        }

        /* allocate, copy and determine base address & max_distance */
        highest_group = 0;
        for (group = 0; group < ai->nr_groups; group++) {
                struct pcpu_group_info *gi = &ai->groups[group];
                unsigned int cpu = NR_CPUS;
                void *ptr;

                for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
                        cpu = gi->cpu_map[i];
                BUG_ON(cpu == NR_CPUS);

                /* allocate space for the whole group */
                ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
                if (!ptr) {
                        rc = -ENOMEM;
                        goto out_free_areas;
                }
                /* kmemleak tracks the percpu allocations separately */
                kmemleak_free(ptr);
                areas[group] = ptr;

                base = min(ptr, base);
                if (ptr > areas[highest_group])
                        highest_group = group;
        }
        max_distance = areas[highest_group] - base;
        max_distance += ai->unit_size * ai->groups[highest_group].nr_units;

        /* warn if maximum distance is further than 75% of vmalloc space */
        if (max_distance > VMALLOC_TOTAL * 3 / 4) {
                pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
                                max_distance, VMALLOC_TOTAL);
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
                /* and fail if we have fallback */
                rc = -EINVAL;
                goto out_free_areas;
#endif
        }

        /*
         * Copy data and free unused parts.  This should happen after all
         * allocations are complete; otherwise, we may end up with
         * overlapping groups.
         */
        for (group = 0; group < ai->nr_groups; group++) {
                struct pcpu_group_info *gi = &ai->groups[group];
                void *ptr = areas[group];

                for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
                        if (gi->cpu_map[i] == NR_CPUS) {
                                /* unused unit, free whole */
                                free_fn(ptr, ai->unit_size);
                                continue;
                        }
                        /* copy and return the unused part */
                        memcpy(ptr, __per_cpu_load, ai->static_size);
                        free_fn(ptr + size_sum, ai->unit_size - size_sum);
                }
        }

        /* base address is now known, determine group base offsets */
        for (group = 0; group < ai->nr_groups; group++) {
                ai->groups[group].base_offset = areas[group] - base;
        }

        pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n",
                PFN_DOWN(size_sum), ai->static_size, ai->reserved_size,
                ai->dyn_size, ai->unit_size);

        pcpu_setup_first_chunk(ai, base);
        goto out_free;

out_free_areas:
        for (group = 0; group < ai->nr_groups; group++)
                if (areas[group])
                        free_fn(areas[group],
                                ai->groups[group].nr_units * ai->unit_size);
out_free:
        pcpu_free_alloc_info(ai);
        if (areas)
                memblock_free_early(__pa(areas), areas_size);
        return rc;
}
#endif /* BUILD_EMBED_FIRST_CHUNK */

#ifdef BUILD_PAGE_FIRST_CHUNK
/**
 * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
 * @reserved_size: the size of reserved percpu area in bytes
 * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
 * @free_fn: function to free percpu page, always called with PAGE_SIZE
 * @populate_pte_fn: function to populate pte
 *
 * This is a helper to ease setting up page-remapped first percpu
 * chunk and can be called where pcpu_setup_first_chunk() is expected.
 *
 * This is the basic allocator.  Static percpu area is allocated
 * page-by-page into vmalloc area.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int __init pcpu_page_first_chunk(size_t reserved_size,
                                 pcpu_fc_alloc_fn_t alloc_fn,
                                 pcpu_fc_free_fn_t free_fn,
                                 pcpu_fc_populate_pte_fn_t populate_pte_fn)
{
        static struct vm_struct vm;
        struct pcpu_alloc_info *ai;
        char psize_str[16];
        int unit_pages;
        size_t pages_size;
        struct page **pages;
        int unit, i, j, rc = 0;
        int upa;
        int nr_g0_units;

        snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);

        ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
        if (IS_ERR(ai))
                return PTR_ERR(ai);
        BUG_ON(ai->nr_groups != 1);
        upa = ai->alloc_size/ai->unit_size;
        nr_g0_units = roundup(num_possible_cpus(), upa);
        if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) {
                pcpu_free_alloc_info(ai);
                return -EINVAL;
        }

        unit_pages = ai->unit_size >> PAGE_SHIFT;

        /* unaligned allocations can't be freed, round up to page size */
        pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
                               sizeof(pages[0]));
        pages = memblock_alloc(pages_size, SMP_CACHE_BYTES);
        if (!pages)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      pages_size);

        /* allocate pages */
        j = 0;
        for (unit = 0; unit < num_possible_cpus(); unit++) {
                unsigned int cpu = ai->groups[0].cpu_map[unit];
                for (i = 0; i < unit_pages; i++) {
                        void *ptr;

                        ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
                        if (!ptr) {
                                pr_warn("failed to allocate %s page for cpu%u\n",
                                                psize_str, cpu);
                                goto enomem;
                        }
                        /* kmemleak tracks the percpu allocations separately */
                        kmemleak_free(ptr);
                        pages[j++] = virt_to_page(ptr);
                }
        }

        /* allocate vm area, map the pages and copy static data */
        vm.flags = VM_ALLOC;
        vm.size = num_possible_cpus() * ai->unit_size;
        vm_area_register_early(&vm, PAGE_SIZE);

        for (unit = 0; unit < num_possible_cpus(); unit++) {
                unsigned long unit_addr =
                        (unsigned long)vm.addr + unit * ai->unit_size;

                for (i = 0; i < unit_pages; i++)
                        populate_pte_fn(unit_addr + (i << PAGE_SHIFT));

                /* pte already populated, the following shouldn't fail */
                rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
                                      unit_pages);
                if (rc < 0)
                        panic("failed to map percpu area, err=%d\n", rc);

                /*
                 * FIXME: Archs with virtual cache should flush local
                 * cache for the linear mapping here - something
                 * equivalent to flush_cache_vmap() on the local cpu.
                 * flush_cache_vmap() can't be used as most supporting
                 * data structures are not set up yet.
                 */

                /* copy static data */
                memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
        }

        /* we're ready, commit */
        pr_info("%d %s pages/cpu s%zu r%zu d%zu\n",
                unit_pages, psize_str, ai->static_size,
                ai->reserved_size, ai->dyn_size);

        pcpu_setup_first_chunk(ai, vm.addr);
        goto out_free_ar;

enomem:
        while (--j >= 0)
                free_fn(page_address(pages[j]), PAGE_SIZE);
        rc = -ENOMEM;
out_free_ar:
        memblock_free_early(__pa(pages), pages_size);
        pcpu_free_alloc_info(ai);
        return rc;
}
#endif /* BUILD_PAGE_FIRST_CHUNK */

#ifndef        CONFIG_HAVE_SETUP_PER_CPU_AREA
/*
 * Generic SMP percpu area setup.
 *
 * The embedding helper is used because its behavior closely resembles
 * the original non-dynamic generic percpu area setup.  This is
 * important because many archs have addressing restrictions and might
 * fail if the percpu area is located far away from the previous
 * location.  As an added bonus, in non-NUMA cases, embedding is
 * generally a good idea TLB-wise because percpu area can piggy back
 * on the physical linear memory mapping which uses large page
 * mappings on applicable archs.
 */
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);

static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
                                       size_t align)
{
        return  memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS));
}

static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
{
        memblock_free_early(__pa(ptr), size);
}

void __init setup_per_cpu_areas(void)
{
        unsigned long delta;
        unsigned int cpu;
        int rc;

        /*
         * Always reserve area for module percpu variables.  That's
         * what the legacy allocator did.
         */
        rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
                                    PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
                                    pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
        if (rc < 0)
                panic("Failed to initialize percpu areas.");

        delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
        for_each_possible_cpu(cpu)
                __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
}
#endif        /* CONFIG_HAVE_SETUP_PER_CPU_AREA */

#else        /* CONFIG_SMP */

/*
 * UP percpu area setup.
 *
 * UP always uses km-based percpu allocator with identity mapping.
 * Static percpu variables are indistinguishable from the usual static
 * variables and don't require any special preparation.
 */
void __init setup_per_cpu_areas(void)
{
        const size_t unit_size =
                roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
                                         PERCPU_DYNAMIC_RESERVE));
        struct pcpu_alloc_info *ai;
        void *fc;

        ai = pcpu_alloc_alloc_info(1, 1);
        fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
        if (!ai || !fc)
                panic("Failed to allocate memory for percpu areas.");
        /* kmemleak tracks the percpu allocations separately */
        kmemleak_free(fc);

        ai->dyn_size = unit_size;
        ai->unit_size = unit_size;
        ai->atom_size = unit_size;
        ai->alloc_size = unit_size;
        ai->groups[0].nr_units = 1;
        ai->groups[0].cpu_map[0] = 0;

        pcpu_setup_first_chunk(ai, fc);
        pcpu_free_alloc_info(ai);
}

#endif        /* CONFIG_SMP */

/*
 * pcpu_nr_pages - calculate total number of populated backing pages
 *
 * This reflects the number of pages populated to back chunks.  Metadata is
 * excluded in the number exposed in meminfo as the number of backing pages
 * scales with the number of cpus and can quickly outweigh the memory used for
 * metadata.  It also keeps this calculation nice and simple.
 *
 * RETURNS:
 * Total number of populated backing pages in use by the allocator.
 */
unsigned long pcpu_nr_pages(void)
{
        return pcpu_nr_populated * pcpu_nr_units;
}

/*
 * Percpu allocator is initialized early during boot when neither slab or
 * workqueue is available.  Plug async management until everything is up
 * and running.
 */
static int __init percpu_enable_async(void)
{
        pcpu_async_enabled = true;
        return 0;
}
subsys_initcall(percpu_enable_async);




























































































































































































































































































































































    1 


    1 




    1 








    1 



    1 




    1 

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/kernel/fork.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 *  'fork.c' contains the help-routines for the 'fork' system call
 * (see also entry.S and others).
 * Fork is rather simple, once you get the hang of it, but the memory
 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
 */

#include <linux/anon_inodes.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/user.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/stat.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
#include <linux/seq_file.h>
#include <linux/rtmutex.h>
#include <linux/init.h>
#include <linux/unistd.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/completion.h>
#include <linux/personality.h>
#include <linux/mempolicy.h>
#include <linux/sem.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/iocontext.h>
#include <linux/key.h>
#include <linux/binfmts.h>
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/cgroup.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/seccomp.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/jiffies.h>
#include <linux/futex.h>
#include <linux/compat.h>
#include <linux/kthread.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/rcupdate.h>
#include <linux/ptrace.h>
#include <linux/mount.h>
#include <linux/audit.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
#include <linux/proc_fs.h>
#include <linux/profile.h>
#include <linux/rmap.h>
#include <linux/ksm.h>
#include <linux/acct.h>
#include <linux/userfaultfd_k.h>
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/freezer.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
#include <linux/random.h>
#include <linux/tty.h>
#include <linux/blkdev.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
#include <linux/perf_event.h>
#include <linux/posix-timers.h>
#include <linux/user-return-notifier.h>
#include <linux/oom.h>
#include <linux/khugepaged.h>
#include <linux/signalfd.h>
#include <linux/uprobes.h>
#include <linux/aio.h>
#include <linux/compiler.h>
#include <linux/sysctl.h>
#include <linux/kcov.h>
#include <linux/livepatch.h>
#include <linux/thread_info.h>
#include <linux/stackleak.h>
#include <linux/kasan.h>
#include <linux/scs.h>
#include <linux/io_uring.h>

#include <asm/pgalloc.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>

#include <trace/events/sched.h>

#define CREATE_TRACE_POINTS
#include <trace/events/task.h>

/*
 * Minimum number of threads to boot the kernel
 */
#define MIN_THREADS 20

/*
 * Maximum number of threads
 */
#define MAX_THREADS FUTEX_TID_MASK

/*
 * Protected counters by write_lock_irq(&tasklist_lock)
 */
unsigned long total_forks;        /* Handle normal Linux uptimes. */
int nr_threads;                        /* The idle threads do not count.. */

static int max_threads;                /* tunable limit on nr_threads */

#define NAMED_ARRAY_INDEX(x)        [x] = __stringify(x)

static const char * const resident_page_types[] = {
        NAMED_ARRAY_INDEX(MM_FILEPAGES),
        NAMED_ARRAY_INDEX(MM_ANONPAGES),
        NAMED_ARRAY_INDEX(MM_SWAPENTS),
        NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
};

DEFINE_PER_CPU(unsigned long, process_counts) = 0;

__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */

#ifdef CONFIG_PROVE_RCU
int lockdep_tasklist_lock_is_held(void)
{
        return lockdep_is_held(&tasklist_lock);
}
EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
#endif /* #ifdef CONFIG_PROVE_RCU */

int nr_processes(void)
{
        int cpu;
        int total = 0;

        for_each_possible_cpu(cpu)
                total += per_cpu(process_counts, cpu);

        return total;
}

void __weak arch_release_task_struct(struct task_struct *tsk)
{
}

#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
static struct kmem_cache *task_struct_cachep;

static inline struct task_struct *alloc_task_struct_node(int node)
{
        return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
}

static inline void free_task_struct(struct task_struct *tsk)
{
        kmem_cache_free(task_struct_cachep, tsk);
}
#endif

#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR

/*
 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
 * kmemcache based allocator.
 */
# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)

#ifdef CONFIG_VMAP_STACK
/*
 * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
 * flush.  Try to minimize the number of calls by caching stacks.
 */
#define NR_CACHED_STACKS 2
static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);

static int free_vm_stack_cache(unsigned int cpu)
{
        struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
        int i;

        for (i = 0; i < NR_CACHED_STACKS; i++) {
                struct vm_struct *vm_stack = cached_vm_stacks[i];

                if (!vm_stack)
                        continue;

                vfree(vm_stack->addr);
                cached_vm_stacks[i] = NULL;
        }

        return 0;
}
#endif

static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
{
#ifdef CONFIG_VMAP_STACK
        void *stack;
        int i;

        for (i = 0; i < NR_CACHED_STACKS; i++) {
                struct vm_struct *s;

                s = this_cpu_xchg(cached_stacks[i], NULL);

                if (!s)
                        continue;

                /* Clear the KASAN shadow of the stack. */
                kasan_unpoison_shadow(s->addr, THREAD_SIZE);

                /* Clear stale pointers from reused stack. */
                memset(s->addr, 0, THREAD_SIZE);

                tsk->stack_vm_area = s;
                tsk->stack = s->addr;
                return s->addr;
        }

        /*
         * Allocated stacks are cached and later reused by new threads,
         * so memcg accounting is performed manually on assigning/releasing
         * stacks to tasks. Drop __GFP_ACCOUNT.
         */
        stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
                                     VMALLOC_START, VMALLOC_END,
                                     THREADINFO_GFP & ~__GFP_ACCOUNT,
                                     PAGE_KERNEL,
                                     0, node, __builtin_return_address(0));

        /*
         * We can't call find_vm_area() in interrupt context, and
         * free_thread_stack() can be called in interrupt context,
         * so cache the vm_struct.
         */
        if (stack) {
                tsk->stack_vm_area = find_vm_area(stack);
                tsk->stack = stack;
        }
        return stack;
#else
        struct page *page = alloc_pages_node(node, THREADINFO_GFP,
                                             THREAD_SIZE_ORDER);

        if (likely(page)) {
                tsk->stack = kasan_reset_tag(page_address(page));
                return tsk->stack;
        }
        return NULL;
#endif
}

static inline void free_thread_stack(struct task_struct *tsk)
{
#ifdef CONFIG_VMAP_STACK
        struct vm_struct *vm = task_stack_vm_area(tsk);

        if (vm) {
                int i;

                for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
                        memcg_kmem_uncharge_page(vm->pages[i], 0);

                for (i = 0; i < NR_CACHED_STACKS; i++) {
                        if (this_cpu_cmpxchg(cached_stacks[i],
                                        NULL, tsk->stack_vm_area) != NULL)
                                continue;

                        return;
                }

                vfree_atomic(tsk->stack);
                return;
        }
#endif

        __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
}
# else
static struct kmem_cache *thread_stack_cache;

static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
                                                  int node)
{
        unsigned long *stack;
        stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
        stack = kasan_reset_tag(stack);
        tsk->stack = stack;
        return stack;
}

static void free_thread_stack(struct task_struct *tsk)
{
        kmem_cache_free(thread_stack_cache, tsk->stack);
}

void thread_stack_cache_init(void)
{
        thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
                                        THREAD_SIZE, THREAD_SIZE, 0, 0,
                                        THREAD_SIZE, NULL);
        BUG_ON(thread_stack_cache == NULL);
}
# endif
#endif

/* SLAB cache for signal_struct structures (tsk->signal) */
static struct kmem_cache *signal_cachep;

/* SLAB cache for sighand_struct structures (tsk->sighand) */
struct kmem_cache *sighand_cachep;

/* SLAB cache for files_struct structures (tsk->files) */
struct kmem_cache *files_cachep;

/* SLAB cache for fs_struct structures (tsk->fs) */
struct kmem_cache *fs_cachep;

/* SLAB cache for vm_area_struct structures */
static struct kmem_cache *vm_area_cachep;

/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;

struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
{
        struct vm_area_struct *vma;

        vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
        if (vma)
                vma_init(vma, mm);
        return vma;
}

struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
{
        struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);

        if (new) {
                ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
                ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
                /*
                 * orig->shared.rb may be modified concurrently, but the clone
                 * will be reinitialized.
                 */
                *new = data_race(*orig);
                INIT_LIST_HEAD(&new->anon_vma_chain);
                new->vm_next = new->vm_prev = NULL;
        }
        return new;
}

void vm_area_free(struct vm_area_struct *vma)
{
        kmem_cache_free(vm_area_cachep, vma);
}

static void account_kernel_stack(struct task_struct *tsk, int account)
{
        void *stack = task_stack_page(tsk);
        struct vm_struct *vm = task_stack_vm_area(tsk);


        /* All stack pages are in the same node. */
        if (vm)
                mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB,
                                      account * (THREAD_SIZE / 1024));
        else
                mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB,
                                      account * (THREAD_SIZE / 1024));
}

static int memcg_charge_kernel_stack(struct task_struct *tsk)
{
#ifdef CONFIG_VMAP_STACK
        struct vm_struct *vm = task_stack_vm_area(tsk);
        int ret;

        BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);

        if (vm) {
                int i;

                BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);

                for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
                        /*
                         * If memcg_kmem_charge_page() fails, page->mem_cgroup
                         * pointer is NULL, and memcg_kmem_uncharge_page() in
                         * free_thread_stack() will ignore this page.
                         */
                        ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
                                                     0);
                        if (ret)
                                return ret;
                }
        }
#endif
        return 0;
}

static void release_task_stack(struct task_struct *tsk)
{
        if (WARN_ON(tsk->state != TASK_DEAD))
                return;  /* Better to leak the stack than to free prematurely */

        account_kernel_stack(tsk, -1);
        free_thread_stack(tsk);
        tsk->stack = NULL;
#ifdef CONFIG_VMAP_STACK
        tsk->stack_vm_area = NULL;
#endif
}

#ifdef CONFIG_THREAD_INFO_IN_TASK
void put_task_stack(struct task_struct *tsk)
{
        if (refcount_dec_and_test(&tsk->stack_refcount))
                release_task_stack(tsk);
}
#endif

void free_task(struct task_struct *tsk)
{
#ifdef CONFIG_SECCOMP
        WARN_ON_ONCE(tsk->seccomp.filter);
#endif
        scs_release(tsk);

#ifndef CONFIG_THREAD_INFO_IN_TASK
        /*
         * The task is finally done with both the stack and thread_info,
         * so free both.
         */
        release_task_stack(tsk);
#else
        /*
         * If the task had a separate stack allocation, it should be gone
         * by now.
         */
        WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
#endif
        rt_mutex_debug_task_free(tsk);
        ftrace_graph_exit_task(tsk);
        arch_release_task_struct(tsk);
        if (tsk->flags & PF_KTHREAD)
                free_kthread_struct(tsk);
        free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);

#ifdef CONFIG_MMU
static __latent_entropy int dup_mmap(struct mm_struct *mm,
                                        struct mm_struct *oldmm)
{
        struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
        struct rb_node **rb_link, *rb_parent;
        int retval;
        unsigned long charge;
        LIST_HEAD(uf);

        uprobe_start_dup_mmap();
        if (mmap_write_lock_killable(oldmm)) {
                retval = -EINTR;
                goto fail_uprobe_end;
        }
        flush_cache_dup_mm(oldmm);
        uprobe_dup_mmap(oldmm, mm);
        /*
         * Not linked in yet - no deadlock potential:
         */
        mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);

        /* No ordering required: file already has been exposed. */
        RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));

        mm->total_vm = oldmm->total_vm;
        mm->data_vm = oldmm->data_vm;
        mm->exec_vm = oldmm->exec_vm;
        mm->stack_vm = oldmm->stack_vm;

        rb_link = &mm->mm_rb.rb_node;
        rb_parent = NULL;
        pprev = &mm->mmap;
        retval = ksm_fork(mm, oldmm);
        if (retval)
                goto out;
        retval = khugepaged_fork(mm, oldmm);
        if (retval)
                goto out;

        prev = NULL;
        for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
                struct file *file;

                if (mpnt->vm_flags & VM_DONTCOPY) {
                        vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
                        continue;
                }
                charge = 0;
                /*
                 * Don't duplicate many vmas if we've been oom-killed (for
                 * example)
                 */
                if (fatal_signal_pending(current)) {
                        retval = -EINTR;
                        goto out;
                }
                if (mpnt->vm_flags & VM_ACCOUNT) {
                        unsigned long len = vma_pages(mpnt);

                        if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
                                goto fail_nomem;
                        charge = len;
                }
                tmp = vm_area_dup(mpnt);
                if (!tmp)
                        goto fail_nomem;
                retval = vma_dup_policy(mpnt, tmp);
                if (retval)
                        goto fail_nomem_policy;
                tmp->vm_mm = mm;
                retval = dup_userfaultfd(tmp, &uf);
                if (retval)
                        goto fail_nomem_anon_vma_fork;
                if (tmp->vm_flags & VM_WIPEONFORK) {
                        /*
                         * VM_WIPEONFORK gets a clean slate in the child.
                         * Don't prepare anon_vma until fault since we don't
                         * copy page for current vma.
                         */
                        tmp->anon_vma = NULL;
                } else if (anon_vma_fork(tmp, mpnt))
                        goto fail_nomem_anon_vma_fork;
                tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
                file = tmp->vm_file;
                if (file) {
                        struct inode *inode = file_inode(file);
                        struct address_space *mapping = file->f_mapping;

                        get_file(file);
                        if (tmp->vm_flags & VM_DENYWRITE)
                                put_write_access(inode);
                        i_mmap_lock_write(mapping);
                        if (tmp->vm_flags & VM_SHARED)
                                mapping_allow_writable(mapping);
                        flush_dcache_mmap_lock(mapping);
                        /* insert tmp into the share list, just after mpnt */
                        vma_interval_tree_insert_after(tmp, mpnt,
                                        &mapping->i_mmap);
                        flush_dcache_mmap_unlock(mapping);
                        i_mmap_unlock_write(mapping);
                }

                /*
                 * Clear hugetlb-related page reserves for children. This only
                 * affects MAP_PRIVATE mappings. Faults generated by the child
                 * are not guaranteed to succeed, even if read-only
                 */
                if (is_vm_hugetlb_page(tmp))
                        reset_vma_resv_huge_pages(tmp);

                /*
                 * Link in the new vma and copy the page table entries.
                 */
                *pprev = tmp;
                pprev = &tmp->vm_next;
                tmp->vm_prev = prev;
                prev = tmp;

                __vma_link_rb(mm, tmp, rb_link, rb_parent);
                rb_link = &tmp->vm_rb.rb_right;
                rb_parent = &tmp->vm_rb;

                mm->map_count++;
                if (!(tmp->vm_flags & VM_WIPEONFORK))
                        retval = copy_page_range(tmp, mpnt);

                if (tmp->vm_ops && tmp->vm_ops->open)
                        tmp->vm_ops->open(tmp);

                if (retval)
                        goto out;
        }
        /* a new mm has just been created */
        retval = arch_dup_mmap(oldmm, mm);
out:
        mmap_write_unlock(mm);
        flush_tlb_mm(oldmm);
        mmap_write_unlock(oldmm);
        dup_userfaultfd_complete(&uf);
fail_uprobe_end:
        uprobe_end_dup_mmap();
        return retval;
fail_nomem_anon_vma_fork:
        mpol_put(vma_policy(tmp));
fail_nomem_policy:
        vm_area_free(tmp);
fail_nomem:
        retval = -ENOMEM;
        vm_unacct_memory(charge);
        goto out;
}

static inline int mm_alloc_pgd(struct mm_struct *mm)
{
        mm->pgd = pgd_alloc(mm);
        if (unlikely(!mm->pgd))
                return -ENOMEM;
        return 0;
}

static inline void mm_free_pgd(struct mm_struct *mm)
{
        pgd_free(mm, mm->pgd);
}
#else
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
        mmap_write_lock(oldmm);
        RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
        mmap_write_unlock(oldmm);
        return 0;
}
#define mm_alloc_pgd(mm)        (0)
#define mm_free_pgd(mm)
#endif /* CONFIG_MMU */

static void check_mm(struct mm_struct *mm)
{
        int i;

        BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
                         "Please make sure 'struct resident_page_types[]' is updated as well");

        for (i = 0; i < NR_MM_COUNTERS; i++) {
                long x = atomic_long_read(&mm->rss_stat.count[i]);

                if (unlikely(x))
                        pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
                                 mm, resident_page_types[i], x);
        }

        if (mm_pgtables_bytes(mm))
                pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
                                mm_pgtables_bytes(mm));

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
        VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
#endif
}

#define allocate_mm()        (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
#define free_mm(mm)        (kmem_cache_free(mm_cachep, (mm)))

/*
 * Called when the last reference to the mm
 * is dropped: either by a lazy thread or by
 * mmput. Free the page directory and the mm.
 */
void __mmdrop(struct mm_struct *mm)
{
        BUG_ON(mm == &init_mm);
        WARN_ON_ONCE(mm == current->mm);
        WARN_ON_ONCE(mm == current->active_mm);
        mm_free_pgd(mm);
        destroy_context(mm);
        mmu_notifier_subscriptions_destroy(mm);
        check_mm(mm);
        put_user_ns(mm->user_ns);
        free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);

static void mmdrop_async_fn(struct work_struct *work)
{
        struct mm_struct *mm;

        mm = container_of(work, struct mm_struct, async_put_work);
        __mmdrop(mm);
}

static void mmdrop_async(struct mm_struct *mm)
{
        if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
                INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
                schedule_work(&mm->async_put_work);
        }
}

static inline void free_signal_struct(struct signal_struct *sig)
{
        taskstats_tgid_free(sig);
        sched_autogroup_exit(sig);
        /*
         * __mmdrop is not safe to call from softirq context on x86 due to
         * pgd_dtor so postpone it to the async context
         */
        if (sig->oom_mm)
                mmdrop_async(sig->oom_mm);
        kmem_cache_free(signal_cachep, sig);
}

static inline void put_signal_struct(struct signal_struct *sig)
{
        if (refcount_dec_and_test(&sig->sigcnt))
                free_signal_struct(sig);
}

void __put_task_struct(struct task_struct *tsk)
{
        WARN_ON(!tsk->exit_state);
        WARN_ON(refcount_read(&tsk->usage));
        WARN_ON(tsk == current);

        io_uring_free(tsk);
        cgroup_free(tsk);
        task_numa_free(tsk, true);
        security_task_free(tsk);
        exit_creds(tsk);
        delayacct_tsk_free(tsk);
        put_signal_struct(tsk->signal);

        if (!profile_handoff_task(tsk))
                free_task(tsk);
}
EXPORT_SYMBOL_GPL(__put_task_struct);

void __put_task_struct_rcu_cb(struct rcu_head *rhp)
{
        struct task_struct *task = container_of(rhp, struct task_struct, rcu);

        __put_task_struct(task);
}
EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb);

void __init __weak arch_task_cache_init(void) { }

/*
 * set_max_threads
 */
static void set_max_threads(unsigned int max_threads_suggested)
{
        u64 threads;
        unsigned long nr_pages = totalram_pages();

        /*
         * The number of threads shall be limited such that the thread
         * structures may only consume a small part of the available memory.
         */
        if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64)
                threads = MAX_THREADS;
        else
                threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE,
                                    (u64) THREAD_SIZE * 8UL);

        if (threads > max_threads_suggested)
                threads = max_threads_suggested;

        max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
}

#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
/* Initialized by the architecture: */
int arch_task_struct_size __read_mostly;
#endif

#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
{
        /* Fetch thread_struct whitelist for the architecture. */
        arch_thread_struct_whitelist(offset, size);

        /*
         * Handle zero-sized whitelist or empty thread_struct, otherwise
         * adjust offset to position of thread_struct in task_struct.
         */
        if (unlikely(*size == 0))
                *offset = 0;
        else
                *offset += offsetof(struct task_struct, thread);
}
#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */

void __init fork_init(void)
{
        int i;
#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
#ifndef ARCH_MIN_TASKALIGN
#define ARCH_MIN_TASKALIGN        0
#endif
        int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
        unsigned long useroffset, usersize;

        /* create a slab on which task_structs can be allocated */
        task_struct_whitelist(&useroffset, &usersize);
        task_struct_cachep = kmem_cache_create_usercopy("task_struct",
                        arch_task_struct_size, align,
                        SLAB_PANIC|SLAB_ACCOUNT,
                        useroffset, usersize, NULL);
#endif

        /* do the arch specific task caches init */
        arch_task_cache_init();

        set_max_threads(MAX_THREADS);

        init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
        init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
        init_task.signal->rlim[RLIMIT_SIGPENDING] =
                init_task.signal->rlim[RLIMIT_NPROC];

        for (i = 0; i < UCOUNT_COUNTS; i++) {
                init_user_ns.ucount_max[i] = max_threads/2;
        }

#ifdef CONFIG_VMAP_STACK
        cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
                          NULL, free_vm_stack_cache);
#endif

        scs_init();

        lockdep_init_task(&init_task);
        uprobes_init();
}

int __weak arch_dup_task_struct(struct task_struct *dst,
                                               struct task_struct *src)
{
        *dst = *src;
        return 0;
}

void set_task_stack_end_magic(struct task_struct *tsk)
{
        unsigned long *stackend;

        stackend = end_of_stack(tsk);
        *stackend = STACK_END_MAGIC;        /* for overflow detection */
}

static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
        struct task_struct *tsk;
        unsigned long *stack;
        struct vm_struct *stack_vm_area __maybe_unused;
        int err;

        if (node == NUMA_NO_NODE)
                node = tsk_fork_get_node(orig);
        tsk = alloc_task_struct_node(node);
        if (!tsk)
                return NULL;

        stack = alloc_thread_stack_node(tsk, node);
        if (!stack)
                goto free_tsk;

        if (memcg_charge_kernel_stack(tsk))
                goto free_stack;

        stack_vm_area = task_stack_vm_area(tsk);

        err = arch_dup_task_struct(tsk, orig);

        /*
         * arch_dup_task_struct() clobbers the stack-related fields.  Make
         * sure they're properly initialized before using any stack-related
         * functions again.
         */
        tsk->stack = stack;
#ifdef CONFIG_VMAP_STACK
        tsk->stack_vm_area = stack_vm_area;
#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
        refcount_set(&tsk->stack_refcount, 1);
#endif

        if (err)
                goto free_stack;

        err = scs_prepare(tsk, node);
        if (err)
                goto free_stack;

#ifdef CONFIG_SECCOMP
        /*
         * We must handle setting up seccomp filters once we're under
         * the sighand lock in case orig has changed between now and
         * then. Until then, filter must be NULL to avoid messing up
         * the usage counts on the error path calling free_task.
         */
        tsk->seccomp.filter = NULL;
#endif

        setup_thread_stack(tsk, orig);
        clear_user_return_notifier(tsk);
        clear_tsk_need_resched(tsk);
        set_task_stack_end_magic(tsk);

#ifdef CONFIG_STACKPROTECTOR
        tsk->stack_canary = get_random_canary();
#endif
        if (orig->cpus_ptr == &orig->cpus_mask)
                tsk->cpus_ptr = &tsk->cpus_mask;

        /*
         * One for the user space visible state that goes away when reaped.
         * One for the scheduler.
         */
        refcount_set(&tsk->rcu_users, 2);
        /* One for the rcu users */
        refcount_set(&tsk->usage, 1);
#ifdef CONFIG_BLK_DEV_IO_TRACE
        tsk->btrace_seq = 0;
#endif
        tsk->splice_pipe = NULL;
        tsk->task_frag.page = NULL;
        tsk->wake_q.next = NULL;
        tsk->pf_io_worker = NULL;

        account_kernel_stack(tsk, 1);

        kcov_task_init(tsk);

#ifdef CONFIG_FAULT_INJECTION
        tsk->fail_nth = 0;
#endif

#ifdef CONFIG_BLK_CGROUP
        tsk->throttle_queue = NULL;
        tsk->use_memdelay = 0;
#endif

#ifdef CONFIG_MEMCG
        tsk->active_memcg = NULL;
#endif
        return tsk;

free_stack:
        free_thread_stack(tsk);
free_tsk:
        free_task_struct(tsk);
        return NULL;
}

__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);

static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;

static int __init coredump_filter_setup(char *s)
{
        default_dump_filter =
                (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
                MMF_DUMP_FILTER_MASK;
        return 1;
}

__setup("coredump_filter=", coredump_filter_setup);

#include <linux/init_task.h>

static void mm_init_aio(struct mm_struct *mm)
{
#ifdef CONFIG_AIO
        spin_lock_init(&mm->ioctx_lock);
        mm->ioctx_table = NULL;
#endif
}

static __always_inline void mm_clear_owner(struct mm_struct *mm,
                                           struct task_struct *p)
{
#ifdef CONFIG_MEMCG
        if (mm->owner == p)
                WRITE_ONCE(mm->owner, NULL);
#endif
}

static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
{
#ifdef CONFIG_MEMCG
        mm->owner = p;
#endif
}

static void mm_init_pasid(struct mm_struct *mm)
{
#ifdef CONFIG_IOMMU_SUPPORT
        mm->pasid = INIT_PASID;
#endif
}

static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
        mm->uprobes_state.xol_area = NULL;
#endif
}

static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
        struct user_namespace *user_ns)
{
        mm->mmap = NULL;
        mm->mm_rb = RB_ROOT;
        mm->vmacache_seqnum = 0;
        atomic_set(&mm->mm_users, 1);
        atomic_set(&mm->mm_count, 1);
        seqcount_init(&mm->write_protect_seq);
        mmap_init_lock(mm);
        INIT_LIST_HEAD(&mm->mmlist);
        mm->core_state = NULL;
        mm_pgtables_bytes_init(mm);
        mm->map_count = 0;
        mm->locked_vm = 0;
        atomic_set(&mm->has_pinned, 0);
        atomic64_set(&mm->pinned_vm, 0);
        memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
        spin_lock_init(&mm->page_table_lock);
        spin_lock_init(&mm->arg_lock);
        mm_init_cpumask(mm);
        mm_init_aio(mm);
        mm_init_owner(mm, p);
        mm_init_pasid(mm);
        RCU_INIT_POINTER(mm->exe_file, NULL);
        mmu_notifier_subscriptions_init(mm);
        init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
        mm->pmd_huge_pte = NULL;
#endif
        mm_init_uprobes_state(mm);
        hugetlb_count_init(mm);

        if (current->mm) {
                mm->flags = current->mm->flags & MMF_INIT_MASK;
                mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
        } else {
                mm->flags = default_dump_filter;
                mm->def_flags = 0;
        }

        if (mm_alloc_pgd(mm))
                goto fail_nopgd;

        if (init_new_context(p, mm))
                goto fail_nocontext;

        mm->user_ns = get_user_ns(user_ns);
        return mm;

fail_nocontext:
        mm_free_pgd(mm);
fail_nopgd:
        free_mm(mm);
        return NULL;
}

/*
 * Allocate and initialize an mm_struct.
 */
struct mm_struct *mm_alloc(void)
{
        struct mm_struct *mm;

        mm = allocate_mm();
        if (!mm)
                return NULL;

        memset(mm, 0, sizeof(*mm));
        return mm_init(mm, current, current_user_ns());
}

static inline void __mmput(struct mm_struct *mm)
{
        VM_BUG_ON(atomic_read(&mm->mm_users));

        uprobe_clear_state(mm);
        exit_aio(mm);
        ksm_exit(mm);
        khugepaged_exit(mm); /* must run before exit_mmap */
        exit_mmap(mm);
        mm_put_huge_zero_page(mm);
        set_mm_exe_file(mm, NULL);
        if (!list_empty(&mm->mmlist)) {
                spin_lock(&mmlist_lock);
                list_del(&mm->mmlist);
                spin_unlock(&mmlist_lock);
        }
        if (mm->binfmt)
                module_put(mm->binfmt->module);
        mmdrop(mm);
}

/*
 * Decrement the use count and release all resources for an mm.
 */
void mmput(struct mm_struct *mm)
{
        might_sleep();

        if (atomic_dec_and_test(&mm->mm_users))
                __mmput(mm);
}
EXPORT_SYMBOL_GPL(mmput);

#ifdef CONFIG_MMU
static void mmput_async_fn(struct work_struct *work)
{
        struct mm_struct *mm = container_of(work, struct mm_struct,
                                            async_put_work);

        __mmput(mm);
}

void mmput_async(struct mm_struct *mm)
{
        if (atomic_dec_and_test(&mm->mm_users)) {
                INIT_WORK(&mm->async_put_work, mmput_async_fn);
                schedule_work(&mm->async_put_work);
        }
}
EXPORT_SYMBOL_GPL(mmput_async);
#endif

/**
 * set_mm_exe_file - change a reference to the mm's executable file
 *
 * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
 *
 * Main users are mmput() and sys_execve(). Callers prevent concurrent
 * invocations: in mmput() nobody alive left, in execve task is single
 * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
 * mm->exe_file, but does so without using set_mm_exe_file() in order
 * to do avoid the need for any locks.
 */
void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
        struct file *old_exe_file;

        /*
         * It is safe to dereference the exe_file without RCU as
         * this function is only called if nobody else can access
         * this mm -- see comment above for justification.
         */
        old_exe_file = rcu_dereference_raw(mm->exe_file);

        if (new_exe_file)
                get_file(new_exe_file);
        rcu_assign_pointer(mm->exe_file, new_exe_file);
        if (old_exe_file)
                fput(old_exe_file);
}

/**
 * get_mm_exe_file - acquire a reference to the mm's executable file
 *
 * Returns %NULL if mm has no associated executable file.
 * User must release file via fput().
 */
struct file *get_mm_exe_file(struct mm_struct *mm)
{
        struct file *exe_file;

        rcu_read_lock();
        exe_file = rcu_dereference(mm->exe_file);
        if (exe_file && !get_file_rcu(exe_file))
                exe_file = NULL;
        rcu_read_unlock();
        return exe_file;
}
EXPORT_SYMBOL(get_mm_exe_file);

/**
 * get_task_exe_file - acquire a reference to the task's executable file
 *
 * Returns %NULL if task's mm (if any) has no associated executable file or
 * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
 * User must release file via fput().
 */
struct file *get_task_exe_file(struct task_struct *task)
{
        struct file *exe_file = NULL;
        struct mm_struct *mm;

        task_lock(task);
        mm = task->mm;
        if (mm) {
                if (!(task->flags & PF_KTHREAD))
                        exe_file = get_mm_exe_file(mm);
        }
        task_unlock(task);
        return exe_file;
}
EXPORT_SYMBOL(get_task_exe_file);

/**
 * get_task_mm - acquire a reference to the task's mm
 *
 * Returns %NULL if the task has no mm.  Checks PF_KTHREAD (meaning
 * this kernel workthread has transiently adopted a user mm with use_mm,
 * to do its AIO) is not set and if so returns a reference to it, after
 * bumping up the use count.  User must release the mm via mmput()
 * after use.  Typically used by /proc and ptrace.
 */
struct mm_struct *get_task_mm(struct task_struct *task)
{
        struct mm_struct *mm;

        task_lock(task);
        mm = task->mm;
        if (mm) {
                if (task->flags & PF_KTHREAD)
                        mm = NULL;
                else
                        mmget(mm);
        }
        task_unlock(task);
        return mm;
}
EXPORT_SYMBOL_GPL(get_task_mm);

struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
{
        struct mm_struct *mm;
        int err;

        err =  down_read_killable(&task->signal->exec_update_lock);
        if (err)
                return ERR_PTR(err);

        mm = get_task_mm(task);
        if (mm && mm != current->mm &&
                        !ptrace_may_access(task, mode)) {
                mmput(mm);
                mm = ERR_PTR(-EACCES);
        }
        up_read(&task->signal->exec_update_lock);

        return mm;
}

static void complete_vfork_done(struct task_struct *tsk)
{
        struct completion *vfork;

        task_lock(tsk);
        vfork = tsk->vfork_done;
        if (likely(vfork)) {
                tsk->vfork_done = NULL;
                complete(vfork);
        }
        task_unlock(tsk);
}

static int wait_for_vfork_done(struct task_struct *child,
                                struct completion *vfork)
{
        int killed;

        freezer_do_not_count();
        cgroup_enter_frozen();
        killed = wait_for_completion_killable(vfork);
        cgroup_leave_frozen(false);
        freezer_count();

        if (killed) {
                task_lock(child);
                child->vfork_done = NULL;
                task_unlock(child);
        }

        put_task_struct(child);
        return killed;
}

/* Please note the differences between mmput and mm_release.
 * mmput is called whenever we stop holding onto a mm_struct,
 * error success whatever.
 *
 * mm_release is called after a mm_struct has been removed
 * from the current process.
 *
 * This difference is important for error handling, when we
 * only half set up a mm_struct for a new process and need to restore
 * the old one.  Because we mmput the new mm_struct before
 * restoring the old one. . .
 * Eric Biederman 10 January 1998
 */
static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
        uprobe_free_utask(tsk);

        /* Get rid of any cached register state */
        deactivate_mm(tsk, mm);

        /*
         * Signal userspace if we're not exiting with a core dump
         * because we want to leave the value intact for debugging
         * purposes.
         */
        if (tsk->clear_child_tid) {
                if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
                    atomic_read(&mm->mm_users) > 1) {
                        /*
                         * We don't check the error code - if userspace has
                         * not set up a proper pointer then tough luck.
                         */
                        put_user(0, tsk->clear_child_tid);
                        do_futex(tsk->clear_child_tid, FUTEX_WAKE,
                                        1, NULL, NULL, 0, 0);
                }
                tsk->clear_child_tid = NULL;
        }

        /*
         * All done, finally we can wake up parent and return this mm to him.
         * Also kthread_stop() uses this completion for synchronization.
         */
        if (tsk->vfork_done)
                complete_vfork_done(tsk);
}

void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
        futex_exit_release(tsk);
        mm_release(tsk, mm);
}

void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
        futex_exec_release(tsk);
        mm_release(tsk, mm);
}

/**
 * dup_mm() - duplicates an existing mm structure
 * @tsk: the task_struct with which the new mm will be associated.
 * @oldmm: the mm to duplicate.
 *
 * Allocates a new mm structure and duplicates the provided @oldmm structure
 * content into it.
 *
 * Return: the duplicated mm or NULL on failure.
 */
static struct mm_struct *dup_mm(struct task_struct *tsk,
                                struct mm_struct *oldmm)
{
        struct mm_struct *mm;
        int err;

        mm = allocate_mm();
        if (!mm)
                goto fail_nomem;

        memcpy(mm, oldmm, sizeof(*mm));

        if (!mm_init(mm, tsk, mm->user_ns))
                goto fail_nomem;

        err = dup_mmap(mm, oldmm);
        if (err)
                goto free_pt;

        mm->hiwater_rss = get_mm_rss(mm);
        mm->hiwater_vm = mm->total_vm;

        if (mm->binfmt && !try_module_get(mm->binfmt->module))
                goto free_pt;

        return mm;

free_pt:
        /* don't put binfmt in mmput, we haven't got module yet */
        mm->binfmt = NULL;
        mm_init_owner(mm, NULL);
        mmput(mm);

fail_nomem:
        return NULL;
}

static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
{
        struct mm_struct *mm, *oldmm;
        int retval;

        tsk->min_flt = tsk->maj_flt = 0;
        tsk->nvcsw = tsk->nivcsw = 0;
#ifdef CONFIG_DETECT_HUNG_TASK
        tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
        tsk->last_switch_time = 0;
#endif

        tsk->mm = NULL;
        tsk->active_mm = NULL;

        /*
         * Are we cloning a kernel thread?
         *
         * We need to steal a active VM for that..
         */
        oldmm = current->mm;
        if (!oldmm)
                return 0;

        /* initialize the new vmacache entries */
        vmacache_flush(tsk);

        if (clone_flags & CLONE_VM) {
                mmget(oldmm);
                mm = oldmm;
                goto good_mm;
        }

        retval = -ENOMEM;
        mm = dup_mm(tsk, current->mm);
        if (!mm)
                goto fail_nomem;

good_mm:
        tsk->mm = mm;
        tsk->active_mm = mm;
        return 0;

fail_nomem:
        return retval;
}

static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
{
        struct fs_struct *fs = current->fs;
        if (clone_flags & CLONE_FS) {
                /* tsk->fs is already what we want */
                spin_lock(&fs->lock);
                if (fs->in_exec) {
                        spin_unlock(&fs->lock);
                        return -EAGAIN;
                }
                fs->users++;
                spin_unlock(&fs->lock);
                return 0;
        }
        tsk->fs = copy_fs_struct(fs);
        if (!tsk->fs)
                return -ENOMEM;
        return 0;
}

static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
{
        struct files_struct *oldf, *newf;
        int error = 0;

        /*
         * A background process may not have any files ...
         */
        oldf = current->files;
        if (!oldf)
                goto out;

        if (clone_flags & CLONE_FILES) {
                atomic_inc(&oldf->count);
                goto out;
        }

        newf = dup_fd(oldf, NR_OPEN_MAX, &error);
        if (!newf)
                goto out;

        tsk->files = newf;
        error = 0;
out:
        return error;
}

static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
{
#ifdef CONFIG_BLOCK
        struct io_context *ioc = current->io_context;
        struct io_context *new_ioc;

        if (!ioc)
                return 0;
        /*
         * Share io context with parent, if CLONE_IO is set
         */
        if (clone_flags & CLONE_IO) {
                ioc_task_link(ioc);
                tsk->io_context = ioc;
        } else if (ioprio_valid(ioc->ioprio)) {
                new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
                if (unlikely(!new_ioc))
                        return -ENOMEM;

                new_ioc->ioprio = ioc->ioprio;
                put_io_context(new_ioc);
        }
#endif
        return 0;
}

static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
{
        struct sighand_struct *sig;

        if (clone_flags & CLONE_SIGHAND) {
                refcount_inc(&current->sighand->count);
                return 0;
        }
        sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
        RCU_INIT_POINTER(tsk->sighand, sig);
        if (!sig)
                return -ENOMEM;

        refcount_set(&sig->count, 1);
        spin_lock_irq(&current->sighand->siglock);
        memcpy(sig->action, current->sighand->action, sizeof(sig->action));
        spin_unlock_irq(&current->sighand->siglock);

        /* Reset all signal handler not set to SIG_IGN to SIG_DFL. */
        if (clone_flags & CLONE_CLEAR_SIGHAND)
                flush_signal_handlers(tsk, 0);

        return 0;
}

void __cleanup_sighand(struct sighand_struct *sighand)
{
        if (refcount_dec_and_test(&sighand->count)) {
                signalfd_cleanup(sighand);
                /*
                 * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
                 * without an RCU grace period, see __lock_task_sighand().
                 */
                kmem_cache_free(sighand_cachep, sighand);
        }
}

/*
 * Initialize POSIX timer handling for a thread group.
 */
static void posix_cpu_timers_init_group(struct signal_struct *sig)
{
        struct posix_cputimers *pct = &sig->posix_cputimers;
        unsigned long cpu_limit;

        cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
        posix_cputimers_group_init(pct, cpu_limit);
}

static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
{
        struct signal_struct *sig;

        if (clone_flags & CLONE_THREAD)
                return 0;

        sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
        tsk->signal = sig;
        if (!sig)
                return -ENOMEM;

        sig->nr_threads = 1;
        atomic_set(&sig->live, 1);
        refcount_set(&sig->sigcnt, 1);

        /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
        sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
        tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);

        init_waitqueue_head(&sig->wait_chldexit);
        sig->curr_target = tsk;
        init_sigpending(&sig->shared_pending);
        INIT_HLIST_HEAD(&sig->multiprocess);
        seqlock_init(&sig->stats_lock);
        prev_cputime_init(&sig->prev_cputime);

#ifdef CONFIG_POSIX_TIMERS
        INIT_LIST_HEAD(&sig->posix_timers);
        hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        sig->real_timer.function = it_real_fn;
#endif

        task_lock(current->group_leader);
        memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
        task_unlock(current->group_leader);

        posix_cpu_timers_init_group(sig);

        tty_audit_fork(sig);
        sched_autogroup_fork(sig);

        sig->oom_score_adj = current->signal->oom_score_adj;
        sig->oom_score_adj_min = current->signal->oom_score_adj_min;

        mutex_init(&sig->cred_guard_mutex);
        init_rwsem(&sig->exec_update_lock);

        return 0;
}

static void copy_seccomp(struct task_struct *p)
{
#ifdef CONFIG_SECCOMP
        /*
         * Must be called with sighand->lock held, which is common to
         * all threads in the group. Holding cred_guard_mutex is not
         * needed because this new task is not yet running and cannot
         * be racing exec.
         */
        assert_spin_locked(&current->sighand->siglock);

        /* Ref-count the new filter user, and assign it. */
        get_seccomp_filter(current);
        p->seccomp = current->seccomp;

        /*
         * Explicitly enable no_new_privs here in case it got set
         * between the task_struct being duplicated and holding the
         * sighand lock. The seccomp state and nnp must be in sync.
         */
        if (task_no_new_privs(current))
                task_set_no_new_privs(p);

        /*
         * If the parent gained a seccomp mode after copying thread
         * flags and between before we held the sighand lock, we have
         * to manually enable the seccomp thread flag here.
         */
        if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
                set_tsk_thread_flag(p, TIF_SECCOMP);
#endif
}

SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
{
        current->clear_child_tid = tidptr;

        return task_pid_vnr(current);
}

static void rt_mutex_init_task(struct task_struct *p)
{
        raw_spin_lock_init(&p->pi_lock);
#ifdef CONFIG_RT_MUTEXES
        p->pi_waiters = RB_ROOT_CACHED;
        p->pi_top_task = NULL;
        p->pi_blocked_on = NULL;
#endif
}

static inline void init_task_pid_links(struct task_struct *task)
{
        enum pid_type type;

        for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
                INIT_HLIST_NODE(&task->pid_links[type]);
        }
}

static inline void
init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
{
        if (type == PIDTYPE_PID)
                task->thread_pid = pid;
        else
                task->signal->pids[type] = pid;
}

static inline void rcu_copy_process(struct task_struct *p)
{
#ifdef CONFIG_PREEMPT_RCU
        p->rcu_read_lock_nesting = 0;
        p->rcu_read_unlock_special.s = 0;
        p->rcu_blocked_node = NULL;
        INIT_LIST_HEAD(&p->rcu_node_entry);
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_TASKS_RCU
        p->rcu_tasks_holdout = false;
        INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
        p->rcu_tasks_idle_cpu = -1;
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_TASKS_TRACE_RCU
        p->trc_reader_nesting = 0;
        p->trc_reader_special.s = 0;
        INIT_LIST_HEAD(&p->trc_holdout_list);
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}

struct pid *pidfd_pid(const struct file *file)
{
        if (file->f_op == &pidfd_fops)
                return file->private_data;

        return ERR_PTR(-EBADF);
}

static int pidfd_release(struct inode *inode, struct file *file)
{
        struct pid *pid = file->private_data;

        file->private_data = NULL;
        put_pid(pid);
        return 0;
}

#ifdef CONFIG_PROC_FS
/**
 * pidfd_show_fdinfo - print information about a pidfd
 * @m: proc fdinfo file
 * @f: file referencing a pidfd
 *
 * Pid:
 * This function will print the pid that a given pidfd refers to in the
 * pid namespace of the procfs instance.
 * If the pid namespace of the process is not a descendant of the pid
 * namespace of the procfs instance 0 will be shown as its pid. This is
 * similar to calling getppid() on a process whose parent is outside of
 * its pid namespace.
 *
 * NSpid:
 * If pid namespaces are supported then this function will also print
 * the pid of a given pidfd refers to for all descendant pid namespaces
 * starting from the current pid namespace of the instance, i.e. the
 * Pid field and the first entry in the NSpid field will be identical.
 * If the pid namespace of the process is not a descendant of the pid
 * namespace of the procfs instance 0 will be shown as its first NSpid
 * entry and no others will be shown.
 * Note that this differs from the Pid and NSpid fields in
 * /proc/<pid>/status where Pid and NSpid are always shown relative to
 * the  pid namespace of the procfs instance. The difference becomes
 * obvious when sending around a pidfd between pid namespaces from a
 * different branch of the tree, i.e. where no ancestoral relation is
 * present between the pid namespaces:
 * - create two new pid namespaces ns1 and ns2 in the initial pid
 *   namespace (also take care to create new mount namespaces in the
 *   new pid namespace and mount procfs)
 * - create a process with a pidfd in ns1
 * - send pidfd from ns1 to ns2
 * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
 *   have exactly one entry, which is 0
 */
static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
{
        struct pid *pid = f->private_data;
        struct pid_namespace *ns;
        pid_t nr = -1;

        if (likely(pid_has_task(pid, PIDTYPE_PID))) {
                ns = proc_pid_ns(file_inode(m->file)->i_sb);
                nr = pid_nr_ns(pid, ns);
        }

        seq_put_decimal_ll(m, "Pid:\t", nr);

#ifdef CONFIG_PID_NS
        seq_put_decimal_ll(m, "\nNSpid:\t", nr);
        if (nr > 0) {
                int i;

                /* If nr is non-zero it means that 'pid' is valid and that
                 * ns, i.e. the pid namespace associated with the procfs
                 * instance, is in the pid namespace hierarchy of pid.
                 * Start at one below the already printed level.
                 */
                for (i = ns->level + 1; i <= pid->level; i++)
                        seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
        }
#endif
        seq_putc(m, '\n');
}
#endif

/*
 * Poll support for process exit notification.
 */
static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
{
        struct pid *pid = file->private_data;
        __poll_t poll_flags = 0;

        poll_wait(file, &pid->wait_pidfd, pts);

        /*
         * Inform pollers only when the whole thread group exits.
         * If the thread group leader exits before all other threads in the
         * group, then poll(2) should block, similar to the wait(2) family.
         */
        if (thread_group_exited(pid))
                poll_flags = EPOLLIN | EPOLLRDNORM;

        return poll_flags;
}

const struct file_operations pidfd_fops = {
        .release = pidfd_release,
        .poll = pidfd_poll,
#ifdef CONFIG_PROC_FS
        .show_fdinfo = pidfd_show_fdinfo,
#endif
};

static void __delayed_free_task(struct rcu_head *rhp)
{
        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);

        free_task(tsk);
}

static __always_inline void delayed_free_task(struct task_struct *tsk)
{
        if (IS_ENABLED(CONFIG_MEMCG))
                call_rcu(&tsk->rcu, __delayed_free_task);
        else
                free_task(tsk);
}

static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
{
        /* Skip if kernel thread */
        if (!tsk->mm)
                return;

        /* Skip if spawning a thread or using vfork */
        if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM)
                return;

        /* We need to synchronize with __set_oom_adj */
        mutex_lock(&oom_adj_mutex);
        set_bit(MMF_MULTIPROCESS, &tsk->mm->flags);
        /* Update the values in case they were changed after copy_signal */
        tsk->signal->oom_score_adj = current->signal->oom_score_adj;
        tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min;
        mutex_unlock(&oom_adj_mutex);
}

/*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
 *
 * It copies the registers, and all the appropriate
 * parts of the process environment (as per the clone
 * flags). The actual kick-off is left to the caller.
 */
static __latent_entropy struct task_struct *copy_process(
                                        struct pid *pid,
                                        int trace,
                                        int node,
                                        struct kernel_clone_args *args)
{
        int pidfd = -1, retval;
        struct task_struct *p;
        struct multiprocess_signals delayed;
        struct file *pidfile = NULL;
        u64 clone_flags = args->flags;
        struct nsproxy *nsp = current->nsproxy;

        /*
         * Don't allow sharing the root directory with processes in a different
         * namespace
         */
        if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
                return ERR_PTR(-EINVAL);

        if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
                return ERR_PTR(-EINVAL);

        /*
         * Thread groups must share signals as well, and detached threads
         * can only be started up within the thread group.
         */
        if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
                return ERR_PTR(-EINVAL);

        /*
         * Shared signal handlers imply shared VM. By way of the above,
         * thread groups also imply shared VM. Blocking this case allows
         * for various simplifications in other code.
         */
        if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
                return ERR_PTR(-EINVAL);

        /*
         * Siblings of global init remain as zombies on exit since they are
         * not reaped by their parent (swapper). To solve this and to avoid
         * multi-rooted process trees, prevent global and container-inits
         * from creating siblings.
         */
        if ((clone_flags & CLONE_PARENT) &&
                                current->signal->flags & SIGNAL_UNKILLABLE)
                return ERR_PTR(-EINVAL);

        /*
         * If the new process will be in a different pid or user namespace
         * do not allow it to share a thread group with the forking task.
         */
        if (clone_flags & CLONE_THREAD) {
                if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
                    (task_active_pid_ns(current) != nsp->pid_ns_for_children))
                        return ERR_PTR(-EINVAL);
        }

        /*
         * If the new process will be in a different time namespace
         * do not allow it to share VM or a thread group with the forking task.
         */
        if (clone_flags & (CLONE_THREAD | CLONE_VM)) {
                if (nsp->time_ns != nsp->time_ns_for_children)
                        return ERR_PTR(-EINVAL);
        }

        if (clone_flags & CLONE_PIDFD) {
                /*
                 * - CLONE_DETACHED is blocked so that we can potentially
                 *   reuse it later for CLONE_PIDFD.
                 * - CLONE_THREAD is blocked until someone really needs it.
                 */
                if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
                        return ERR_PTR(-EINVAL);
        }

        /*
         * Force any signals received before this point to be delivered
         * before the fork happens.  Collect up signals sent to multiple
         * processes that happen during the fork and delay them so that
         * they appear to happen after the fork.
         */
        sigemptyset(&delayed.signal);
        INIT_HLIST_NODE(&delayed.node);

        spin_lock_irq(&current->sighand->siglock);
        if (!(clone_flags & CLONE_THREAD))
                hlist_add_head(&delayed.node, &current->signal->multiprocess);
        recalc_sigpending();
        spin_unlock_irq(&current->sighand->siglock);
        retval = -ERESTARTNOINTR;
        if (task_sigpending(current))
                goto fork_out;

        retval = -ENOMEM;
        p = dup_task_struct(current, node);
        if (!p)
                goto fork_out;
        if (args->io_thread) {
                /*
                 * Mark us an IO worker, and block any signal that isn't
                 * fatal or STOP
                 */
                p->flags |= PF_IO_WORKER;
                siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
        }

        /*
         * This _must_ happen before we call free_task(), i.e. before we jump
         * to any of the bad_fork_* labels. This is to avoid freeing
         * p->set_child_tid which is (ab)used as a kthread's data pointer for
         * kernel threads (PF_KTHREAD).
         */
        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
        /*
         * Clear TID on mm_release()?
         */
        p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;

        ftrace_graph_init_task(p);

        rt_mutex_init_task(p);

        lockdep_assert_irqs_enabled();
#ifdef CONFIG_PROVE_LOCKING
        DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
        retval = -EAGAIN;
        if (atomic_read(&p->real_cred->user->processes) >=
                        task_rlimit(p, RLIMIT_NPROC)) {
                if (p->real_cred->user != INIT_USER &&
                    !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
                        goto bad_fork_free;
        }
        current->flags &= ~PF_NPROC_EXCEEDED;

        retval = copy_creds(p, clone_flags);
        if (retval < 0)
                goto bad_fork_free;

        /*
         * If multiple threads are within copy_process(), then this check
         * triggers too late. This doesn't hurt, the check is only there
         * to stop root fork bombs.
         */
        retval = -EAGAIN;
        if (data_race(nr_threads >= max_threads))
                goto bad_fork_cleanup_count;

        delayacct_tsk_init(p);        /* Must remain after dup_task_struct() */
        p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE);
        p->flags |= PF_FORKNOEXEC;
        INIT_LIST_HEAD(&p->children);
        INIT_LIST_HEAD(&p->sibling);
        rcu_copy_process(p);
        p->vfork_done = NULL;
        spin_lock_init(&p->alloc_lock);

        init_sigpending(&p->pending);

        p->utime = p->stime = p->gtime = 0;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
        p->utimescaled = p->stimescaled = 0;
#endif
        prev_cputime_init(&p->prev_cputime);

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
        seqcount_init(&p->vtime.seqcount);
        p->vtime.starttime = 0;
        p->vtime.state = VTIME_INACTIVE;
#endif

#ifdef CONFIG_IO_URING
        p->io_uring = NULL;
#endif

#if defined(SPLIT_RSS_COUNTING)
        memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif

        p->default_timer_slack_ns = current->timer_slack_ns;

#ifdef CONFIG_PSI
        p->psi_flags = 0;
#endif

        task_io_accounting_init(&p->ioac);
        acct_clear_integrals(p);

        posix_cputimers_init(&p->posix_cputimers);

        p->io_context = NULL;
        audit_set_context(p, NULL);
        cgroup_fork(p);
#ifdef CONFIG_NUMA
        p->mempolicy = mpol_dup(p->mempolicy);
        if (IS_ERR(p->mempolicy)) {
                retval = PTR_ERR(p->mempolicy);
                p->mempolicy = NULL;
                goto bad_fork_cleanup_threadgroup_lock;
        }
#endif
#ifdef CONFIG_CPUSETS
        p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
        p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
        seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
        memset(&p->irqtrace, 0, sizeof(p->irqtrace));
        p->irqtrace.hardirq_disable_ip        = _THIS_IP_;
        p->irqtrace.softirq_enable_ip        = _THIS_IP_;
        p->softirqs_enabled                = 1;
        p->softirq_context                = 0;
#endif

        p->pagefault_disabled = 0;

#ifdef CONFIG_LOCKDEP
        lockdep_init_task(p);
#endif

#ifdef CONFIG_DEBUG_MUTEXES
        p->blocked_on = NULL; /* not blocked yet */
#endif
#ifdef CONFIG_BCACHE
        p->sequential_io        = 0;
        p->sequential_io_avg        = 0;
#endif

        /* Perform scheduler related setup. Assign this task to a CPU. */
        retval = sched_fork(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_policy;

        retval = perf_event_init_task(p);
        if (retval)
                goto bad_fork_cleanup_policy;
        retval = audit_alloc(p);
        if (retval)
                goto bad_fork_cleanup_perf;
        /* copy all the process information */
        shm_init_task(p);
        retval = security_task_alloc(p, clone_flags);
        if (retval)
                goto bad_fork_cleanup_audit;
        retval = copy_semundo(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_security;
        retval = copy_files(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_semundo;
        retval = copy_fs(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_files;
        retval = copy_sighand(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_fs;
        retval = copy_signal(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_sighand;
        retval = copy_mm(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_signal;
        retval = copy_namespaces(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_mm;
        retval = copy_io(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_namespaces;
        retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls);
        if (retval)
                goto bad_fork_cleanup_io;

        stackleak_task_init(p);

        if (pid != &init_struct_pid) {
                pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
                                args->set_tid_size);
                if (IS_ERR(pid)) {
                        retval = PTR_ERR(pid);
                        goto bad_fork_cleanup_thread;
                }
        }

        /*
         * This has to happen after we've potentially unshared the file
         * descriptor table (so that the pidfd doesn't leak into the child
         * if the fd table isn't shared).
         */
        if (clone_flags & CLONE_PIDFD) {
                retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
                if (retval < 0)
                        goto bad_fork_free_pid;

                pidfd = retval;

                pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
                                              O_RDWR | O_CLOEXEC);
                if (IS_ERR(pidfile)) {
                        put_unused_fd(pidfd);
                        retval = PTR_ERR(pidfile);
                        goto bad_fork_free_pid;
                }
                get_pid(pid);        /* held by pidfile now */

                retval = put_user(pidfd, args->pidfd);
                if (retval)
                        goto bad_fork_put_pidfd;
        }

#ifdef CONFIG_BLOCK
        p->plug = NULL;
#endif
        futex_init_task(p);

        /*
         * sigaltstack should be cleared when sharing the same VM
         */
        if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
                sas_ss_reset(p);

        /*
         * Syscall tracing and stepping should be turned off in the
         * child regardless of CLONE_PTRACE.
         */
        user_disable_single_step(p);
        clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
#ifdef TIF_SYSCALL_EMU
        clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif
        clear_tsk_latency_tracing(p);

        /* ok, now we should be set up.. */
        p->pid = pid_nr(pid);
        if (clone_flags & CLONE_THREAD) {
                p->group_leader = current->group_leader;
                p->tgid = current->tgid;
        } else {
                p->group_leader = p;
                p->tgid = p->pid;
        }

        p->nr_dirtied = 0;
        p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
        p->dirty_paused_when = 0;

        p->pdeath_signal = 0;
        INIT_LIST_HEAD(&p->thread_group);
        p->task_works = NULL;
        clear_posix_cputimers_work(p);

        /*
         * Ensure that the cgroup subsystem policies allow the new process to be
         * forked. It should be noted that the new process's css_set can be changed
         * between here and cgroup_post_fork() if an organisation operation is in
         * progress.
         */
        retval = cgroup_can_fork(p, args);
        if (retval)
                goto bad_fork_put_pidfd;

        /*
         * From this point on we must avoid any synchronous user-space
         * communication until we take the tasklist-lock. In particular, we do
         * not want user-space to be able to predict the process start-time by
         * stalling fork(2) after we recorded the start_time but before it is
         * visible to the system.
         */

        p->start_time = ktime_get_ns();
        p->start_boottime = ktime_get_boottime_ns();

        /*
         * Make it visible to the rest of the system, but dont wake it up yet.
         * Need tasklist lock for parent etc handling!
         */
        write_lock_irq(&tasklist_lock);

        /* CLONE_PARENT re-uses the old parent */
        if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
                p->real_parent = current->real_parent;
                p->parent_exec_id = current->parent_exec_id;
                if (clone_flags & CLONE_THREAD)
                        p->exit_signal = -1;
                else
                        p->exit_signal = current->group_leader->exit_signal;
        } else {
                p->real_parent = current;
                p->parent_exec_id = current->self_exec_id;
                p->exit_signal = args->exit_signal;
        }

        klp_copy_process(p);

        spin_lock(&current->sighand->siglock);

        rseq_fork(p, clone_flags);

        /* Don't start children in a dying pid namespace */
        if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
                retval = -ENOMEM;
                goto bad_fork_cancel_cgroup;
        }

        /* Let kill terminate clone/fork in the middle */
        if (fatal_signal_pending(current)) {
                retval = -EINTR;
                goto bad_fork_cancel_cgroup;
        }

        /* No more failure paths after this point. */

        /*
         * Copy seccomp details explicitly here, in case they were changed
         * before holding sighand lock.
         */
        copy_seccomp(p);

        init_task_pid_links(p);
        if (likely(p->pid)) {
                ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);

                init_task_pid(p, PIDTYPE_PID, pid);
                if (thread_group_leader(p)) {
                        init_task_pid(p, PIDTYPE_TGID, pid);
                        init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
                        init_task_pid(p, PIDTYPE_SID, task_session(current));

                        if (is_child_reaper(pid)) {
                                ns_of_pid(pid)->child_reaper = p;
                                p->signal->flags |= SIGNAL_UNKILLABLE;
                        }
                        p->signal->shared_pending.signal = delayed.signal;
                        p->signal->tty = tty_kref_get(current->signal->tty);
                        /*
                         * Inherit has_child_subreaper flag under the same
                         * tasklist_lock with adding child to the process tree
                         * for propagate_has_child_subreaper optimization.
                         */
                        p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
                                                         p->real_parent->signal->is_child_subreaper;
                        list_add_tail(&p->sibling, &p->real_parent->children);
                        list_add_tail_rcu(&p->tasks, &init_task.tasks);
                        attach_pid(p, PIDTYPE_TGID);
                        attach_pid(p, PIDTYPE_PGID);
                        attach_pid(p, PIDTYPE_SID);
                        __this_cpu_inc(process_counts);
                } else {
                        current->signal->nr_threads++;
                        atomic_inc(&current->signal->live);
                        refcount_inc(&current->signal->sigcnt);
                        task_join_group_stop(p);
                        list_add_tail_rcu(&p->thread_group,
                                          &p->group_leader->thread_group);
                        list_add_tail_rcu(&p->thread_node,
                                          &p->signal->thread_head);
                }
                attach_pid(p, PIDTYPE_PID);
                nr_threads++;
        }
        total_forks++;
        hlist_del_init(&delayed.node);
        spin_unlock(&current->sighand->siglock);
        syscall_tracepoint_update(p);
        write_unlock_irq(&tasklist_lock);

        if (pidfile)
                fd_install(pidfd, pidfile);

        proc_fork_connector(p);
        sched_post_fork(p, args);
        cgroup_post_fork(p, args);
        perf_event_fork(p);

        trace_task_newtask(p, clone_flags);
        uprobe_copy_process(p, clone_flags);

        copy_oom_score_adj(clone_flags, p);

        return p;

bad_fork_cancel_cgroup:
        spin_unlock(&current->sighand->siglock);
        write_unlock_irq(&tasklist_lock);
        cgroup_cancel_fork(p, args);
bad_fork_put_pidfd:
        if (clone_flags & CLONE_PIDFD) {
                fput(pidfile);
                put_unused_fd(pidfd);
        }
bad_fork_free_pid:
        if (pid != &init_struct_pid)
                free_pid(pid);
bad_fork_cleanup_thread:
        exit_thread(p);
bad_fork_cleanup_io:
        if (p->io_context)
                exit_io_context(p);
bad_fork_cleanup_namespaces:
        exit_task_namespaces(p);
bad_fork_cleanup_mm:
        if (p->mm) {
                mm_clear_owner(p->mm, p);
                mmput(p->mm);
        }
bad_fork_cleanup_signal:
        if (!(clone_flags & CLONE_THREAD))
                free_signal_struct(p->signal);
bad_fork_cleanup_sighand:
        __cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
        exit_fs(p); /* blocking */
bad_fork_cleanup_files:
        exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
        exit_sem(p);
bad_fork_cleanup_security:
        security_task_free(p);
bad_fork_cleanup_audit:
        audit_free(p);
bad_fork_cleanup_perf:
        perf_event_free_task(p);
bad_fork_cleanup_policy:
        lockdep_free_task(p);
#ifdef CONFIG_NUMA
        mpol_put(p->mempolicy);
bad_fork_cleanup_threadgroup_lock:
#endif
        delayacct_tsk_free(p);
bad_fork_cleanup_count:
        atomic_dec(&p->cred->user->processes);
        exit_creds(p);
bad_fork_free:
        p->state = TASK_DEAD;
        put_task_stack(p);
        delayed_free_task(p);
fork_out:
        spin_lock_irq(&current->sighand->siglock);
        hlist_del_init(&delayed.node);
        spin_unlock_irq(&current->sighand->siglock);
        return ERR_PTR(retval);
}

static inline void init_idle_pids(struct task_struct *idle)
{
        enum pid_type type;

        for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
                INIT_HLIST_NODE(&idle->pid_links[type]); /* not really needed */
                init_task_pid(idle, type, &init_struct_pid);
        }
}

struct task_struct * __init fork_idle(int cpu)
{
        struct task_struct *task;
        struct kernel_clone_args args = {
                .flags = CLONE_VM,
        };

        task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
        if (!IS_ERR(task)) {
                init_idle_pids(task);
                init_idle(task, cpu);
        }

        return task;
}

/*
 * This is like kernel_clone(), but shaved down and tailored to just
 * creating io_uring workers. It returns a created task, or an error pointer.
 * The returned task is inactive, and the caller must fire it up through
 * wake_up_new_task(p). All signals are blocked in the created task.
 */
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
{
        unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
                                CLONE_IO;
        struct kernel_clone_args args = {
                .flags                = ((lower_32_bits(flags) | CLONE_VM |
                                    CLONE_UNTRACED) & ~CSIGNAL),
                .exit_signal        = (lower_32_bits(flags) & CSIGNAL),
                .stack                = (unsigned long)fn,
                .stack_size        = (unsigned long)arg,
                .io_thread        = 1,
        };

        return copy_process(NULL, 0, node, &args);
}

/*
 *  Ok, this is the main fork-routine.
 *
 * It copies the process, and if successful kick-starts
 * it and waits for it to finish using the VM if required.
 *
 * args->exit_signal is expected to be checked for sanity by the caller.
 */
pid_t kernel_clone(struct kernel_clone_args *args)
{
        u64 clone_flags = args->flags;
        struct completion vfork;
        struct pid *pid;
        struct task_struct *p;
        int trace = 0;
        pid_t nr;

        /*
         * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
         * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
         * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
         * field in struct clone_args and it still doesn't make sense to have
         * them both point at the same memory location. Performing this check
         * here has the advantage that we don't need to have a separate helper
         * to check for legacy clone().
         */
        if ((args->flags & CLONE_PIDFD) &&
            (args->flags & CLONE_PARENT_SETTID) &&
            (args->pidfd == args->parent_tid))
                return -EINVAL;

        /*
         * Determine whether and which event to report to ptracer.  When
         * called from kernel_thread or CLONE_UNTRACED is explicitly
         * requested, no event is reported; otherwise, report if the event
         * for the type of forking is enabled.
         */
        if (!(clone_flags & CLONE_UNTRACED)) {
                if (clone_flags & CLONE_VFORK)
                        trace = PTRACE_EVENT_VFORK;
                else if (args->exit_signal != SIGCHLD)
                        trace = PTRACE_EVENT_CLONE;
                else
                        trace = PTRACE_EVENT_FORK;

                if (likely(!ptrace_event_enabled(current, trace)))
                        trace = 0;
        }

        p = copy_process(NULL, trace, NUMA_NO_NODE, args);
        add_latent_entropy();

        if (IS_ERR(p))
                return PTR_ERR(p);

        /*
         * Do this prior waking up the new thread - the thread pointer
         * might get invalid after that point, if the thread exits quickly.
         */
        trace_sched_process_fork(current, p);

        pid = get_task_pid(p, PIDTYPE_PID);
        nr = pid_vnr(pid);

        if (clone_flags & CLONE_PARENT_SETTID)
                put_user(nr, args->parent_tid);

        if (clone_flags & CLONE_VFORK) {
                p->vfork_done = &vfork;
                init_completion(&vfork);
                get_task_struct(p);
        }

        wake_up_new_task(p);

        /* forking complete and child started to run, tell ptracer */
        if (unlikely(trace))
                ptrace_event_pid(trace, pid);

        if (clone_flags & CLONE_VFORK) {
                if (!wait_for_vfork_done(p, &vfork))
                        ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
        }

        put_pid(pid);
        return nr;
}

/*
 * Create a kernel thread.
 */
pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
        struct kernel_clone_args args = {
                .flags                = ((lower_32_bits(flags) | CLONE_VM |
                                    CLONE_UNTRACED) & ~CSIGNAL),
                .exit_signal        = (lower_32_bits(flags) & CSIGNAL),
                .stack                = (unsigned long)fn,
                .stack_size        = (unsigned long)arg,
        };

        return kernel_clone(&args);
}

#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
        struct kernel_clone_args args = {
                .exit_signal = SIGCHLD,
        };

        return kernel_clone(&args);
#else
        /* can not support in nommu mode */
        return -EINVAL;
#endif
}
#endif

#ifdef __ARCH_WANT_SYS_VFORK
SYSCALL_DEFINE0(vfork)
{
        struct kernel_clone_args args = {
                .flags                = CLONE_VFORK | CLONE_VM,
                .exit_signal        = SIGCHLD,
        };

        return kernel_clone(&args);
}
#endif

#ifdef __ARCH_WANT_SYS_CLONE
#ifdef CONFIG_CLONE_BACKWARDS
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
                 int __user *, parent_tidptr,
                 unsigned long, tls,
                 int __user *, child_tidptr)
#elif defined(CONFIG_CLONE_BACKWARDS2)
SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
                 int __user *, parent_tidptr,
                 int __user *, child_tidptr,
                 unsigned long, tls)
#elif defined(CONFIG_CLONE_BACKWARDS3)
SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
                int, stack_size,
                int __user *, parent_tidptr,
                int __user *, child_tidptr,
                unsigned long, tls)
#else
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
                 int __user *, parent_tidptr,
                 int __user *, child_tidptr,
                 unsigned long, tls)
#endif
{
        struct kernel_clone_args args = {
                .flags                = (lower_32_bits(clone_flags) & ~CSIGNAL),
                .pidfd                = parent_tidptr,
                .child_tid        = child_tidptr,
                .parent_tid        = parent_tidptr,
                .exit_signal        = (lower_32_bits(clone_flags) & CSIGNAL),
                .stack                = newsp,
                .tls                = tls,
        };

        return kernel_clone(&args);
}
#endif

#ifdef __ARCH_WANT_SYS_CLONE3

noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
                                              struct clone_args __user *uargs,
                                              size_t usize)
{
        int err;
        struct clone_args args;
        pid_t *kset_tid = kargs->set_tid;

        BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
                     CLONE_ARGS_SIZE_VER0);
        BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
                     CLONE_ARGS_SIZE_VER1);
        BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
                     CLONE_ARGS_SIZE_VER2);
        BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);

        if (unlikely(usize > PAGE_SIZE))
                return -E2BIG;
        if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
                return -EINVAL;

        err = copy_struct_from_user(&args, sizeof(args), uargs, usize);
        if (err)
                return err;

        if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
                return -EINVAL;

        if (unlikely(!args.set_tid && args.set_tid_size > 0))
                return -EINVAL;

        if (unlikely(args.set_tid && args.set_tid_size == 0))
                return -EINVAL;

        /*
         * Verify that higher 32bits of exit_signal are unset and that
         * it is a valid signal
         */
        if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
                     !valid_signal(args.exit_signal)))
                return -EINVAL;

        if ((args.flags & CLONE_INTO_CGROUP) &&
            (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
                return -EINVAL;

        *kargs = (struct kernel_clone_args){
                .flags                = args.flags,
                .pidfd                = u64_to_user_ptr(args.pidfd),
                .child_tid        = u64_to_user_ptr(args.child_tid),
                .parent_tid        = u64_to_user_ptr(args.parent_tid),
                .exit_signal        = args.exit_signal,
                .stack                = args.stack,
                .stack_size        = args.stack_size,
                .tls                = args.tls,
                .set_tid_size        = args.set_tid_size,
                .cgroup                = args.cgroup,
        };

        if (args.set_tid &&
                copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
                        (kargs->set_tid_size * sizeof(pid_t))))
                return -EFAULT;

        kargs->set_tid = kset_tid;

        return 0;
}

/**
 * clone3_stack_valid - check and prepare stack
 * @kargs: kernel clone args
 *
 * Verify that the stack arguments userspace gave us are sane.
 * In addition, set the stack direction for userspace since it's easy for us to
 * determine.
 */
static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
{
        if (kargs->stack == 0) {
                if (kargs->stack_size > 0)
                        return false;
        } else {
                if (kargs->stack_size == 0)
                        return false;

                if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
                        return false;

#if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64)
                kargs->stack += kargs->stack_size;
#endif
        }

        return true;
}

static bool clone3_args_valid(struct kernel_clone_args *kargs)
{
        /* Verify that no unknown flags are passed along. */
        if (kargs->flags &
            ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
                return false;

        /*
         * - make the CLONE_DETACHED bit reuseable for clone3
         * - make the CSIGNAL bits reuseable for clone3
         */
        if (kargs->flags & (CLONE_DETACHED | (CSIGNAL & (~CLONE_NEWTIME))))
                return false;

        if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
            (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND))
                return false;

        if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
            kargs->exit_signal)
                return false;

        if (!clone3_stack_valid(kargs))
                return false;

        return true;
}

/**
 * clone3 - create a new process with specific properties
 * @uargs: argument structure
 * @size:  size of @uargs
 *
 * clone3() is the extensible successor to clone()/clone2().
 * It takes a struct as argument that is versioned by its size.
 *
 * Return: On success, a positive PID for the child process.
 *         On error, a negative errno number.
 */
SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
{
        int err;

        struct kernel_clone_args kargs;
        pid_t set_tid[MAX_PID_NS_LEVEL];

        kargs.set_tid = set_tid;

        err = copy_clone_args_from_user(&kargs, uargs, size);
        if (err)
                return err;

        if (!clone3_args_valid(&kargs))
                return -EINVAL;

        return kernel_clone(&kargs);
}
#endif

void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
{
        struct task_struct *leader, *parent, *child;
        int res;

        read_lock(&tasklist_lock);
        leader = top = top->group_leader;
down:
        for_each_thread(leader, parent) {
                list_for_each_entry(child, &parent->children, sibling) {
                        res = visitor(child, data);
                        if (res) {
                                if (res < 0)
                                        goto out;
                                leader = child;
                                goto down;
                        }
up:
                        ;
                }
        }

        if (leader != top) {
                child = leader;
                parent = child->real_parent;
                leader = parent->group_leader;
                goto up;
        }
out:
        read_unlock(&tasklist_lock);
}

#ifndef ARCH_MIN_MMSTRUCT_ALIGN
#define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif

static void sighand_ctor(void *data)
{
        struct sighand_struct *sighand = data;

        spin_lock_init(&sighand->siglock);
        init_waitqueue_head(&sighand->signalfd_wqh);
}

void __init mm_cache_init(void)
{
        unsigned int mm_size;

        /*
         * The mm_cpumask is located at the end of mm_struct, and is
         * dynamically sized based on the maximum CPU number this system
         * can have, taking hotplug into account (nr_cpu_ids).
         */
        mm_size = sizeof(struct mm_struct) + cpumask_size();

        mm_cachep = kmem_cache_create_usercopy("mm_struct",
                        mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        offsetof(struct mm_struct, saved_auxv),
                        sizeof_field(struct mm_struct, saved_auxv),
                        NULL);
}

void __init proc_caches_init(void)
{
        sighand_cachep = kmem_cache_create("sighand_cache",
                        sizeof(struct sighand_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
                        SLAB_ACCOUNT, sighand_ctor);
        signal_cachep = kmem_cache_create("signal_cache",
                        sizeof(struct signal_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        NULL);
        files_cachep = kmem_cache_create("files_cache",
                        sizeof(struct files_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        NULL);
        fs_cachep = kmem_cache_create("fs_cache",
                        sizeof(struct fs_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        NULL);

        vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
        mmap_init();
        nsproxy_cache_init();
}

/*
 * Check constraints on flags passed to the unshare system call.
 */
static int check_unshare_flags(unsigned long unshare_flags)
{
        if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
                                CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
                                CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
                                CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
                                CLONE_NEWTIME))
                return -EINVAL;
        /*
         * Not implemented, but pretend it works if there is nothing
         * to unshare.  Note that unsharing the address space or the
         * signal handlers also need to unshare the signal queues (aka
         * CLONE_THREAD).
         */
        if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
                if (!thread_group_empty(current))
                        return -EINVAL;
        }
        if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
                if (refcount_read(&current->sighand->count) > 1)
                        return -EINVAL;
        }
        if (unshare_flags & CLONE_VM) {
                if (!current_is_single_threaded())
                        return -EINVAL;
        }

        return 0;
}

/*
 * Unshare the filesystem structure if it is being shared
 */
static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
{
        struct fs_struct *fs = current->fs;

        if (!(unshare_flags & CLONE_FS) || !fs)
                return 0;

        /* don't need lock here; in the worst case we'll do useless copy */
        if (fs->users == 1)
                return 0;

        *new_fsp = copy_fs_struct(fs);
        if (!*new_fsp)
                return -ENOMEM;

        return 0;
}

/*
 * Unshare file descriptor table if it is being shared
 */
int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
               struct files_struct **new_fdp)
{
        struct files_struct *fd = current->files;
        int error = 0;

        if ((unshare_flags & CLONE_FILES) &&
            (fd && atomic_read(&fd->count) > 1)) {
                *new_fdp = dup_fd(fd, max_fds, &error);
                if (!*new_fdp)
                        return error;
        }

        return 0;
}

/*
 * unshare allows a process to 'unshare' part of the process
 * context which was originally shared using clone.  copy_*
 * functions used by kernel_clone() cannot be used here directly
 * because they modify an inactive task_struct that is being
 * constructed. Here we are modifying the current, active,
 * task_struct.
 */
int ksys_unshare(unsigned long unshare_flags)
{
        struct fs_struct *fs, *new_fs = NULL;
        struct files_struct *fd, *new_fd = NULL;
        struct cred *new_cred = NULL;
        struct nsproxy *new_nsproxy = NULL;
        int do_sysvsem = 0;
        int err;

        /*
         * If unsharing a user namespace must also unshare the thread group
         * and unshare the filesystem root and working directories.
         */
        if (unshare_flags & CLONE_NEWUSER)
                unshare_flags |= CLONE_THREAD | CLONE_FS;
        /*
         * If unsharing vm, must also unshare signal handlers.
         */
        if (unshare_flags & CLONE_VM)
                unshare_flags |= CLONE_SIGHAND;
        /*
         * If unsharing a signal handlers, must also unshare the signal queues.
         */
        if (unshare_flags & CLONE_SIGHAND)
                unshare_flags |= CLONE_THREAD;
        /*
         * If unsharing namespace, must also unshare filesystem information.
         */
        if (unshare_flags & CLONE_NEWNS)
                unshare_flags |= CLONE_FS;

        err = check_unshare_flags(unshare_flags);
        if (err)
                goto bad_unshare_out;
        /*
         * CLONE_NEWIPC must also detach from the undolist: after switching
         * to a new ipc namespace, the semaphore arrays from the old
         * namespace are unreachable.
         */
        if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
                do_sysvsem = 1;
        err = unshare_fs(unshare_flags, &new_fs);
        if (err)
                goto bad_unshare_out;
        err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
        if (err)
                goto bad_unshare_cleanup_fs;
        err = unshare_userns(unshare_flags, &new_cred);
        if (err)
                goto bad_unshare_cleanup_fd;
        err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
                                         new_cred, new_fs);
        if (err)
                goto bad_unshare_cleanup_cred;

        if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
                if (do_sysvsem) {
                        /*
                         * CLONE_SYSVSEM is equivalent to sys_exit().
                         */
                        exit_sem(current);
                }
                if (unshare_flags & CLONE_NEWIPC) {
                        /* Orphan segments in old ns (see sem above). */
                        exit_shm(current);
                        shm_init_task(current);
                }

                if (new_nsproxy)
                        switch_task_namespaces(current, new_nsproxy);

                task_lock(current);

                if (new_fs) {
                        fs = current->fs;
                        spin_lock(&fs->lock);
                        current->fs = new_fs;
                        if (--fs->users)
                                new_fs = NULL;
                        else
                                new_fs = fs;
                        spin_unlock(&fs->lock);
                }

                if (new_fd) {
                        fd = current->files;
                        current->files = new_fd;
                        new_fd = fd;
                }

                task_unlock(current);

                if (new_cred) {
                        /* Install the new user namespace */
                        commit_creds(new_cred);
                        new_cred = NULL;
                }
        }

        perf_event_namespaces(current);

bad_unshare_cleanup_cred:
        if (new_cred)
                put_cred(new_cred);
bad_unshare_cleanup_fd:
        if (new_fd)
                put_files_struct(new_fd);

bad_unshare_cleanup_fs:
        if (new_fs)
                free_fs_struct(new_fs);

bad_unshare_out:
        return err;
}

SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
{
        return ksys_unshare(unshare_flags);
}

/*
 *        Helper to unshare the files of the current task.
 *        We don't want to expose copy_files internals to
 *        the exec layer of the kernel.
 */

int unshare_files(struct files_struct **displaced)
{
        struct task_struct *task = current;
        struct files_struct *copy = NULL;
        int error;

        error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, &copy);
        if (error || !copy) {
                *displaced = NULL;
                return error;
        }
        *displaced = task->files;
        task_lock(task);
        task->files = copy;
        task_unlock(task);
        return 0;
}

int sysctl_max_threads(struct ctl_table *table, int write,
                       void *buffer, size_t *lenp, loff_t *ppos)
{
        struct ctl_table t;
        int ret;
        int threads = max_threads;
        int min = 1;
        int max = MAX_THREADS;

        t = *table;
        t.data = &threads;
        t.extra1 = &min;
        t.extra2 = &max;

        ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
        if (ret || !write)
                return ret;

        max_threads = threads;

        return 0;
}







































































































































    1 


















































































































































    1 













































































































































































































































































































































































































































































































































    1 








































































    1 


    1 
    1 















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_LIST_H
#define _LINUX_LIST_H

#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/poison.h>
#include <linux/const.h>
#include <linux/kernel.h>

/*
 * Simple doubly linked list implementation.
 *
 * Some of the internal functions ("__xxx") are useful when
 * manipulating whole lists rather than single entries, as
 * sometimes we already know the next/prev entries and we can
 * generate better code by using them directly rather than
 * using the generic single-entry routines.
 */

#define LIST_HEAD_INIT(name) { &(name), &(name) }

#define LIST_HEAD(name) \
        struct list_head name = LIST_HEAD_INIT(name)

/**
 * INIT_LIST_HEAD - Initialize a list_head structure
 * @list: list_head structure to be initialized.
 *
 * Initializes the list_head to point to itself.  If it is a list header,
 * the result is an empty list.
 */
static inline void INIT_LIST_HEAD(struct list_head *list)
{
        WRITE_ONCE(list->next, list);
        list->prev = list;
}

#ifdef CONFIG_DEBUG_LIST
extern bool __list_add_valid(struct list_head *new,
                              struct list_head *prev,
                              struct list_head *next);
extern bool __list_del_entry_valid(struct list_head *entry);
#else
static inline bool __list_add_valid(struct list_head *new,
                                struct list_head *prev,
                                struct list_head *next)
{
        return true;
}
static inline bool __list_del_entry_valid(struct list_head *entry)
{
        return true;
}
#endif

/*
 * Insert a new entry between two known consecutive entries.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void __list_add(struct list_head *new,
                              struct list_head *prev,
                              struct list_head *next)
{
        if (!__list_add_valid(new, prev, next))
                return;

        next->prev = new;
        new->next = next;
        new->prev = prev;
        WRITE_ONCE(prev->next, new);
}

/**
 * list_add - add a new entry
 * @new: new entry to be added
 * @head: list head to add it after
 *
 * Insert a new entry after the specified head.
 * This is good for implementing stacks.
 */
static inline void list_add(struct list_head *new, struct list_head *head)
{
        __list_add(new, head, head->next);
}


/**
 * list_add_tail - add a new entry
 * @new: new entry to be added
 * @head: list head to add it before
 *
 * Insert a new entry before the specified head.
 * This is useful for implementing queues.
 */
static inline void list_add_tail(struct list_head *new, struct list_head *head)
{
        __list_add(new, head->prev, head);
}

/*
 * Delete a list entry by making the prev/next entries
 * point to each other.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void __list_del(struct list_head * prev, struct list_head * next)
{
        next->prev = prev;
        WRITE_ONCE(prev->next, next);
}

/*
 * Delete a list entry and clear the 'prev' pointer.
 *
 * This is a special-purpose list clearing method used in the networking code
 * for lists allocated as per-cpu, where we don't want to incur the extra
 * WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this
 * needs to check the node 'prev' pointer instead of calling list_empty().
 */
static inline void __list_del_clearprev(struct list_head *entry)
{
        __list_del(entry->prev, entry->next);
        entry->prev = NULL;
}

static inline void __list_del_entry(struct list_head *entry)
{
        if (!__list_del_entry_valid(entry))
                return;

        __list_del(entry->prev, entry->next);
}

/**
 * list_del - deletes entry from list.
 * @entry: the element to delete from the list.
 * Note: list_empty() on entry does not return true after this, the entry is
 * in an undefined state.
 */
static inline void list_del(struct list_head *entry)
{
        __list_del_entry(entry);
        entry->next = LIST_POISON1;
        entry->prev = LIST_POISON2;
}

/**
 * list_replace - replace old entry by new one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * If @old was empty, it will be overwritten.
 */
static inline void list_replace(struct list_head *old,
                                struct list_head *new)
{
        new->next = old->next;
        new->next->prev = new;
        new->prev = old->prev;
        new->prev->next = new;
}

/**
 * list_replace_init - replace old entry by new one and initialize the old one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * If @old was empty, it will be overwritten.
 */
static inline void list_replace_init(struct list_head *old,
                                     struct list_head *new)
{
        list_replace(old, new);
        INIT_LIST_HEAD(old);
}

/**
 * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position
 * @entry1: the location to place entry2
 * @entry2: the location to place entry1
 */
static inline void list_swap(struct list_head *entry1,
                             struct list_head *entry2)
{
        struct list_head *pos = entry2->prev;

        list_del(entry2);
        list_replace(entry1, entry2);
        if (pos == entry1)
                pos = entry2;
        list_add(entry1, pos);
}

/**
 * list_del_init - deletes entry from list and reinitialize it.
 * @entry: the element to delete from the list.
 */
static inline void list_del_init(struct list_head *entry)
{
        __list_del_entry(entry);
        INIT_LIST_HEAD(entry);
}

/**
 * list_move - delete from one list and add as another's head
 * @list: the entry to move
 * @head: the head that will precede our entry
 */
static inline void list_move(struct list_head *list, struct list_head *head)
{
        __list_del_entry(list);
        list_add(list, head);
}

/**
 * list_move_tail - delete from one list and add as another's tail
 * @list: the entry to move
 * @head: the head that will follow our entry
 */
static inline void list_move_tail(struct list_head *list,
                                  struct list_head *head)
{
        __list_del_entry(list);
        list_add_tail(list, head);
}

/**
 * list_bulk_move_tail - move a subsection of a list to its tail
 * @head: the head that will follow our entry
 * @first: first entry to move
 * @last: last entry to move, can be the same as first
 *
 * Move all entries between @first and including @last before @head.
 * All three entries must belong to the same linked list.
 */
static inline void list_bulk_move_tail(struct list_head *head,
                                       struct list_head *first,
                                       struct list_head *last)
{
        first->prev->next = last->next;
        last->next->prev = first->prev;

        head->prev->next = first;
        first->prev = head->prev;

        last->next = head;
        head->prev = last;
}

/**
 * list_is_first -- tests whether @list is the first entry in list @head
 * @list: the entry to test
 * @head: the head of the list
 */
static inline int list_is_first(const struct list_head *list,
                                        const struct list_head *head)
{
        return list->prev == head;
}

/**
 * list_is_last - tests whether @list is the last entry in list @head
 * @list: the entry to test
 * @head: the head of the list
 */
static inline int list_is_last(const struct list_head *list,
                                const struct list_head *head)
{
        return list->next == head;
}

/**
 * list_empty - tests whether a list is empty
 * @head: the list to test.
 */
static inline int list_empty(const struct list_head *head)
{
        return READ_ONCE(head->next) == head;
}

/**
 * list_del_init_careful - deletes entry from list and reinitialize it.
 * @entry: the element to delete from the list.
 *
 * This is the same as list_del_init(), except designed to be used
 * together with list_empty_careful() in a way to guarantee ordering
 * of other memory operations.
 *
 * Any memory operations done before a list_del_init_careful() are
 * guaranteed to be visible after a list_empty_careful() test.
 */
static inline void list_del_init_careful(struct list_head *entry)
{
        __list_del_entry(entry);
        entry->prev = entry;
        smp_store_release(&entry->next, entry);
}

/**
 * list_empty_careful - tests whether a list is empty and not being modified
 * @head: the list to test
 *
 * Description:
 * tests whether a list is empty _and_ checks that no other CPU might be
 * in the process of modifying either member (next or prev)
 *
 * NOTE: using list_empty_careful() without synchronization
 * can only be safe if the only activity that can happen
 * to the list entry is list_del_init(). Eg. it cannot be used
 * if another CPU could re-list_add() it.
 */
static inline int list_empty_careful(const struct list_head *head)
{
        struct list_head *next = smp_load_acquire(&head->next);
        return (next == head) && (next == head->prev);
}

/**
 * list_rotate_left - rotate the list to the left
 * @head: the head of the list
 */
static inline void list_rotate_left(struct list_head *head)
{
        struct list_head *first;

        if (!list_empty(head)) {
                first = head->next;
                list_move_tail(first, head);
        }
}

/**
 * list_rotate_to_front() - Rotate list to specific item.
 * @list: The desired new front of the list.
 * @head: The head of the list.
 *
 * Rotates list so that @list becomes the new front of the list.
 */
static inline void list_rotate_to_front(struct list_head *list,
                                        struct list_head *head)
{
        /*
         * Deletes the list head from the list denoted by @head and
         * places it as the tail of @list, this effectively rotates the
         * list so that @list is at the front.
         */
        list_move_tail(head, list);
}

/**
 * list_is_singular - tests whether a list has just one entry.
 * @head: the list to test.
 */
static inline int list_is_singular(const struct list_head *head)
{
        return !list_empty(head) && (head->next == head->prev);
}

static inline void __list_cut_position(struct list_head *list,
                struct list_head *head, struct list_head *entry)
{
        struct list_head *new_first = entry->next;
        list->next = head->next;
        list->next->prev = list;
        list->prev = entry;
        entry->next = list;
        head->next = new_first;
        new_first->prev = head;
}

/**
 * list_cut_position - cut a list into two
 * @list: a new list to add all removed entries
 * @head: a list with entries
 * @entry: an entry within head, could be the head itself
 *        and if so we won't cut the list
 *
 * This helper moves the initial part of @head, up to and
 * including @entry, from @head to @list. You should
 * pass on @entry an element you know is on @head. @list
 * should be an empty list or a list you do not care about
 * losing its data.
 *
 */
static inline void list_cut_position(struct list_head *list,
                struct list_head *head, struct list_head *entry)
{
        if (list_empty(head))
                return;
        if (list_is_singular(head) &&
                (head->next != entry && head != entry))
                return;
        if (entry == head)
                INIT_LIST_HEAD(list);
        else
                __list_cut_position(list, head, entry);
}

/**
 * list_cut_before - cut a list into two, before given entry
 * @list: a new list to add all removed entries
 * @head: a list with entries
 * @entry: an entry within head, could be the head itself
 *
 * This helper moves the initial part of @head, up to but
 * excluding @entry, from @head to @list.  You should pass
 * in @entry an element you know is on @head.  @list should
 * be an empty list or a list you do not care about losing
 * its data.
 * If @entry == @head, all entries on @head are moved to
 * @list.
 */
static inline void list_cut_before(struct list_head *list,
                                   struct list_head *head,
                                   struct list_head *entry)
{
        if (head->next == entry) {
                INIT_LIST_HEAD(list);
                return;
        }
        list->next = head->next;
        list->next->prev = list;
        list->prev = entry->prev;
        list->prev->next = list;
        head->next = entry;
        entry->prev = head;
}

static inline void __list_splice(const struct list_head *list,
                                 struct list_head *prev,
                                 struct list_head *next)
{
        struct list_head *first = list->next;
        struct list_head *last = list->prev;

        first->prev = prev;
        prev->next = first;

        last->next = next;
        next->prev = last;
}

/**
 * list_splice - join two lists, this is designed for stacks
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 */
static inline void list_splice(const struct list_head *list,
                                struct list_head *head)
{
        if (!list_empty(list))
                __list_splice(list, head, head->next);
}

/**
 * list_splice_tail - join two lists, each list being a queue
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 */
static inline void list_splice_tail(struct list_head *list,
                                struct list_head *head)
{
        if (!list_empty(list))
                __list_splice(list, head->prev, head);
}

/**
 * list_splice_init - join two lists and reinitialise the emptied list.
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 *
 * The list at @list is reinitialised
 */
static inline void list_splice_init(struct list_head *list,
                                    struct list_head *head)
{
        if (!list_empty(list)) {
                __list_splice(list, head, head->next);
                INIT_LIST_HEAD(list);
        }
}

/**
 * list_splice_tail_init - join two lists and reinitialise the emptied list
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 *
 * Each of the lists is a queue.
 * The list at @list is reinitialised
 */
static inline void list_splice_tail_init(struct list_head *list,
                                         struct list_head *head)
{
        if (!list_empty(list)) {
                __list_splice(list, head->prev, head);
                INIT_LIST_HEAD(list);
        }
}

/**
 * list_entry - get the struct for this entry
 * @ptr:        the &struct list_head pointer.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 */
#define list_entry(ptr, type, member) \
        container_of(ptr, type, member)

/**
 * list_first_entry - get the first element from a list
 * @ptr:        the list head to take the element from.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 *
 * Note, that list is expected to be not empty.
 */
#define list_first_entry(ptr, type, member) \
        list_entry((ptr)->next, type, member)

/**
 * list_last_entry - get the last element from a list
 * @ptr:        the list head to take the element from.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 *
 * Note, that list is expected to be not empty.
 */
#define list_last_entry(ptr, type, member) \
        list_entry((ptr)->prev, type, member)

/**
 * list_first_entry_or_null - get the first element from a list
 * @ptr:        the list head to take the element from.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 *
 * Note that if the list is empty, it returns NULL.
 */
#define list_first_entry_or_null(ptr, type, member) ({ \
        struct list_head *head__ = (ptr); \
        struct list_head *pos__ = READ_ONCE(head__->next); \
        pos__ != head__ ? list_entry(pos__, type, member) : NULL; \
})

/**
 * list_next_entry - get the next element in list
 * @pos:        the type * to cursor
 * @member:        the name of the list_head within the struct.
 */
#define list_next_entry(pos, member) \
        list_entry((pos)->member.next, typeof(*(pos)), member)

/**
 * list_prev_entry - get the prev element in list
 * @pos:        the type * to cursor
 * @member:        the name of the list_head within the struct.
 */
#define list_prev_entry(pos, member) \
        list_entry((pos)->member.prev, typeof(*(pos)), member)

/**
 * list_for_each        -        iterate over a list
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 */
#define list_for_each(pos, head) \
        for (pos = (head)->next; pos != (head); pos = pos->next)

/**
 * list_for_each_continue - continue iteration over a list
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 *
 * Continue to iterate over a list, continuing after the current position.
 */
#define list_for_each_continue(pos, head) \
        for (pos = pos->next; pos != (head); pos = pos->next)

/**
 * list_for_each_prev        -        iterate over a list backwards
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 */
#define list_for_each_prev(pos, head) \
        for (pos = (head)->prev; pos != (head); pos = pos->prev)

/**
 * list_for_each_safe - iterate over a list safe against removal of list entry
 * @pos:        the &struct list_head to use as a loop cursor.
 * @n:                another &struct list_head to use as temporary storage
 * @head:        the head for your list.
 */
#define list_for_each_safe(pos, n, head) \
        for (pos = (head)->next, n = pos->next; pos != (head); \
                pos = n, n = pos->next)

/**
 * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry
 * @pos:        the &struct list_head to use as a loop cursor.
 * @n:                another &struct list_head to use as temporary storage
 * @head:        the head for your list.
 */
#define list_for_each_prev_safe(pos, n, head) \
        for (pos = (head)->prev, n = pos->prev; \
             pos != (head); \
             pos = n, n = pos->prev)

/**
 * list_entry_is_head - test if the entry points to the head of the list
 * @pos:        the type * to cursor
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_entry_is_head(pos, head, member)                                \
        (&pos->member == (head))

/**
 * list_for_each_entry        -        iterate over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_for_each_entry(pos, head, member)                                \
        for (pos = list_first_entry(head, typeof(*pos), member);        \
             !list_entry_is_head(pos, head, member);                        \
             pos = list_next_entry(pos, member))

/**
 * list_for_each_entry_reverse - iterate backwards over list of given type.
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_for_each_entry_reverse(pos, head, member)                        \
        for (pos = list_last_entry(head, typeof(*pos), member);                \
             !list_entry_is_head(pos, head, member);                         \
             pos = list_prev_entry(pos, member))

/**
 * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue()
 * @pos:        the type * to use as a start point
 * @head:        the head of the list
 * @member:        the name of the list_head within the struct.
 *
 * Prepares a pos entry for use as a start point in list_for_each_entry_continue().
 */
#define list_prepare_entry(pos, head, member) \
        ((pos) ? : list_entry(head, typeof(*pos), member))

/**
 * list_for_each_entry_continue - continue iteration over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Continue to iterate over list of given type, continuing after
 * the current position.
 */
#define list_for_each_entry_continue(pos, head, member)                 \
        for (pos = list_next_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                        \
             pos = list_next_entry(pos, member))

/**
 * list_for_each_entry_continue_reverse - iterate backwards from the given point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Start to iterate over list of given type backwards, continuing after
 * the current position.
 */
#define list_for_each_entry_continue_reverse(pos, head, member)                \
        for (pos = list_prev_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                        \
             pos = list_prev_entry(pos, member))

/**
 * list_for_each_entry_from - iterate over list of given type from the current point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate over list of given type, continuing from current position.
 */
#define list_for_each_entry_from(pos, head, member)                         \
        for (; !list_entry_is_head(pos, head, member);                        \
             pos = list_next_entry(pos, member))

/**
 * list_for_each_entry_from_reverse - iterate backwards over list of given type
 *                                    from the current point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate backwards over list of given type, continuing from current position.
 */
#define list_for_each_entry_from_reverse(pos, head, member)                \
        for (; !list_entry_is_head(pos, head, member);                        \
             pos = list_prev_entry(pos, member))

/**
 * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_for_each_entry_safe(pos, n, head, member)                        \
        for (pos = list_first_entry(head, typeof(*pos), member),        \
                n = list_next_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                         \
             pos = n, n = list_next_entry(n, member))

/**
 * list_for_each_entry_safe_continue - continue list iteration safe against removal
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate over list of given type, continuing after current point,
 * safe against removal of list entry.
 */
#define list_for_each_entry_safe_continue(pos, n, head, member)                 \
        for (pos = list_next_entry(pos, member),                                 \
                n = list_next_entry(pos, member);                                \
             !list_entry_is_head(pos, head, member);                                \
             pos = n, n = list_next_entry(n, member))

/**
 * list_for_each_entry_safe_from - iterate over list from current point safe against removal
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate over list of given type from current point, safe against
 * removal of list entry.
 */
#define list_for_each_entry_safe_from(pos, n, head, member)                         \
        for (n = list_next_entry(pos, member);                                        \
             !list_entry_is_head(pos, head, member);                                \
             pos = n, n = list_next_entry(n, member))

/**
 * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate backwards over list of given type, safe against removal
 * of list entry.
 */
#define list_for_each_entry_safe_reverse(pos, n, head, member)                \
        for (pos = list_last_entry(head, typeof(*pos), member),                \
                n = list_prev_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                         \
             pos = n, n = list_prev_entry(n, member))

/**
 * list_safe_reset_next - reset a stale list_for_each_entry_safe loop
 * @pos:        the loop cursor used in the list_for_each_entry_safe loop
 * @n:                temporary storage used in list_for_each_entry_safe
 * @member:        the name of the list_head within the struct.
 *
 * list_safe_reset_next is not safe to use in general if the list may be
 * modified concurrently (eg. the lock is dropped in the loop body). An
 * exception to this is if the cursor element (pos) is pinned in the list,
 * and list_safe_reset_next is called after re-taking the lock and before
 * completing the current iteration of the loop body.
 */
#define list_safe_reset_next(pos, n, member)                                \
        n = list_next_entry(pos, member)

/*
 * Double linked lists with a single pointer list head.
 * Mostly useful for hash tables where the two pointer list head is
 * too wasteful.
 * You lose the ability to access the tail in O(1).
 */

#define HLIST_HEAD_INIT { .first = NULL }
#define HLIST_HEAD(name) struct hlist_head name = {  .first = NULL }
#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
static inline void INIT_HLIST_NODE(struct hlist_node *h)
{
        h->next = NULL;
        h->pprev = NULL;
}

/**
 * hlist_unhashed - Has node been removed from list and reinitialized?
 * @h: Node to be checked
 *
 * Not that not all removal functions will leave a node in unhashed
 * state.  For example, hlist_nulls_del_init_rcu() does leave the
 * node in unhashed state, but hlist_nulls_del() does not.
 */
static inline int hlist_unhashed(const struct hlist_node *h)
{
        return !h->pprev;
}

/**
 * hlist_unhashed_lockless - Version of hlist_unhashed for lockless use
 * @h: Node to be checked
 *
 * This variant of hlist_unhashed() must be used in lockless contexts
 * to avoid potential load-tearing.  The READ_ONCE() is paired with the
 * various WRITE_ONCE() in hlist helpers that are defined below.
 */
static inline int hlist_unhashed_lockless(const struct hlist_node *h)
{
        return !READ_ONCE(h->pprev);
}

/**
 * hlist_empty - Is the specified hlist_head structure an empty hlist?
 * @h: Structure to check.
 */
static inline int hlist_empty(const struct hlist_head *h)
{
        return !READ_ONCE(h->first);
}

static inline void __hlist_del(struct hlist_node *n)
{
        struct hlist_node *next = n->next;
        struct hlist_node **pprev = n->pprev;

        WRITE_ONCE(*pprev, next);
        if (next)
                WRITE_ONCE(next->pprev, pprev);
}

/**
 * hlist_del - Delete the specified hlist_node from its list
 * @n: Node to delete.
 *
 * Note that this function leaves the node in hashed state.  Use
 * hlist_del_init() or similar instead to unhash @n.
 */
static inline void hlist_del(struct hlist_node *n)
{
        __hlist_del(n);
        n->next = LIST_POISON1;
        n->pprev = LIST_POISON2;
}

/**
 * hlist_del_init - Delete the specified hlist_node from its list and initialize
 * @n: Node to delete.
 *
 * Note that this function leaves the node in unhashed state.
 */
static inline void hlist_del_init(struct hlist_node *n)
{
        if (!hlist_unhashed(n)) {
                __hlist_del(n);
                INIT_HLIST_NODE(n);
        }
}

/**
 * hlist_add_head - add a new entry at the beginning of the hlist
 * @n: new entry to be added
 * @h: hlist head to add it after
 *
 * Insert a new entry after the specified head.
 * This is good for implementing stacks.
 */
static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
{
        struct hlist_node *first = h->first;
        WRITE_ONCE(n->next, first);
        if (first)
                WRITE_ONCE(first->pprev, &n->next);
        WRITE_ONCE(h->first, n);
        WRITE_ONCE(n->pprev, &h->first);
}

/**
 * hlist_add_before - add a new entry before the one specified
 * @n: new entry to be added
 * @next: hlist node to add it before, which must be non-NULL
 */
static inline void hlist_add_before(struct hlist_node *n,
                                    struct hlist_node *next)
{
        WRITE_ONCE(n->pprev, next->pprev);
        WRITE_ONCE(n->next, next);
        WRITE_ONCE(next->pprev, &n->next);
        WRITE_ONCE(*(n->pprev), n);
}

/**
 * hlist_add_behing - add a new entry after the one specified
 * @n: new entry to be added
 * @prev: hlist node to add it after, which must be non-NULL
 */
static inline void hlist_add_behind(struct hlist_node *n,
                                    struct hlist_node *prev)
{
        WRITE_ONCE(n->next, prev->next);
        WRITE_ONCE(prev->next, n);
        WRITE_ONCE(n->pprev, &prev->next);

        if (n->next)
                WRITE_ONCE(n->next->pprev, &n->next);
}

/**
 * hlist_add_fake - create a fake hlist consisting of a single headless node
 * @n: Node to make a fake list out of
 *
 * This makes @n appear to be its own predecessor on a headless hlist.
 * The point of this is to allow things like hlist_del() to work correctly
 * in cases where there is no list.
 */
static inline void hlist_add_fake(struct hlist_node *n)
{
        n->pprev = &n->next;
}

/**
 * hlist_fake: Is this node a fake hlist?
 * @h: Node to check for being a self-referential fake hlist.
 */
static inline bool hlist_fake(struct hlist_node *h)
{
        return h->pprev == &h->next;
}

/**
 * hlist_is_singular_node - is node the only element of the specified hlist?
 * @n: Node to check for singularity.
 * @h: Header for potentially singular list.
 *
 * Check whether the node is the only node of the head without
 * accessing head, thus avoiding unnecessary cache misses.
 */
static inline bool
hlist_is_singular_node(struct hlist_node *n, struct hlist_head *h)
{
        return !n->next && n->pprev == &h->first;
}

/**
 * hlist_move_list - Move an hlist
 * @old: hlist_head for old list.
 * @new: hlist_head for new list.
 *
 * Move a list from one list head to another. Fixup the pprev
 * reference of the first entry if it exists.
 */
static inline void hlist_move_list(struct hlist_head *old,
                                   struct hlist_head *new)
{
        new->first = old->first;
        if (new->first)
                new->first->pprev = &new->first;
        old->first = NULL;
}

#define hlist_entry(ptr, type, member) container_of(ptr,type,member)

#define hlist_for_each(pos, head) \
        for (pos = (head)->first; pos ; pos = pos->next)

#define hlist_for_each_safe(pos, n, head) \
        for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \
             pos = n)

#define hlist_entry_safe(ptr, type, member) \
        ({ typeof(ptr) ____ptr = (ptr); \
           ____ptr ? hlist_entry(____ptr, type, member) : NULL; \
        })

/**
 * hlist_for_each_entry        - iterate over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry(pos, head, member)                                \
        for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\
             pos;                                                        \
             pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))

/**
 * hlist_for_each_entry_continue - iterate over a hlist continuing after current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_continue(pos, member)                        \
        for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\
             pos;                                                        \
             pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))

/**
 * hlist_for_each_entry_from - iterate over a hlist continuing from current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_from(pos, member)                                \
        for (; pos;                                                        \
             pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))

/**
 * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry
 * @pos:        the type * to use as a loop cursor.
 * @n:                a &struct hlist_node to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_safe(pos, n, head, member)                 \
        for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\
             pos && ({ n = pos->member.next; 1; });                        \
             pos = hlist_entry_safe(n, typeof(*pos), member))

#endif


























    1 




















































    1 







































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_PREEMPT_H
#define __ASM_PREEMPT_H

#include <asm/rmwcc.h>
#include <asm/percpu.h>
#include <linux/thread_info.h>

DECLARE_PER_CPU(int, __preempt_count);

/* We use the MSB mostly because its available */
#define PREEMPT_NEED_RESCHED        0x80000000

/*
 * We use the PREEMPT_NEED_RESCHED bit as an inverted NEED_RESCHED such
 * that a decrement hitting 0 means we can and should reschedule.
 */
#define PREEMPT_ENABLED        (0 + PREEMPT_NEED_RESCHED)

/*
 * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
 * that think a non-zero value indicates we cannot preempt.
 */
static __always_inline int preempt_count(void)
{
        return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
}

static __always_inline void preempt_count_set(int pc)
{
        int old, new;

        do {
                old = raw_cpu_read_4(__preempt_count);
                new = (old & PREEMPT_NEED_RESCHED) |
                        (pc & ~PREEMPT_NEED_RESCHED);
        } while (raw_cpu_cmpxchg_4(__preempt_count, old, new) != old);
}

/*
 * must be macros to avoid header recursion hell
 */
#define init_task_preempt_count(p) do { } while (0)

#define init_idle_preempt_count(p, cpu) do { \
        per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \
} while (0)

/*
 * We fold the NEED_RESCHED bit into the preempt count such that
 * preempt_enable() can decrement and test for needing to reschedule with a
 * single instruction.
 *
 * We invert the actual bit, so that when the decrement hits 0 we know we both
 * need to resched (the bit is cleared) and can resched (no preempt count).
 */

static __always_inline void set_preempt_need_resched(void)
{
        raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED);
}

static __always_inline void clear_preempt_need_resched(void)
{
        raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED);
}

static __always_inline bool test_preempt_need_resched(void)
{
        return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED);
}

/*
 * The various preempt_count add/sub methods
 */

static __always_inline void __preempt_count_add(int val)
{
        raw_cpu_add_4(__preempt_count, val);
}

static __always_inline void __preempt_count_sub(int val)
{
        raw_cpu_add_4(__preempt_count, -val);
}

/*
 * Because we keep PREEMPT_NEED_RESCHED set when we do _not_ need to reschedule
 * a decrement which hits zero means we have no preempt_count and should
 * reschedule.
 */
static __always_inline bool __preempt_count_dec_and_test(void)
{
        return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var]));
}

/*
 * Returns true when we need to resched and can (barring IRQ state).
 */
static __always_inline bool should_resched(int preempt_offset)
{
        return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
}

#ifdef CONFIG_PREEMPTION
  extern asmlinkage void preempt_schedule_thunk(void);
# define __preempt_schedule() \
        asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT)

  extern asmlinkage void preempt_schedule(void);
  extern asmlinkage void preempt_schedule_notrace_thunk(void);
# define __preempt_schedule_notrace() \
        asm volatile ("call preempt_schedule_notrace_thunk" : ASM_CALL_CONSTRAINT)

  extern asmlinkage void preempt_schedule_notrace(void);
#endif

#endif /* __ASM_PREEMPT_H */

































































    1 





















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Wrapper functions for accessing the file_struct fd array.
 */

#ifndef __LINUX_FILE_H
#define __LINUX_FILE_H

#include <linux/compiler.h>
#include <linux/types.h>
#include <linux/posix_types.h>
#include <linux/errno.h>

struct file;

extern void fput(struct file *);
extern void fput_many(struct file *, unsigned int);

struct file_operations;
struct task_struct;
struct vfsmount;
struct dentry;
struct inode;
struct path;
extern struct file *alloc_file_pseudo(struct inode *, struct vfsmount *,
        const char *, int flags, const struct file_operations *);
extern struct file *alloc_file_clone(struct file *, int flags,
        const struct file_operations *);

static inline void fput_light(struct file *file, int fput_needed)
{
        if (fput_needed)
                fput(file);
}

struct fd {
        struct file *file;
        unsigned int flags;
};
#define FDPUT_FPUT       1
#define FDPUT_POS_UNLOCK 2

static inline void fdput(struct fd fd)
{
        if (fd.flags & FDPUT_FPUT)
                fput(fd.file);
}

extern struct file *fget(unsigned int fd);
extern struct file *fget_many(unsigned int fd, unsigned int refs);
extern struct file *fget_raw(unsigned int fd);
extern struct file *fget_task(struct task_struct *task, unsigned int fd);
extern unsigned long __fdget(unsigned int fd);
extern unsigned long __fdget_raw(unsigned int fd);
extern unsigned long __fdget_pos(unsigned int fd);
extern void __f_unlock_pos(struct file *);

static inline struct fd __to_fd(unsigned long v)
{
        return (struct fd){(struct file *)(v & ~3),v & 3};
}

static inline struct fd fdget(unsigned int fd)
{
        return __to_fd(__fdget(fd));
}

static inline struct fd fdget_raw(unsigned int fd)
{
        return __to_fd(__fdget_raw(fd));
}

static inline struct fd fdget_pos(int fd)
{
        return __to_fd(__fdget_pos(fd));
}

static inline void fdput_pos(struct fd f)
{
        if (f.flags & FDPUT_POS_UNLOCK)
                __f_unlock_pos(f.file);
        fdput(f);
}

extern int f_dupfd(unsigned int from, struct file *file, unsigned flags);
extern int replace_fd(unsigned fd, struct file *file, unsigned flags);
extern void set_close_on_exec(unsigned int fd, int flag);
extern bool get_close_on_exec(unsigned int fd);
extern int __get_unused_fd_flags(unsigned flags, unsigned long nofile);
extern int get_unused_fd_flags(unsigned flags);
extern void put_unused_fd(unsigned int fd);

extern void fd_install(unsigned int fd, struct file *file);

extern int __receive_fd(int fd, struct file *file, int __user *ufd,
                        unsigned int o_flags);
static inline int receive_fd_user(struct file *file, int __user *ufd,
                                  unsigned int o_flags)
{
        if (ufd == NULL)
                return -EFAULT;
        return __receive_fd(-1, file, ufd, o_flags);
}
static inline int receive_fd(struct file *file, unsigned int o_flags)
{
        return __receive_fd(-1, file, NULL, o_flags);
}
static inline int receive_fd_replace(int fd, struct file *file, unsigned int o_flags)
{
        return __receive_fd(fd, file, NULL, o_flags);
}

extern void flush_delayed_fput(void);
extern void __fput_sync(struct file *);

extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max;

#endif /* __LINUX_FILE_H */



































































































































































































































































    1 


    1 










    1 












    1 










    1 



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 










    1 

    1 





















    1 





    1 
    1 

























































































    1 
    1 


    1 










































































































































































































    1 

    1 
    1 

    1 

    1 



    1 

    1 

    1 













    1 





    1 



















    1 
    1 





































































































































    1 























































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


















    1 


    1 






    1 
    1 





    1 





















































































































































































































































































































































































































































































































































































































































    1 




    1 
    1 












    1 




    1 


    1 


    1 










    1 






    1 








    1 
    1 










    1 
    1 

























































































































































    1 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  NSA Security-Enhanced Linux (SELinux) security module
 *
 *  This file contains the SELinux hook function implementations.
 *
 *  Authors:  Stephen Smalley, <sds@tycho.nsa.gov>
 *              Chris Vance, <cvance@nai.com>
 *              Wayne Salamon, <wsalamon@nai.com>
 *              James Morris <jmorris@redhat.com>
 *
 *  Copyright (C) 2001,2002 Networks Associates Technology, Inc.
 *  Copyright (C) 2003-2008 Red Hat, Inc., James Morris <jmorris@redhat.com>
 *                                           Eric Paris <eparis@redhat.com>
 *  Copyright (C) 2004-2005 Trusted Computer Solutions, Inc.
 *                            <dgoeddel@trustedcs.com>
 *  Copyright (C) 2006, 2007, 2009 Hewlett-Packard Development Company, L.P.
 *        Paul Moore <paul@paul-moore.com>
 *  Copyright (C) 2007 Hitachi Software Engineering Co., Ltd.
 *                       Yuichi Nakamura <ynakam@hitachisoft.jp>
 *  Copyright (C) 2016 Mellanox Technologies
 */

#include <linux/init.h>
#include <linux/kd.h>
#include <linux/kernel.h>
#include <linux/kernel_read_file.h>
#include <linux/tracehook.h>
#include <linux/errno.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/lsm_hooks.h>
#include <linux/xattr.h>
#include <linux/capability.h>
#include <linux/unistd.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/proc_fs.h>
#include <linux/swap.h>
#include <linux/spinlock.h>
#include <linux/syscalls.h>
#include <linux/dcache.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv6.h>
#include <linux/tty.h>
#include <net/icmp.h>
#include <net/ip.h>                /* for local_port_range[] */
#include <net/tcp.h>                /* struct or_callable used in sock_rcv_skb */
#include <net/inet_connection_sock.h>
#include <net/net_namespace.h>
#include <net/netlabel.h>
#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/interrupt.h>
#include <linux/netdevice.h>        /* for network interface checks */
#include <net/netlink.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/dccp.h>
#include <linux/sctp.h>
#include <net/sctp/structs.h>
#include <linux/quota.h>
#include <linux/un.h>                /* for Unix socket types */
#include <net/af_unix.h>        /* for Unix socket types */
#include <linux/parser.h>
#include <linux/nfs_mount.h>
#include <net/ipv6.h>
#include <linux/hugetlb.h>
#include <linux/personality.h>
#include <linux/audit.h>
#include <linux/string.h>
#include <linux/mutex.h>
#include <linux/posix-timers.h>
#include <linux/syslog.h>
#include <linux/user_namespace.h>
#include <linux/export.h>
#include <linux/msg.h>
#include <linux/shm.h>
#include <linux/bpf.h>
#include <linux/kernfs.h>
#include <linux/stringhash.h>        /* for hashlen_string() */
#include <uapi/linux/mount.h>
#include <linux/fsnotify.h>
#include <linux/fanotify.h>

#include "avc.h"
#include "objsec.h"
#include "netif.h"
#include "netnode.h"
#include "netport.h"
#include "ibpkey.h"
#include "xfrm.h"
#include "netlabel.h"
#include "audit.h"
#include "avc_ss.h"

struct selinux_state selinux_state;

/* SECMARK reference count */
static atomic_t selinux_secmark_refcount = ATOMIC_INIT(0);

#ifdef CONFIG_SECURITY_SELINUX_DEVELOP
static int selinux_enforcing_boot __initdata;

static int __init enforcing_setup(char *str)
{
        unsigned long enforcing;
        if (!kstrtoul(str, 0, &enforcing))
                selinux_enforcing_boot = enforcing ? 1 : 0;
        return 1;
}
__setup("enforcing=", enforcing_setup);
#else
#define selinux_enforcing_boot 1
#endif

int selinux_enabled_boot __initdata = 1;
#ifdef CONFIG_SECURITY_SELINUX_BOOTPARAM
static int __init selinux_enabled_setup(char *str)
{
        unsigned long enabled;
        if (!kstrtoul(str, 0, &enabled))
                selinux_enabled_boot = enabled ? 1 : 0;
        return 1;
}
__setup("selinux=", selinux_enabled_setup);
#endif

static unsigned int selinux_checkreqprot_boot =
        CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE;

static int __init checkreqprot_setup(char *str)
{
        unsigned long checkreqprot;

        if (!kstrtoul(str, 0, &checkreqprot)) {
                selinux_checkreqprot_boot = checkreqprot ? 1 : 0;
                if (checkreqprot)
                        pr_warn("SELinux: checkreqprot set to 1 via kernel parameter.  This is deprecated and will be rejected in a future kernel release.\n");
        }
        return 1;
}
__setup("checkreqprot=", checkreqprot_setup);

/**
 * selinux_secmark_enabled - Check to see if SECMARK is currently enabled
 *
 * Description:
 * This function checks the SECMARK reference counter to see if any SECMARK
 * targets are currently configured, if the reference counter is greater than
 * zero SECMARK is considered to be enabled.  Returns true (1) if SECMARK is
 * enabled, false (0) if SECMARK is disabled.  If the always_check_network
 * policy capability is enabled, SECMARK is always considered enabled.
 *
 */
static int selinux_secmark_enabled(void)
{
        return (selinux_policycap_alwaysnetwork() ||
                atomic_read(&selinux_secmark_refcount));
}

/**
 * selinux_peerlbl_enabled - Check to see if peer labeling is currently enabled
 *
 * Description:
 * This function checks if NetLabel or labeled IPSEC is enabled.  Returns true
 * (1) if any are enabled or false (0) if neither are enabled.  If the
 * always_check_network policy capability is enabled, peer labeling
 * is always considered enabled.
 *
 */
static int selinux_peerlbl_enabled(void)
{
        return (selinux_policycap_alwaysnetwork() ||
                netlbl_enabled() || selinux_xfrm_enabled());
}

static int selinux_netcache_avc_callback(u32 event)
{
        if (event == AVC_CALLBACK_RESET) {
                sel_netif_flush();
                sel_netnode_flush();
                sel_netport_flush();
                synchronize_net();
        }
        return 0;
}

static int selinux_lsm_notifier_avc_callback(u32 event)
{
        if (event == AVC_CALLBACK_RESET) {
                sel_ib_pkey_flush();
                call_blocking_lsm_notifier(LSM_POLICY_CHANGE, NULL);
        }

        return 0;
}

/*
 * initialise the security for the init task
 */
static void cred_init_security(void)
{
        struct cred *cred = (struct cred *) current->real_cred;
        struct task_security_struct *tsec;

        tsec = selinux_cred(cred);
        tsec->osid = tsec->sid = SECINITSID_KERNEL;
}

/*
 * get the security ID of a set of credentials
 */
static inline u32 cred_sid(const struct cred *cred)
{
        const struct task_security_struct *tsec;

        tsec = selinux_cred(cred);
        return tsec->sid;
}

/*
 * get the objective security ID of a task
 */
static inline u32 task_sid(const struct task_struct *task)
{
        u32 sid;

        rcu_read_lock();
        sid = cred_sid(__task_cred(task));
        rcu_read_unlock();
        return sid;
}

static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dentry);

/*
 * Try reloading inode security labels that have been marked as invalid.  The
 * @may_sleep parameter indicates when sleeping and thus reloading labels is
 * allowed; when set to false, returns -ECHILD when the label is
 * invalid.  The @dentry parameter should be set to a dentry of the inode.
 */
static int __inode_security_revalidate(struct inode *inode,
                                       struct dentry *dentry,
                                       bool may_sleep)
{
        struct inode_security_struct *isec = selinux_inode(inode);

        might_sleep_if(may_sleep);

        if (selinux_initialized(&selinux_state) &&
            isec->initialized != LABEL_INITIALIZED) {
                if (!may_sleep)
                        return -ECHILD;

                /*
                 * Try reloading the inode security label.  This will fail if
                 * @opt_dentry is NULL and no dentry for this inode can be
                 * found; in that case, continue using the old label.
                 */
                inode_doinit_with_dentry(inode, dentry);
        }
        return 0;
}

static struct inode_security_struct *inode_security_novalidate(struct inode *inode)
{
        return selinux_inode(inode);
}

static struct inode_security_struct *inode_security_rcu(struct inode *inode, bool rcu)
{
        int error;

        error = __inode_security_revalidate(inode, NULL, !rcu);
        if (error)
                return ERR_PTR(error);
        return selinux_inode(inode);
}

/*
 * Get the security label of an inode.
 */
static struct inode_security_struct *inode_security(struct inode *inode)
{
        __inode_security_revalidate(inode, NULL, true);
        return selinux_inode(inode);
}

static struct inode_security_struct *backing_inode_security_novalidate(struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);

        return selinux_inode(inode);
}

/*
 * Get the security label of a dentry's backing inode.
 */
static struct inode_security_struct *backing_inode_security(struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);

        __inode_security_revalidate(inode, dentry, true);
        return selinux_inode(inode);
}

static void inode_free_security(struct inode *inode)
{
        struct inode_security_struct *isec = selinux_inode(inode);
        struct superblock_security_struct *sbsec;

        if (!isec)
                return;
        sbsec = inode->i_sb->s_security;
        /*
         * As not all inode security structures are in a list, we check for
         * empty list outside of the lock to make sure that we won't waste
         * time taking a lock doing nothing.
         *
         * The list_del_init() function can be safely called more than once.
         * It should not be possible for this function to be called with
         * concurrent list_add(), but for better safety against future changes
         * in the code, we use list_empty_careful() here.
         */
        if (!list_empty_careful(&isec->list)) {
                spin_lock(&sbsec->isec_lock);
                list_del_init(&isec->list);
                spin_unlock(&sbsec->isec_lock);
        }
}

static void superblock_free_security(struct super_block *sb)
{
        struct superblock_security_struct *sbsec = sb->s_security;
        sb->s_security = NULL;
        kfree(sbsec);
}

struct selinux_mnt_opts {
        const char *fscontext, *context, *rootcontext, *defcontext;
};

static void selinux_free_mnt_opts(void *mnt_opts)
{
        struct selinux_mnt_opts *opts = mnt_opts;
        kfree(opts->fscontext);
        kfree(opts->context);
        kfree(opts->rootcontext);
        kfree(opts->defcontext);
        kfree(opts);
}

enum {
        Opt_error = -1,
        Opt_context = 0,
        Opt_defcontext = 1,
        Opt_fscontext = 2,
        Opt_rootcontext = 3,
        Opt_seclabel = 4,
};

#define A(s, has_arg) {#s, sizeof(#s) - 1, Opt_##s, has_arg}
static struct {
        const char *name;
        int len;
        int opt;
        bool has_arg;
} tokens[] = {
        A(context, true),
        A(fscontext, true),
        A(defcontext, true),
        A(rootcontext, true),
        A(seclabel, false),
};
#undef A

static int match_opt_prefix(char *s, int l, char **arg)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(tokens); i++) {
                size_t len = tokens[i].len;
                if (len > l || memcmp(s, tokens[i].name, len))
                        continue;
                if (tokens[i].has_arg) {
                        if (len == l || s[len] != '=')
                                continue;
                        *arg = s + len + 1;
                } else if (len != l)
                        continue;
                return tokens[i].opt;
        }
        return Opt_error;
}

#define SEL_MOUNT_FAIL_MSG "SELinux:  duplicate or incompatible mount options\n"

static int may_context_mount_sb_relabel(u32 sid,
                        struct superblock_security_struct *sbsec,
                        const struct cred *cred)
{
        const struct task_security_struct *tsec = selinux_cred(cred);
        int rc;

        rc = avc_has_perm(&selinux_state,
                          tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM,
                          FILESYSTEM__RELABELFROM, NULL);
        if (rc)
                return rc;

        rc = avc_has_perm(&selinux_state,
                          tsec->sid, sid, SECCLASS_FILESYSTEM,
                          FILESYSTEM__RELABELTO, NULL);
        return rc;
}

static int may_context_mount_inode_relabel(u32 sid,
                        struct superblock_security_struct *sbsec,
                        const struct cred *cred)
{
        const struct task_security_struct *tsec = selinux_cred(cred);
        int rc;
        rc = avc_has_perm(&selinux_state,
                          tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM,
                          FILESYSTEM__RELABELFROM, NULL);
        if (rc)
                return rc;

        rc = avc_has_perm(&selinux_state,
                          sid, sbsec->sid, SECCLASS_FILESYSTEM,
                          FILESYSTEM__ASSOCIATE, NULL);
        return rc;
}

static int selinux_is_genfs_special_handling(struct super_block *sb)
{
        /* Special handling. Genfs but also in-core setxattr handler */
        return        !strcmp(sb->s_type->name, "sysfs") ||
                !strcmp(sb->s_type->name, "pstore") ||
                !strcmp(sb->s_type->name, "debugfs") ||
                !strcmp(sb->s_type->name, "tracefs") ||
                !strcmp(sb->s_type->name, "rootfs") ||
                (selinux_policycap_cgroupseclabel() &&
                 (!strcmp(sb->s_type->name, "cgroup") ||
                  !strcmp(sb->s_type->name, "cgroup2")));
}

static int selinux_is_sblabel_mnt(struct super_block *sb)
{
        struct superblock_security_struct *sbsec = sb->s_security;

        /*
         * IMPORTANT: Double-check logic in this function when adding a new
         * SECURITY_FS_USE_* definition!
         */
        BUILD_BUG_ON(SECURITY_FS_USE_MAX != 7);

        switch (sbsec->behavior) {
        case SECURITY_FS_USE_XATTR:
        case SECURITY_FS_USE_TRANS:
        case SECURITY_FS_USE_TASK:
        case SECURITY_FS_USE_NATIVE:
                return 1;

        case SECURITY_FS_USE_GENFS:
                return selinux_is_genfs_special_handling(sb);

        /* Never allow relabeling on context mounts */
        case SECURITY_FS_USE_MNTPOINT:
        case SECURITY_FS_USE_NONE:
        default:
                return 0;
        }
}

static int sb_finish_set_opts(struct super_block *sb)
{
        struct superblock_security_struct *sbsec = sb->s_security;
        struct dentry *root = sb->s_root;
        struct inode *root_inode = d_backing_inode(root);
        int rc = 0;

        if (sbsec->behavior == SECURITY_FS_USE_XATTR) {
                /* Make sure that the xattr handler exists and that no
                   error other than -ENODATA is returned by getxattr on
                   the root directory.  -ENODATA is ok, as this may be
                   the first boot of the SELinux kernel before we have
                   assigned xattr values to the filesystem. */
                if (!(root_inode->i_opflags & IOP_XATTR)) {
                        pr_warn("SELinux: (dev %s, type %s) has no "
                               "xattr support\n", sb->s_id, sb->s_type->name);
                        rc = -EOPNOTSUPP;
                        goto out;
                }

                rc = __vfs_getxattr(root, root_inode, XATTR_NAME_SELINUX, NULL, 0);
                if (rc < 0 && rc != -ENODATA) {
                        if (rc == -EOPNOTSUPP)
                                pr_warn("SELinux: (dev %s, type "
                                       "%s) has no security xattr handler\n",
                                       sb->s_id, sb->s_type->name);
                        else
                                pr_warn("SELinux: (dev %s, type "
                                       "%s) getxattr errno %d\n", sb->s_id,
                                       sb->s_type->name, -rc);
                        goto out;
                }
        }

        sbsec->flags |= SE_SBINITIALIZED;

        /*
         * Explicitly set or clear SBLABEL_MNT.  It's not sufficient to simply
         * leave the flag untouched because sb_clone_mnt_opts might be handing
         * us a superblock that needs the flag to be cleared.
         */
        if (selinux_is_sblabel_mnt(sb))
                sbsec->flags |= SBLABEL_MNT;
        else
                sbsec->flags &= ~SBLABEL_MNT;

        /* Initialize the root inode. */
        rc = inode_doinit_with_dentry(root_inode, root);

        /* Initialize any other inodes associated with the superblock, e.g.
           inodes created prior to initial policy load or inodes created
           during get_sb by a pseudo filesystem that directly
           populates itself. */
        spin_lock(&sbsec->isec_lock);
        while (!list_empty(&sbsec->isec_head)) {
                struct inode_security_struct *isec =
                                list_first_entry(&sbsec->isec_head,
                                           struct inode_security_struct, list);
                struct inode *inode = isec->inode;
                list_del_init(&isec->list);
                spin_unlock(&sbsec->isec_lock);
                inode = igrab(inode);
                if (inode) {
                        if (!IS_PRIVATE(inode))
                                inode_doinit_with_dentry(inode, NULL);
                        iput(inode);
                }
                spin_lock(&sbsec->isec_lock);
        }
        spin_unlock(&sbsec->isec_lock);
out:
        return rc;
}

static int bad_option(struct superblock_security_struct *sbsec, char flag,
                      u32 old_sid, u32 new_sid)
{
        char mnt_flags = sbsec->flags & SE_MNTMASK;

        /* check if the old mount command had the same options */
        if (sbsec->flags & SE_SBINITIALIZED)
                if (!(sbsec->flags & flag) ||
                    (old_sid != new_sid))
                        return 1;

        /* check if we were passed the same options twice,
         * aka someone passed context=a,context=b
         */
        if (!(sbsec->flags & SE_SBINITIALIZED))
                if (mnt_flags & flag)
                        return 1;
        return 0;
}

static int parse_sid(struct super_block *sb, const char *s, u32 *sid)
{
        int rc = security_context_str_to_sid(&selinux_state, s,
                                             sid, GFP_KERNEL);
        if (rc)
                pr_warn("SELinux: security_context_str_to_sid"
                       "(%s) failed for (dev %s, type %s) errno=%d\n",
                       s, sb->s_id, sb->s_type->name, rc);
        return rc;
}

/*
 * Allow filesystems with binary mount data to explicitly set mount point
 * labeling information.
 */
static int selinux_set_mnt_opts(struct super_block *sb,
                                void *mnt_opts,
                                unsigned long kern_flags,
                                unsigned long *set_kern_flags)
{
        const struct cred *cred = current_cred();
        struct superblock_security_struct *sbsec = sb->s_security;
        struct dentry *root = sbsec->sb->s_root;
        struct selinux_mnt_opts *opts = mnt_opts;
        struct inode_security_struct *root_isec;
        u32 fscontext_sid = 0, context_sid = 0, rootcontext_sid = 0;
        u32 defcontext_sid = 0;
        int rc = 0;

        mutex_lock(&sbsec->lock);

        if (!selinux_initialized(&selinux_state)) {
                if (!opts) {
                        /* Defer initialization until selinux_complete_init,
                           after the initial policy is loaded and the security
                           server is ready to handle calls. */
                        goto out;
                }
                rc = -EINVAL;
                pr_warn("SELinux: Unable to set superblock options "
                        "before the security server is initialized\n");
                goto out;
        }
        if (kern_flags && !set_kern_flags) {
                /* Specifying internal flags without providing a place to
                 * place the results is not allowed */
                rc = -EINVAL;
                goto out;
        }

        /*
         * Binary mount data FS will come through this function twice.  Once
         * from an explicit call and once from the generic calls from the vfs.
         * Since the generic VFS calls will not contain any security mount data
         * we need to skip the double mount verification.
         *
         * This does open a hole in which we will not notice if the first
         * mount using this sb set explict options and a second mount using
         * this sb does not set any security options.  (The first options
         * will be used for both mounts)
         */
        if ((sbsec->flags & SE_SBINITIALIZED) && (sb->s_type->fs_flags & FS_BINARY_MOUNTDATA)
            && !opts)
                goto out;

        root_isec = backing_inode_security_novalidate(root);

        /*
         * parse the mount options, check if they are valid sids.
         * also check if someone is trying to mount the same sb more
         * than once with different security options.
         */
        if (opts) {
                if (opts->fscontext) {
                        rc = parse_sid(sb, opts->fscontext, &fscontext_sid);
                        if (rc)
                                goto out;
                        if (bad_option(sbsec, FSCONTEXT_MNT, sbsec->sid,
                                        fscontext_sid))
                                goto out_double_mount;
                        sbsec->flags |= FSCONTEXT_MNT;
                }
                if (opts->context) {
                        rc = parse_sid(sb, opts->context, &context_sid);
                        if (rc)
                                goto out;
                        if (bad_option(sbsec, CONTEXT_MNT, sbsec->mntpoint_sid,
                                        context_sid))
                                goto out_double_mount;
                        sbsec->flags |= CONTEXT_MNT;
                }
                if (opts->rootcontext) {
                        rc = parse_sid(sb, opts->rootcontext, &rootcontext_sid);
                        if (rc)
                                goto out;
                        if (bad_option(sbsec, ROOTCONTEXT_MNT, root_isec->sid,
                                        rootcontext_sid))
                                goto out_double_mount;
                        sbsec->flags |= ROOTCONTEXT_MNT;
                }
                if (opts->defcontext) {
                        rc = parse_sid(sb, opts->defcontext, &defcontext_sid);
                        if (rc)
                                goto out;
                        if (bad_option(sbsec, DEFCONTEXT_MNT, sbsec->def_sid,
                                        defcontext_sid))
                                goto out_double_mount;
                        sbsec->flags |= DEFCONTEXT_MNT;
                }
        }

        if (sbsec->flags & SE_SBINITIALIZED) {
                /* previously mounted with options, but not on this attempt? */
                if ((sbsec->flags & SE_MNTMASK) && !opts)
                        goto out_double_mount;
                rc = 0;
                goto out;
        }

        if (strcmp(sb->s_type->name, "proc") == 0)
                sbsec->flags |= SE_SBPROC | SE_SBGENFS;

        if (!strcmp(sb->s_type->name, "debugfs") ||
            !strcmp(sb->s_type->name, "tracefs") ||
            !strcmp(sb->s_type->name, "binder") ||
            !strcmp(sb->s_type->name, "bpf") ||
            !strcmp(sb->s_type->name, "pstore"))
                sbsec->flags |= SE_SBGENFS;

        if (!strcmp(sb->s_type->name, "sysfs") ||
            !strcmp(sb->s_type->name, "cgroup") ||
            !strcmp(sb->s_type->name, "cgroup2"))
                sbsec->flags |= SE_SBGENFS | SE_SBGENFS_XATTR;

        if (!sbsec->behavior) {
                /*
                 * Determine the labeling behavior to use for this
                 * filesystem type.
                 */
                rc = security_fs_use(&selinux_state, sb);
                if (rc) {
                        pr_warn("%s: security_fs_use(%s) returned %d\n",
                                        __func__, sb->s_type->name, rc);
                        goto out;
                }
        }

        /*
         * If this is a user namespace mount and the filesystem type is not
         * explicitly whitelisted, then no contexts are allowed on the command
         * line and security labels must be ignored.
         */
        if (sb->s_user_ns != &init_user_ns &&
            strcmp(sb->s_type->name, "tmpfs") &&
            strcmp(sb->s_type->name, "ramfs") &&
            strcmp(sb->s_type->name, "devpts")) {
                if (context_sid || fscontext_sid || rootcontext_sid ||
                    defcontext_sid) {
                        rc = -EACCES;
                        goto out;
                }
                if (sbsec->behavior == SECURITY_FS_USE_XATTR) {
                        sbsec->behavior = SECURITY_FS_USE_MNTPOINT;
                        rc = security_transition_sid(&selinux_state,
                                                     current_sid(),
                                                     current_sid(),
                                                     SECCLASS_FILE, NULL,
                                                     &sbsec->mntpoint_sid);
                        if (rc)
                                goto out;
                }
                goto out_set_opts;
        }

        /* sets the context of the superblock for the fs being mounted. */
        if (fscontext_sid) {
                rc = may_context_mount_sb_relabel(fscontext_sid, sbsec, cred);
                if (rc)
                        goto out;

                sbsec->sid = fscontext_sid;
        }

        /*
         * Switch to using mount point labeling behavior.
         * sets the label used on all file below the mountpoint, and will set
         * the superblock context if not already set.
         */
        if (kern_flags & SECURITY_LSM_NATIVE_LABELS && !context_sid) {
                sbsec->behavior = SECURITY_FS_USE_NATIVE;
                *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS;
        }

        if (context_sid) {
                if (!fscontext_sid) {
                        rc = may_context_mount_sb_relabel(context_sid, sbsec,
                                                          cred);
                        if (rc)
                                goto out;
                        sbsec->sid = context_sid;
                } else {
                        rc = may_context_mount_inode_relabel(context_sid, sbsec,
                                                             cred);
                        if (rc)
                                goto out;
                }
                if (!rootcontext_sid)
                        rootcontext_sid = context_sid;

                sbsec->mntpoint_sid = context_sid;
                sbsec->behavior = SECURITY_FS_USE_MNTPOINT;
        }

        if (rootcontext_sid) {
                rc = may_context_mount_inode_relabel(rootcontext_sid, sbsec,
                                                     cred);
                if (rc)
                        goto out;

                root_isec->sid = rootcontext_sid;
                root_isec->initialized = LABEL_INITIALIZED;
        }

        if (defcontext_sid) {
                if (sbsec->behavior != SECURITY_FS_USE_XATTR &&
                        sbsec->behavior != SECURITY_FS_USE_NATIVE) {
                        rc = -EINVAL;
                        pr_warn("SELinux: defcontext option is "
                               "invalid for this filesystem type\n");
                        goto out;
                }

                if (defcontext_sid != sbsec->def_sid) {
                        rc = may_context_mount_inode_relabel(defcontext_sid,
                                                             sbsec, cred);
                        if (rc)
                                goto out;
                }

                sbsec->def_sid = defcontext_sid;
        }

out_set_opts:
        rc = sb_finish_set_opts(sb);
out:
        mutex_unlock(&sbsec->lock);
        return rc;
out_double_mount:
        rc = -EINVAL;
        pr_warn("SELinux: mount invalid.  Same superblock, different "
               "security settings for (dev %s, type %s)\n", sb->s_id,
               sb->s_type->name);
        goto out;
}

static int selinux_cmp_sb_context(const struct super_block *oldsb,
                                    const struct super_block *newsb)
{
        struct superblock_security_struct *old = oldsb->s_security;
        struct superblock_security_struct *new = newsb->s_security;
        char oldflags = old->flags & SE_MNTMASK;
        char newflags = new->flags & SE_MNTMASK;

        if (oldflags != newflags)
                goto mismatch;
        if ((oldflags & FSCONTEXT_MNT) && old->sid != new->sid)
                goto mismatch;
        if ((oldflags & CONTEXT_MNT) && old->mntpoint_sid != new->mntpoint_sid)
                goto mismatch;
        if ((oldflags & DEFCONTEXT_MNT) && old->def_sid != new->def_sid)
                goto mismatch;
        if (oldflags & ROOTCONTEXT_MNT) {
                struct inode_security_struct *oldroot = backing_inode_security(oldsb->s_root);
                struct inode_security_struct *newroot = backing_inode_security(newsb->s_root);
                if (oldroot->sid != newroot->sid)
                        goto mismatch;
        }
        return 0;
mismatch:
        pr_warn("SELinux: mount invalid.  Same superblock, "
                            "different security settings for (dev %s, "
                            "type %s)\n", newsb->s_id, newsb->s_type->name);
        return -EBUSY;
}

static int selinux_sb_clone_mnt_opts(const struct super_block *oldsb,
                                        struct super_block *newsb,
                                        unsigned long kern_flags,
                                        unsigned long *set_kern_flags)
{
        int rc = 0;
        const struct superblock_security_struct *oldsbsec = oldsb->s_security;
        struct superblock_security_struct *newsbsec = newsb->s_security;

        int set_fscontext =        (oldsbsec->flags & FSCONTEXT_MNT);
        int set_context =        (oldsbsec->flags & CONTEXT_MNT);
        int set_rootcontext =        (oldsbsec->flags & ROOTCONTEXT_MNT);

        /*
         * if the parent was able to be mounted it clearly had no special lsm
         * mount options.  thus we can safely deal with this superblock later
         */
        if (!selinux_initialized(&selinux_state))
                return 0;

        /*
         * Specifying internal flags without providing a place to
         * place the results is not allowed.
         */
        if (kern_flags && !set_kern_flags)
                return -EINVAL;

        /* how can we clone if the old one wasn't set up?? */
        BUG_ON(!(oldsbsec->flags & SE_SBINITIALIZED));

        /* if fs is reusing a sb, make sure that the contexts match */
        if (newsbsec->flags & SE_SBINITIALIZED) {
                if ((kern_flags & SECURITY_LSM_NATIVE_LABELS) && !set_context)
                        *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS;
                return selinux_cmp_sb_context(oldsb, newsb);
        }

        mutex_lock(&newsbsec->lock);

        newsbsec->flags = oldsbsec->flags;

        newsbsec->sid = oldsbsec->sid;
        newsbsec->def_sid = oldsbsec->def_sid;
        newsbsec->behavior = oldsbsec->behavior;

        if (newsbsec->behavior == SECURITY_FS_USE_NATIVE &&
                !(kern_flags & SECURITY_LSM_NATIVE_LABELS) && !set_context) {
                rc = security_fs_use(&selinux_state, newsb);
                if (rc)
                        goto out;
        }

        if (kern_flags & SECURITY_LSM_NATIVE_LABELS && !set_context) {
                newsbsec->behavior = SECURITY_FS_USE_NATIVE;
                *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS;
        }

        if (set_context) {
                u32 sid = oldsbsec->mntpoint_sid;

                if (!set_fscontext)
                        newsbsec->sid = sid;
                if (!set_rootcontext) {
                        struct inode_security_struct *newisec = backing_inode_security(newsb->s_root);
                        newisec->sid = sid;
                }
                newsbsec->mntpoint_sid = sid;
        }
        if (set_rootcontext) {
                const struct inode_security_struct *oldisec = backing_inode_security(oldsb->s_root);
                struct inode_security_struct *newisec = backing_inode_security(newsb->s_root);

                newisec->sid = oldisec->sid;
        }

        sb_finish_set_opts(newsb);
out:
        mutex_unlock(&newsbsec->lock);
        return rc;
}

static int selinux_add_opt(int token, const char *s, void **mnt_opts)
{
        struct selinux_mnt_opts *opts = *mnt_opts;
        bool is_alloc_opts = false;

        if (token == Opt_seclabel)        /* eaten and completely ignored */
                return 0;

        if (!s)
                return -ENOMEM;

        if (!opts) {
                opts = kzalloc(sizeof(struct selinux_mnt_opts), GFP_KERNEL);
                if (!opts)
                        return -ENOMEM;
                *mnt_opts = opts;
                is_alloc_opts = true;
        }

        switch (token) {
        case Opt_context:
                if (opts->context || opts->defcontext)
                        goto Einval;
                opts->context = s;
                break;
        case Opt_fscontext:
                if (opts->fscontext)
                        goto Einval;
                opts->fscontext = s;
                break;
        case Opt_rootcontext:
                if (opts->rootcontext)
                        goto Einval;
                opts->rootcontext = s;
                break;
        case Opt_defcontext:
                if (opts->context || opts->defcontext)
                        goto Einval;
                opts->defcontext = s;
                break;
        }
        return 0;
Einval:
        if (is_alloc_opts) {
                kfree(opts);
                *mnt_opts = NULL;
        }
        pr_warn(SEL_MOUNT_FAIL_MSG);
        return -EINVAL;
}

static int selinux_add_mnt_opt(const char *option, const char *val, int len,
                               void **mnt_opts)
{
        int token = Opt_error;
        int rc, i;

        for (i = 0; i < ARRAY_SIZE(tokens); i++) {
                if (strcmp(option, tokens[i].name) == 0) {
                        token = tokens[i].opt;
                        break;
                }
        }

        if (token == Opt_error)
                return -EINVAL;

        if (token != Opt_seclabel) {
                val = kmemdup_nul(val, len, GFP_KERNEL);
                if (!val) {
                        rc = -ENOMEM;
                        goto free_opt;
                }
        }
        rc = selinux_add_opt(token, val, mnt_opts);
        if (unlikely(rc)) {
                kfree(val);
                goto free_opt;
        }
        return rc;

free_opt:
        if (*mnt_opts) {
                selinux_free_mnt_opts(*mnt_opts);
                *mnt_opts = NULL;
        }
        return rc;
}

static int show_sid(struct seq_file *m, u32 sid)
{
        char *context = NULL;
        u32 len;
        int rc;

        rc = security_sid_to_context(&selinux_state, sid,
                                             &context, &len);
        if (!rc) {
                bool has_comma = context && strchr(context, ',');

                seq_putc(m, '=');
                if (has_comma)
                        seq_putc(m, '\"');
                seq_escape(m, context, "\"\n\\");
                if (has_comma)
                        seq_putc(m, '\"');
        }
        kfree(context);
        return rc;
}

static int selinux_sb_show_options(struct seq_file *m, struct super_block *sb)
{
        struct superblock_security_struct *sbsec = sb->s_security;
        int rc;

        if (!(sbsec->flags & SE_SBINITIALIZED))
                return 0;

        if (!selinux_initialized(&selinux_state))
                return 0;

        if (sbsec->flags & FSCONTEXT_MNT) {
                seq_putc(m, ',');
                seq_puts(m, FSCONTEXT_STR);
                rc = show_sid(m, sbsec->sid);
                if (rc)
                        return rc;
        }
        if (sbsec->flags & CONTEXT_MNT) {
                seq_putc(m, ',');
                seq_puts(m, CONTEXT_STR);
                rc = show_sid(m, sbsec->mntpoint_sid);
                if (rc)
                        return rc;
        }
        if (sbsec->flags & DEFCONTEXT_MNT) {
                seq_putc(m, ',');
                seq_puts(m, DEFCONTEXT_STR);
                rc = show_sid(m, sbsec->def_sid);
                if (rc)
                        return rc;
        }
        if (sbsec->flags & ROOTCONTEXT_MNT) {
                struct dentry *root = sbsec->sb->s_root;
                struct inode_security_struct *isec = backing_inode_security(root);
                seq_putc(m, ',');
                seq_puts(m, ROOTCONTEXT_STR);
                rc = show_sid(m, isec->sid);
                if (rc)
                        return rc;
        }
        if (sbsec->flags & SBLABEL_MNT) {
                seq_putc(m, ',');
                seq_puts(m, SECLABEL_STR);
        }
        return 0;
}

static inline u16 inode_mode_to_security_class(umode_t mode)
{
        switch (mode & S_IFMT) {
        case S_IFSOCK:
                return SECCLASS_SOCK_FILE;
        case S_IFLNK:
                return SECCLASS_LNK_FILE;
        case S_IFREG:
                return SECCLASS_FILE;
        case S_IFBLK:
                return SECCLASS_BLK_FILE;
        case S_IFDIR:
                return SECCLASS_DIR;
        case S_IFCHR:
                return SECCLASS_CHR_FILE;
        case S_IFIFO:
                return SECCLASS_FIFO_FILE;

        }

        return SECCLASS_FILE;
}

static inline int default_protocol_stream(int protocol)
{
        return (protocol == IPPROTO_IP || protocol == IPPROTO_TCP);
}

static inline int default_protocol_dgram(int protocol)
{
        return (protocol == IPPROTO_IP || protocol == IPPROTO_UDP);
}

static inline u16 socket_type_to_security_class(int family, int type, int protocol)
{
        int extsockclass = selinux_policycap_extsockclass();

        switch (family) {
        case PF_UNIX:
                switch (type) {
                case SOCK_STREAM:
                case SOCK_SEQPACKET:
                        return SECCLASS_UNIX_STREAM_SOCKET;
                case SOCK_DGRAM:
                case SOCK_RAW:
                        return SECCLASS_UNIX_DGRAM_SOCKET;
                }
                break;
        case PF_INET:
        case PF_INET6:
                switch (type) {
                case SOCK_STREAM:
                case SOCK_SEQPACKET:
                        if (default_protocol_stream(protocol))
                                return SECCLASS_TCP_SOCKET;
                        else if (extsockclass && protocol == IPPROTO_SCTP)
                                return SECCLASS_SCTP_SOCKET;
                        else
                                return SECCLASS_RAWIP_SOCKET;
                case SOCK_DGRAM:
                        if (default_protocol_dgram(protocol))
                                return SECCLASS_UDP_SOCKET;
                        else if (extsockclass && (protocol == IPPROTO_ICMP ||
                                                  protocol == IPPROTO_ICMPV6))
                                return SECCLASS_ICMP_SOCKET;
                        else
                                return SECCLASS_RAWIP_SOCKET;
                case SOCK_DCCP:
                        return SECCLASS_DCCP_SOCKET;
                default:
                        return SECCLASS_RAWIP_SOCKET;
                }
                break;
        case PF_NETLINK:
                switch (protocol) {
                case NETLINK_ROUTE:
                        return SECCLASS_NETLINK_ROUTE_SOCKET;
                case NETLINK_SOCK_DIAG:
                        return SECCLASS_NETLINK_TCPDIAG_SOCKET;
                case NETLINK_NFLOG:
                        return SECCLASS_NETLINK_NFLOG_SOCKET;
                case NETLINK_XFRM:
                        return SECCLASS_NETLINK_XFRM_SOCKET;
                case NETLINK_SELINUX:
                        return SECCLASS_NETLINK_SELINUX_SOCKET;
                case NETLINK_ISCSI:
                        return SECCLASS_NETLINK_ISCSI_SOCKET;
                case NETLINK_AUDIT:
                        return SECCLASS_NETLINK_AUDIT_SOCKET;
                case NETLINK_FIB_LOOKUP:
                        return SECCLASS_NETLINK_FIB_LOOKUP_SOCKET;
                case NETLINK_CONNECTOR:
                        return SECCLASS_NETLINK_CONNECTOR_SOCKET;
                case NETLINK_NETFILTER:
                        return SECCLASS_NETLINK_NETFILTER_SOCKET;
                case NETLINK_DNRTMSG:
                        return SECCLASS_NETLINK_DNRT_SOCKET;
                case NETLINK_KOBJECT_UEVENT:
                        return SECCLASS_NETLINK_KOBJECT_UEVENT_SOCKET;
                case NETLINK_GENERIC:
                        return SECCLASS_NETLINK_GENERIC_SOCKET;
                case NETLINK_SCSITRANSPORT:
                        return SECCLASS_NETLINK_SCSITRANSPORT_SOCKET;
                case NETLINK_RDMA:
                        return SECCLASS_NETLINK_RDMA_SOCKET;
                case NETLINK_CRYPTO:
                        return SECCLASS_NETLINK_CRYPTO_SOCKET;
                default:
                        return SECCLASS_NETLINK_SOCKET;
                }
        case PF_PACKET:
                return SECCLASS_PACKET_SOCKET;
        case PF_KEY:
                return SECCLASS_KEY_SOCKET;
        case PF_APPLETALK:
                return SECCLASS_APPLETALK_SOCKET;
        }

        if (extsockclass) {
                switch (family) {
                case PF_AX25:
                        return SECCLASS_AX25_SOCKET;
                case PF_IPX:
                        return SECCLASS_IPX_SOCKET;
                case PF_NETROM:
                        return SECCLASS_NETROM_SOCKET;
                case PF_ATMPVC:
                        return SECCLASS_ATMPVC_SOCKET;
                case PF_X25:
                        return SECCLASS_X25_SOCKET;
                case PF_ROSE:
                        return SECCLASS_ROSE_SOCKET;
                case PF_DECnet:
                        return SECCLASS_DECNET_SOCKET;
                case PF_ATMSVC:
                        return SECCLASS_ATMSVC_SOCKET;
                case PF_RDS:
                        return SECCLASS_RDS_SOCKET;
                case PF_IRDA:
                        return SECCLASS_IRDA_SOCKET;
                case PF_PPPOX:
                        return SECCLASS_PPPOX_SOCKET;
                case PF_LLC:
                        return SECCLASS_LLC_SOCKET;
                case PF_CAN:
                        return SECCLASS_CAN_SOCKET;
                case PF_TIPC:
                        return SECCLASS_TIPC_SOCKET;
                case PF_BLUETOOTH:
                        return SECCLASS_BLUETOOTH_SOCKET;
                case PF_IUCV:
                        return SECCLASS_IUCV_SOCKET;
                case PF_RXRPC:
                        return SECCLASS_RXRPC_SOCKET;
                case PF_ISDN:
                        return SECCLASS_ISDN_SOCKET;
                case PF_PHONET:
                        return SECCLASS_PHONET_SOCKET;
                case PF_IEEE802154:
                        return SECCLASS_IEEE802154_SOCKET;
                case PF_CAIF:
                        return SECCLASS_CAIF_SOCKET;
                case PF_ALG:
                        return SECCLASS_ALG_SOCKET;
                case PF_NFC:
                        return SECCLASS_NFC_SOCKET;
                case PF_VSOCK:
                        return SECCLASS_VSOCK_SOCKET;
                case PF_KCM:
                        return SECCLASS_KCM_SOCKET;
                case PF_QIPCRTR:
                        return SECCLASS_QIPCRTR_SOCKET;
                case PF_SMC:
                        return SECCLASS_SMC_SOCKET;
                case PF_XDP:
                        return SECCLASS_XDP_SOCKET;
#if PF_MAX > 45
#error New address family defined, please update this function.
#endif
                }
        }

        return SECCLASS_SOCKET;
}

static int selinux_genfs_get_sid(struct dentry *dentry,
                                 u16 tclass,
                                 u16 flags,
                                 u32 *sid)
{
        int rc;
        struct super_block *sb = dentry->d_sb;
        char *buffer, *path;

        buffer = (char *)__get_free_page(GFP_KERNEL);
        if (!buffer)
                return -ENOMEM;

        path = dentry_path_raw(dentry, buffer, PAGE_SIZE);
        if (IS_ERR(path))
                rc = PTR_ERR(path);
        else {
                if (flags & SE_SBPROC) {
                        /* each process gets a /proc/PID/ entry. Strip off the
                         * PID part to get a valid selinux labeling.
                         * e.g. /proc/1/net/rpc/nfs -> /net/rpc/nfs */
                        while (path[1] >= '0' && path[1] <= '9') {
                                path[1] = '/';
                                path++;
                        }
                }
                rc = security_genfs_sid(&selinux_state, sb->s_type->name,
                                        path, tclass, sid);
                if (rc == -ENOENT) {
                        /* No match in policy, mark as unlabeled. */
                        *sid = SECINITSID_UNLABELED;
                        rc = 0;
                }
        }
        free_page((unsigned long)buffer);
        return rc;
}

static int inode_doinit_use_xattr(struct inode *inode, struct dentry *dentry,
                                  u32 def_sid, u32 *sid)
{
#define INITCONTEXTLEN 255
        char *context;
        unsigned int len;
        int rc;

        len = INITCONTEXTLEN;
        context = kmalloc(len + 1, GFP_NOFS);
        if (!context)
                return -ENOMEM;

        context[len] = '\0';
        rc = __vfs_getxattr(dentry, inode, XATTR_NAME_SELINUX, context, len);
        if (rc == -ERANGE) {
                kfree(context);

                /* Need a larger buffer.  Query for the right size. */
                rc = __vfs_getxattr(dentry, inode, XATTR_NAME_SELINUX, NULL, 0);
                if (rc < 0)
                        return rc;

                len = rc;
                context = kmalloc(len + 1, GFP_NOFS);
                if (!context)
                        return -ENOMEM;

                context[len] = '\0';
                rc = __vfs_getxattr(dentry, inode, XATTR_NAME_SELINUX,
                                    context, len);
        }
        if (rc < 0) {
                kfree(context);
                if (rc != -ENODATA) {
                        pr_warn("SELinux: %s:  getxattr returned %d for dev=%s ino=%ld\n",
                                __func__, -rc, inode->i_sb->s_id, inode->i_ino);
                        return rc;
                }
                *sid = def_sid;
                return 0;
        }

        rc = security_context_to_sid_default(&selinux_state, context, rc, sid,
                                             def_sid, GFP_NOFS);
        if (rc) {
                char *dev = inode->i_sb->s_id;
                unsigned long ino = inode->i_ino;

                if (rc == -EINVAL) {
                        pr_notice_ratelimited("SELinux: inode=%lu on dev=%s was found to have an invalid context=%s.  This indicates you may need to relabel the inode or the filesystem in question.\n",
                                              ino, dev, context);
                } else {
                        pr_warn("SELinux: %s:  context_to_sid(%s) returned %d for dev=%s ino=%ld\n",
                                __func__, context, -rc, dev, ino);
                }
        }
        kfree(context);
        return 0;
}

/* The inode's security attributes must be initialized before first use. */
static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dentry)
{
        struct superblock_security_struct *sbsec = NULL;
        struct inode_security_struct *isec = selinux_inode(inode);
        u32 task_sid, sid = 0;
        u16 sclass;
        struct dentry *dentry;
        int rc = 0;

        if (isec->initialized == LABEL_INITIALIZED)
                return 0;

        spin_lock(&isec->lock);
        if (isec->initialized == LABEL_INITIALIZED)
                goto out_unlock;

        if (isec->sclass == SECCLASS_FILE)
                isec->sclass = inode_mode_to_security_class(inode->i_mode);

        sbsec = inode->i_sb->s_security;
        if (!(sbsec->flags & SE_SBINITIALIZED)) {
                /* Defer initialization until selinux_complete_init,
                   after the initial policy is loaded and the security
                   server is ready to handle calls. */
                spin_lock(&sbsec->isec_lock);
                if (list_empty(&isec->list))
                        list_add(&isec->list, &sbsec->isec_head);
                spin_unlock(&sbsec->isec_lock);
                goto out_unlock;
        }

        sclass = isec->sclass;
        task_sid = isec->task_sid;
        sid = isec->sid;
        isec->initialized = LABEL_PENDING;
        spin_unlock(&isec->lock);

        switch (sbsec->behavior) {
        case SECURITY_FS_USE_NATIVE:
                break;
        case SECURITY_FS_USE_XATTR:
                if (!(inode->i_opflags & IOP_XATTR)) {
                        sid = sbsec->def_sid;
                        break;
                }
                /* Need a dentry, since the xattr API requires one.
                   Life would be simpler if we could just pass the inode. */
                if (opt_dentry) {
                        /* Called from d_instantiate or d_splice_alias. */
                        dentry = dget(opt_dentry);
                } else {
                        /*
                         * Called from selinux_complete_init, try to find a dentry.
                         * Some filesystems really want a connected one, so try
                         * that first.  We could split SECURITY_FS_USE_XATTR in
                         * two, depending upon that...
                         */
                        dentry = d_find_alias(inode);
                        if (!dentry)
                                dentry = d_find_any_alias(inode);
                }
                if (!dentry) {
                        /*
                         * this is can be hit on boot when a file is accessed
                         * before the policy is loaded.  When we load policy we
                         * may find inodes that have no dentry on the
                         * sbsec->isec_head list.  No reason to complain as these
                         * will get fixed up the next time we go through
                         * inode_doinit with a dentry, before these inodes could
                         * be used again by userspace.
                         */
                        goto out_invalid;
                }

                rc = inode_doinit_use_xattr(inode, dentry, sbsec->def_sid,
                                            &sid);
                dput(dentry);
                if (rc)
                        goto out;
                break;
        case SECURITY_FS_USE_TASK:
                sid = task_sid;
                break;
        case SECURITY_FS_USE_TRANS:
                /* Default to the fs SID. */
                sid = sbsec->sid;

                /* Try to obtain a transition SID. */
                rc = security_transition_sid(&selinux_state, task_sid, sid,
                                             sclass, NULL, &sid);
                if (rc)
                        goto out;
                break;
        case SECURITY_FS_USE_MNTPOINT:
                sid = sbsec->mntpoint_sid;
                break;
        default:
                /* Default to the fs superblock SID. */
                sid = sbsec->sid;

                if ((sbsec->flags & SE_SBGENFS) &&
                     (!S_ISLNK(inode->i_mode) ||
                      selinux_policycap_genfs_seclabel_symlinks())) {
                        /* We must have a dentry to determine the label on
                         * procfs inodes */
                        if (opt_dentry) {
                                /* Called from d_instantiate or
                                 * d_splice_alias. */
                                dentry = dget(opt_dentry);
                        } else {
                                /* Called from selinux_complete_init, try to
                                 * find a dentry.  Some filesystems really want
                                 * a connected one, so try that first.
                                 */
                                dentry = d_find_alias(inode);
                                if (!dentry)
                                        dentry = d_find_any_alias(inode);
                        }
                        /*
                         * This can be hit on boot when a file is accessed
                         * before the policy is loaded.  When we load policy we
                         * may find inodes that have no dentry on the
                         * sbsec->isec_head list.  No reason to complain as
                         * these will get fixed up the next time we go through
                         * inode_doinit() with a dentry, before these inodes
                         * could be used again by userspace.
                         */
                        if (!dentry)
                                goto out_invalid;
                        rc = selinux_genfs_get_sid(dentry, sclass,
                                                   sbsec->flags, &sid);
                        if (rc) {
                                dput(dentry);
                                goto out;
                        }

                        if ((sbsec->flags & SE_SBGENFS_XATTR) &&
                            (inode->i_opflags & IOP_XATTR)) {
                                rc = inode_doinit_use_xattr(inode, dentry,
                                                            sid, &sid);
                                if (rc) {
                                        dput(dentry);
                                        goto out;
                                }
                        }
                        dput(dentry);
                }
                break;
        }

out:
        spin_lock(&isec->lock);
        if (isec->initialized == LABEL_PENDING) {
                if (rc) {
                        isec->initialized = LABEL_INVALID;
                        goto out_unlock;
                }
                isec->initialized = LABEL_INITIALIZED;
                isec->sid = sid;
        }

out_unlock:
        spin_unlock(&isec->lock);
        return rc;

out_invalid:
        spin_lock(&isec->lock);
        if (isec->initialized == LABEL_PENDING) {
                isec->initialized = LABEL_INVALID;
                isec->sid = sid;
        }
        spin_unlock(&isec->lock);
        return 0;
}

/* Convert a Linux signal to an access vector. */
static inline u32 signal_to_av(int sig)
{
        u32 perm = 0;

        switch (sig) {
        case SIGCHLD:
                /* Commonly granted from child to parent. */
                perm = PROCESS__SIGCHLD;
                break;
        case SIGKILL:
                /* Cannot be caught or ignored */
                perm = PROCESS__SIGKILL;
                break;
        case SIGSTOP:
                /* Cannot be caught or ignored */
                perm = PROCESS__SIGSTOP;
                break;
        default:
                /* All other signals. */
                perm = PROCESS__SIGNAL;
                break;
        }

        return perm;
}

#if CAP_LAST_CAP > 63
#error Fix SELinux to handle capabilities > 63.
#endif

/* Check whether a task is allowed to use a capability. */
static int cred_has_capability(const struct cred *cred,
                               int cap, unsigned int opts, bool initns)
{
        struct common_audit_data ad;
        struct av_decision avd;
        u16 sclass;
        u32 sid = cred_sid(cred);
        u32 av = CAP_TO_MASK(cap);
        int rc;

        ad.type = LSM_AUDIT_DATA_CAP;
        ad.u.cap = cap;

        switch (CAP_TO_INDEX(cap)) {
        case 0:
                sclass = initns ? SECCLASS_CAPABILITY : SECCLASS_CAP_USERNS;
                break;
        case 1:
                sclass = initns ? SECCLASS_CAPABILITY2 : SECCLASS_CAP2_USERNS;
                break;
        default:
                pr_err("SELinux:  out of range capability %d\n", cap);
                BUG();
                return -EINVAL;
        }

        rc = avc_has_perm_noaudit(&selinux_state,
                                  sid, sid, sclass, av, 0, &avd);
        if (!(opts & CAP_OPT_NOAUDIT)) {
                int rc2 = avc_audit(&selinux_state,
                                    sid, sid, sclass, av, &avd, rc, &ad, 0);
                if (rc2)
                        return rc2;
        }
        return rc;
}

/* Check whether a task has a particular permission to an inode.
   The 'adp' parameter is optional and allows other audit
   data to be passed (e.g. the dentry). */
static int inode_has_perm(const struct cred *cred,
                          struct inode *inode,
                          u32 perms,
                          struct common_audit_data *adp)
{
        struct inode_security_struct *isec;
        u32 sid;

        validate_creds(cred);

        if (unlikely(IS_PRIVATE(inode)))
                return 0;

        sid = cred_sid(cred);
        isec = selinux_inode(inode);

        return avc_has_perm(&selinux_state,
                            sid, isec->sid, isec->sclass, perms, adp);
}

/* Same as inode_has_perm, but pass explicit audit data containing
   the dentry to help the auditing code to more easily generate the
   pathname if needed. */
static inline int dentry_has_perm(const struct cred *cred,
                                  struct dentry *dentry,
                                  u32 av)
{
        struct inode *inode = d_backing_inode(dentry);
        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = dentry;
        __inode_security_revalidate(inode, dentry, true);
        return inode_has_perm(cred, inode, av, &ad);
}

/* Same as inode_has_perm, but pass explicit audit data containing
   the path to help the auditing code to more easily generate the
   pathname if needed. */
static inline int path_has_perm(const struct cred *cred,
                                const struct path *path,
                                u32 av)
{
        struct inode *inode = d_backing_inode(path->dentry);
        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_PATH;
        ad.u.path = *path;
        __inode_security_revalidate(inode, path->dentry, true);
        return inode_has_perm(cred, inode, av, &ad);
}

/* Same as path_has_perm, but uses the inode from the file struct. */
static inline int file_path_has_perm(const struct cred *cred,
                                     struct file *file,
                                     u32 av)
{
        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_FILE;
        ad.u.file = file;
        return inode_has_perm(cred, file_inode(file), av, &ad);
}

#ifdef CONFIG_BPF_SYSCALL
static int bpf_fd_pass(struct file *file, u32 sid);
#endif

/* Check whether a task can use an open file descriptor to
   access an inode in a given way.  Check access to the
   descriptor itself, and then use dentry_has_perm to
   check a particular permission to the file.
   Access to the descriptor is implicitly granted if it
   has the same SID as the process.  If av is zero, then
   access to the file is not checked, e.g. for cases
   where only the descriptor is affected like seek. */
static int file_has_perm(const struct cred *cred,
                         struct file *file,
                         u32 av)
{
        struct file_security_struct *fsec = selinux_file(file);
        struct inode *inode = file_inode(file);
        struct common_audit_data ad;
        u32 sid = cred_sid(cred);
        int rc;

        ad.type = LSM_AUDIT_DATA_FILE;
        ad.u.file = file;

        if (sid != fsec->sid) {
                rc = avc_has_perm(&selinux_state,
                                  sid, fsec->sid,
                                  SECCLASS_FD,
                                  FD__USE,
                                  &ad);
                if (rc)
                        goto out;
        }

#ifdef CONFIG_BPF_SYSCALL
        rc = bpf_fd_pass(file, cred_sid(cred));
        if (rc)
                return rc;
#endif

        /* av is zero if only checking access to the descriptor. */
        rc = 0;
        if (av)
                rc = inode_has_perm(cred, inode, av, &ad);

out:
        return rc;
}

/*
 * Determine the label for an inode that might be unioned.
 */
static int
selinux_determine_inode_label(const struct task_security_struct *tsec,
                                 struct inode *dir,
                                 const struct qstr *name, u16 tclass,
                                 u32 *_new_isid)
{
        const struct superblock_security_struct *sbsec = dir->i_sb->s_security;

        if ((sbsec->flags & SE_SBINITIALIZED) &&
            (sbsec->behavior == SECURITY_FS_USE_MNTPOINT)) {
                *_new_isid = sbsec->mntpoint_sid;
        } else if ((sbsec->flags & SBLABEL_MNT) &&
                   tsec->create_sid) {
                *_new_isid = tsec->create_sid;
        } else {
                const struct inode_security_struct *dsec = inode_security(dir);
                return security_transition_sid(&selinux_state, tsec->sid,
                                               dsec->sid, tclass,
                                               name, _new_isid);
        }

        return 0;
}

/* Check whether a task can create a file. */
static int may_create(struct inode *dir,
                      struct dentry *dentry,
                      u16 tclass)
{
        const struct task_security_struct *tsec = selinux_cred(current_cred());
        struct inode_security_struct *dsec;
        struct superblock_security_struct *sbsec;
        u32 sid, newsid;
        struct common_audit_data ad;
        int rc;

        dsec = inode_security(dir);
        sbsec = dir->i_sb->s_security;

        sid = tsec->sid;

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = dentry;

        rc = avc_has_perm(&selinux_state,
                          sid, dsec->sid, SECCLASS_DIR,
                          DIR__ADD_NAME | DIR__SEARCH,
                          &ad);
        if (rc)
                return rc;

        rc = selinux_determine_inode_label(tsec, dir, &dentry->d_name, tclass,
                                           &newsid);
        if (rc)
                return rc;

        rc = avc_has_perm(&selinux_state,
                          sid, newsid, tclass, FILE__CREATE, &ad);
        if (rc)
                return rc;

        return avc_has_perm(&selinux_state,
                            newsid, sbsec->sid,
                            SECCLASS_FILESYSTEM,
                            FILESYSTEM__ASSOCIATE, &ad);
}

#define MAY_LINK        0
#define MAY_UNLINK        1
#define MAY_RMDIR        2

/* Check whether a task can link, unlink, or rmdir a file/directory. */
static int may_link(struct inode *dir,
                    struct dentry *dentry,
                    int kind)

{
        struct inode_security_struct *dsec, *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();
        u32 av;
        int rc;

        dsec = inode_security(dir);
        isec = backing_inode_security(dentry);

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = dentry;

        av = DIR__SEARCH;
        av |= (kind ? DIR__REMOVE_NAME : DIR__ADD_NAME);
        rc = avc_has_perm(&selinux_state,
                          sid, dsec->sid, SECCLASS_DIR, av, &ad);
        if (rc)
                return rc;

        switch (kind) {
        case MAY_LINK:
                av = FILE__LINK;
                break;
        case MAY_UNLINK:
                av = FILE__UNLINK;
                break;
        case MAY_RMDIR:
                av = DIR__RMDIR;
                break;
        default:
                pr_warn("SELinux: %s:  unrecognized kind %d\n",
                        __func__, kind);
                return 0;
        }

        rc = avc_has_perm(&selinux_state,
                          sid, isec->sid, isec->sclass, av, &ad);
        return rc;
}

static inline int may_rename(struct inode *old_dir,
                             struct dentry *old_dentry,
                             struct inode *new_dir,
                             struct dentry *new_dentry)
{
        struct inode_security_struct *old_dsec, *new_dsec, *old_isec, *new_isec;
        struct common_audit_data ad;
        u32 sid = current_sid();
        u32 av;
        int old_is_dir, new_is_dir;
        int rc;

        old_dsec = inode_security(old_dir);
        old_isec = backing_inode_security(old_dentry);
        old_is_dir = d_is_dir(old_dentry);
        new_dsec = inode_security(new_dir);

        ad.type = LSM_AUDIT_DATA_DENTRY;

        ad.u.dentry = old_dentry;
        rc = avc_has_perm(&selinux_state,
                          sid, old_dsec->sid, SECCLASS_DIR,
                          DIR__REMOVE_NAME | DIR__SEARCH, &ad);
        if (rc)
                return rc;
        rc = avc_has_perm(&selinux_state,
                          sid, old_isec->sid,
                          old_isec->sclass, FILE__RENAME, &ad);
        if (rc)
                return rc;
        if (old_is_dir && new_dir != old_dir) {
                rc = avc_has_perm(&selinux_state,
                                  sid, old_isec->sid,
                                  old_isec->sclass, DIR__REPARENT, &ad);
                if (rc)
                        return rc;
        }

        ad.u.dentry = new_dentry;
        av = DIR__ADD_NAME | DIR__SEARCH;
        if (d_is_positive(new_dentry))
                av |= DIR__REMOVE_NAME;
        rc = avc_has_perm(&selinux_state,
                          sid, new_dsec->sid, SECCLASS_DIR, av, &ad);
        if (rc)
                return rc;
        if (d_is_positive(new_dentry)) {
                new_isec = backing_inode_security(new_dentry);
                new_is_dir = d_is_dir(new_dentry);
                rc = avc_has_perm(&selinux_state,
                                  sid, new_isec->sid,
                                  new_isec->sclass,
                                  (new_is_dir ? DIR__RMDIR : FILE__UNLINK), &ad);
                if (rc)
                        return rc;
        }

        return 0;
}

/* Check whether a task can perform a filesystem operation. */
static int superblock_has_perm(const struct cred *cred,
                               struct super_block *sb,
                               u32 perms,
                               struct common_audit_data *ad)
{
        struct superblock_security_struct *sbsec;
        u32 sid = cred_sid(cred);

        sbsec = sb->s_security;
        return avc_has_perm(&selinux_state,
                            sid, sbsec->sid, SECCLASS_FILESYSTEM, perms, ad);
}

/* Convert a Linux mode and permission mask to an access vector. */
static inline u32 file_mask_to_av(int mode, int mask)
{
        u32 av = 0;

        if (!S_ISDIR(mode)) {
                if (mask & MAY_EXEC)
                        av |= FILE__EXECUTE;
                if (mask & MAY_READ)
                        av |= FILE__READ;

                if (mask & MAY_APPEND)
                        av |= FILE__APPEND;
                else if (mask & MAY_WRITE)
                        av |= FILE__WRITE;

        } else {
                if (mask & MAY_EXEC)
                        av |= DIR__SEARCH;
                if (mask & MAY_WRITE)
                        av |= DIR__WRITE;
                if (mask & MAY_READ)
                        av |= DIR__READ;
        }

        return av;
}

/* Convert a Linux file to an access vector. */
static inline u32 file_to_av(struct file *file)
{
        u32 av = 0;

        if (file->f_mode & FMODE_READ)
                av |= FILE__READ;
        if (file->f_mode & FMODE_WRITE) {
                if (file->f_flags & O_APPEND)
                        av |= FILE__APPEND;
                else
                        av |= FILE__WRITE;
        }
        if (!av) {
                /*
                 * Special file opened with flags 3 for ioctl-only use.
                 */
                av = FILE__IOCTL;
        }

        return av;
}

/*
 * Convert a file to an access vector and include the correct
 * open permission.
 */
static inline u32 open_file_to_av(struct file *file)
{
        u32 av = file_to_av(file);
        struct inode *inode = file_inode(file);

        if (selinux_policycap_openperm() &&
            inode->i_sb->s_magic != SOCKFS_MAGIC)
                av |= FILE__OPEN;

        return av;
}

/* Hook functions begin here. */

static int selinux_binder_set_context_mgr(const struct cred *mgr)
{
        return avc_has_perm(&selinux_state,
                            current_sid(), cred_sid(mgr), SECCLASS_BINDER,
                            BINDER__SET_CONTEXT_MGR, NULL);
}

static int selinux_binder_transaction(const struct cred *from,
                                      const struct cred *to)
{
        u32 mysid = current_sid();
        u32 fromsid = cred_sid(from);
        u32 tosid = cred_sid(to);
        int rc;

        if (mysid != fromsid) {
                rc = avc_has_perm(&selinux_state,
                                  mysid, fromsid, SECCLASS_BINDER,
                                  BINDER__IMPERSONATE, NULL);
                if (rc)
                        return rc;
        }

        return avc_has_perm(&selinux_state, fromsid, tosid,
                            SECCLASS_BINDER, BINDER__CALL, NULL);
}

static int selinux_binder_transfer_binder(const struct cred *from,
                                          const struct cred *to)
{
        return avc_has_perm(&selinux_state,
                            cred_sid(from), cred_sid(to),
                            SECCLASS_BINDER, BINDER__TRANSFER,
                            NULL);
}

static int selinux_binder_transfer_file(const struct cred *from,
                                        const struct cred *to,
                                        struct file *file)
{
        u32 sid = cred_sid(to);
        struct file_security_struct *fsec = selinux_file(file);
        struct dentry *dentry = file->f_path.dentry;
        struct inode_security_struct *isec;
        struct common_audit_data ad;
        int rc;

        ad.type = LSM_AUDIT_DATA_PATH;
        ad.u.path = file->f_path;

        if (sid != fsec->sid) {
                rc = avc_has_perm(&selinux_state,
                                  sid, fsec->sid,
                                  SECCLASS_FD,
                                  FD__USE,
                                  &ad);
                if (rc)
                        return rc;
        }

#ifdef CONFIG_BPF_SYSCALL
        rc = bpf_fd_pass(file, sid);
        if (rc)
                return rc;
#endif

        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;

        isec = backing_inode_security(dentry);
        return avc_has_perm(&selinux_state,
                            sid, isec->sid, isec->sclass, file_to_av(file),
                            &ad);
}

static int selinux_ptrace_access_check(struct task_struct *child,
                                     unsigned int mode)
{
        u32 sid = current_sid();
        u32 csid = task_sid(child);

        if (mode & PTRACE_MODE_READ)
                return avc_has_perm(&selinux_state,
                                    sid, csid, SECCLASS_FILE, FILE__READ, NULL);

        return avc_has_perm(&selinux_state,
                            sid, csid, SECCLASS_PROCESS, PROCESS__PTRACE, NULL);
}

static int selinux_ptrace_traceme(struct task_struct *parent)
{
        return avc_has_perm(&selinux_state,
                            task_sid(parent), current_sid(), SECCLASS_PROCESS,
                            PROCESS__PTRACE, NULL);
}

static int selinux_capget(struct task_struct *target, kernel_cap_t *effective,
                          kernel_cap_t *inheritable, kernel_cap_t *permitted)
{
        return avc_has_perm(&selinux_state,
                            current_sid(), task_sid(target), SECCLASS_PROCESS,
                            PROCESS__GETCAP, NULL);
}

static int selinux_capset(struct cred *new, const struct cred *old,
                          const kernel_cap_t *effective,
                          const kernel_cap_t *inheritable,
                          const kernel_cap_t *permitted)
{
        return avc_has_perm(&selinux_state,
                            cred_sid(old), cred_sid(new), SECCLASS_PROCESS,
                            PROCESS__SETCAP, NULL);
}

/*
 * (This comment used to live with the selinux_task_setuid hook,
 * which was removed).
 *
 * Since setuid only affects the current process, and since the SELinux
 * controls are not based on the Linux identity attributes, SELinux does not
 * need to control this operation.  However, SELinux does control the use of
 * the CAP_SETUID and CAP_SETGID capabilities using the capable hook.
 */

static int selinux_capable(const struct cred *cred, struct user_namespace *ns,
                           int cap, unsigned int opts)
{
        return cred_has_capability(cred, cap, opts, ns == &init_user_ns);
}

static int selinux_quotactl(int cmds, int type, int id, struct super_block *sb)
{
        const struct cred *cred = current_cred();
        int rc = 0;

        if (!sb)
                return 0;

        switch (cmds) {
        case Q_SYNC:
        case Q_QUOTAON:
        case Q_QUOTAOFF:
        case Q_SETINFO:
        case Q_SETQUOTA:
        case Q_XQUOTAOFF:
        case Q_XQUOTAON:
        case Q_XSETQLIM:
                rc = superblock_has_perm(cred, sb, FILESYSTEM__QUOTAMOD, NULL);
                break;
        case Q_GETFMT:
        case Q_GETINFO:
        case Q_GETQUOTA:
        case Q_XGETQUOTA:
        case Q_XGETQSTAT:
        case Q_XGETQSTATV:
        case Q_XGETNEXTQUOTA:
                rc = superblock_has_perm(cred, sb, FILESYSTEM__QUOTAGET, NULL);
                break;
        default:
                rc = 0;  /* let the kernel handle invalid cmds */
                break;
        }
        return rc;
}

static int selinux_quota_on(struct dentry *dentry)
{
        const struct cred *cred = current_cred();

        return dentry_has_perm(cred, dentry, FILE__QUOTAON);
}

static int selinux_syslog(int type)
{
        switch (type) {
        case SYSLOG_ACTION_READ_ALL:        /* Read last kernel messages */
        case SYSLOG_ACTION_SIZE_BUFFER:        /* Return size of the log buffer */
                return avc_has_perm(&selinux_state,
                                    current_sid(), SECINITSID_KERNEL,
                                    SECCLASS_SYSTEM, SYSTEM__SYSLOG_READ, NULL);
        case SYSLOG_ACTION_CONSOLE_OFF:        /* Disable logging to console */
        case SYSLOG_ACTION_CONSOLE_ON:        /* Enable logging to console */
        /* Set level of messages printed to console */
        case SYSLOG_ACTION_CONSOLE_LEVEL:
                return avc_has_perm(&selinux_state,
                                    current_sid(), SECINITSID_KERNEL,
                                    SECCLASS_SYSTEM, SYSTEM__SYSLOG_CONSOLE,
                                    NULL);
        }
        /* All other syslog types */
        return avc_has_perm(&selinux_state,
                            current_sid(), SECINITSID_KERNEL,
                            SECCLASS_SYSTEM, SYSTEM__SYSLOG_MOD, NULL);
}

/*
 * Check that a process has enough memory to allocate a new virtual
 * mapping. 0 means there is enough memory for the allocation to
 * succeed and -ENOMEM implies there is not.
 *
 * Do not audit the selinux permission check, as this is applied to all
 * processes that allocate mappings.
 */
static int selinux_vm_enough_memory(struct mm_struct *mm, long pages)
{
        int rc, cap_sys_admin = 0;

        rc = cred_has_capability(current_cred(), CAP_SYS_ADMIN,
                                 CAP_OPT_NOAUDIT, true);
        if (rc == 0)
                cap_sys_admin = 1;

        return cap_sys_admin;
}

/* binprm security operations */

static u32 ptrace_parent_sid(void)
{
        u32 sid = 0;
        struct task_struct *tracer;

        rcu_read_lock();
        tracer = ptrace_parent(current);
        if (tracer)
                sid = task_sid(tracer);
        rcu_read_unlock();

        return sid;
}

static int check_nnp_nosuid(const struct linux_binprm *bprm,
                            const struct task_security_struct *old_tsec,
                            const struct task_security_struct *new_tsec)
{
        int nnp = (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS);
        int nosuid = !mnt_may_suid(bprm->file->f_path.mnt);
        int rc;
        u32 av;

        if (!nnp && !nosuid)
                return 0; /* neither NNP nor nosuid */

        if (new_tsec->sid == old_tsec->sid)
                return 0; /* No change in credentials */

        /*
         * If the policy enables the nnp_nosuid_transition policy capability,
         * then we permit transitions under NNP or nosuid if the
         * policy allows the corresponding permission between
         * the old and new contexts.
         */
        if (selinux_policycap_nnp_nosuid_transition()) {
                av = 0;
                if (nnp)
                        av |= PROCESS2__NNP_TRANSITION;
                if (nosuid)
                        av |= PROCESS2__NOSUID_TRANSITION;
                rc = avc_has_perm(&selinux_state,
                                  old_tsec->sid, new_tsec->sid,
                                  SECCLASS_PROCESS2, av, NULL);
                if (!rc)
                        return 0;
        }

        /*
         * We also permit NNP or nosuid transitions to bounded SIDs,
         * i.e. SIDs that are guaranteed to only be allowed a subset
         * of the permissions of the current SID.
         */
        rc = security_bounded_transition(&selinux_state, old_tsec->sid,
                                         new_tsec->sid);
        if (!rc)
                return 0;

        /*
         * On failure, preserve the errno values for NNP vs nosuid.
         * NNP:  Operation not permitted for caller.
         * nosuid:  Permission denied to file.
         */
        if (nnp)
                return -EPERM;
        return -EACCES;
}

static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm)
{
        const struct task_security_struct *old_tsec;
        struct task_security_struct *new_tsec;
        struct inode_security_struct *isec;
        struct common_audit_data ad;
        struct inode *inode = file_inode(bprm->file);
        int rc;

        /* SELinux context only depends on initial program or script and not
         * the script interpreter */

        old_tsec = selinux_cred(current_cred());
        new_tsec = selinux_cred(bprm->cred);
        isec = inode_security(inode);

        /* Default to the current task SID. */
        new_tsec->sid = old_tsec->sid;
        new_tsec->osid = old_tsec->sid;

        /* Reset fs, key, and sock SIDs on execve. */
        new_tsec->create_sid = 0;
        new_tsec->keycreate_sid = 0;
        new_tsec->sockcreate_sid = 0;

        if (old_tsec->exec_sid) {
                new_tsec->sid = old_tsec->exec_sid;
                /* Reset exec SID on execve. */
                new_tsec->exec_sid = 0;

                /* Fail on NNP or nosuid if not an allowed transition. */
                rc = check_nnp_nosuid(bprm, old_tsec, new_tsec);
                if (rc)
                        return rc;
        } else {
                /* Check for a default transition on this program. */
                rc = security_transition_sid(&selinux_state, old_tsec->sid,
                                             isec->sid, SECCLASS_PROCESS, NULL,
                                             &new_tsec->sid);
                if (rc)
                        return rc;

                /*
                 * Fallback to old SID on NNP or nosuid if not an allowed
                 * transition.
                 */
                rc = check_nnp_nosuid(bprm, old_tsec, new_tsec);
                if (rc)
                        new_tsec->sid = old_tsec->sid;
        }

        ad.type = LSM_AUDIT_DATA_FILE;
        ad.u.file = bprm->file;

        if (new_tsec->sid == old_tsec->sid) {
                rc = avc_has_perm(&selinux_state,
                                  old_tsec->sid, isec->sid,
                                  SECCLASS_FILE, FILE__EXECUTE_NO_TRANS, &ad);
                if (rc)
                        return rc;
        } else {
                /* Check permissions for the transition. */
                rc = avc_has_perm(&selinux_state,
                                  old_tsec->sid, new_tsec->sid,
                                  SECCLASS_PROCESS, PROCESS__TRANSITION, &ad);
                if (rc)
                        return rc;

                rc = avc_has_perm(&selinux_state,
                                  new_tsec->sid, isec->sid,
                                  SECCLASS_FILE, FILE__ENTRYPOINT, &ad);
                if (rc)
                        return rc;

                /* Check for shared state */
                if (bprm->unsafe & LSM_UNSAFE_SHARE) {
                        rc = avc_has_perm(&selinux_state,
                                          old_tsec->sid, new_tsec->sid,
                                          SECCLASS_PROCESS, PROCESS__SHARE,
                                          NULL);
                        if (rc)
                                return -EPERM;
                }

                /* Make sure that anyone attempting to ptrace over a task that
                 * changes its SID has the appropriate permit */
                if (bprm->unsafe & LSM_UNSAFE_PTRACE) {
                        u32 ptsid = ptrace_parent_sid();
                        if (ptsid != 0) {
                                rc = avc_has_perm(&selinux_state,
                                                  ptsid, new_tsec->sid,
                                                  SECCLASS_PROCESS,
                                                  PROCESS__PTRACE, NULL);
                                if (rc)
                                        return -EPERM;
                        }
                }

                /* Clear any possibly unsafe personality bits on exec: */
                bprm->per_clear |= PER_CLEAR_ON_SETID;

                /* Enable secure mode for SIDs transitions unless
                   the noatsecure permission is granted between
                   the two SIDs, i.e. ahp returns 0. */
                rc = avc_has_perm(&selinux_state,
                                  old_tsec->sid, new_tsec->sid,
                                  SECCLASS_PROCESS, PROCESS__NOATSECURE,
                                  NULL);
                bprm->secureexec |= !!rc;
        }

        return 0;
}

static int match_file(const void *p, struct file *file, unsigned fd)
{
        return file_has_perm(p, file, file_to_av(file)) ? fd + 1 : 0;
}

/* Derived from fs/exec.c:flush_old_files. */
static inline void flush_unauthorized_files(const struct cred *cred,
                                            struct files_struct *files)
{
        struct file *file, *devnull = NULL;
        struct tty_struct *tty;
        int drop_tty = 0;
        unsigned n;

        tty = get_current_tty();
        if (tty) {
                spin_lock(&tty->files_lock);
                if (!list_empty(&tty->tty_files)) {
                        struct tty_file_private *file_priv;

                        /* Revalidate access to controlling tty.
                           Use file_path_has_perm on the tty path directly
                           rather than using file_has_perm, as this particular
                           open file may belong to another process and we are
                           only interested in the inode-based check here. */
                        file_priv = list_first_entry(&tty->tty_files,
                                                struct tty_file_private, list);
                        file = file_priv->file;
                        if (file_path_has_perm(cred, file, FILE__READ | FILE__WRITE))
                                drop_tty = 1;
                }
                spin_unlock(&tty->files_lock);
                tty_kref_put(tty);
        }
        /* Reset controlling tty. */
        if (drop_tty)
                no_tty();

        /* Revalidate access to inherited open files. */
        n = iterate_fd(files, 0, match_file, cred);
        if (!n) /* none found? */
                return;

        devnull = dentry_open(&selinux_null, O_RDWR, cred);
        if (IS_ERR(devnull))
                devnull = NULL;
        /* replace all the matching ones with this */
        do {
                replace_fd(n - 1, devnull, 0);
        } while ((n = iterate_fd(files, n, match_file, cred)) != 0);
        if (devnull)
                fput(devnull);
}

/*
 * Prepare a process for imminent new credential changes due to exec
 */
static void selinux_bprm_committing_creds(struct linux_binprm *bprm)
{
        struct task_security_struct *new_tsec;
        struct rlimit *rlim, *initrlim;
        int rc, i;

        new_tsec = selinux_cred(bprm->cred);
        if (new_tsec->sid == new_tsec->osid)
                return;

        /* Close files for which the new task SID is not authorized. */
        flush_unauthorized_files(bprm->cred, current->files);

        /* Always clear parent death signal on SID transitions. */
        current->pdeath_signal = 0;

        /* Check whether the new SID can inherit resource limits from the old
         * SID.  If not, reset all soft limits to the lower of the current
         * task's hard limit and the init task's soft limit.
         *
         * Note that the setting of hard limits (even to lower them) can be
         * controlled by the setrlimit check.  The inclusion of the init task's
         * soft limit into the computation is to avoid resetting soft limits
         * higher than the default soft limit for cases where the default is
         * lower than the hard limit, e.g. RLIMIT_CORE or RLIMIT_STACK.
         */
        rc = avc_has_perm(&selinux_state,
                          new_tsec->osid, new_tsec->sid, SECCLASS_PROCESS,
                          PROCESS__RLIMITINH, NULL);
        if (rc) {
                /* protect against do_prlimit() */
                task_lock(current);
                for (i = 0; i < RLIM_NLIMITS; i++) {
                        rlim = current->signal->rlim + i;
                        initrlim = init_task.signal->rlim + i;
                        rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur);
                }
                task_unlock(current);
                if (IS_ENABLED(CONFIG_POSIX_TIMERS))
                        update_rlimit_cpu(current, rlimit(RLIMIT_CPU));
        }
}

/*
 * Clean up the process immediately after the installation of new credentials
 * due to exec
 */
static void selinux_bprm_committed_creds(struct linux_binprm *bprm)
{
        const struct task_security_struct *tsec = selinux_cred(current_cred());
        u32 osid, sid;
        int rc;

        osid = tsec->osid;
        sid = tsec->sid;

        if (sid == osid)
                return;

        /* Check whether the new SID can inherit signal state from the old SID.
         * If not, clear itimers to avoid subsequent signal generation and
         * flush and unblock signals.
         *
         * This must occur _after_ the task SID has been updated so that any
         * kill done after the flush will be checked against the new SID.
         */
        rc = avc_has_perm(&selinux_state,
                          osid, sid, SECCLASS_PROCESS, PROCESS__SIGINH, NULL);
        if (rc) {
                clear_itimer();

                spin_lock_irq(&current->sighand->siglock);
                if (!fatal_signal_pending(current)) {
                        flush_sigqueue(&current->pending);
                        flush_sigqueue(&current->signal->shared_pending);
                        flush_signal_handlers(current, 1);
                        sigemptyset(&current->blocked);
                        recalc_sigpending();
                }
                spin_unlock_irq(&current->sighand->siglock);
        }

        /* Wake up the parent if it is waiting so that it can recheck
         * wait permission to the new task SID. */
        read_lock(&tasklist_lock);
        __wake_up_parent(current, current->real_parent);
        read_unlock(&tasklist_lock);
}

/* superblock security operations */

static int selinux_sb_alloc_security(struct super_block *sb)
{
        struct superblock_security_struct *sbsec;

        sbsec = kzalloc(sizeof(struct superblock_security_struct), GFP_KERNEL);
        if (!sbsec)
                return -ENOMEM;

        mutex_init(&sbsec->lock);
        INIT_LIST_HEAD(&sbsec->isec_head);
        spin_lock_init(&sbsec->isec_lock);
        sbsec->sb = sb;
        sbsec->sid = SECINITSID_UNLABELED;
        sbsec->def_sid = SECINITSID_FILE;
        sbsec->mntpoint_sid = SECINITSID_UNLABELED;
        sb->s_security = sbsec;

        return 0;
}

static void selinux_sb_free_security(struct super_block *sb)
{
        superblock_free_security(sb);
}

static inline int opt_len(const char *s)
{
        bool open_quote = false;
        int len;
        char c;

        for (len = 0; (c = s[len]) != '\0'; len++) {
                if (c == '"')
                        open_quote = !open_quote;
                if (c == ',' && !open_quote)
                        break;
        }
        return len;
}

static int selinux_sb_eat_lsm_opts(char *options, void **mnt_opts)
{
        char *from = options;
        char *to = options;
        bool first = true;
        int rc;

        while (1) {
                int len = opt_len(from);
                int token;
                char *arg = NULL;

                token = match_opt_prefix(from, len, &arg);

                if (token != Opt_error) {
                        char *p, *q;

                        /* strip quotes */
                        if (arg) {
                                for (p = q = arg; p < from + len; p++) {
                                        char c = *p;
                                        if (c != '"')
                                                *q++ = c;
                                }
                                arg = kmemdup_nul(arg, q - arg, GFP_KERNEL);
                                if (!arg) {
                                        rc = -ENOMEM;
                                        goto free_opt;
                                }
                        }
                        rc = selinux_add_opt(token, arg, mnt_opts);
                        if (unlikely(rc)) {
                                kfree(arg);
                                goto free_opt;
                        }
                } else {
                        if (!first) {        // copy with preceding comma
                                from--;
                                len++;
                        }
                        if (to != from)
                                memmove(to, from, len);
                        to += len;
                        first = false;
                }
                if (!from[len])
                        break;
                from += len + 1;
        }
        *to = '\0';
        return 0;

free_opt:
        if (*mnt_opts) {
                selinux_free_mnt_opts(*mnt_opts);
                *mnt_opts = NULL;
        }
        return rc;
}

static int selinux_sb_remount(struct super_block *sb, void *mnt_opts)
{
        struct selinux_mnt_opts *opts = mnt_opts;
        struct superblock_security_struct *sbsec = sb->s_security;
        u32 sid;
        int rc;

        if (!(sbsec->flags & SE_SBINITIALIZED))
                return 0;

        if (!opts)
                return 0;

        if (opts->fscontext) {
                rc = parse_sid(sb, opts->fscontext, &sid);
                if (rc)
                        return rc;
                if (bad_option(sbsec, FSCONTEXT_MNT, sbsec->sid, sid))
                        goto out_bad_option;
        }
        if (opts->context) {
                rc = parse_sid(sb, opts->context, &sid);
                if (rc)
                        return rc;
                if (bad_option(sbsec, CONTEXT_MNT, sbsec->mntpoint_sid, sid))
                        goto out_bad_option;
        }
        if (opts->rootcontext) {
                struct inode_security_struct *root_isec;
                root_isec = backing_inode_security(sb->s_root);
                rc = parse_sid(sb, opts->rootcontext, &sid);
                if (rc)
                        return rc;
                if (bad_option(sbsec, ROOTCONTEXT_MNT, root_isec->sid, sid))
                        goto out_bad_option;
        }
        if (opts->defcontext) {
                rc = parse_sid(sb, opts->defcontext, &sid);
                if (rc)
                        return rc;
                if (bad_option(sbsec, DEFCONTEXT_MNT, sbsec->def_sid, sid))
                        goto out_bad_option;
        }
        return 0;

out_bad_option:
        pr_warn("SELinux: unable to change security options "
               "during remount (dev %s, type=%s)\n", sb->s_id,
               sb->s_type->name);
        return -EINVAL;
}

static int selinux_sb_kern_mount(struct super_block *sb)
{
        const struct cred *cred = current_cred();
        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = sb->s_root;
        return superblock_has_perm(cred, sb, FILESYSTEM__MOUNT, &ad);
}

static int selinux_sb_statfs(struct dentry *dentry)
{
        const struct cred *cred = current_cred();
        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = dentry->d_sb->s_root;
        return superblock_has_perm(cred, dentry->d_sb, FILESYSTEM__GETATTR, &ad);
}

static int selinux_mount(const char *dev_name,
                         const struct path *path,
                         const char *type,
                         unsigned long flags,
                         void *data)
{
        const struct cred *cred = current_cred();

        if (flags & MS_REMOUNT)
                return superblock_has_perm(cred, path->dentry->d_sb,
                                           FILESYSTEM__REMOUNT, NULL);
        else
                return path_has_perm(cred, path, FILE__MOUNTON);
}

static int selinux_move_mount(const struct path *from_path,
                              const struct path *to_path)
{
        const struct cred *cred = current_cred();

        return path_has_perm(cred, to_path, FILE__MOUNTON);
}

static int selinux_umount(struct vfsmount *mnt, int flags)
{
        const struct cred *cred = current_cred();

        return superblock_has_perm(cred, mnt->mnt_sb,
                                   FILESYSTEM__UNMOUNT, NULL);
}

static int selinux_fs_context_dup(struct fs_context *fc,
                                  struct fs_context *src_fc)
{
        const struct selinux_mnt_opts *src = src_fc->security;
        struct selinux_mnt_opts *opts;

        if (!src)
                return 0;

        fc->security = kzalloc(sizeof(struct selinux_mnt_opts), GFP_KERNEL);
        if (!fc->security)
                return -ENOMEM;

        opts = fc->security;

        if (src->fscontext) {
                opts->fscontext = kstrdup(src->fscontext, GFP_KERNEL);
                if (!opts->fscontext)
                        return -ENOMEM;
        }
        if (src->context) {
                opts->context = kstrdup(src->context, GFP_KERNEL);
                if (!opts->context)
                        return -ENOMEM;
        }
        if (src->rootcontext) {
                opts->rootcontext = kstrdup(src->rootcontext, GFP_KERNEL);
                if (!opts->rootcontext)
                        return -ENOMEM;
        }
        if (src->defcontext) {
                opts->defcontext = kstrdup(src->defcontext, GFP_KERNEL);
                if (!opts->defcontext)
                        return -ENOMEM;
        }
        return 0;
}

static const struct fs_parameter_spec selinux_fs_parameters[] = {
        fsparam_string(CONTEXT_STR,        Opt_context),
        fsparam_string(DEFCONTEXT_STR,        Opt_defcontext),
        fsparam_string(FSCONTEXT_STR,        Opt_fscontext),
        fsparam_string(ROOTCONTEXT_STR,        Opt_rootcontext),
        fsparam_flag  (SECLABEL_STR,        Opt_seclabel),
        {}
};

static int selinux_fs_context_parse_param(struct fs_context *fc,
                                          struct fs_parameter *param)
{
        struct fs_parse_result result;
        int opt, rc;

        opt = fs_parse(fc, selinux_fs_parameters, param, &result);
        if (opt < 0)
                return opt;

        rc = selinux_add_opt(opt, param->string, &fc->security);
        if (!rc)
                param->string = NULL;

        return rc;
}

/* inode security operations */

static int selinux_inode_alloc_security(struct inode *inode)
{
        struct inode_security_struct *isec = selinux_inode(inode);
        u32 sid = current_sid();

        spin_lock_init(&isec->lock);
        INIT_LIST_HEAD(&isec->list);
        isec->inode = inode;
        isec->sid = SECINITSID_UNLABELED;
        isec->sclass = SECCLASS_FILE;
        isec->task_sid = sid;
        isec->initialized = LABEL_INVALID;

        return 0;
}

static void selinux_inode_free_security(struct inode *inode)
{
        inode_free_security(inode);
}

static int selinux_dentry_init_security(struct dentry *dentry, int mode,
                                        const struct qstr *name, void **ctx,
                                        u32 *ctxlen)
{
        u32 newsid;
        int rc;

        rc = selinux_determine_inode_label(selinux_cred(current_cred()),
                                           d_inode(dentry->d_parent), name,
                                           inode_mode_to_security_class(mode),
                                           &newsid);
        if (rc)
                return rc;

        return security_sid_to_context(&selinux_state, newsid, (char **)ctx,
                                       ctxlen);
}

static int selinux_dentry_create_files_as(struct dentry *dentry, int mode,
                                          struct qstr *name,
                                          const struct cred *old,
                                          struct cred *new)
{
        u32 newsid;
        int rc;
        struct task_security_struct *tsec;

        rc = selinux_determine_inode_label(selinux_cred(old),
                                           d_inode(dentry->d_parent), name,
                                           inode_mode_to_security_class(mode),
                                           &newsid);
        if (rc)
                return rc;

        tsec = selinux_cred(new);
        tsec->create_sid = newsid;
        return 0;
}

static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
                                       const struct qstr *qstr,
                                       const char **name,
                                       void **value, size_t *len)
{
        const struct task_security_struct *tsec = selinux_cred(current_cred());
        struct superblock_security_struct *sbsec;
        u32 newsid, clen;
        int rc;
        char *context;

        sbsec = dir->i_sb->s_security;

        newsid = tsec->create_sid;

        rc = selinux_determine_inode_label(tsec, dir, qstr,
                inode_mode_to_security_class(inode->i_mode),
                &newsid);
        if (rc)
                return rc;

        /* Possibly defer initialization to selinux_complete_init. */
        if (sbsec->flags & SE_SBINITIALIZED) {
                struct inode_security_struct *isec = selinux_inode(inode);
                isec->sclass = inode_mode_to_security_class(inode->i_mode);
                isec->sid = newsid;
                isec->initialized = LABEL_INITIALIZED;
        }

        if (!selinux_initialized(&selinux_state) ||
            !(sbsec->flags & SBLABEL_MNT))
                return -EOPNOTSUPP;

        if (name)
                *name = XATTR_SELINUX_SUFFIX;

        if (value && len) {
                rc = security_sid_to_context_force(&selinux_state, newsid,
                                                   &context, &clen);
                if (rc)
                        return rc;
                *value = context;
                *len = clen;
        }

        return 0;
}

static int selinux_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode)
{
        return may_create(dir, dentry, SECCLASS_FILE);
}

static int selinux_inode_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)
{
        return may_link(dir, old_dentry, MAY_LINK);
}

static int selinux_inode_unlink(struct inode *dir, struct dentry *dentry)
{
        return may_link(dir, dentry, MAY_UNLINK);
}

static int selinux_inode_symlink(struct inode *dir, struct dentry *dentry, const char *name)
{
        return may_create(dir, dentry, SECCLASS_LNK_FILE);
}

static int selinux_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mask)
{
        return may_create(dir, dentry, SECCLASS_DIR);
}

static int selinux_inode_rmdir(struct inode *dir, struct dentry *dentry)
{
        return may_link(dir, dentry, MAY_RMDIR);
}

static int selinux_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
{
        return may_create(dir, dentry, inode_mode_to_security_class(mode));
}

static int selinux_inode_rename(struct inode *old_inode, struct dentry *old_dentry,
                                struct inode *new_inode, struct dentry *new_dentry)
{
        return may_rename(old_inode, old_dentry, new_inode, new_dentry);
}

static int selinux_inode_readlink(struct dentry *dentry)
{
        const struct cred *cred = current_cred();

        return dentry_has_perm(cred, dentry, FILE__READ);
}

static int selinux_inode_follow_link(struct dentry *dentry, struct inode *inode,
                                     bool rcu)
{
        const struct cred *cred = current_cred();
        struct common_audit_data ad;
        struct inode_security_struct *isec;
        u32 sid;

        validate_creds(cred);

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = dentry;
        sid = cred_sid(cred);
        isec = inode_security_rcu(inode, rcu);
        if (IS_ERR(isec))
                return PTR_ERR(isec);

        return avc_has_perm_flags(&selinux_state,
                                  sid, isec->sid, isec->sclass, FILE__READ, &ad,
                                  rcu ? MAY_NOT_BLOCK : 0);
}

static noinline int audit_inode_permission(struct inode *inode,
                                           u32 perms, u32 audited, u32 denied,
                                           int result)
{
        struct common_audit_data ad;
        struct inode_security_struct *isec = selinux_inode(inode);
        int rc;

        ad.type = LSM_AUDIT_DATA_INODE;
        ad.u.inode = inode;

        rc = slow_avc_audit(&selinux_state,
                            current_sid(), isec->sid, isec->sclass, perms,
                            audited, denied, result, &ad);
        if (rc)
                return rc;
        return 0;
}

static int selinux_inode_permission(struct inode *inode, int mask)
{
        const struct cred *cred = current_cred();
        u32 perms;
        bool from_access;
        bool no_block = mask & MAY_NOT_BLOCK;
        struct inode_security_struct *isec;
        u32 sid;
        struct av_decision avd;
        int rc, rc2;
        u32 audited, denied;

        from_access = mask & MAY_ACCESS;
        mask &= (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND);

        /* No permission to check.  Existence test. */
        if (!mask)
                return 0;

        validate_creds(cred);

        if (unlikely(IS_PRIVATE(inode)))
                return 0;

        perms = file_mask_to_av(inode->i_mode, mask);

        sid = cred_sid(cred);
        isec = inode_security_rcu(inode, no_block);
        if (IS_ERR(isec))
                return PTR_ERR(isec);

        rc = avc_has_perm_noaudit(&selinux_state,
                                  sid, isec->sid, isec->sclass, perms,
                                  no_block ? AVC_NONBLOCKING : 0,
                                  &avd);
        audited = avc_audit_required(perms, &avd, rc,
                                     from_access ? FILE__AUDIT_ACCESS : 0,
                                     &denied);
        if (likely(!audited))
                return rc;

        /* fall back to ref-walk if we have to generate audit */
        if (no_block)
                return -ECHILD;

        rc2 = audit_inode_permission(inode, perms, audited, denied, rc);
        if (rc2)
                return rc2;
        return rc;
}

static int selinux_inode_setattr(struct dentry *dentry, struct iattr *iattr)
{
        const struct cred *cred = current_cred();
        struct inode *inode = d_backing_inode(dentry);
        unsigned int ia_valid = iattr->ia_valid;
        __u32 av = FILE__WRITE;

        /* ATTR_FORCE is just used for ATTR_KILL_S[UG]ID. */
        if (ia_valid & ATTR_FORCE) {
                ia_valid &= ~(ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_MODE |
                              ATTR_FORCE);
                if (!ia_valid)
                        return 0;
        }

        if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID |
                        ATTR_ATIME_SET | ATTR_MTIME_SET | ATTR_TIMES_SET))
                return dentry_has_perm(cred, dentry, FILE__SETATTR);

        if (selinux_policycap_openperm() &&
            inode->i_sb->s_magic != SOCKFS_MAGIC &&
            (ia_valid & ATTR_SIZE) &&
            !(ia_valid & ATTR_FILE))
                av |= FILE__OPEN;

        return dentry_has_perm(cred, dentry, av);
}

static int selinux_inode_getattr(const struct path *path)
{
        return path_has_perm(current_cred(), path, FILE__GETATTR);
}

static bool has_cap_mac_admin(bool audit)
{
        const struct cred *cred = current_cred();
        unsigned int opts = audit ? CAP_OPT_NONE : CAP_OPT_NOAUDIT;

        if (cap_capable(cred, &init_user_ns, CAP_MAC_ADMIN, opts))
                return false;
        if (cred_has_capability(cred, CAP_MAC_ADMIN, opts, true))
                return false;
        return true;
}

static int selinux_inode_setxattr(struct dentry *dentry, const char *name,
                                  const void *value, size_t size, int flags)
{
        struct inode *inode = d_backing_inode(dentry);
        struct inode_security_struct *isec;
        struct superblock_security_struct *sbsec;
        struct common_audit_data ad;
        u32 newsid, sid = current_sid();
        int rc = 0;

        if (strcmp(name, XATTR_NAME_SELINUX)) {
                rc = cap_inode_setxattr(dentry, name, value, size, flags);
                if (rc)
                        return rc;

                /* Not an attribute we recognize, so just check the
                   ordinary setattr permission. */
                return dentry_has_perm(current_cred(), dentry, FILE__SETATTR);
        }

        if (!selinux_initialized(&selinux_state))
                return (inode_owner_or_capable(inode) ? 0 : -EPERM);

        sbsec = inode->i_sb->s_security;
        if (!(sbsec->flags & SBLABEL_MNT))
                return -EOPNOTSUPP;

        if (!inode_owner_or_capable(inode))
                return -EPERM;

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = dentry;

        isec = backing_inode_security(dentry);
        rc = avc_has_perm(&selinux_state,
                          sid, isec->sid, isec->sclass,
                          FILE__RELABELFROM, &ad);
        if (rc)
                return rc;

        rc = security_context_to_sid(&selinux_state, value, size, &newsid,
                                     GFP_KERNEL);
        if (rc == -EINVAL) {
                if (!has_cap_mac_admin(true)) {
                        struct audit_buffer *ab;
                        size_t audit_size;

                        /* We strip a nul only if it is at the end, otherwise the
                         * context contains a nul and we should audit that */
                        if (value) {
                                const char *str = value;

                                if (str[size - 1] == '\0')
                                        audit_size = size - 1;
                                else
                                        audit_size = size;
                        } else {
                                audit_size = 0;
                        }
                        ab = audit_log_start(audit_context(),
                                             GFP_ATOMIC, AUDIT_SELINUX_ERR);
                        audit_log_format(ab, "op=setxattr invalid_context=");
                        audit_log_n_untrustedstring(ab, value, audit_size);
                        audit_log_end(ab);

                        return rc;
                }
                rc = security_context_to_sid_force(&selinux_state, value,
                                                   size, &newsid);
        }
        if (rc)
                return rc;

        rc = avc_has_perm(&selinux_state,
                          sid, newsid, isec->sclass,
                          FILE__RELABELTO, &ad);
        if (rc)
                return rc;

        rc = security_validate_transition(&selinux_state, isec->sid, newsid,
                                          sid, isec->sclass);
        if (rc)
                return rc;

        return avc_has_perm(&selinux_state,
                            newsid,
                            sbsec->sid,
                            SECCLASS_FILESYSTEM,
                            FILESYSTEM__ASSOCIATE,
                            &ad);
}

static void selinux_inode_post_setxattr(struct dentry *dentry, const char *name,
                                        const void *value, size_t size,
                                        int flags)
{
        struct inode *inode = d_backing_inode(dentry);
        struct inode_security_struct *isec;
        u32 newsid;
        int rc;

        if (strcmp(name, XATTR_NAME_SELINUX)) {
                /* Not an attribute we recognize, so nothing to do. */
                return;
        }

        if (!selinux_initialized(&selinux_state)) {
                /* If we haven't even been initialized, then we can't validate
                 * against a policy, so leave the label as invalid. It may
                 * resolve to a valid label on the next revalidation try if
                 * we've since initialized.
                 */
                return;
        }

        rc = security_context_to_sid_force(&selinux_state, value, size,
                                           &newsid);
        if (rc) {
                pr_err("SELinux:  unable to map context to SID"
                       "for (%s, %lu), rc=%d\n",
                       inode->i_sb->s_id, inode->i_ino, -rc);
                return;
        }

        isec = backing_inode_security(dentry);
        spin_lock(&isec->lock);
        isec->sclass = inode_mode_to_security_class(inode->i_mode);
        isec->sid = newsid;
        isec->initialized = LABEL_INITIALIZED;
        spin_unlock(&isec->lock);

        return;
}

static int selinux_inode_getxattr(struct dentry *dentry, const char *name)
{
        const struct cred *cred = current_cred();

        return dentry_has_perm(cred, dentry, FILE__GETATTR);
}

static int selinux_inode_listxattr(struct dentry *dentry)
{
        const struct cred *cred = current_cred();

        return dentry_has_perm(cred, dentry, FILE__GETATTR);
}

static int selinux_inode_removexattr(struct dentry *dentry, const char *name)
{
        if (strcmp(name, XATTR_NAME_SELINUX)) {
                int rc = cap_inode_removexattr(dentry, name);
                if (rc)
                        return rc;

                /* Not an attribute we recognize, so just check the
                   ordinary setattr permission. */
                return dentry_has_perm(current_cred(), dentry, FILE__SETATTR);
        }

        if (!selinux_initialized(&selinux_state))
                return 0;

        /* No one is allowed to remove a SELinux security label.
           You can change the label, but all data must be labeled. */
        return -EACCES;
}

static int selinux_path_notify(const struct path *path, u64 mask,
                                                unsigned int obj_type)
{
        int ret;
        u32 perm;

        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_PATH;
        ad.u.path = *path;

        /*
         * Set permission needed based on the type of mark being set.
         * Performs an additional check for sb watches.
         */
        switch (obj_type) {
        case FSNOTIFY_OBJ_TYPE_VFSMOUNT:
                perm = FILE__WATCH_MOUNT;
                break;
        case FSNOTIFY_OBJ_TYPE_SB:
                perm = FILE__WATCH_SB;
                ret = superblock_has_perm(current_cred(), path->dentry->d_sb,
                                                FILESYSTEM__WATCH, &ad);
                if (ret)
                        return ret;
                break;
        case FSNOTIFY_OBJ_TYPE_INODE:
                perm = FILE__WATCH;
                break;
        default:
                return -EINVAL;
        }

        /* blocking watches require the file:watch_with_perm permission */
        if (mask & (ALL_FSNOTIFY_PERM_EVENTS))
                perm |= FILE__WATCH_WITH_PERM;

        /* watches on read-like events need the file:watch_reads permission */
        if (mask & (FS_ACCESS | FS_ACCESS_PERM | FS_CLOSE_NOWRITE))
                perm |= FILE__WATCH_READS;

        return path_has_perm(current_cred(), path, perm);
}

/*
 * Copy the inode security context value to the user.
 *
 * Permission check is handled by selinux_inode_getxattr hook.
 */
static int selinux_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc)
{
        u32 size;
        int error;
        char *context = NULL;
        struct inode_security_struct *isec;

        /*
         * If we're not initialized yet, then we can't validate contexts, so
         * just let vfs_getxattr fall back to using the on-disk xattr.
         */
        if (!selinux_initialized(&selinux_state) ||
            strcmp(name, XATTR_SELINUX_SUFFIX))
                return -EOPNOTSUPP;

        /*
         * If the caller has CAP_MAC_ADMIN, then get the raw context
         * value even if it is not defined by current policy; otherwise,
         * use the in-core value under current policy.
         * Use the non-auditing forms of the permission checks since
         * getxattr may be called by unprivileged processes commonly
         * and lack of permission just means that we fall back to the
         * in-core context value, not a denial.
         */
        isec = inode_security(inode);
        if (has_cap_mac_admin(false))
                error = security_sid_to_context_force(&selinux_state,
                                                      isec->sid, &context,
                                                      &size);
        else
                error = security_sid_to_context(&selinux_state, isec->sid,
                                                &context, &size);
        if (error)
                return error;
        error = size;
        if (alloc) {
                *buffer = context;
                goto out_nofree;
        }
        kfree(context);
out_nofree:
        return error;
}

static int selinux_inode_setsecurity(struct inode *inode, const char *name,
                                     const void *value, size_t size, int flags)
{
        struct inode_security_struct *isec = inode_security_novalidate(inode);
        struct superblock_security_struct *sbsec = inode->i_sb->s_security;
        u32 newsid;
        int rc;

        if (strcmp(name, XATTR_SELINUX_SUFFIX))
                return -EOPNOTSUPP;

        if (!(sbsec->flags & SBLABEL_MNT))
                return -EOPNOTSUPP;

        if (!value || !size)
                return -EACCES;

        rc = security_context_to_sid(&selinux_state, value, size, &newsid,
                                     GFP_KERNEL);
        if (rc)
                return rc;

        spin_lock(&isec->lock);
        isec->sclass = inode_mode_to_security_class(inode->i_mode);
        isec->sid = newsid;
        isec->initialized = LABEL_INITIALIZED;
        spin_unlock(&isec->lock);
        return 0;
}

static int selinux_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size)
{
        const int len = sizeof(XATTR_NAME_SELINUX);

        if (!selinux_initialized(&selinux_state))
                return 0;

        if (buffer && len <= buffer_size)
                memcpy(buffer, XATTR_NAME_SELINUX, len);
        return len;
}

static void selinux_inode_getsecid(struct inode *inode, u32 *secid)
{
        struct inode_security_struct *isec = inode_security_novalidate(inode);
        *secid = isec->sid;
}

static int selinux_inode_copy_up(struct dentry *src, struct cred **new)
{
        u32 sid;
        struct task_security_struct *tsec;
        struct cred *new_creds = *new;

        if (new_creds == NULL) {
                new_creds = prepare_creds();
                if (!new_creds)
                        return -ENOMEM;
        }

        tsec = selinux_cred(new_creds);
        /* Get label from overlay inode and set it in create_sid */
        selinux_inode_getsecid(d_inode(src), &sid);
        tsec->create_sid = sid;
        *new = new_creds;
        return 0;
}

static int selinux_inode_copy_up_xattr(const char *name)
{
        /* The copy_up hook above sets the initial context on an inode, but we
         * don't then want to overwrite it by blindly copying all the lower
         * xattrs up.  Instead, we have to filter out SELinux-related xattrs.
         */
        if (strcmp(name, XATTR_NAME_SELINUX) == 0)
                return 1; /* Discard */
        /*
         * Any other attribute apart from SELINUX is not claimed, supported
         * by selinux.
         */
        return -EOPNOTSUPP;
}

/* kernfs node operations */

static int selinux_kernfs_init_security(struct kernfs_node *kn_dir,
                                        struct kernfs_node *kn)
{
        const struct task_security_struct *tsec = selinux_cred(current_cred());
        u32 parent_sid, newsid, clen;
        int rc;
        char *context;

        rc = kernfs_xattr_get(kn_dir, XATTR_NAME_SELINUX, NULL, 0);
        if (rc == -ENODATA)
                return 0;
        else if (rc < 0)
                return rc;

        clen = (u32)rc;
        context = kmalloc(clen, GFP_KERNEL);
        if (!context)
                return -ENOMEM;

        rc = kernfs_xattr_get(kn_dir, XATTR_NAME_SELINUX, context, clen);
        if (rc < 0) {
                kfree(context);
                return rc;
        }

        rc = security_context_to_sid(&selinux_state, context, clen, &parent_sid,
                                     GFP_KERNEL);
        kfree(context);
        if (rc)
                return rc;

        if (tsec->create_sid) {
                newsid = tsec->create_sid;
        } else {
                u16 secclass = inode_mode_to_security_class(kn->mode);
                struct qstr q;

                q.name = kn->name;
                q.hash_len = hashlen_string(kn_dir, kn->name);

                rc = security_transition_sid(&selinux_state, tsec->sid,
                                             parent_sid, secclass, &q,
                                             &newsid);
                if (rc)
                        return rc;
        }

        rc = security_sid_to_context_force(&selinux_state, newsid,
                                           &context, &clen);
        if (rc)
                return rc;

        rc = kernfs_xattr_set(kn, XATTR_NAME_SELINUX, context, clen,
                              XATTR_CREATE);
        kfree(context);
        return rc;
}


/* file security operations */

static int selinux_revalidate_file_permission(struct file *file, int mask)
{
        const struct cred *cred = current_cred();
        struct inode *inode = file_inode(file);

        /* file_mask_to_av won't add FILE__WRITE if MAY_APPEND is set */
        if ((file->f_flags & O_APPEND) && (mask & MAY_WRITE))
                mask |= MAY_APPEND;

        return file_has_perm(cred, file,
                             file_mask_to_av(inode->i_mode, mask));
}

static int selinux_file_permission(struct file *file, int mask)
{
        struct inode *inode = file_inode(file);
        struct file_security_struct *fsec = selinux_file(file);
        struct inode_security_struct *isec;
        u32 sid = current_sid();

        if (!mask)
                /* No permission to check.  Existence test. */
                return 0;

        isec = inode_security(inode);
        if (sid == fsec->sid && fsec->isid == isec->sid &&
            fsec->pseqno == avc_policy_seqno(&selinux_state))
                /* No change since file_open check. */
                return 0;

        return selinux_revalidate_file_permission(file, mask);
}

static int selinux_file_alloc_security(struct file *file)
{
        struct file_security_struct *fsec = selinux_file(file);
        u32 sid = current_sid();

        fsec->sid = sid;
        fsec->fown_sid = sid;

        return 0;
}

/*
 * Check whether a task has the ioctl permission and cmd
 * operation to an inode.
 */
static int ioctl_has_perm(const struct cred *cred, struct file *file,
                u32 requested, u16 cmd)
{
        struct common_audit_data ad;
        struct file_security_struct *fsec = selinux_file(file);
        struct inode *inode = file_inode(file);
        struct inode_security_struct *isec;
        struct lsm_ioctlop_audit ioctl;
        u32 ssid = cred_sid(cred);
        int rc;
        u8 driver = cmd >> 8;
        u8 xperm = cmd & 0xff;

        ad.type = LSM_AUDIT_DATA_IOCTL_OP;
        ad.u.op = &ioctl;
        ad.u.op->cmd = cmd;
        ad.u.op->path = file->f_path;

        if (ssid != fsec->sid) {
                rc = avc_has_perm(&selinux_state,
                                  ssid, fsec->sid,
                                SECCLASS_FD,
                                FD__USE,
                                &ad);
                if (rc)
                        goto out;
        }

        if (unlikely(IS_PRIVATE(inode)))
                return 0;

        isec = inode_security(inode);
        rc = avc_has_extended_perms(&selinux_state,
                                    ssid, isec->sid, isec->sclass,
                                    requested, driver, xperm, &ad);
out:
        return rc;
}

static int selinux_file_ioctl(struct file *file, unsigned int cmd,
                              unsigned long arg)
{
        const struct cred *cred = current_cred();
        int error = 0;

        switch (cmd) {
        case FIONREAD:
        case FIBMAP:
        case FIGETBSZ:
        case FS_IOC_GETFLAGS:
        case FS_IOC_GETVERSION:
                error = file_has_perm(cred, file, FILE__GETATTR);
                break;

        case FS_IOC_SETFLAGS:
        case FS_IOC_SETVERSION:
                error = file_has_perm(cred, file, FILE__SETATTR);
                break;

        /* sys_ioctl() checks */
        case FIONBIO:
        case FIOASYNC:
                error = file_has_perm(cred, file, 0);
                break;

        case KDSKBENT:
        case KDSKBSENT:
                error = cred_has_capability(cred, CAP_SYS_TTY_CONFIG,
                                            CAP_OPT_NONE, true);
                break;

        case FIOCLEX:
        case FIONCLEX:
                if (!selinux_policycap_ioctl_skip_cloexec())
                        error = ioctl_has_perm(cred, file, FILE__IOCTL, (u16) cmd);
                break;

        /* default case assumes that the command will go
         * to the file's ioctl() function.
         */
        default:
                error = ioctl_has_perm(cred, file, FILE__IOCTL, (u16) cmd);
        }
        return error;
}

static int selinux_file_ioctl_compat(struct file *file, unsigned int cmd,
                              unsigned long arg)
{
        /*
         * If we are in a 64-bit kernel running 32-bit userspace, we need to
         * make sure we don't compare 32-bit flags to 64-bit flags.
         */
        switch (cmd) {
        case FS_IOC32_GETFLAGS:
                cmd = FS_IOC_GETFLAGS;
                break;
        case FS_IOC32_SETFLAGS:
                cmd = FS_IOC_SETFLAGS;
                break;
        case FS_IOC32_GETVERSION:
                cmd = FS_IOC_GETVERSION;
                break;
        case FS_IOC32_SETVERSION:
                cmd = FS_IOC_SETVERSION;
                break;
        default:
                break;
        }

        return selinux_file_ioctl(file, cmd, arg);
}

static int default_noexec __ro_after_init;

static int file_map_prot_check(struct file *file, unsigned long prot, int shared)
{
        const struct cred *cred = current_cred();
        u32 sid = cred_sid(cred);
        int rc = 0;

        if (default_noexec &&
            (prot & PROT_EXEC) && (!file || IS_PRIVATE(file_inode(file)) ||
                                   (!shared && (prot & PROT_WRITE)))) {
                /*
                 * We are making executable an anonymous mapping or a
                 * private file mapping that will also be writable.
                 * This has an additional check.
                 */
                rc = avc_has_perm(&selinux_state,
                                  sid, sid, SECCLASS_PROCESS,
                                  PROCESS__EXECMEM, NULL);
                if (rc)
                        goto error;
        }

        if (file) {
                /* read access is always possible with a mapping */
                u32 av = FILE__READ;

                /* write access only matters if the mapping is shared */
                if (shared && (prot & PROT_WRITE))
                        av |= FILE__WRITE;

                if (prot & PROT_EXEC)
                        av |= FILE__EXECUTE;

                return file_has_perm(cred, file, av);
        }

error:
        return rc;
}

static int selinux_mmap_addr(unsigned long addr)
{
        int rc = 0;

        if (addr < CONFIG_LSM_MMAP_MIN_ADDR) {
                u32 sid = current_sid();
                rc = avc_has_perm(&selinux_state,
                                  sid, sid, SECCLASS_MEMPROTECT,
                                  MEMPROTECT__MMAP_ZERO, NULL);
        }

        return rc;
}

static int selinux_mmap_file(struct file *file, unsigned long reqprot,
                             unsigned long prot, unsigned long flags)
{
        struct common_audit_data ad;
        int rc;

        if (file) {
                ad.type = LSM_AUDIT_DATA_FILE;
                ad.u.file = file;
                rc = inode_has_perm(current_cred(), file_inode(file),
                                    FILE__MAP, &ad);
                if (rc)
                        return rc;
        }

        if (checkreqprot_get(&selinux_state))
                prot = reqprot;

        return file_map_prot_check(file, prot,
                                   (flags & MAP_TYPE) == MAP_SHARED);
}

static int selinux_file_mprotect(struct vm_area_struct *vma,
                                 unsigned long reqprot,
                                 unsigned long prot)
{
        const struct cred *cred = current_cred();
        u32 sid = cred_sid(cred);

        if (checkreqprot_get(&selinux_state))
                prot = reqprot;

        if (default_noexec &&
            (prot & PROT_EXEC) && !(vma->vm_flags & VM_EXEC)) {
                int rc = 0;
                if (vma->vm_start >= vma->vm_mm->start_brk &&
                    vma->vm_end <= vma->vm_mm->brk) {
                        rc = avc_has_perm(&selinux_state,
                                          sid, sid, SECCLASS_PROCESS,
                                          PROCESS__EXECHEAP, NULL);
                } else if (!vma->vm_file &&
                           ((vma->vm_start <= vma->vm_mm->start_stack &&
                             vma->vm_end >= vma->vm_mm->start_stack) ||
                            vma_is_stack_for_current(vma))) {
                        rc = avc_has_perm(&selinux_state,
                                          sid, sid, SECCLASS_PROCESS,
                                          PROCESS__EXECSTACK, NULL);
                } else if (vma->vm_file && vma->anon_vma) {
                        /*
                         * We are making executable a file mapping that has
                         * had some COW done. Since pages might have been
                         * written, check ability to execute the possibly
                         * modified content.  This typically should only
                         * occur for text relocations.
                         */
                        rc = file_has_perm(cred, vma->vm_file, FILE__EXECMOD);
                }
                if (rc)
                        return rc;
        }

        return file_map_prot_check(vma->vm_file, prot, vma->vm_flags&VM_SHARED);
}

static int selinux_file_lock(struct file *file, unsigned int cmd)
{
        const struct cred *cred = current_cred();

        return file_has_perm(cred, file, FILE__LOCK);
}

static int selinux_file_fcntl(struct file *file, unsigned int cmd,
                              unsigned long arg)
{
        const struct cred *cred = current_cred();
        int err = 0;

        switch (cmd) {
        case F_SETFL:
                if ((file->f_flags & O_APPEND) && !(arg & O_APPEND)) {
                        err = file_has_perm(cred, file, FILE__WRITE);
                        break;
                }
                fallthrough;
        case F_SETOWN:
        case F_SETSIG:
        case F_GETFL:
        case F_GETOWN:
        case F_GETSIG:
        case F_GETOWNER_UIDS:
                /* Just check FD__USE permission */
                err = file_has_perm(cred, file, 0);
                break;
        case F_GETLK:
        case F_SETLK:
        case F_SETLKW:
        case F_OFD_GETLK:
        case F_OFD_SETLK:
        case F_OFD_SETLKW:
#if BITS_PER_LONG == 32
        case F_GETLK64:
        case F_SETLK64:
        case F_SETLKW64:
#endif
                err = file_has_perm(cred, file, FILE__LOCK);
                break;
        }

        return err;
}

static void selinux_file_set_fowner(struct file *file)
{
        struct file_security_struct *fsec;

        fsec = selinux_file(file);
        fsec->fown_sid = current_sid();
}

static int selinux_file_send_sigiotask(struct task_struct *tsk,
                                       struct fown_struct *fown, int signum)
{
        struct file *file;
        u32 sid = task_sid(tsk);
        u32 perm;
        struct file_security_struct *fsec;

        /* struct fown_struct is never outside the context of a struct file */
        file = container_of(fown, struct file, f_owner);

        fsec = selinux_file(file);

        if (!signum)
                perm = signal_to_av(SIGIO); /* as per send_sigio_to_task */
        else
                perm = signal_to_av(signum);

        return avc_has_perm(&selinux_state,
                            fsec->fown_sid, sid,
                            SECCLASS_PROCESS, perm, NULL);
}

static int selinux_file_receive(struct file *file)
{
        const struct cred *cred = current_cred();

        return file_has_perm(cred, file, file_to_av(file));
}

static int selinux_file_open(struct file *file)
{
        struct file_security_struct *fsec;
        struct inode_security_struct *isec;

        fsec = selinux_file(file);
        isec = inode_security(file_inode(file));
        /*
         * Save inode label and policy sequence number
         * at open-time so that selinux_file_permission
         * can determine whether revalidation is necessary.
         * Task label is already saved in the file security
         * struct as its SID.
         */
        fsec->isid = isec->sid;
        fsec->pseqno = avc_policy_seqno(&selinux_state);
        /*
         * Since the inode label or policy seqno may have changed
         * between the selinux_inode_permission check and the saving
         * of state above, recheck that access is still permitted.
         * Otherwise, access might never be revalidated against the
         * new inode label or new policy.
         * This check is not redundant - do not remove.
         */
        return file_path_has_perm(file->f_cred, file, open_file_to_av(file));
}

/* task security operations */

static int selinux_task_alloc(struct task_struct *task,
                              unsigned long clone_flags)
{
        u32 sid = current_sid();

        return avc_has_perm(&selinux_state,
                            sid, sid, SECCLASS_PROCESS, PROCESS__FORK, NULL);
}

/*
 * prepare a new set of credentials for modification
 */
static int selinux_cred_prepare(struct cred *new, const struct cred *old,
                                gfp_t gfp)
{
        const struct task_security_struct *old_tsec = selinux_cred(old);
        struct task_security_struct *tsec = selinux_cred(new);

        *tsec = *old_tsec;
        return 0;
}

/*
 * transfer the SELinux data to a blank set of creds
 */
static void selinux_cred_transfer(struct cred *new, const struct cred *old)
{
        const struct task_security_struct *old_tsec = selinux_cred(old);
        struct task_security_struct *tsec = selinux_cred(new);

        *tsec = *old_tsec;
}

static void selinux_cred_getsecid(const struct cred *c, u32 *secid)
{
        *secid = cred_sid(c);
}

/*
 * set the security data for a kernel service
 * - all the creation contexts are set to unlabelled
 */
static int selinux_kernel_act_as(struct cred *new, u32 secid)
{
        struct task_security_struct *tsec = selinux_cred(new);
        u32 sid = current_sid();
        int ret;

        ret = avc_has_perm(&selinux_state,
                           sid, secid,
                           SECCLASS_KERNEL_SERVICE,
                           KERNEL_SERVICE__USE_AS_OVERRIDE,
                           NULL);
        if (ret == 0) {
                tsec->sid = secid;
                tsec->create_sid = 0;
                tsec->keycreate_sid = 0;
                tsec->sockcreate_sid = 0;
        }
        return ret;
}

/*
 * set the file creation context in a security record to the same as the
 * objective context of the specified inode
 */
static int selinux_kernel_create_files_as(struct cred *new, struct inode *inode)
{
        struct inode_security_struct *isec = inode_security(inode);
        struct task_security_struct *tsec = selinux_cred(new);
        u32 sid = current_sid();
        int ret;

        ret = avc_has_perm(&selinux_state,
                           sid, isec->sid,
                           SECCLASS_KERNEL_SERVICE,
                           KERNEL_SERVICE__CREATE_FILES_AS,
                           NULL);

        if (ret == 0)
                tsec->create_sid = isec->sid;
        return ret;
}

static int selinux_kernel_module_request(char *kmod_name)
{
        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_KMOD;
        ad.u.kmod_name = kmod_name;

        return avc_has_perm(&selinux_state,
                            current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM,
                            SYSTEM__MODULE_REQUEST, &ad);
}

static int selinux_kernel_module_from_file(struct file *file)
{
        struct common_audit_data ad;
        struct inode_security_struct *isec;
        struct file_security_struct *fsec;
        u32 sid = current_sid();
        int rc;

        /* init_module */
        if (file == NULL)
                return avc_has_perm(&selinux_state,
                                    sid, sid, SECCLASS_SYSTEM,
                                        SYSTEM__MODULE_LOAD, NULL);

        /* finit_module */

        ad.type = LSM_AUDIT_DATA_FILE;
        ad.u.file = file;

        fsec = selinux_file(file);
        if (sid != fsec->sid) {
                rc = avc_has_perm(&selinux_state,
                                  sid, fsec->sid, SECCLASS_FD, FD__USE, &ad);
                if (rc)
                        return rc;
        }

        isec = inode_security(file_inode(file));
        return avc_has_perm(&selinux_state,
                            sid, isec->sid, SECCLASS_SYSTEM,
                                SYSTEM__MODULE_LOAD, &ad);
}

static int selinux_kernel_read_file(struct file *file,
                                    enum kernel_read_file_id id,
                                    bool contents)
{
        int rc = 0;

        switch (id) {
        case READING_MODULE:
                rc = selinux_kernel_module_from_file(contents ? file : NULL);
                break;
        default:
                break;
        }

        return rc;
}

static int selinux_kernel_load_data(enum kernel_load_data_id id, bool contents)
{
        int rc = 0;

        switch (id) {
        case LOADING_MODULE:
                rc = selinux_kernel_module_from_file(NULL);
        default:
                break;
        }

        return rc;
}

static int selinux_task_setpgid(struct task_struct *p, pid_t pgid)
{
        return avc_has_perm(&selinux_state,
                            current_sid(), task_sid(p), SECCLASS_PROCESS,
                            PROCESS__SETPGID, NULL);
}

static int selinux_task_getpgid(struct task_struct *p)
{
        return avc_has_perm(&selinux_state,
                            current_sid(), task_sid(p), SECCLASS_PROCESS,
                            PROCESS__GETPGID, NULL);
}

static int selinux_task_getsid(struct task_struct *p)
{
        return avc_has_perm(&selinux_state,
                            current_sid(), task_sid(p), SECCLASS_PROCESS,
                            PROCESS__GETSESSION, NULL);
}

static void selinux_task_getsecid(struct task_struct *p, u32 *secid)
{
        *secid = task_sid(p);
}

static int selinux_task_setnice(struct task_struct *p, int nice)
{
        return avc_has_perm(&selinux_state,
                            current_sid(), task_sid(p), SECCLASS_PROCESS,
                            PROCESS__SETSCHED, NULL);
}

static int selinux_task_setioprio(struct task_struct *p, int ioprio)
{
        return avc_has_perm(&selinux_state,
                            current_sid(), task_sid(p), SECCLASS_PROCESS,
                            PROCESS__SETSCHED, NULL);
}

static int selinux_task_getioprio(struct task_struct *p)
{
        return avc_has_perm(&selinux_state,
                            current_sid(), task_sid(p), SECCLASS_PROCESS,
                            PROCESS__GETSCHED, NULL);
}

static int selinux_task_prlimit(const struct cred *cred, const struct cred *tcred,
                                unsigned int flags)
{
        u32 av = 0;

        if (!flags)
                return 0;
        if (flags & LSM_PRLIMIT_WRITE)
                av |= PROCESS__SETRLIMIT;
        if (flags & LSM_PRLIMIT_READ)
                av |= PROCESS__GETRLIMIT;
        return avc_has_perm(&selinux_state,
                            cred_sid(cred), cred_sid(tcred),
                            SECCLASS_PROCESS, av, NULL);
}

static int selinux_task_setrlimit(struct task_struct *p, unsigned int resource,
                struct rlimit *new_rlim)
{
        struct rlimit *old_rlim = p->signal->rlim + resource;

        /* Control the ability to change the hard limit (whether
           lowering or raising it), so that the hard limit can
           later be used as a safe reset point for the soft limit
           upon context transitions.  See selinux_bprm_committing_creds. */
        if (old_rlim->rlim_max != new_rlim->rlim_max)
                return avc_has_perm(&selinux_state,
                                    current_sid(), task_sid(p),
                                    SECCLASS_PROCESS, PROCESS__SETRLIMIT, NULL);

        return 0;
}

static int selinux_task_setscheduler(struct task_struct *p)
{
        return avc_has_perm(&selinux_state,
                            current_sid(), task_sid(p), SECCLASS_PROCESS,
                            PROCESS__SETSCHED, NULL);
}

static int selinux_task_getscheduler(struct task_struct *p)
{
        return avc_has_perm(&selinux_state,
                            current_sid(), task_sid(p), SECCLASS_PROCESS,
                            PROCESS__GETSCHED, NULL);
}

static int selinux_task_movememory(struct task_struct *p)
{
        return avc_has_perm(&selinux_state,
                            current_sid(), task_sid(p), SECCLASS_PROCESS,
                            PROCESS__SETSCHED, NULL);
}

static int selinux_task_kill(struct task_struct *p, struct kernel_siginfo *info,
                                int sig, const struct cred *cred)
{
        u32 secid;
        u32 perm;

        if (!sig)
                perm = PROCESS__SIGNULL; /* null signal; existence test */
        else
                perm = signal_to_av(sig);
        if (!cred)
                secid = current_sid();
        else
                secid = cred_sid(cred);
        return avc_has_perm(&selinux_state,
                            secid, task_sid(p), SECCLASS_PROCESS, perm, NULL);
}

static void selinux_task_to_inode(struct task_struct *p,
                                  struct inode *inode)
{
        struct inode_security_struct *isec = selinux_inode(inode);
        u32 sid = task_sid(p);

        spin_lock(&isec->lock);
        isec->sclass = inode_mode_to_security_class(inode->i_mode);
        isec->sid = sid;
        isec->initialized = LABEL_INITIALIZED;
        spin_unlock(&isec->lock);
}

/* Returns error only if unable to parse addresses */
static int selinux_parse_skb_ipv4(struct sk_buff *skb,
                        struct common_audit_data *ad, u8 *proto)
{
        int offset, ihlen, ret = -EINVAL;
        struct iphdr _iph, *ih;

        offset = skb_network_offset(skb);
        ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
        if (ih == NULL)
                goto out;

        ihlen = ih->ihl * 4;
        if (ihlen < sizeof(_iph))
                goto out;

        ad->u.net->v4info.saddr = ih->saddr;
        ad->u.net->v4info.daddr = ih->daddr;
        ret = 0;

        if (proto)
                *proto = ih->protocol;

        switch (ih->protocol) {
        case IPPROTO_TCP: {
                struct tcphdr _tcph, *th;

                if (ntohs(ih->frag_off) & IP_OFFSET)
                        break;

                offset += ihlen;
                th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
                if (th == NULL)
                        break;

                ad->u.net->sport = th->source;
                ad->u.net->dport = th->dest;
                break;
        }

        case IPPROTO_UDP: {
                struct udphdr _udph, *uh;

                if (ntohs(ih->frag_off) & IP_OFFSET)
                        break;

                offset += ihlen;
                uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
                if (uh == NULL)
                        break;

                ad->u.net->sport = uh->source;
                ad->u.net->dport = uh->dest;
                break;
        }

        case IPPROTO_DCCP: {
                struct dccp_hdr _dccph, *dh;

                if (ntohs(ih->frag_off) & IP_OFFSET)
                        break;

                offset += ihlen;
                dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph);
                if (dh == NULL)
                        break;

                ad->u.net->sport = dh->dccph_sport;
                ad->u.net->dport = dh->dccph_dport;
                break;
        }

#if IS_ENABLED(CONFIG_IP_SCTP)
        case IPPROTO_SCTP: {
                struct sctphdr _sctph, *sh;

                if (ntohs(ih->frag_off) & IP_OFFSET)
                        break;

                offset += ihlen;
                sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph);
                if (sh == NULL)
                        break;

                ad->u.net->sport = sh->source;
                ad->u.net->dport = sh->dest;
                break;
        }
#endif
        default:
                break;
        }
out:
        return ret;
}

#if IS_ENABLED(CONFIG_IPV6)

/* Returns error only if unable to parse addresses */
static int selinux_parse_skb_ipv6(struct sk_buff *skb,
                        struct common_audit_data *ad, u8 *proto)
{
        u8 nexthdr;
        int ret = -EINVAL, offset;
        struct ipv6hdr _ipv6h, *ip6;
        __be16 frag_off;

        offset = skb_network_offset(skb);
        ip6 = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
        if (ip6 == NULL)
                goto out;

        ad->u.net->v6info.saddr = ip6->saddr;
        ad->u.net->v6info.daddr = ip6->daddr;
        ret = 0;

        nexthdr = ip6->nexthdr;
        offset += sizeof(_ipv6h);
        offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off);
        if (offset < 0)
                goto out;

        if (proto)
                *proto = nexthdr;

        switch (nexthdr) {
        case IPPROTO_TCP: {
                struct tcphdr _tcph, *th;

                th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
                if (th == NULL)
                        break;

                ad->u.net->sport = th->source;
                ad->u.net->dport = th->dest;
                break;
        }

        case IPPROTO_UDP: {
                struct udphdr _udph, *uh;

                uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
                if (uh == NULL)
                        break;

                ad->u.net->sport = uh->source;
                ad->u.net->dport = uh->dest;
                break;
        }

        case IPPROTO_DCCP: {
                struct dccp_hdr _dccph, *dh;

                dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph);
                if (dh == NULL)
                        break;

                ad->u.net->sport = dh->dccph_sport;
                ad->u.net->dport = dh->dccph_dport;
                break;
        }

#if IS_ENABLED(CONFIG_IP_SCTP)
        case IPPROTO_SCTP: {
                struct sctphdr _sctph, *sh;

                sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph);
                if (sh == NULL)
                        break;

                ad->u.net->sport = sh->source;
                ad->u.net->dport = sh->dest;
                break;
        }
#endif
        /* includes fragments */
        default:
                break;
        }
out:
        return ret;
}

#endif /* IPV6 */

static int selinux_parse_skb(struct sk_buff *skb, struct common_audit_data *ad,
                             char **_addrp, int src, u8 *proto)
{
        char *addrp;
        int ret;

        switch (ad->u.net->family) {
        case PF_INET:
                ret = selinux_parse_skb_ipv4(skb, ad, proto);
                if (ret)
                        goto parse_error;
                addrp = (char *)(src ? &ad->u.net->v4info.saddr :
                                       &ad->u.net->v4info.daddr);
                goto okay;

#if IS_ENABLED(CONFIG_IPV6)
        case PF_INET6:
                ret = selinux_parse_skb_ipv6(skb, ad, proto);
                if (ret)
                        goto parse_error;
                addrp = (char *)(src ? &ad->u.net->v6info.saddr :
                                       &ad->u.net->v6info.daddr);
                goto okay;
#endif        /* IPV6 */
        default:
                addrp = NULL;
                goto okay;
        }

parse_error:
        pr_warn(
               "SELinux: failure in selinux_parse_skb(),"
               " unable to parse packet\n");
        return ret;

okay:
        if (_addrp)
                *_addrp = addrp;
        return 0;
}

/**
 * selinux_skb_peerlbl_sid - Determine the peer label of a packet
 * @skb: the packet
 * @family: protocol family
 * @sid: the packet's peer label SID
 *
 * Description:
 * Check the various different forms of network peer labeling and determine
 * the peer label/SID for the packet; most of the magic actually occurs in
 * the security server function security_net_peersid_cmp().  The function
 * returns zero if the value in @sid is valid (although it may be SECSID_NULL)
 * or -EACCES if @sid is invalid due to inconsistencies with the different
 * peer labels.
 *
 */
static int selinux_skb_peerlbl_sid(struct sk_buff *skb, u16 family, u32 *sid)
{
        int err;
        u32 xfrm_sid;
        u32 nlbl_sid;
        u32 nlbl_type;

        err = selinux_xfrm_skb_sid(skb, &xfrm_sid);
        if (unlikely(err))
                return -EACCES;
        err = selinux_netlbl_skbuff_getsid(skb, family, &nlbl_type, &nlbl_sid);
        if (unlikely(err))
                return -EACCES;

        err = security_net_peersid_resolve(&selinux_state, nlbl_sid,
                                           nlbl_type, xfrm_sid, sid);
        if (unlikely(err)) {
                pr_warn(
                       "SELinux: failure in selinux_skb_peerlbl_sid(),"
                       " unable to determine packet's peer label\n");
                return -EACCES;
        }

        return 0;
}

/**
 * selinux_conn_sid - Determine the child socket label for a connection
 * @sk_sid: the parent socket's SID
 * @skb_sid: the packet's SID
 * @conn_sid: the resulting connection SID
 *
 * If @skb_sid is valid then the user:role:type information from @sk_sid is
 * combined with the MLS information from @skb_sid in order to create
 * @conn_sid.  If @skb_sid is not valid then @conn_sid is simply a copy
 * of @sk_sid.  Returns zero on success, negative values on failure.
 *
 */
static int selinux_conn_sid(u32 sk_sid, u32 skb_sid, u32 *conn_sid)
{
        int err = 0;

        if (skb_sid != SECSID_NULL)
                err = security_sid_mls_copy(&selinux_state, sk_sid, skb_sid,
                                            conn_sid);
        else
                *conn_sid = sk_sid;

        return err;
}

/* socket security operations */

static int socket_sockcreate_sid(const struct task_security_struct *tsec,
                                 u16 secclass, u32 *socksid)
{
        if (tsec->sockcreate_sid > SECSID_NULL) {
                *socksid = tsec->sockcreate_sid;
                return 0;
        }

        return security_transition_sid(&selinux_state, tsec->sid, tsec->sid,
                                       secclass, NULL, socksid);
}

static int sock_has_perm(struct sock *sk, u32 perms)
{
        struct sk_security_struct *sksec = sk->sk_security;
        struct common_audit_data ad;
        struct lsm_network_audit net = {0,};

        if (sksec->sid == SECINITSID_KERNEL)
                return 0;

        ad.type = LSM_AUDIT_DATA_NET;
        ad.u.net = &net;
        ad.u.net->sk = sk;

        return avc_has_perm(&selinux_state,
                            current_sid(), sksec->sid, sksec->sclass, perms,
                            &ad);
}

static int selinux_socket_create(int family, int type,
                                 int protocol, int kern)
{
        const struct task_security_struct *tsec = selinux_cred(current_cred());
        u32 newsid;
        u16 secclass;
        int rc;

        if (kern)
                return 0;

        secclass = socket_type_to_security_class(family, type, protocol);
        rc = socket_sockcreate_sid(tsec, secclass, &newsid);
        if (rc)
                return rc;

        return avc_has_perm(&selinux_state,
                            tsec->sid, newsid, secclass, SOCKET__CREATE, NULL);
}

static int selinux_socket_post_create(struct socket *sock, int family,
                                      int type, int protocol, int kern)
{
        const struct task_security_struct *tsec = selinux_cred(current_cred());
        struct inode_security_struct *isec = inode_security_novalidate(SOCK_INODE(sock));
        struct sk_security_struct *sksec;
        u16 sclass = socket_type_to_security_class(family, type, protocol);
        u32 sid = SECINITSID_KERNEL;
        int err = 0;

        if (!kern) {
                err = socket_sockcreate_sid(tsec, sclass, &sid);
                if (err)
                        return err;
        }

        isec->sclass = sclass;
        isec->sid = sid;
        isec->initialized = LABEL_INITIALIZED;

        if (sock->sk) {
                sksec = sock->sk->sk_security;
                sksec->sclass = sclass;
                sksec->sid = sid;
                /* Allows detection of the first association on this socket */
                if (sksec->sclass == SECCLASS_SCTP_SOCKET)
                        sksec->sctp_assoc_state = SCTP_ASSOC_UNSET;

                err = selinux_netlbl_socket_post_create(sock->sk, family);
        }

        return err;
}

static int selinux_socket_socketpair(struct socket *socka,
                                     struct socket *sockb)
{
        struct sk_security_struct *sksec_a = socka->sk->sk_security;
        struct sk_security_struct *sksec_b = sockb->sk->sk_security;

        sksec_a->peer_sid = sksec_b->sid;
        sksec_b->peer_sid = sksec_a->sid;

        return 0;
}

/* Range of port numbers used to automatically bind.
   Need to determine whether we should perform a name_bind
   permission check between the socket and the port number. */

static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen)
{
        struct sock *sk = sock->sk;
        struct sk_security_struct *sksec = sk->sk_security;
        u16 family;
        int err;

        err = sock_has_perm(sk, SOCKET__BIND);
        if (err)
                goto out;

        /* If PF_INET or PF_INET6, check name_bind permission for the port. */
        family = sk->sk_family;
        if (family == PF_INET || family == PF_INET6) {
                char *addrp;
                struct common_audit_data ad;
                struct lsm_network_audit net = {0,};
                struct sockaddr_in *addr4 = NULL;
                struct sockaddr_in6 *addr6 = NULL;
                u16 family_sa;
                unsigned short snum;
                u32 sid, node_perm;

                /*
                 * sctp_bindx(3) calls via selinux_sctp_bind_connect()
                 * that validates multiple binding addresses. Because of this
                 * need to check address->sa_family as it is possible to have
                 * sk->sk_family = PF_INET6 with addr->sa_family = AF_INET.
                 */
                if (addrlen < offsetofend(struct sockaddr, sa_family))
                        return -EINVAL;
                family_sa = address->sa_family;
                switch (family_sa) {
                case AF_UNSPEC:
                case AF_INET:
                        if (addrlen < sizeof(struct sockaddr_in))
                                return -EINVAL;
                        addr4 = (struct sockaddr_in *)address;
                        if (family_sa == AF_UNSPEC) {
                                if (family == PF_INET6) {
                                        /* Length check from inet6_bind_sk() */
                                        if (addrlen < SIN6_LEN_RFC2133)
                                                return -EINVAL;
                                        /* Family check from __inet6_bind() */
                                        goto err_af;
                                }
                                /* see __inet_bind(), we only want to allow
                                 * AF_UNSPEC if the address is INADDR_ANY
                                 */
                                if (addr4->sin_addr.s_addr != htonl(INADDR_ANY))
                                        goto err_af;
                                family_sa = AF_INET;
                        }
                        snum = ntohs(addr4->sin_port);
                        addrp = (char *)&addr4->sin_addr.s_addr;
                        break;
                case AF_INET6:
                        if (addrlen < SIN6_LEN_RFC2133)
                                return -EINVAL;
                        addr6 = (struct sockaddr_in6 *)address;
                        snum = ntohs(addr6->sin6_port);
                        addrp = (char *)&addr6->sin6_addr.s6_addr;
                        break;
                default:
                        goto err_af;
                }

                ad.type = LSM_AUDIT_DATA_NET;
                ad.u.net = &net;
                ad.u.net->sport = htons(snum);
                ad.u.net->family = family_sa;

                if (snum) {
                        int low, high;

                        inet_get_local_port_range(sock_net(sk), &low, &high);

                        if (inet_port_requires_bind_service(sock_net(sk), snum) ||
                            snum < low || snum > high) {
                                err = sel_netport_sid(sk->sk_protocol,
                                                      snum, &sid);
                                if (err)
                                        goto out;
                                err = avc_has_perm(&selinux_state,
                                                   sksec->sid, sid,
                                                   sksec->sclass,
                                                   SOCKET__NAME_BIND, &ad);
                                if (err)
                                        goto out;
                        }
                }

                switch (sksec->sclass) {
                case SECCLASS_TCP_SOCKET:
                        node_perm = TCP_SOCKET__NODE_BIND;
                        break;

                case SECCLASS_UDP_SOCKET:
                        node_perm = UDP_SOCKET__NODE_BIND;
                        break;

                case SECCLASS_DCCP_SOCKET:
                        node_perm = DCCP_SOCKET__NODE_BIND;
                        break;

                case SECCLASS_SCTP_SOCKET:
                        node_perm = SCTP_SOCKET__NODE_BIND;
                        break;

                default:
                        node_perm = RAWIP_SOCKET__NODE_BIND;
                        break;
                }

                err = sel_netnode_sid(addrp, family_sa, &sid);
                if (err)
                        goto out;

                if (family_sa == AF_INET)
                        ad.u.net->v4info.saddr = addr4->sin_addr.s_addr;
                else
                        ad.u.net->v6info.saddr = addr6->sin6_addr;

                err = avc_has_perm(&selinux_state,
                                   sksec->sid, sid,
                                   sksec->sclass, node_perm, &ad);
                if (err)
                        goto out;
        }
out:
        return err;
err_af:
        /* Note that SCTP services expect -EINVAL, others -EAFNOSUPPORT. */
        if (sksec->sclass == SECCLASS_SCTP_SOCKET)
                return -EINVAL;
        return -EAFNOSUPPORT;
}

/* This supports connect(2) and SCTP connect services such as sctp_connectx(3)
 * and sctp_sendmsg(3) as described in Documentation/security/SCTP.rst
 */
static int selinux_socket_connect_helper(struct socket *sock,
                                         struct sockaddr *address, int addrlen)
{
        struct sock *sk = sock->sk;
        struct sk_security_struct *sksec = sk->sk_security;
        int err;

        err = sock_has_perm(sk, SOCKET__CONNECT);
        if (err)
                return err;
        if (addrlen < offsetofend(struct sockaddr, sa_family))
                return -EINVAL;

        /* connect(AF_UNSPEC) has special handling, as it is a documented
         * way to disconnect the socket
         */
        if (address->sa_family == AF_UNSPEC)
                return 0;

        /*
         * If a TCP, DCCP or SCTP socket, check name_connect permission
         * for the port.
         */
        if (sksec->sclass == SECCLASS_TCP_SOCKET ||
            sksec->sclass == SECCLASS_DCCP_SOCKET ||
            sksec->sclass == SECCLASS_SCTP_SOCKET) {
                struct common_audit_data ad;
                struct lsm_network_audit net = {0,};
                struct sockaddr_in *addr4 = NULL;
                struct sockaddr_in6 *addr6 = NULL;
                unsigned short snum;
                u32 sid, perm;

                /* sctp_connectx(3) calls via selinux_sctp_bind_connect()
                 * that validates multiple connect addresses. Because of this
                 * need to check address->sa_family as it is possible to have
                 * sk->sk_family = PF_INET6 with addr->sa_family = AF_INET.
                 */
                switch (address->sa_family) {
                case AF_INET:
                        addr4 = (struct sockaddr_in *)address;
                        if (addrlen < sizeof(struct sockaddr_in))
                                return -EINVAL;
                        snum = ntohs(addr4->sin_port);
                        break;
                case AF_INET6:
                        addr6 = (struct sockaddr_in6 *)address;
                        if (addrlen < SIN6_LEN_RFC2133)
                                return -EINVAL;
                        snum = ntohs(addr6->sin6_port);
                        break;
                default:
                        /* Note that SCTP services expect -EINVAL, whereas
                         * others expect -EAFNOSUPPORT.
                         */
                        if (sksec->sclass == SECCLASS_SCTP_SOCKET)
                                return -EINVAL;
                        else
                                return -EAFNOSUPPORT;
                }

                err = sel_netport_sid(sk->sk_protocol, snum, &sid);
                if (err)
                        return err;

                switch (sksec->sclass) {
                case SECCLASS_TCP_SOCKET:
                        perm = TCP_SOCKET__NAME_CONNECT;
                        break;
                case SECCLASS_DCCP_SOCKET:
                        perm = DCCP_SOCKET__NAME_CONNECT;
                        break;
                case SECCLASS_SCTP_SOCKET:
                        perm = SCTP_SOCKET__NAME_CONNECT;
                        break;
                }

                ad.type = LSM_AUDIT_DATA_NET;
                ad.u.net = &net;
                ad.u.net->dport = htons(snum);
                ad.u.net->family = address->sa_family;
                err = avc_has_perm(&selinux_state,
                                   sksec->sid, sid, sksec->sclass, perm, &ad);
                if (err)
                        return err;
        }

        return 0;
}

/* Supports connect(2), see comments in selinux_socket_connect_helper() */
static int selinux_socket_connect(struct socket *sock,
                                  struct sockaddr *address, int addrlen)
{
        int err;
        struct sock *sk = sock->sk;

        err = selinux_socket_connect_helper(sock, address, addrlen);
        if (err)
                return err;

        return selinux_netlbl_socket_connect(sk, address);
}

static int selinux_socket_listen(struct socket *sock, int backlog)
{
        return sock_has_perm(sock->sk, SOCKET__LISTEN);
}

static int selinux_socket_accept(struct socket *sock, struct socket *newsock)
{
        int err;
        struct inode_security_struct *isec;
        struct inode_security_struct *newisec;
        u16 sclass;
        u32 sid;

        err = sock_has_perm(sock->sk, SOCKET__ACCEPT);
        if (err)
                return err;

        isec = inode_security_novalidate(SOCK_INODE(sock));
        spin_lock(&isec->lock);
        sclass = isec->sclass;
        sid = isec->sid;
        spin_unlock(&isec->lock);

        newisec = inode_security_novalidate(SOCK_INODE(newsock));
        newisec->sclass = sclass;
        newisec->sid = sid;
        newisec->initialized = LABEL_INITIALIZED;

        return 0;
}

static int selinux_socket_sendmsg(struct socket *sock, struct msghdr *msg,
                                  int size)
{
        return sock_has_perm(sock->sk, SOCKET__WRITE);
}

static int selinux_socket_recvmsg(struct socket *sock, struct msghdr *msg,
                                  int size, int flags)
{
        return sock_has_perm(sock->sk, SOCKET__READ);
}

static int selinux_socket_getsockname(struct socket *sock)
{
        return sock_has_perm(sock->sk, SOCKET__GETATTR);
}

static int selinux_socket_getpeername(struct socket *sock)
{
        return sock_has_perm(sock->sk, SOCKET__GETATTR);
}

static int selinux_socket_setsockopt(struct socket *sock, int level, int optname)
{
        int err;

        err = sock_has_perm(sock->sk, SOCKET__SETOPT);
        if (err)
                return err;

        return selinux_netlbl_socket_setsockopt(sock, level, optname);
}

static int selinux_socket_getsockopt(struct socket *sock, int level,
                                     int optname)
{
        return sock_has_perm(sock->sk, SOCKET__GETOPT);
}

static int selinux_socket_shutdown(struct socket *sock, int how)
{
        return sock_has_perm(sock->sk, SOCKET__SHUTDOWN);
}

static int selinux_socket_unix_stream_connect(struct sock *sock,
                                              struct sock *other,
                                              struct sock *newsk)
{
        struct sk_security_struct *sksec_sock = sock->sk_security;
        struct sk_security_struct *sksec_other = other->sk_security;
        struct sk_security_struct *sksec_new = newsk->sk_security;
        struct common_audit_data ad;
        struct lsm_network_audit net = {0,};
        int err;

        ad.type = LSM_AUDIT_DATA_NET;
        ad.u.net = &net;
        ad.u.net->sk = other;

        err = avc_has_perm(&selinux_state,
                           sksec_sock->sid, sksec_other->sid,
                           sksec_other->sclass,
                           UNIX_STREAM_SOCKET__CONNECTTO, &ad);
        if (err)
                return err;

        /* server child socket */
        sksec_new->peer_sid = sksec_sock->sid;
        err = security_sid_mls_copy(&selinux_state, sksec_other->sid,
                                    sksec_sock->sid, &sksec_new->sid);
        if (err)
                return err;

        /* connecting socket */
        sksec_sock->peer_sid = sksec_new->sid;

        return 0;
}

static int selinux_socket_unix_may_send(struct socket *sock,
                                        struct socket *other)
{
        struct sk_security_struct *ssec = sock->sk->sk_security;
        struct sk_security_struct *osec = other->sk->sk_security;
        struct common_audit_data ad;
        struct lsm_network_audit net = {0,};

        ad.type = LSM_AUDIT_DATA_NET;
        ad.u.net = &net;
        ad.u.net->sk = other->sk;

        return avc_has_perm(&selinux_state,
                            ssec->sid, osec->sid, osec->sclass, SOCKET__SENDTO,
                            &ad);
}

static int selinux_inet_sys_rcv_skb(struct net *ns, int ifindex,
                                    char *addrp, u16 family, u32 peer_sid,
                                    struct common_audit_data *ad)
{
        int err;
        u32 if_sid;
        u32 node_sid;

        err = sel_netif_sid(ns, ifindex, &if_sid);
        if (err)
                return err;
        err = avc_has_perm(&selinux_state,
                           peer_sid, if_sid,
                           SECCLASS_NETIF, NETIF__INGRESS, ad);
        if (err)
                return err;

        err = sel_netnode_sid(addrp, family, &node_sid);
        if (err)
                return err;
        return avc_has_perm(&selinux_state,
                            peer_sid, node_sid,
                            SECCLASS_NODE, NODE__RECVFROM, ad);
}

static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb,
                                       u16 family)
{
        int err = 0;
        struct sk_security_struct *sksec = sk->sk_security;
        u32 sk_sid = sksec->sid;
        struct common_audit_data ad;
        struct lsm_network_audit net = {0,};
        char *addrp;

        ad.type = LSM_AUDIT_DATA_NET;
        ad.u.net = &net;
        ad.u.net->netif = skb->skb_iif;
        ad.u.net->family = family;
        err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL);
        if (err)
                return err;

        if (selinux_secmark_enabled()) {
                err = avc_has_perm(&selinux_state,
                                   sk_sid, skb->secmark, SECCLASS_PACKET,
                                   PACKET__RECV, &ad);
                if (err)
                        return err;
        }

        err = selinux_netlbl_sock_rcv_skb(sksec, skb, family, &ad);
        if (err)
                return err;
        err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, &ad);

        return err;
}

static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        int err;
        struct sk_security_struct *sksec = sk->sk_security;
        u16 family = sk->sk_family;
        u32 sk_sid = sksec->sid;
        struct common_audit_data ad;
        struct lsm_network_audit net = {0,};
        char *addrp;
        u8 secmark_active;
        u8 peerlbl_active;

        if (family != PF_INET && family != PF_INET6)
                return 0;

        /* Handle mapped IPv4 packets arriving via IPv6 sockets */
        if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
                family = PF_INET;

        /* If any sort of compatibility mode is enabled then handoff processing
         * to the selinux_sock_rcv_skb_compat() function to deal with the
         * special handling.  We do this in an attempt to keep this function
         * as fast and as clean as possible. */
        if (!selinux_policycap_netpeer())
                return selinux_sock_rcv_skb_compat(sk, skb, family);

        secmark_active = selinux_secmark_enabled();
        peerlbl_active = selinux_peerlbl_enabled();
        if (!secmark_active && !peerlbl_active)
                return 0;

        ad.type = LSM_AUDIT_DATA_NET;
        ad.u.net = &net;
        ad.u.net->netif = skb->skb_iif;
        ad.u.net->family = family;
        err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL);
        if (err)
                return err;

        if (peerlbl_active) {
                u32 peer_sid;

                err = selinux_skb_peerlbl_sid(skb, family, &peer_sid);
                if (err)
                        return err;
                err = selinux_inet_sys_rcv_skb(sock_net(sk), skb->skb_iif,
                                               addrp, family, peer_sid, &ad);
                if (err) {
                        selinux_netlbl_err(skb, family, err, 0);
                        return err;
                }
                err = avc_has_perm(&selinux_state,
                                   sk_sid, peer_sid, SECCLASS_PEER,
                                   PEER__RECV, &ad);
                if (err) {
                        selinux_netlbl_err(skb, family, err, 0);
                        return err;
                }
        }

        if (secmark_active) {
                err = avc_has_perm(&selinux_state,
                                   sk_sid, skb->secmark, SECCLASS_PACKET,
                                   PACKET__RECV, &ad);
                if (err)
                        return err;
        }

        return err;
}

static int selinux_socket_getpeersec_stream(struct socket *sock,
                                            sockptr_t optval, sockptr_t optlen,
                                            unsigned int len)
{
        int err = 0;
        char *scontext = NULL;
        u32 scontext_len;
        struct sk_security_struct *sksec = sock->sk->sk_security;
        u32 peer_sid = SECSID_NULL;

        if (sksec->sclass == SECCLASS_UNIX_STREAM_SOCKET ||
            sksec->sclass == SECCLASS_TCP_SOCKET ||
            sksec->sclass == SECCLASS_SCTP_SOCKET)
                peer_sid = sksec->peer_sid;
        if (peer_sid == SECSID_NULL)
                return -ENOPROTOOPT;

        err = security_sid_to_context(&selinux_state, peer_sid, &scontext,
                                      &scontext_len);
        if (err)
                return err;
        if (scontext_len > len) {
                err = -ERANGE;
                goto out_len;
        }

        if (copy_to_sockptr(optval, scontext, scontext_len))
                err = -EFAULT;
out_len:
        if (copy_to_sockptr(optlen, &scontext_len, sizeof(scontext_len)))
                err = -EFAULT;
        kfree(scontext);
        return err;
}

static int selinux_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid)
{
        u32 peer_secid = SECSID_NULL;
        u16 family;
        struct inode_security_struct *isec;

        if (skb && skb->protocol == htons(ETH_P_IP))
                family = PF_INET;
        else if (skb && skb->protocol == htons(ETH_P_IPV6))
                family = PF_INET6;
        else if (sock)
                family = sock->sk->sk_family;
        else
                goto out;

        if (sock && family == PF_UNIX) {
                isec = inode_security_novalidate(SOCK_INODE(sock));
                peer_secid = isec->sid;
        } else if (skb)
                selinux_skb_peerlbl_sid(skb, family, &peer_secid);

out:
        *secid = peer_secid;
        if (peer_secid == SECSID_NULL)
                return -EINVAL;
        return 0;
}

static int selinux_sk_alloc_security(struct sock *sk, int family, gfp_t priority)
{
        struct sk_security_struct *sksec;

        sksec = kzalloc(sizeof(*sksec), priority);
        if (!sksec)
                return -ENOMEM;

        sksec->peer_sid = SECINITSID_UNLABELED;
        sksec->sid = SECINITSID_UNLABELED;
        sksec->sclass = SECCLASS_SOCKET;
        selinux_netlbl_sk_security_reset(sksec);
        sk->sk_security = sksec;

        return 0;
}

static void selinux_sk_free_security(struct sock *sk)
{
        struct sk_security_struct *sksec = sk->sk_security;

        sk->sk_security = NULL;
        selinux_netlbl_sk_security_free(sksec);
        kfree(sksec);
}

static void selinux_sk_clone_security(const struct sock *sk, struct sock *newsk)
{
        struct sk_security_struct *sksec = sk->sk_security;
        struct sk_security_struct *newsksec = newsk->sk_security;

        newsksec->sid = sksec->sid;
        newsksec->peer_sid = sksec->peer_sid;
        newsksec->sclass = sksec->sclass;

        selinux_netlbl_sk_security_reset(newsksec);
}

static void selinux_sk_getsecid(struct sock *sk, u32 *secid)
{
        if (!sk)
                *secid = SECINITSID_ANY_SOCKET;
        else {
                struct sk_security_struct *sksec = sk->sk_security;

                *secid = sksec->sid;
        }
}

static void selinux_sock_graft(struct sock *sk, struct socket *parent)
{
        struct inode_security_struct *isec =
                inode_security_novalidate(SOCK_INODE(parent));
        struct sk_security_struct *sksec = sk->sk_security;

        if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6 ||
            sk->sk_family == PF_UNIX)
                isec->sid = sksec->sid;
        sksec->sclass = isec->sclass;
}

/* Called whenever SCTP receives an INIT chunk. This happens when an incoming
 * connect(2), sctp_connectx(3) or sctp_sendmsg(3) (with no association
 * already present).
 */
static int selinux_sctp_assoc_request(struct sctp_endpoint *ep,
                                      struct sk_buff *skb)
{
        struct sk_security_struct *sksec = ep->base.sk->sk_security;
        struct common_audit_data ad;
        struct lsm_network_audit net = {0,};
        u8 peerlbl_active;
        u32 peer_sid = SECINITSID_UNLABELED;
        u32 conn_sid;
        int err = 0;

        if (!selinux_policycap_extsockclass())
                return 0;

        peerlbl_active = selinux_peerlbl_enabled();

        if (peerlbl_active) {
                /* This will return peer_sid = SECSID_NULL if there are
                 * no peer labels, see security_net_peersid_resolve().
                 */
                err = selinux_skb_peerlbl_sid(skb, ep->base.sk->sk_family,
                                              &peer_sid);
                if (err)
                        return err;

                if (peer_sid == SECSID_NULL)
                        peer_sid = SECINITSID_UNLABELED;
        }

        if (sksec->sctp_assoc_state == SCTP_ASSOC_UNSET) {
                sksec->sctp_assoc_state = SCTP_ASSOC_SET;

                /* Here as first association on socket. As the peer SID
                 * was allowed by peer recv (and the netif/node checks),
                 * then it is approved by policy and used as the primary
                 * peer SID for getpeercon(3).
                 */
                sksec->peer_sid = peer_sid;
        } else if  (sksec->peer_sid != peer_sid) {
                /* Other association peer SIDs are checked to enforce
                 * consistency among the peer SIDs.
                 */
                ad.type = LSM_AUDIT_DATA_NET;
                ad.u.net = &net;
                ad.u.net->sk = ep->base.sk;
                err = avc_has_perm(&selinux_state,
                                   sksec->peer_sid, peer_sid, sksec->sclass,
                                   SCTP_SOCKET__ASSOCIATION, &ad);
                if (err)
                        return err;
        }

        /* Compute the MLS component for the connection and store
         * the information in ep. This will be used by SCTP TCP type
         * sockets and peeled off connections as they cause a new
         * socket to be generated. selinux_sctp_sk_clone() will then
         * plug this into the new socket.
         */
        err = selinux_conn_sid(sksec->sid, peer_sid, &conn_sid);
        if (err)
                return err;

        ep->secid = conn_sid;
        ep->peer_secid = peer_sid;

        /* Set any NetLabel labels including CIPSO/CALIPSO options. */
        return selinux_netlbl_sctp_assoc_request(ep, skb);
}

/* Check if sctp IPv4/IPv6 addresses are valid for binding or connecting
 * based on their @optname.
 */
static int selinux_sctp_bind_connect(struct sock *sk, int optname,
                                     struct sockaddr *address,
                                     int addrlen)
{
        int len, err = 0, walk_size = 0;
        void *addr_buf;
        struct sockaddr *addr;
        struct socket *sock;

        if (!selinux_policycap_extsockclass())
                return 0;

        /* Process one or more addresses that may be IPv4 or IPv6 */
        sock = sk->sk_socket;
        addr_buf = address;

        while (walk_size < addrlen) {
                if (walk_size + sizeof(sa_family_t) > addrlen)
                        return -EINVAL;

                addr = addr_buf;
                switch (addr->sa_family) {
                case AF_UNSPEC:
                case AF_INET:
                        len = sizeof(struct sockaddr_in);
                        break;
                case AF_INET6:
                        len = sizeof(struct sockaddr_in6);
                        break;
                default:
                        return -EINVAL;
                }

                if (walk_size + len > addrlen)
                        return -EINVAL;

                err = -EINVAL;
                switch (optname) {
                /* Bind checks */
                case SCTP_PRIMARY_ADDR:
                case SCTP_SET_PEER_PRIMARY_ADDR:
                case SCTP_SOCKOPT_BINDX_ADD:
                        err = selinux_socket_bind(sock, addr, len);
                        break;
                /* Connect checks */
                case SCTP_SOCKOPT_CONNECTX:
                case SCTP_PARAM_SET_PRIMARY:
                case SCTP_PARAM_ADD_IP:
                case SCTP_SENDMSG_CONNECT:
                        err = selinux_socket_connect_helper(sock, addr, len);
                        if (err)
                                return err;

                        /* As selinux_sctp_bind_connect() is called by the
                         * SCTP protocol layer, the socket is already locked,
                         * therefore selinux_netlbl_socket_connect_locked()
                         * is called here. The situations handled are:
                         * sctp_connectx(3), sctp_sendmsg(3), sendmsg(2),
                         * whenever a new IP address is added or when a new
                         * primary address is selected.
                         * Note that an SCTP connect(2) call happens before
                         * the SCTP protocol layer and is handled via
                         * selinux_socket_connect().
                         */
                        err = selinux_netlbl_socket_connect_locked(sk, addr);
                        break;
                }

                if (err)
                        return err;

                addr_buf += len;
                walk_size += len;
        }

        return 0;
}

/* Called whenever a new socket is created by accept(2) or sctp_peeloff(3). */
static void selinux_sctp_sk_clone(struct sctp_endpoint *ep, struct sock *sk,
                                  struct sock *newsk)
{
        struct sk_security_struct *sksec = sk->sk_security;
        struct sk_security_struct *newsksec = newsk->sk_security;

        /* If policy does not support SECCLASS_SCTP_SOCKET then call
         * the non-sctp clone version.
         */
        if (!selinux_policycap_extsockclass())
                return selinux_sk_clone_security(sk, newsk);

        newsksec->sid = ep->secid;
        newsksec->peer_sid = ep->peer_secid;
        newsksec->sclass = sksec->sclass;
        selinux_netlbl_sctp_sk_clone(sk, newsk);
}

static int selinux_inet_conn_request(struct sock *sk, struct sk_buff *skb,
                                     struct request_sock *req)
{
        struct sk_security_struct *sksec = sk->sk_security;
        int err;
        u16 family = req->rsk_ops->family;
        u32 connsid;
        u32 peersid;

        err = selinux_skb_peerlbl_sid(skb, family, &peersid);
        if (err)
                return err;
        err = selinux_conn_sid(sksec->sid, peersid, &connsid);
        if (err)
                return err;
        req->secid = connsid;
        req->peer_secid = peersid;

        return selinux_netlbl_inet_conn_request(req, family);
}

static void selinux_inet_csk_clone(struct sock *newsk,
                                   const struct request_sock *req)
{
        struct sk_security_struct *newsksec = newsk->sk_security;

        newsksec->sid = req->secid;
        newsksec->peer_sid = req->peer_secid;
        /* NOTE: Ideally, we should also get the isec->sid for the
           new socket in sync, but we don't have the isec available yet.
           So we will wait until sock_graft to do it, by which
           time it will have been created and available. */

        /* We don't need to take any sort of lock here as we are the only
         * thread with access to newsksec */
        selinux_netlbl_inet_csk_clone(newsk, req->rsk_ops->family);
}

static void selinux_inet_conn_established(struct sock *sk, struct sk_buff *skb)
{
        u16 family = sk->sk_family;
        struct sk_security_struct *sksec = sk->sk_security;

        /* handle mapped IPv4 packets arriving via IPv6 sockets */
        if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
                family = PF_INET;

        selinux_skb_peerlbl_sid(skb, family, &sksec->peer_sid);
}

static int selinux_secmark_relabel_packet(u32 sid)
{
        const struct task_security_struct *__tsec;
        u32 tsid;

        __tsec = selinux_cred(current_cred());
        tsid = __tsec->sid;

        return avc_has_perm(&selinux_state,
                            tsid, sid, SECCLASS_PACKET, PACKET__RELABELTO,
                            NULL);
}

static void selinux_secmark_refcount_inc(void)
{
        atomic_inc(&selinux_secmark_refcount);
}

static void selinux_secmark_refcount_dec(void)
{
        atomic_dec(&selinux_secmark_refcount);
}

static void selinux_req_classify_flow(const struct request_sock *req,
                                      struct flowi_common *flic)
{
        flic->flowic_secid = req->secid;
}

static int selinux_tun_dev_alloc_security(void **security)
{
        struct tun_security_struct *tunsec;

        tunsec = kzalloc(sizeof(*tunsec), GFP_KERNEL);
        if (!tunsec)
                return -ENOMEM;
        tunsec->sid = current_sid();

        *security = tunsec;
        return 0;
}

static void selinux_tun_dev_free_security(void *security)
{
        kfree(security);
}

static int selinux_tun_dev_create(void)
{
        u32 sid = current_sid();

        /* we aren't taking into account the "sockcreate" SID since the socket
         * that is being created here is not a socket in the traditional sense,
         * instead it is a private sock, accessible only to the kernel, and
         * representing a wide range of network traffic spanning multiple
         * connections unlike traditional sockets - check the TUN driver to
         * get a better understanding of why this socket is special */

        return avc_has_perm(&selinux_state,
                            sid, sid, SECCLASS_TUN_SOCKET, TUN_SOCKET__CREATE,
                            NULL);
}

static int selinux_tun_dev_attach_queue(void *security)
{
        struct tun_security_struct *tunsec = security;

        return avc_has_perm(&selinux_state,
                            current_sid(), tunsec->sid, SECCLASS_TUN_SOCKET,
                            TUN_SOCKET__ATTACH_QUEUE, NULL);
}

static int selinux_tun_dev_attach(struct sock *sk, void *security)
{
        struct tun_security_struct *tunsec = security;
        struct sk_security_struct *sksec = sk->sk_security;

        /* we don't currently perform any NetLabel based labeling here and it
         * isn't clear that we would want to do so anyway; while we could apply
         * labeling without the support of the TUN user the resulting labeled
         * traffic from the other end of the connection would almost certainly
         * cause confusion to the TUN user that had no idea network labeling
         * protocols were being used */

        sksec->sid = tunsec->sid;
        sksec->sclass = SECCLASS_TUN_SOCKET;

        return 0;
}

static int selinux_tun_dev_open(void *security)
{
        struct tun_security_struct *tunsec = security;
        u32 sid = current_sid();
        int err;

        err = avc_has_perm(&selinux_state,
                           sid, tunsec->sid, SECCLASS_TUN_SOCKET,
                           TUN_SOCKET__RELABELFROM, NULL);
        if (err)
                return err;
        err = avc_has_perm(&selinux_state,
                           sid, sid, SECCLASS_TUN_SOCKET,
                           TUN_SOCKET__RELABELTO, NULL);
        if (err)
                return err;
        tunsec->sid = sid;

        return 0;
}

#ifdef CONFIG_NETFILTER

static unsigned int selinux_ip_forward(struct sk_buff *skb,
                                       const struct net_device *indev,
                                       u16 family)
{
        int err;
        char *addrp;
        u32 peer_sid;
        struct common_audit_data ad;
        struct lsm_network_audit net = {0,};
        u8 secmark_active;
        u8 netlbl_active;
        u8 peerlbl_active;

        if (!selinux_policycap_netpeer())
                return NF_ACCEPT;

        secmark_active = selinux_secmark_enabled();
        netlbl_active = netlbl_enabled();
        peerlbl_active = selinux_peerlbl_enabled();
        if (!secmark_active && !peerlbl_active)
                return NF_ACCEPT;

        if (selinux_skb_peerlbl_sid(skb, family, &peer_sid) != 0)
                return NF_DROP;

        ad.type = LSM_AUDIT_DATA_NET;
        ad.u.net = &net;
        ad.u.net->netif = indev->ifindex;
        ad.u.net->family = family;
        if (selinux_parse_skb(skb, &ad, &addrp, 1, NULL) != 0)
                return NF_DROP;

        if (peerlbl_active) {
                err = selinux_inet_sys_rcv_skb(dev_net(indev), indev->ifindex,
                                               addrp, family, peer_sid, &ad);
                if (err) {
                        selinux_netlbl_err(skb, family, err, 1);
                        return NF_DROP;
                }
        }

        if (secmark_active)
                if (avc_has_perm(&selinux_state,
                                 peer_sid, skb->secmark,
                                 SECCLASS_PACKET, PACKET__FORWARD_IN, &ad))
                        return NF_DROP;

        if (netlbl_active)
                /* we do this in the FORWARD path and not the POST_ROUTING
                 * path because we want to make sure we apply the necessary
                 * labeling before IPsec is applied so we can leverage AH
                 * protection */
                if (selinux_netlbl_skbuff_setsid(skb, family, peer_sid) != 0)
                        return NF_DROP;

        return NF_ACCEPT;
}

static unsigned int selinux_ipv4_forward(void *priv,
                                         struct sk_buff *skb,
                                         const struct nf_hook_state *state)
{
        return selinux_ip_forward(skb, state->in, PF_INET);
}

#if IS_ENABLED(CONFIG_IPV6)
static unsigned int selinux_ipv6_forward(void *priv,
                                         struct sk_buff *skb,
                                         const struct nf_hook_state *state)
{
        return selinux_ip_forward(skb, state->in, PF_INET6);
}
#endif        /* IPV6 */

static unsigned int selinux_ip_output(struct sk_buff *skb,
                                      u16 family)
{
        struct sock *sk;
        u32 sid;

        if (!netlbl_enabled())
                return NF_ACCEPT;

        /* we do this in the LOCAL_OUT path and not the POST_ROUTING path
         * because we want to make sure we apply the necessary labeling
         * before IPsec is applied so we can leverage AH protection */
        sk = skb->sk;
        if (sk) {
                struct sk_security_struct *sksec;

                if (sk_listener(sk))
                        /* if the socket is the listening state then this
                         * packet is a SYN-ACK packet which means it needs to
                         * be labeled based on the connection/request_sock and
                         * not the parent socket.  unfortunately, we can't
                         * lookup the request_sock yet as it isn't queued on
                         * the parent socket until after the SYN-ACK is sent.
                         * the "solution" is to simply pass the packet as-is
                         * as any IP option based labeling should be copied
                         * from the initial connection request (in the IP
                         * layer).  it is far from ideal, but until we get a
                         * security label in the packet itself this is the
                         * best we can do. */
                        return NF_ACCEPT;

                /* standard practice, label using the parent socket */
                sksec = sk->sk_security;
                sid = sksec->sid;
        } else
                sid = SECINITSID_KERNEL;
        if (selinux_netlbl_skbuff_setsid(skb, family, sid) != 0)
                return NF_DROP;

        return NF_ACCEPT;
}

static unsigned int selinux_ipv4_output(void *priv,
                                        struct sk_buff *skb,
                                        const struct nf_hook_state *state)
{
        return selinux_ip_output(skb, PF_INET);
}

#if IS_ENABLED(CONFIG_IPV6)
static unsigned int selinux_ipv6_output(void *priv,
                                        struct sk_buff *skb,
                                        const struct nf_hook_state *state)
{
        return selinux_ip_output(skb, PF_INET6);
}
#endif        /* IPV6 */

static unsigned int selinux_ip_postroute_compat(struct sk_buff *skb,
                                                int ifindex,
                                                u16 family)
{
        struct sock *sk = skb_to_full_sk(skb);
        struct sk_security_struct *sksec;
        struct common_audit_data ad;
        struct lsm_network_audit net = {0,};
        char *addrp;
        u8 proto = 0;

        if (sk == NULL)
                return NF_ACCEPT;
        sksec = sk->sk_security;

        ad.type = LSM_AUDIT_DATA_NET;
        ad.u.net = &net;
        ad.u.net->netif = ifindex;
        ad.u.net->family = family;
        if (selinux_parse_skb(skb, &ad, &addrp, 0, &proto))
                return NF_DROP;

        if (selinux_secmark_enabled())
                if (avc_has_perm(&selinux_state,
                                 sksec->sid, skb->secmark,
                                 SECCLASS_PACKET, PACKET__SEND, &ad))
                        return NF_DROP_ERR(-ECONNREFUSED);

        if (selinux_xfrm_postroute_last(sksec->sid, skb, &ad, proto))
                return NF_DROP_ERR(-ECONNREFUSED);

        return NF_ACCEPT;
}

static unsigned int selinux_ip_postroute(struct sk_buff *skb,
                                         const struct net_device *outdev,
                                         u16 family)
{
        u32 secmark_perm;
        u32 peer_sid;
        int ifindex = outdev->ifindex;
        struct sock *sk;
        struct common_audit_data ad;
        struct lsm_network_audit net = {0,};
        char *addrp;
        u8 secmark_active;
        u8 peerlbl_active;

        /* If any sort of compatibility mode is enabled then handoff processing
         * to the selinux_ip_postroute_compat() function to deal with the
         * special handling.  We do this in an attempt to keep this function
         * as fast and as clean as possible. */
        if (!selinux_policycap_netpeer())
                return selinux_ip_postroute_compat(skb, ifindex, family);

        secmark_active = selinux_secmark_enabled();
        peerlbl_active = selinux_peerlbl_enabled();
        if (!secmark_active && !peerlbl_active)
                return NF_ACCEPT;

        sk = skb_to_full_sk(skb);

#ifdef CONFIG_XFRM
        /* If skb->dst->xfrm is non-NULL then the packet is undergoing an IPsec
         * packet transformation so allow the packet to pass without any checks
         * since we'll have another chance to perform access control checks
         * when the packet is on it's final way out.
         * NOTE: there appear to be some IPv6 multicast cases where skb->dst
         *       is NULL, in this case go ahead and apply access control.
         * NOTE: if this is a local socket (skb->sk != NULL) that is in the
         *       TCP listening state we cannot wait until the XFRM processing
         *       is done as we will miss out on the SA label if we do;
         *       unfortunately, this means more work, but it is only once per
         *       connection. */
        if (skb_dst(skb) != NULL && skb_dst(skb)->xfrm != NULL &&
            !(sk && sk_listener(sk)))
                return NF_ACCEPT;
#endif

        if (sk == NULL) {
                /* Without an associated socket the packet is either coming
                 * from the kernel or it is being forwarded; check the packet
                 * to determine which and if the packet is being forwarded
                 * query the packet directly to determine the security label. */
                if (skb->skb_iif) {
                        secmark_perm = PACKET__FORWARD_OUT;
                        if (selinux_skb_peerlbl_sid(skb, family, &peer_sid))
                                return NF_DROP;
                } else {
                        secmark_perm = PACKET__SEND;
                        peer_sid = SECINITSID_KERNEL;
                }
        } else if (sk_listener(sk)) {
                /* Locally generated packet but the associated socket is in the
                 * listening state which means this is a SYN-ACK packet.  In
                 * this particular case the correct security label is assigned
                 * to the connection/request_sock but unfortunately we can't
                 * query the request_sock as it isn't queued on the parent
                 * socket until after the SYN-ACK packet is sent; the only
                 * viable choice is to regenerate the label like we do in
                 * selinux_inet_conn_request().  See also selinux_ip_output()
                 * for similar problems. */
                u32 skb_sid;
                struct sk_security_struct *sksec;

                sksec = sk->sk_security;
                if (selinux_skb_peerlbl_sid(skb, family, &skb_sid))
                        return NF_DROP;
                /* At this point, if the returned skb peerlbl is SECSID_NULL
                 * and the packet has been through at least one XFRM
                 * transformation then we must be dealing with the "final"
                 * form of labeled IPsec packet; since we've already applied
                 * all of our access controls on this packet we can safely
                 * pass the packet. */
                if (skb_sid == SECSID_NULL) {
                        switch (family) {
                        case PF_INET:
                                if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED)
                                        return NF_ACCEPT;
                                break;
                        case PF_INET6:
                                if (IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED)
                                        return NF_ACCEPT;
                                break;
                        default:
                                return NF_DROP_ERR(-ECONNREFUSED);
                        }
                }
                if (selinux_conn_sid(sksec->sid, skb_sid, &peer_sid))
                        return NF_DROP;
                secmark_perm = PACKET__SEND;
        } else {
                /* Locally generated packet, fetch the security label from the
                 * associated socket. */
                struct sk_security_struct *sksec = sk->sk_security;
                peer_sid = sksec->sid;
                secmark_perm = PACKET__SEND;
        }

        ad.type = LSM_AUDIT_DATA_NET;
        ad.u.net = &net;
        ad.u.net->netif = ifindex;
        ad.u.net->family = family;
        if (selinux_parse_skb(skb, &ad, &addrp, 0, NULL))
                return NF_DROP;

        if (secmark_active)
                if (avc_has_perm(&selinux_state,
                                 peer_sid, skb->secmark,
                                 SECCLASS_PACKET, secmark_perm, &ad))
                        return NF_DROP_ERR(-ECONNREFUSED);

        if (peerlbl_active) {
                u32 if_sid;
                u32 node_sid;

                if (sel_netif_sid(dev_net(outdev), ifindex, &if_sid))
                        return NF_DROP;
                if (avc_has_perm(&selinux_state,
                                 peer_sid, if_sid,
                                 SECCLASS_NETIF, NETIF__EGRESS, &ad))
                        return NF_DROP_ERR(-ECONNREFUSED);

                if (sel_netnode_sid(addrp, family, &node_sid))
                        return NF_DROP;
                if (avc_has_perm(&selinux_state,
                                 peer_sid, node_sid,
                                 SECCLASS_NODE, NODE__SENDTO, &ad))
                        return NF_DROP_ERR(-ECONNREFUSED);
        }

        return NF_ACCEPT;
}

static unsigned int selinux_ipv4_postroute(void *priv,
                                           struct sk_buff *skb,
                                           const struct nf_hook_state *state)
{
        return selinux_ip_postroute(skb, state->out, PF_INET);
}

#if IS_ENABLED(CONFIG_IPV6)
static unsigned int selinux_ipv6_postroute(void *priv,
                                           struct sk_buff *skb,
                                           const struct nf_hook_state *state)
{
        return selinux_ip_postroute(skb, state->out, PF_INET6);
}
#endif        /* IPV6 */

#endif        /* CONFIG_NETFILTER */

static int selinux_netlink_send(struct sock *sk, struct sk_buff *skb)
{
        int rc = 0;
        unsigned int msg_len;
        unsigned int data_len = skb->len;
        unsigned char *data = skb->data;
        struct nlmsghdr *nlh;
        struct sk_security_struct *sksec = sk->sk_security;
        u16 sclass = sksec->sclass;
        u32 perm;

        while (data_len >= nlmsg_total_size(0)) {
                nlh = (struct nlmsghdr *)data;

                /* NOTE: the nlmsg_len field isn't reliably set by some netlink
                 *       users which means we can't reject skb's with bogus
                 *       length fields; our solution is to follow what
                 *       netlink_rcv_skb() does and simply skip processing at
                 *       messages with length fields that are clearly junk
                 */
                if (nlh->nlmsg_len < NLMSG_HDRLEN || nlh->nlmsg_len > data_len)
                        return 0;

                rc = selinux_nlmsg_lookup(sclass, nlh->nlmsg_type, &perm);
                if (rc == 0) {
                        rc = sock_has_perm(sk, perm);
                        if (rc)
                                return rc;
                } else if (rc == -EINVAL) {
                        /* -EINVAL is a missing msg/perm mapping */
                        pr_warn_ratelimited("SELinux: unrecognized netlink"
                                " message: protocol=%hu nlmsg_type=%hu sclass=%s"
                                " pid=%d comm=%s\n",
                                sk->sk_protocol, nlh->nlmsg_type,
                                secclass_map[sclass - 1].name,
                                task_pid_nr(current), current->comm);
                        if (enforcing_enabled(&selinux_state) &&
                            !security_get_allow_unknown(&selinux_state))
                                return rc;
                        rc = 0;
                } else if (rc == -ENOENT) {
                        /* -ENOENT is a missing socket/class mapping, ignore */
                        rc = 0;
                } else {
                        return rc;
                }

                /* move to the next message after applying netlink padding */
                msg_len = NLMSG_ALIGN(nlh->nlmsg_len);
                if (msg_len >= data_len)
                        return 0;
                data_len -= msg_len;
                data += msg_len;
        }

        return rc;
}

static void ipc_init_security(struct ipc_security_struct *isec, u16 sclass)
{
        isec->sclass = sclass;
        isec->sid = current_sid();
}

static int ipc_has_perm(struct kern_ipc_perm *ipc_perms,
                        u32 perms)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();

        isec = selinux_ipc(ipc_perms);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = ipc_perms->key;

        return avc_has_perm(&selinux_state,
                            sid, isec->sid, isec->sclass, perms, &ad);
}

static int selinux_msg_msg_alloc_security(struct msg_msg *msg)
{
        struct msg_security_struct *msec;

        msec = selinux_msg_msg(msg);
        msec->sid = SECINITSID_UNLABELED;

        return 0;
}

/* message queue security operations */
static int selinux_msg_queue_alloc_security(struct kern_ipc_perm *msq)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();
        int rc;

        isec = selinux_ipc(msq);
        ipc_init_security(isec, SECCLASS_MSGQ);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = msq->key;

        rc = avc_has_perm(&selinux_state,
                          sid, isec->sid, SECCLASS_MSGQ,
                          MSGQ__CREATE, &ad);
        return rc;
}

static int selinux_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();

        isec = selinux_ipc(msq);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = msq->key;

        return avc_has_perm(&selinux_state,
                            sid, isec->sid, SECCLASS_MSGQ,
                            MSGQ__ASSOCIATE, &ad);
}

static int selinux_msg_queue_msgctl(struct kern_ipc_perm *msq, int cmd)
{
        int err;
        int perms;

        switch (cmd) {
        case IPC_INFO:
        case MSG_INFO:
                /* No specific object, just general system-wide information. */
                return avc_has_perm(&selinux_state,
                                    current_sid(), SECINITSID_KERNEL,
                                    SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL);
        case IPC_STAT:
        case MSG_STAT:
        case MSG_STAT_ANY:
                perms = MSGQ__GETATTR | MSGQ__ASSOCIATE;
                break;
        case IPC_SET:
                perms = MSGQ__SETATTR;
                break;
        case IPC_RMID:
                perms = MSGQ__DESTROY;
                break;
        default:
                return 0;
        }

        err = ipc_has_perm(msq, perms);
        return err;
}

static int selinux_msg_queue_msgsnd(struct kern_ipc_perm *msq, struct msg_msg *msg, int msqflg)
{
        struct ipc_security_struct *isec;
        struct msg_security_struct *msec;
        struct common_audit_data ad;
        u32 sid = current_sid();
        int rc;

        isec = selinux_ipc(msq);
        msec = selinux_msg_msg(msg);

        /*
         * First time through, need to assign label to the message
         */
        if (msec->sid == SECINITSID_UNLABELED) {
                /*
                 * Compute new sid based on current process and
                 * message queue this message will be stored in
                 */
                rc = security_transition_sid(&selinux_state, sid, isec->sid,
                                             SECCLASS_MSG, NULL, &msec->sid);
                if (rc)
                        return rc;
        }

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = msq->key;

        /* Can this process write to the queue? */
        rc = avc_has_perm(&selinux_state,
                          sid, isec->sid, SECCLASS_MSGQ,
                          MSGQ__WRITE, &ad);
        if (!rc)
                /* Can this process send the message */
                rc = avc_has_perm(&selinux_state,
                                  sid, msec->sid, SECCLASS_MSG,
                                  MSG__SEND, &ad);
        if (!rc)
                /* Can the message be put in the queue? */
                rc = avc_has_perm(&selinux_state,
                                  msec->sid, isec->sid, SECCLASS_MSGQ,
                                  MSGQ__ENQUEUE, &ad);

        return rc;
}

static int selinux_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *msg,
                                    struct task_struct *target,
                                    long type, int mode)
{
        struct ipc_security_struct *isec;
        struct msg_security_struct *msec;
        struct common_audit_data ad;
        u32 sid = task_sid(target);
        int rc;

        isec = selinux_ipc(msq);
        msec = selinux_msg_msg(msg);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = msq->key;

        rc = avc_has_perm(&selinux_state,
                          sid, isec->sid,
                          SECCLASS_MSGQ, MSGQ__READ, &ad);
        if (!rc)
                rc = avc_has_perm(&selinux_state,
                                  sid, msec->sid,
                                  SECCLASS_MSG, MSG__RECEIVE, &ad);
        return rc;
}

/* Shared Memory security operations */
static int selinux_shm_alloc_security(struct kern_ipc_perm *shp)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();
        int rc;

        isec = selinux_ipc(shp);
        ipc_init_security(isec, SECCLASS_SHM);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = shp->key;

        rc = avc_has_perm(&selinux_state,
                          sid, isec->sid, SECCLASS_SHM,
                          SHM__CREATE, &ad);
        return rc;
}

static int selinux_shm_associate(struct kern_ipc_perm *shp, int shmflg)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();

        isec = selinux_ipc(shp);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = shp->key;

        return avc_has_perm(&selinux_state,
                            sid, isec->sid, SECCLASS_SHM,
                            SHM__ASSOCIATE, &ad);
}

/* Note, at this point, shp is locked down */
static int selinux_shm_shmctl(struct kern_ipc_perm *shp, int cmd)
{
        int perms;
        int err;

        switch (cmd) {
        case IPC_INFO:
        case SHM_INFO:
                /* No specific object, just general system-wide information. */
                return avc_has_perm(&selinux_state,
                                    current_sid(), SECINITSID_KERNEL,
                                    SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL);
        case IPC_STAT:
        case SHM_STAT:
        case SHM_STAT_ANY:
                perms = SHM__GETATTR | SHM__ASSOCIATE;
                break;
        case IPC_SET:
                perms = SHM__SETATTR;
                break;
        case SHM_LOCK:
        case SHM_UNLOCK:
                perms = SHM__LOCK;
                break;
        case IPC_RMID:
                perms = SHM__DESTROY;
                break;
        default:
                return 0;
        }

        err = ipc_has_perm(shp, perms);
        return err;
}

static int selinux_shm_shmat(struct kern_ipc_perm *shp,
                             char __user *shmaddr, int shmflg)
{
        u32 perms;

        if (shmflg & SHM_RDONLY)
                perms = SHM__READ;
        else
                perms = SHM__READ | SHM__WRITE;

        return ipc_has_perm(shp, perms);
}

/* Semaphore security operations */
static int selinux_sem_alloc_security(struct kern_ipc_perm *sma)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();
        int rc;

        isec = selinux_ipc(sma);
        ipc_init_security(isec, SECCLASS_SEM);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = sma->key;

        rc = avc_has_perm(&selinux_state,
                          sid, isec->sid, SECCLASS_SEM,
                          SEM__CREATE, &ad);
        return rc;
}

static int selinux_sem_associate(struct kern_ipc_perm *sma, int semflg)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();

        isec = selinux_ipc(sma);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = sma->key;

        return avc_has_perm(&selinux_state,
                            sid, isec->sid, SECCLASS_SEM,
                            SEM__ASSOCIATE, &ad);
}

/* Note, at this point, sma is locked down */
static int selinux_sem_semctl(struct kern_ipc_perm *sma, int cmd)
{
        int err;
        u32 perms;

        switch (cmd) {
        case IPC_INFO:
        case SEM_INFO:
                /* No specific object, just general system-wide information. */
                return avc_has_perm(&selinux_state,
                                    current_sid(), SECINITSID_KERNEL,
                                    SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL);
        case GETPID:
        case GETNCNT:
        case GETZCNT:
                perms = SEM__GETATTR;
                break;
        case GETVAL:
        case GETALL:
                perms = SEM__READ;
                break;
        case SETVAL:
        case SETALL:
                perms = SEM__WRITE;
                break;
        case IPC_RMID:
                perms = SEM__DESTROY;
                break;
        case IPC_SET:
                perms = SEM__SETATTR;
                break;
        case IPC_STAT:
        case SEM_STAT:
        case SEM_STAT_ANY:
                perms = SEM__GETATTR | SEM__ASSOCIATE;
                break;
        default:
                return 0;
        }

        err = ipc_has_perm(sma, perms);
        return err;
}

static int selinux_sem_semop(struct kern_ipc_perm *sma,
                             struct sembuf *sops, unsigned nsops, int alter)
{
        u32 perms;

        if (alter)
                perms = SEM__READ | SEM__WRITE;
        else
                perms = SEM__READ;

        return ipc_has_perm(sma, perms);
}

static int selinux_ipc_permission(struct kern_ipc_perm *ipcp, short flag)
{
        u32 av = 0;

        av = 0;
        if (flag & S_IRUGO)
                av |= IPC__UNIX_READ;
        if (flag & S_IWUGO)
                av |= IPC__UNIX_WRITE;

        if (av == 0)
                return 0;

        return ipc_has_perm(ipcp, av);
}

static void selinux_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid)
{
        struct ipc_security_struct *isec = selinux_ipc(ipcp);
        *secid = isec->sid;
}

static void selinux_d_instantiate(struct dentry *dentry, struct inode *inode)
{
        if (inode)
                inode_doinit_with_dentry(inode, dentry);
}

static int selinux_getprocattr(struct task_struct *p,
                               char *name, char **value)
{
        const struct task_security_struct *__tsec;
        u32 sid;
        int error;
        unsigned len;

        rcu_read_lock();
        __tsec = selinux_cred(__task_cred(p));

        if (current != p) {
                error = avc_has_perm(&selinux_state,
                                     current_sid(), __tsec->sid,
                                     SECCLASS_PROCESS, PROCESS__GETATTR, NULL);
                if (error)
                        goto bad;
        }

        if (!strcmp(name, "current"))
                sid = __tsec->sid;
        else if (!strcmp(name, "prev"))
                sid = __tsec->osid;
        else if (!strcmp(name, "exec"))
                sid = __tsec->exec_sid;
        else if (!strcmp(name, "fscreate"))
                sid = __tsec->create_sid;
        else if (!strcmp(name, "keycreate"))
                sid = __tsec->keycreate_sid;
        else if (!strcmp(name, "sockcreate"))
                sid = __tsec->sockcreate_sid;
        else {
                error = -EINVAL;
                goto bad;
        }
        rcu_read_unlock();

        if (!sid)
                return 0;

        error = security_sid_to_context(&selinux_state, sid, value, &len);
        if (error)
                return error;
        return len;

bad:
        rcu_read_unlock();
        return error;
}

static int selinux_setprocattr(const char *name, void *value, size_t size)
{
        struct task_security_struct *tsec;
        struct cred *new;
        u32 mysid = current_sid(), sid = 0, ptsid;
        int error;
        char *str = value;

        /*
         * Basic control over ability to set these attributes at all.
         */
        if (!strcmp(name, "exec"))
                error = avc_has_perm(&selinux_state,
                                     mysid, mysid, SECCLASS_PROCESS,
                                     PROCESS__SETEXEC, NULL);
        else if (!strcmp(name, "fscreate"))
                error = avc_has_perm(&selinux_state,
                                     mysid, mysid, SECCLASS_PROCESS,
                                     PROCESS__SETFSCREATE, NULL);
        else if (!strcmp(name, "keycreate"))
                error = avc_has_perm(&selinux_state,
                                     mysid, mysid, SECCLASS_PROCESS,
                                     PROCESS__SETKEYCREATE, NULL);
        else if (!strcmp(name, "sockcreate"))
                error = avc_has_perm(&selinux_state,
                                     mysid, mysid, SECCLASS_PROCESS,
                                     PROCESS__SETSOCKCREATE, NULL);
        else if (!strcmp(name, "current"))
                error = avc_has_perm(&selinux_state,
                                     mysid, mysid, SECCLASS_PROCESS,
                                     PROCESS__SETCURRENT, NULL);
        else
                error = -EINVAL;
        if (error)
                return error;

        /* Obtain a SID for the context, if one was specified. */
        if (size && str[0] && str[0] != '\n') {
                if (str[size-1] == '\n') {
                        str[size-1] = 0;
                        size--;
                }
                error = security_context_to_sid(&selinux_state, value, size,
                                                &sid, GFP_KERNEL);
                if (error == -EINVAL && !strcmp(name, "fscreate")) {
                        if (!has_cap_mac_admin(true)) {
                                struct audit_buffer *ab;
                                size_t audit_size;

                                /* We strip a nul only if it is at the end, otherwise the
                                 * context contains a nul and we should audit that */
                                if (str[size - 1] == '\0')
                                        audit_size = size - 1;
                                else
                                        audit_size = size;
                                ab = audit_log_start(audit_context(),
                                                     GFP_ATOMIC,
                                                     AUDIT_SELINUX_ERR);
                                audit_log_format(ab, "op=fscreate invalid_context=");
                                audit_log_n_untrustedstring(ab, value, audit_size);
                                audit_log_end(ab);

                                return error;
                        }
                        error = security_context_to_sid_force(
                                                      &selinux_state,
                                                      value, size, &sid);
                }
                if (error)
                        return error;
        }

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        /* Permission checking based on the specified context is
           performed during the actual operation (execve,
           open/mkdir/...), when we know the full context of the
           operation.  See selinux_bprm_creds_for_exec for the execve
           checks and may_create for the file creation checks. The
           operation will then fail if the context is not permitted. */
        tsec = selinux_cred(new);
        if (!strcmp(name, "exec")) {
                tsec->exec_sid = sid;
        } else if (!strcmp(name, "fscreate")) {
                tsec->create_sid = sid;
        } else if (!strcmp(name, "keycreate")) {
                if (sid) {
                        error = avc_has_perm(&selinux_state, mysid, sid,
                                             SECCLASS_KEY, KEY__CREATE, NULL);
                        if (error)
                                goto abort_change;
                }
                tsec->keycreate_sid = sid;
        } else if (!strcmp(name, "sockcreate")) {
                tsec->sockcreate_sid = sid;
        } else if (!strcmp(name, "current")) {
                error = -EINVAL;
                if (sid == 0)
                        goto abort_change;

                /* Only allow single threaded processes to change context */
                error = -EPERM;
                if (!current_is_single_threaded()) {
                        error = security_bounded_transition(&selinux_state,
                                                            tsec->sid, sid);
                        if (error)
                                goto abort_change;
                }

                /* Check permissions for the transition. */
                error = avc_has_perm(&selinux_state,
                                     tsec->sid, sid, SECCLASS_PROCESS,
                                     PROCESS__DYNTRANSITION, NULL);
                if (error)
                        goto abort_change;

                /* Check for ptracing, and update the task SID if ok.
                   Otherwise, leave SID unchanged and fail. */
                ptsid = ptrace_parent_sid();
                if (ptsid != 0) {
                        error = avc_has_perm(&selinux_state,
                                             ptsid, sid, SECCLASS_PROCESS,
                                             PROCESS__PTRACE, NULL);
                        if (error)
                                goto abort_change;
                }

                tsec->sid = sid;
        } else {
                error = -EINVAL;
                goto abort_change;
        }

        commit_creds(new);
        return size;

abort_change:
        abort_creds(new);
        return error;
}

static int selinux_ismaclabel(const char *name)
{
        return (strcmp(name, XATTR_SELINUX_SUFFIX) == 0);
}

static int selinux_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
{
        return security_sid_to_context(&selinux_state, secid,
                                       secdata, seclen);
}

static int selinux_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
{
        return security_context_to_sid(&selinux_state, secdata, seclen,
                                       secid, GFP_KERNEL);
}

static void selinux_release_secctx(char *secdata, u32 seclen)
{
        kfree(secdata);
}

static void selinux_inode_invalidate_secctx(struct inode *inode)
{
        struct inode_security_struct *isec = selinux_inode(inode);

        spin_lock(&isec->lock);
        isec->initialized = LABEL_INVALID;
        spin_unlock(&isec->lock);
}

/*
 *        called with inode->i_mutex locked
 */
static int selinux_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen)
{
        int rc = selinux_inode_setsecurity(inode, XATTR_SELINUX_SUFFIX,
                                           ctx, ctxlen, 0);
        /* Do not return error when suppressing label (SBLABEL_MNT not set). */
        return rc == -EOPNOTSUPP ? 0 : rc;
}

/*
 *        called with inode->i_mutex locked
 */
static int selinux_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen)
{
        return __vfs_setxattr_noperm(dentry, XATTR_NAME_SELINUX, ctx, ctxlen, 0);
}

static int selinux_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
{
        int len = 0;
        len = selinux_inode_getsecurity(inode, XATTR_SELINUX_SUFFIX,
                                                ctx, true);
        if (len < 0)
                return len;
        *ctxlen = len;
        return 0;
}
#ifdef CONFIG_KEYS

static int selinux_key_alloc(struct key *k, const struct cred *cred,
                             unsigned long flags)
{
        const struct task_security_struct *tsec;
        struct key_security_struct *ksec;

        ksec = kzalloc(sizeof(struct key_security_struct), GFP_KERNEL);
        if (!ksec)
                return -ENOMEM;

        tsec = selinux_cred(cred);
        if (tsec->keycreate_sid)
                ksec->sid = tsec->keycreate_sid;
        else
                ksec->sid = tsec->sid;

        k->security = ksec;
        return 0;
}

static void selinux_key_free(struct key *k)
{
        struct key_security_struct *ksec = k->security;

        k->security = NULL;
        kfree(ksec);
}

static int selinux_key_permission(key_ref_t key_ref,
                                  const struct cred *cred,
                                  enum key_need_perm need_perm)
{
        struct key *key;
        struct key_security_struct *ksec;
        u32 perm, sid;

        switch (need_perm) {
        case KEY_NEED_VIEW:
                perm = KEY__VIEW;
                break;
        case KEY_NEED_READ:
                perm = KEY__READ;
                break;
        case KEY_NEED_WRITE:
                perm = KEY__WRITE;
                break;
        case KEY_NEED_SEARCH:
                perm = KEY__SEARCH;
                break;
        case KEY_NEED_LINK:
                perm = KEY__LINK;
                break;
        case KEY_NEED_SETATTR:
                perm = KEY__SETATTR;
                break;
        case KEY_NEED_UNLINK:
        case KEY_SYSADMIN_OVERRIDE:
        case KEY_AUTHTOKEN_OVERRIDE:
        case KEY_DEFER_PERM_CHECK:
                return 0;
        default:
                WARN_ON(1);
                return -EPERM;

        }

        sid = cred_sid(cred);
        key = key_ref_to_ptr(key_ref);
        ksec = key->security;

        return avc_has_perm(&selinux_state,
                            sid, ksec->sid, SECCLASS_KEY, perm, NULL);
}

static int selinux_key_getsecurity(struct key *key, char **_buffer)
{
        struct key_security_struct *ksec = key->security;
        char *context = NULL;
        unsigned len;
        int rc;

        rc = security_sid_to_context(&selinux_state, ksec->sid,
                                     &context, &len);
        if (!rc)
                rc = len;
        *_buffer = context;
        return rc;
}

#ifdef CONFIG_KEY_NOTIFICATIONS
static int selinux_watch_key(struct key *key)
{
        struct key_security_struct *ksec = key->security;
        u32 sid = current_sid();

        return avc_has_perm(&selinux_state,
                            sid, ksec->sid, SECCLASS_KEY, KEY__VIEW, NULL);
}
#endif
#endif

#ifdef CONFIG_SECURITY_INFINIBAND
static int selinux_ib_pkey_access(void *ib_sec, u64 subnet_prefix, u16 pkey_val)
{
        struct common_audit_data ad;
        int err;
        u32 sid = 0;
        struct ib_security_struct *sec = ib_sec;
        struct lsm_ibpkey_audit ibpkey;

        err = sel_ib_pkey_sid(subnet_prefix, pkey_val, &sid);
        if (err)
                return err;

        ad.type = LSM_AUDIT_DATA_IBPKEY;
        ibpkey.subnet_prefix = subnet_prefix;
        ibpkey.pkey = pkey_val;
        ad.u.ibpkey = &ibpkey;
        return avc_has_perm(&selinux_state,
                            sec->sid, sid,
                            SECCLASS_INFINIBAND_PKEY,
                            INFINIBAND_PKEY__ACCESS, &ad);
}

static int selinux_ib_endport_manage_subnet(void *ib_sec, const char *dev_name,
                                            u8 port_num)
{
        struct common_audit_data ad;
        int err;
        u32 sid = 0;
        struct ib_security_struct *sec = ib_sec;
        struct lsm_ibendport_audit ibendport;

        err = security_ib_endport_sid(&selinux_state, dev_name, port_num,
                                      &sid);

        if (err)
                return err;

        ad.type = LSM_AUDIT_DATA_IBENDPORT;
        strncpy(ibendport.dev_name, dev_name, sizeof(ibendport.dev_name));
        ibendport.port = port_num;
        ad.u.ibendport = &ibendport;
        return avc_has_perm(&selinux_state,
                            sec->sid, sid,
                            SECCLASS_INFINIBAND_ENDPORT,
                            INFINIBAND_ENDPORT__MANAGE_SUBNET, &ad);
}

static int selinux_ib_alloc_security(void **ib_sec)
{
        struct ib_security_struct *sec;

        sec = kzalloc(sizeof(*sec), GFP_KERNEL);
        if (!sec)
                return -ENOMEM;
        sec->sid = current_sid();

        *ib_sec = sec;
        return 0;
}

static void selinux_ib_free_security(void *ib_sec)
{
        kfree(ib_sec);
}
#endif

#ifdef CONFIG_BPF_SYSCALL
static int selinux_bpf(int cmd, union bpf_attr *attr,
                                     unsigned int size)
{
        u32 sid = current_sid();
        int ret;

        switch (cmd) {
        case BPF_MAP_CREATE:
                ret = avc_has_perm(&selinux_state,
                                   sid, sid, SECCLASS_BPF, BPF__MAP_CREATE,
                                   NULL);
                break;
        case BPF_PROG_LOAD:
                ret = avc_has_perm(&selinux_state,
                                   sid, sid, SECCLASS_BPF, BPF__PROG_LOAD,
                                   NULL);
                break;
        default:
                ret = 0;
                break;
        }

        return ret;
}

static u32 bpf_map_fmode_to_av(fmode_t fmode)
{
        u32 av = 0;

        if (fmode & FMODE_READ)
                av |= BPF__MAP_READ;
        if (fmode & FMODE_WRITE)
                av |= BPF__MAP_WRITE;
        return av;
}

/* This function will check the file pass through unix socket or binder to see
 * if it is a bpf related object. And apply correspinding checks on the bpf
 * object based on the type. The bpf maps and programs, not like other files and
 * socket, are using a shared anonymous inode inside the kernel as their inode.
 * So checking that inode cannot identify if the process have privilege to
 * access the bpf object and that's why we have to add this additional check in
 * selinux_file_receive and selinux_binder_transfer_files.
 */
static int bpf_fd_pass(struct file *file, u32 sid)
{
        struct bpf_security_struct *bpfsec;
        struct bpf_prog *prog;
        struct bpf_map *map;
        int ret;

        if (file->f_op == &bpf_map_fops) {
                map = file->private_data;
                bpfsec = map->security;
                ret = avc_has_perm(&selinux_state,
                                   sid, bpfsec->sid, SECCLASS_BPF,
                                   bpf_map_fmode_to_av(file->f_mode), NULL);
                if (ret)
                        return ret;
        } else if (file->f_op == &bpf_prog_fops) {
                prog = file->private_data;
                bpfsec = prog->aux->security;
                ret = avc_has_perm(&selinux_state,
                                   sid, bpfsec->sid, SECCLASS_BPF,
                                   BPF__PROG_RUN, NULL);
                if (ret)
                        return ret;
        }
        return 0;
}

static int selinux_bpf_map(struct bpf_map *map, fmode_t fmode)
{
        u32 sid = current_sid();
        struct bpf_security_struct *bpfsec;

        bpfsec = map->security;
        return avc_has_perm(&selinux_state,
                            sid, bpfsec->sid, SECCLASS_BPF,
                            bpf_map_fmode_to_av(fmode), NULL);
}

static int selinux_bpf_prog(struct bpf_prog *prog)
{
        u32 sid = current_sid();
        struct bpf_security_struct *bpfsec;

        bpfsec = prog->aux->security;
        return avc_has_perm(&selinux_state,
                            sid, bpfsec->sid, SECCLASS_BPF,
                            BPF__PROG_RUN, NULL);
}

static int selinux_bpf_map_alloc(struct bpf_map *map)
{
        struct bpf_security_struct *bpfsec;

        bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL);
        if (!bpfsec)
                return -ENOMEM;

        bpfsec->sid = current_sid();
        map->security = bpfsec;

        return 0;
}

static void selinux_bpf_map_free(struct bpf_map *map)
{
        struct bpf_security_struct *bpfsec = map->security;

        map->security = NULL;
        kfree(bpfsec);
}

static int selinux_bpf_prog_alloc(struct bpf_prog_aux *aux)
{
        struct bpf_security_struct *bpfsec;

        bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL);
        if (!bpfsec)
                return -ENOMEM;

        bpfsec->sid = current_sid();
        aux->security = bpfsec;

        return 0;
}

static void selinux_bpf_prog_free(struct bpf_prog_aux *aux)
{
        struct bpf_security_struct *bpfsec = aux->security;

        aux->security = NULL;
        kfree(bpfsec);
}
#endif

static int selinux_lockdown(enum lockdown_reason what)
{
        struct common_audit_data ad;
        u32 sid = current_sid();
        int invalid_reason = (what <= LOCKDOWN_NONE) ||
                             (what == LOCKDOWN_INTEGRITY_MAX) ||
                             (what >= LOCKDOWN_CONFIDENTIALITY_MAX);

        if (WARN(invalid_reason, "Invalid lockdown reason")) {
                audit_log(audit_context(),
                          GFP_ATOMIC, AUDIT_SELINUX_ERR,
                          "lockdown_reason=invalid");
                return -EINVAL;
        }

        ad.type = LSM_AUDIT_DATA_LOCKDOWN;
        ad.u.reason = what;

        if (what <= LOCKDOWN_INTEGRITY_MAX)
                return avc_has_perm(&selinux_state,
                                    sid, sid, SECCLASS_LOCKDOWN,
                                    LOCKDOWN__INTEGRITY, &ad);
        else
                return avc_has_perm(&selinux_state,
                                    sid, sid, SECCLASS_LOCKDOWN,
                                    LOCKDOWN__CONFIDENTIALITY, &ad);
}

struct lsm_blob_sizes selinux_blob_sizes __lsm_ro_after_init = {
        .lbs_cred = sizeof(struct task_security_struct),
        .lbs_file = sizeof(struct file_security_struct),
        .lbs_inode = sizeof(struct inode_security_struct),
        .lbs_ipc = sizeof(struct ipc_security_struct),
        .lbs_msg_msg = sizeof(struct msg_security_struct),
};

#ifdef CONFIG_PERF_EVENTS
static int selinux_perf_event_open(struct perf_event_attr *attr, int type)
{
        u32 requested, sid = current_sid();

        if (type == PERF_SECURITY_OPEN)
                requested = PERF_EVENT__OPEN;
        else if (type == PERF_SECURITY_CPU)
                requested = PERF_EVENT__CPU;
        else if (type == PERF_SECURITY_KERNEL)
                requested = PERF_EVENT__KERNEL;
        else if (type == PERF_SECURITY_TRACEPOINT)
                requested = PERF_EVENT__TRACEPOINT;
        else
                return -EINVAL;

        return avc_has_perm(&selinux_state, sid, sid, SECCLASS_PERF_EVENT,
                            requested, NULL);
}

static int selinux_perf_event_alloc(struct perf_event *event)
{
        struct perf_event_security_struct *perfsec;

        perfsec = kzalloc(sizeof(*perfsec), GFP_KERNEL);
        if (!perfsec)
                return -ENOMEM;

        perfsec->sid = current_sid();
        event->security = perfsec;

        return 0;
}

static void selinux_perf_event_free(struct perf_event *event)
{
        struct perf_event_security_struct *perfsec = event->security;

        event->security = NULL;
        kfree(perfsec);
}

static int selinux_perf_event_read(struct perf_event *event)
{
        struct perf_event_security_struct *perfsec = event->security;
        u32 sid = current_sid();

        return avc_has_perm(&selinux_state, sid, perfsec->sid,
                            SECCLASS_PERF_EVENT, PERF_EVENT__READ, NULL);
}

static int selinux_perf_event_write(struct perf_event *event)
{
        struct perf_event_security_struct *perfsec = event->security;
        u32 sid = current_sid();

        return avc_has_perm(&selinux_state, sid, perfsec->sid,
                            SECCLASS_PERF_EVENT, PERF_EVENT__WRITE, NULL);
}
#endif

/*
 * IMPORTANT NOTE: When adding new hooks, please be careful to keep this order:
 * 1. any hooks that don't belong to (2.) or (3.) below,
 * 2. hooks that both access structures allocated by other hooks, and allocate
 *    structures that can be later accessed by other hooks (mostly "cloning"
 *    hooks),
 * 3. hooks that only allocate structures that can be later accessed by other
 *    hooks ("allocating" hooks).
 *
 * Please follow block comment delimiters in the list to keep this order.
 *
 * This ordering is needed for SELinux runtime disable to work at least somewhat
 * safely. Breaking the ordering rules above might lead to NULL pointer derefs
 * when disabling SELinux at runtime.
 */
static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
        LSM_HOOK_INIT(binder_set_context_mgr, selinux_binder_set_context_mgr),
        LSM_HOOK_INIT(binder_transaction, selinux_binder_transaction),
        LSM_HOOK_INIT(binder_transfer_binder, selinux_binder_transfer_binder),
        LSM_HOOK_INIT(binder_transfer_file, selinux_binder_transfer_file),

        LSM_HOOK_INIT(ptrace_access_check, selinux_ptrace_access_check),
        LSM_HOOK_INIT(ptrace_traceme, selinux_ptrace_traceme),
        LSM_HOOK_INIT(capget, selinux_capget),
        LSM_HOOK_INIT(capset, selinux_capset),
        LSM_HOOK_INIT(capable, selinux_capable),
        LSM_HOOK_INIT(quotactl, selinux_quotactl),
        LSM_HOOK_INIT(quota_on, selinux_quota_on),
        LSM_HOOK_INIT(syslog, selinux_syslog),
        LSM_HOOK_INIT(vm_enough_memory, selinux_vm_enough_memory),

        LSM_HOOK_INIT(netlink_send, selinux_netlink_send),

        LSM_HOOK_INIT(bprm_creds_for_exec, selinux_bprm_creds_for_exec),
        LSM_HOOK_INIT(bprm_committing_creds, selinux_bprm_committing_creds),
        LSM_HOOK_INIT(bprm_committed_creds, selinux_bprm_committed_creds),

        LSM_HOOK_INIT(sb_free_security, selinux_sb_free_security),
        LSM_HOOK_INIT(sb_free_mnt_opts, selinux_free_mnt_opts),
        LSM_HOOK_INIT(sb_remount, selinux_sb_remount),
        LSM_HOOK_INIT(sb_kern_mount, selinux_sb_kern_mount),
        LSM_HOOK_INIT(sb_show_options, selinux_sb_show_options),
        LSM_HOOK_INIT(sb_statfs, selinux_sb_statfs),
        LSM_HOOK_INIT(sb_mount, selinux_mount),
        LSM_HOOK_INIT(sb_umount, selinux_umount),
        LSM_HOOK_INIT(sb_set_mnt_opts, selinux_set_mnt_opts),
        LSM_HOOK_INIT(sb_clone_mnt_opts, selinux_sb_clone_mnt_opts),

        LSM_HOOK_INIT(move_mount, selinux_move_mount),

        LSM_HOOK_INIT(dentry_init_security, selinux_dentry_init_security),
        LSM_HOOK_INIT(dentry_create_files_as, selinux_dentry_create_files_as),

        LSM_HOOK_INIT(inode_free_security, selinux_inode_free_security),
        LSM_HOOK_INIT(inode_init_security, selinux_inode_init_security),
        LSM_HOOK_INIT(inode_create, selinux_inode_create),
        LSM_HOOK_INIT(inode_link, selinux_inode_link),
        LSM_HOOK_INIT(inode_unlink, selinux_inode_unlink),
        LSM_HOOK_INIT(inode_symlink, selinux_inode_symlink),
        LSM_HOOK_INIT(inode_mkdir, selinux_inode_mkdir),
        LSM_HOOK_INIT(inode_rmdir, selinux_inode_rmdir),
        LSM_HOOK_INIT(inode_mknod, selinux_inode_mknod),
        LSM_HOOK_INIT(inode_rename, selinux_inode_rename),
        LSM_HOOK_INIT(inode_readlink, selinux_inode_readlink),
        LSM_HOOK_INIT(inode_follow_link, selinux_inode_follow_link),
        LSM_HOOK_INIT(inode_permission, selinux_inode_permission),
        LSM_HOOK_INIT(inode_setattr, selinux_inode_setattr),
        LSM_HOOK_INIT(inode_getattr, selinux_inode_getattr),
        LSM_HOOK_INIT(inode_setxattr, selinux_inode_setxattr),
        LSM_HOOK_INIT(inode_post_setxattr, selinux_inode_post_setxattr),
        LSM_HOOK_INIT(inode_getxattr, selinux_inode_getxattr),
        LSM_HOOK_INIT(inode_listxattr, selinux_inode_listxattr),
        LSM_HOOK_INIT(inode_removexattr, selinux_inode_removexattr),
        LSM_HOOK_INIT(inode_getsecurity, selinux_inode_getsecurity),
        LSM_HOOK_INIT(inode_setsecurity, selinux_inode_setsecurity),
        LSM_HOOK_INIT(inode_listsecurity, selinux_inode_listsecurity),
        LSM_HOOK_INIT(inode_getsecid, selinux_inode_getsecid),
        LSM_HOOK_INIT(inode_copy_up, selinux_inode_copy_up),
        LSM_HOOK_INIT(inode_copy_up_xattr, selinux_inode_copy_up_xattr),
        LSM_HOOK_INIT(path_notify, selinux_path_notify),

        LSM_HOOK_INIT(kernfs_init_security, selinux_kernfs_init_security),

        LSM_HOOK_INIT(file_permission, selinux_file_permission),
        LSM_HOOK_INIT(file_alloc_security, selinux_file_alloc_security),
        LSM_HOOK_INIT(file_ioctl, selinux_file_ioctl),
        LSM_HOOK_INIT(file_ioctl_compat, selinux_file_ioctl_compat),
        LSM_HOOK_INIT(mmap_file, selinux_mmap_file),
        LSM_HOOK_INIT(mmap_addr, selinux_mmap_addr),
        LSM_HOOK_INIT(file_mprotect, selinux_file_mprotect),
        LSM_HOOK_INIT(file_lock, selinux_file_lock),
        LSM_HOOK_INIT(file_fcntl, selinux_file_fcntl),
        LSM_HOOK_INIT(file_set_fowner, selinux_file_set_fowner),
        LSM_HOOK_INIT(file_send_sigiotask, selinux_file_send_sigiotask),
        LSM_HOOK_INIT(file_receive, selinux_file_receive),

        LSM_HOOK_INIT(file_open, selinux_file_open),

        LSM_HOOK_INIT(task_alloc, selinux_task_alloc),
        LSM_HOOK_INIT(cred_prepare, selinux_cred_prepare),
        LSM_HOOK_INIT(cred_transfer, selinux_cred_transfer),
        LSM_HOOK_INIT(cred_getsecid, selinux_cred_getsecid),
        LSM_HOOK_INIT(kernel_act_as, selinux_kernel_act_as),
        LSM_HOOK_INIT(kernel_create_files_as, selinux_kernel_create_files_as),
        LSM_HOOK_INIT(kernel_module_request, selinux_kernel_module_request),
        LSM_HOOK_INIT(kernel_load_data, selinux_kernel_load_data),
        LSM_HOOK_INIT(kernel_read_file, selinux_kernel_read_file),
        LSM_HOOK_INIT(task_setpgid, selinux_task_setpgid),
        LSM_HOOK_INIT(task_getpgid, selinux_task_getpgid),
        LSM_HOOK_INIT(task_getsid, selinux_task_getsid),
        LSM_HOOK_INIT(task_getsecid, selinux_task_getsecid),
        LSM_HOOK_INIT(task_setnice, selinux_task_setnice),
        LSM_HOOK_INIT(task_setioprio, selinux_task_setioprio),
        LSM_HOOK_INIT(task_getioprio, selinux_task_getioprio),
        LSM_HOOK_INIT(task_prlimit, selinux_task_prlimit),
        LSM_HOOK_INIT(task_setrlimit, selinux_task_setrlimit),
        LSM_HOOK_INIT(task_setscheduler, selinux_task_setscheduler),
        LSM_HOOK_INIT(task_getscheduler, selinux_task_getscheduler),
        LSM_HOOK_INIT(task_movememory, selinux_task_movememory),
        LSM_HOOK_INIT(task_kill, selinux_task_kill),
        LSM_HOOK_INIT(task_to_inode, selinux_task_to_inode),

        LSM_HOOK_INIT(ipc_permission, selinux_ipc_permission),
        LSM_HOOK_INIT(ipc_getsecid, selinux_ipc_getsecid),

        LSM_HOOK_INIT(msg_queue_associate, selinux_msg_queue_associate),
        LSM_HOOK_INIT(msg_queue_msgctl, selinux_msg_queue_msgctl),
        LSM_HOOK_INIT(msg_queue_msgsnd, selinux_msg_queue_msgsnd),
        LSM_HOOK_INIT(msg_queue_msgrcv, selinux_msg_queue_msgrcv),

        LSM_HOOK_INIT(shm_associate, selinux_shm_associate),
        LSM_HOOK_INIT(shm_shmctl, selinux_shm_shmctl),
        LSM_HOOK_INIT(shm_shmat, selinux_shm_shmat),

        LSM_HOOK_INIT(sem_associate, selinux_sem_associate),
        LSM_HOOK_INIT(sem_semctl, selinux_sem_semctl),
        LSM_HOOK_INIT(sem_semop, selinux_sem_semop),

        LSM_HOOK_INIT(d_instantiate, selinux_d_instantiate),

        LSM_HOOK_INIT(getprocattr, selinux_getprocattr),
        LSM_HOOK_INIT(setprocattr, selinux_setprocattr),

        LSM_HOOK_INIT(ismaclabel, selinux_ismaclabel),
        LSM_HOOK_INIT(secctx_to_secid, selinux_secctx_to_secid),
        LSM_HOOK_INIT(release_secctx, selinux_release_secctx),
        LSM_HOOK_INIT(inode_invalidate_secctx, selinux_inode_invalidate_secctx),
        LSM_HOOK_INIT(inode_notifysecctx, selinux_inode_notifysecctx),
        LSM_HOOK_INIT(inode_setsecctx, selinux_inode_setsecctx),

        LSM_HOOK_INIT(unix_stream_connect, selinux_socket_unix_stream_connect),
        LSM_HOOK_INIT(unix_may_send, selinux_socket_unix_may_send),

        LSM_HOOK_INIT(socket_create, selinux_socket_create),
        LSM_HOOK_INIT(socket_post_create, selinux_socket_post_create),
        LSM_HOOK_INIT(socket_socketpair, selinux_socket_socketpair),
        LSM_HOOK_INIT(socket_bind, selinux_socket_bind),
        LSM_HOOK_INIT(socket_connect, selinux_socket_connect),
        LSM_HOOK_INIT(socket_listen, selinux_socket_listen),
        LSM_HOOK_INIT(socket_accept, selinux_socket_accept),
        LSM_HOOK_INIT(socket_sendmsg, selinux_socket_sendmsg),
        LSM_HOOK_INIT(socket_recvmsg, selinux_socket_recvmsg),
        LSM_HOOK_INIT(socket_getsockname, selinux_socket_getsockname),
        LSM_HOOK_INIT(socket_getpeername, selinux_socket_getpeername),
        LSM_HOOK_INIT(socket_getsockopt, selinux_socket_getsockopt),
        LSM_HOOK_INIT(socket_setsockopt, selinux_socket_setsockopt),
        LSM_HOOK_INIT(socket_shutdown, selinux_socket_shutdown),
        LSM_HOOK_INIT(socket_sock_rcv_skb, selinux_socket_sock_rcv_skb),
        LSM_HOOK_INIT(socket_getpeersec_stream,
                        selinux_socket_getpeersec_stream),
        LSM_HOOK_INIT(socket_getpeersec_dgram, selinux_socket_getpeersec_dgram),
        LSM_HOOK_INIT(sk_free_security, selinux_sk_free_security),
        LSM_HOOK_INIT(sk_clone_security, selinux_sk_clone_security),
        LSM_HOOK_INIT(sk_getsecid, selinux_sk_getsecid),
        LSM_HOOK_INIT(sock_graft, selinux_sock_graft),
        LSM_HOOK_INIT(sctp_assoc_request, selinux_sctp_assoc_request),
        LSM_HOOK_INIT(sctp_sk_clone, selinux_sctp_sk_clone),
        LSM_HOOK_INIT(sctp_bind_connect, selinux_sctp_bind_connect),
        LSM_HOOK_INIT(inet_conn_request, selinux_inet_conn_request),
        LSM_HOOK_INIT(inet_csk_clone, selinux_inet_csk_clone),
        LSM_HOOK_INIT(inet_conn_established, selinux_inet_conn_established),
        LSM_HOOK_INIT(secmark_relabel_packet, selinux_secmark_relabel_packet),
        LSM_HOOK_INIT(secmark_refcount_inc, selinux_secmark_refcount_inc),
        LSM_HOOK_INIT(secmark_refcount_dec, selinux_secmark_refcount_dec),
        LSM_HOOK_INIT(req_classify_flow, selinux_req_classify_flow),
        LSM_HOOK_INIT(tun_dev_free_security, selinux_tun_dev_free_security),
        LSM_HOOK_INIT(tun_dev_create, selinux_tun_dev_create),
        LSM_HOOK_INIT(tun_dev_attach_queue, selinux_tun_dev_attach_queue),
        LSM_HOOK_INIT(tun_dev_attach, selinux_tun_dev_attach),
        LSM_HOOK_INIT(tun_dev_open, selinux_tun_dev_open),
#ifdef CONFIG_SECURITY_INFINIBAND
        LSM_HOOK_INIT(ib_pkey_access, selinux_ib_pkey_access),
        LSM_HOOK_INIT(ib_endport_manage_subnet,
                      selinux_ib_endport_manage_subnet),
        LSM_HOOK_INIT(ib_free_security, selinux_ib_free_security),
#endif
#ifdef CONFIG_SECURITY_NETWORK_XFRM
        LSM_HOOK_INIT(xfrm_policy_free_security, selinux_xfrm_policy_free),
        LSM_HOOK_INIT(xfrm_policy_delete_security, selinux_xfrm_policy_delete),
        LSM_HOOK_INIT(xfrm_state_free_security, selinux_xfrm_state_free),
        LSM_HOOK_INIT(xfrm_state_delete_security, selinux_xfrm_state_delete),
        LSM_HOOK_INIT(xfrm_policy_lookup, selinux_xfrm_policy_lookup),
        LSM_HOOK_INIT(xfrm_state_pol_flow_match,
                        selinux_xfrm_state_pol_flow_match),
        LSM_HOOK_INIT(xfrm_decode_session, selinux_xfrm_decode_session),
#endif

#ifdef CONFIG_KEYS
        LSM_HOOK_INIT(key_free, selinux_key_free),
        LSM_HOOK_INIT(key_permission, selinux_key_permission),
        LSM_HOOK_INIT(key_getsecurity, selinux_key_getsecurity),
#ifdef CONFIG_KEY_NOTIFICATIONS
        LSM_HOOK_INIT(watch_key, selinux_watch_key),
#endif
#endif

#ifdef CONFIG_AUDIT
        LSM_HOOK_INIT(audit_rule_known, selinux_audit_rule_known),
        LSM_HOOK_INIT(audit_rule_match, selinux_audit_rule_match),
        LSM_HOOK_INIT(audit_rule_free, selinux_audit_rule_free),
#endif

#ifdef CONFIG_BPF_SYSCALL
        LSM_HOOK_INIT(bpf, selinux_bpf),
        LSM_HOOK_INIT(bpf_map, selinux_bpf_map),
        LSM_HOOK_INIT(bpf_prog, selinux_bpf_prog),
        LSM_HOOK_INIT(bpf_map_free_security, selinux_bpf_map_free),
        LSM_HOOK_INIT(bpf_prog_free_security, selinux_bpf_prog_free),
#endif

#ifdef CONFIG_PERF_EVENTS
        LSM_HOOK_INIT(perf_event_open, selinux_perf_event_open),
        LSM_HOOK_INIT(perf_event_free, selinux_perf_event_free),
        LSM_HOOK_INIT(perf_event_read, selinux_perf_event_read),
        LSM_HOOK_INIT(perf_event_write, selinux_perf_event_write),
#endif

        LSM_HOOK_INIT(locked_down, selinux_lockdown),

        /*
         * PUT "CLONING" (ACCESSING + ALLOCATING) HOOKS HERE
         */
        LSM_HOOK_INIT(fs_context_dup, selinux_fs_context_dup),
        LSM_HOOK_INIT(fs_context_parse_param, selinux_fs_context_parse_param),
        LSM_HOOK_INIT(sb_eat_lsm_opts, selinux_sb_eat_lsm_opts),
        LSM_HOOK_INIT(sb_add_mnt_opt, selinux_add_mnt_opt),
#ifdef CONFIG_SECURITY_NETWORK_XFRM
        LSM_HOOK_INIT(xfrm_policy_clone_security, selinux_xfrm_policy_clone),
#endif

        /*
         * PUT "ALLOCATING" HOOKS HERE
         */
        LSM_HOOK_INIT(msg_msg_alloc_security, selinux_msg_msg_alloc_security),
        LSM_HOOK_INIT(msg_queue_alloc_security,
                      selinux_msg_queue_alloc_security),
        LSM_HOOK_INIT(shm_alloc_security, selinux_shm_alloc_security),
        LSM_HOOK_INIT(sb_alloc_security, selinux_sb_alloc_security),
        LSM_HOOK_INIT(inode_alloc_security, selinux_inode_alloc_security),
        LSM_HOOK_INIT(sem_alloc_security, selinux_sem_alloc_security),
        LSM_HOOK_INIT(secid_to_secctx, selinux_secid_to_secctx),
        LSM_HOOK_INIT(inode_getsecctx, selinux_inode_getsecctx),
        LSM_HOOK_INIT(sk_alloc_security, selinux_sk_alloc_security),
        LSM_HOOK_INIT(tun_dev_alloc_security, selinux_tun_dev_alloc_security),
#ifdef CONFIG_SECURITY_INFINIBAND
        LSM_HOOK_INIT(ib_alloc_security, selinux_ib_alloc_security),
#endif
#ifdef CONFIG_SECURITY_NETWORK_XFRM
        LSM_HOOK_INIT(xfrm_policy_alloc_security, selinux_xfrm_policy_alloc),
        LSM_HOOK_INIT(xfrm_state_alloc, selinux_xfrm_state_alloc),
        LSM_HOOK_INIT(xfrm_state_alloc_acquire,
                      selinux_xfrm_state_alloc_acquire),
#endif
#ifdef CONFIG_KEYS
        LSM_HOOK_INIT(key_alloc, selinux_key_alloc),
#endif
#ifdef CONFIG_AUDIT
        LSM_HOOK_INIT(audit_rule_init, selinux_audit_rule_init),
#endif
#ifdef CONFIG_BPF_SYSCALL
        LSM_HOOK_INIT(bpf_map_alloc_security, selinux_bpf_map_alloc),
        LSM_HOOK_INIT(bpf_prog_alloc_security, selinux_bpf_prog_alloc),
#endif
#ifdef CONFIG_PERF_EVENTS
        LSM_HOOK_INIT(perf_event_alloc, selinux_perf_event_alloc),
#endif
};

static __init int selinux_init(void)
{
        pr_info("SELinux:  Initializing.\n");

        memset(&selinux_state, 0, sizeof(selinux_state));
        enforcing_set(&selinux_state, selinux_enforcing_boot);
        checkreqprot_set(&selinux_state, selinux_checkreqprot_boot);
        selinux_avc_init(&selinux_state.avc);
        mutex_init(&selinux_state.status_lock);
        mutex_init(&selinux_state.policy_mutex);

        /* Set the security state for the initial task. */
        cred_init_security();

        default_noexec = !(VM_DATA_DEFAULT_FLAGS & VM_EXEC);

        avc_init();

        avtab_cache_init();

        ebitmap_cache_init();

        hashtab_cache_init();

        security_add_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks), "selinux");

        if (avc_add_callback(selinux_netcache_avc_callback, AVC_CALLBACK_RESET))
                panic("SELinux: Unable to register AVC netcache callback\n");

        if (avc_add_callback(selinux_lsm_notifier_avc_callback, AVC_CALLBACK_RESET))
                panic("SELinux: Unable to register AVC LSM notifier callback\n");

        if (selinux_enforcing_boot)
                pr_debug("SELinux:  Starting in enforcing mode\n");
        else
                pr_debug("SELinux:  Starting in permissive mode\n");

        fs_validate_description("selinux", selinux_fs_parameters);

        return 0;
}

static void delayed_superblock_init(struct super_block *sb, void *unused)
{
        selinux_set_mnt_opts(sb, NULL, 0, NULL);
}

void selinux_complete_init(void)
{
        pr_debug("SELinux:  Completing initialization.\n");

        /* Set up any superblocks initialized prior to the policy load. */
        pr_debug("SELinux:  Setting up existing superblocks.\n");
        iterate_supers(delayed_superblock_init, NULL);
}

/* SELinux requires early initialization in order to label
   all processes and objects when they are created. */
DEFINE_LSM(selinux) = {
        .name = "selinux",
        .flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE,
        .enabled = &selinux_enabled_boot,
        .blobs = &selinux_blob_sizes,
        .init = selinux_init,
};

#if defined(CONFIG_NETFILTER)

static const struct nf_hook_ops selinux_nf_ops[] = {
        {
                .hook =                selinux_ipv4_postroute,
                .pf =                NFPROTO_IPV4,
                .hooknum =        NF_INET_POST_ROUTING,
                .priority =        NF_IP_PRI_SELINUX_LAST,
        },
        {
                .hook =                selinux_ipv4_forward,
                .pf =                NFPROTO_IPV4,
                .hooknum =        NF_INET_FORWARD,
                .priority =        NF_IP_PRI_SELINUX_FIRST,
        },
        {
                .hook =                selinux_ipv4_output,
                .pf =                NFPROTO_IPV4,
                .hooknum =        NF_INET_LOCAL_OUT,
                .priority =        NF_IP_PRI_SELINUX_FIRST,
        },
#if IS_ENABLED(CONFIG_IPV6)
        {
                .hook =                selinux_ipv6_postroute,
                .pf =                NFPROTO_IPV6,
                .hooknum =        NF_INET_POST_ROUTING,
                .priority =        NF_IP6_PRI_SELINUX_LAST,
        },
        {
                .hook =                selinux_ipv6_forward,
                .pf =                NFPROTO_IPV6,
                .hooknum =        NF_INET_FORWARD,
                .priority =        NF_IP6_PRI_SELINUX_FIRST,
        },
        {
                .hook =                selinux_ipv6_output,
                .pf =                NFPROTO_IPV6,
                .hooknum =        NF_INET_LOCAL_OUT,
                .priority =        NF_IP6_PRI_SELINUX_FIRST,
        },
#endif        /* IPV6 */
};

static int __net_init selinux_nf_register(struct net *net)
{
        return nf_register_net_hooks(net, selinux_nf_ops,
                                     ARRAY_SIZE(selinux_nf_ops));
}

static void __net_exit selinux_nf_unregister(struct net *net)
{
        nf_unregister_net_hooks(net, selinux_nf_ops,
                                ARRAY_SIZE(selinux_nf_ops));
}

static struct pernet_operations selinux_net_ops = {
        .init = selinux_nf_register,
        .exit = selinux_nf_unregister,
};

static int __init selinux_nf_ip_init(void)
{
        int err;

        if (!selinux_enabled_boot)
                return 0;

        pr_debug("SELinux:  Registering netfilter hooks\n");

        err = register_pernet_subsys(&selinux_net_ops);
        if (err)
                panic("SELinux: register_pernet_subsys: error %d\n", err);

        return 0;
}
__initcall(selinux_nf_ip_init);

#ifdef CONFIG_SECURITY_SELINUX_DISABLE
static void selinux_nf_ip_exit(void)
{
        pr_debug("SELinux:  Unregistering netfilter hooks\n");

        unregister_pernet_subsys(&selinux_net_ops);
}
#endif

#else /* CONFIG_NETFILTER */

#ifdef CONFIG_SECURITY_SELINUX_DISABLE
#define selinux_nf_ip_exit()
#endif

#endif /* CONFIG_NETFILTER */

#ifdef CONFIG_SECURITY_SELINUX_DISABLE
int selinux_disable(struct selinux_state *state)
{
        if (selinux_initialized(state)) {
                /* Not permitted after initial policy load. */
                return -EINVAL;
        }

        if (selinux_disabled(state)) {
                /* Only do this once. */
                return -EINVAL;
        }

        selinux_mark_disabled(state);

        pr_info("SELinux:  Disabled at runtime.\n");

        /*
         * Unregister netfilter hooks.
         * Must be done before security_delete_hooks() to avoid breaking
         * runtime disable.
         */
        selinux_nf_ip_exit();

        security_delete_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks));

        /* Try to destroy the avc node cache */
        avc_disable();

        /* Unregister selinuxfs. */
        exit_sel_fs();

        return 0;
}
#endif





























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_NET_SCM_H
#define __LINUX_NET_SCM_H

#include <linux/limits.h>
#include <linux/net.h>
#include <linux/cred.h>
#include <linux/security.h>
#include <linux/pid.h>
#include <linux/nsproxy.h>
#include <linux/sched/signal.h>

/* Well, we should have at least one descriptor open
 * to accept passed FDs 8)
 */
#define SCM_MAX_FD        253

struct scm_creds {
        u32        pid;
        kuid_t        uid;
        kgid_t        gid;
};

struct scm_fp_list {
        short                        count;
        short                        max;
        struct user_struct        *user;
        struct file                *fp[SCM_MAX_FD];
};

struct scm_cookie {
        struct pid                *pid;                /* Skb credentials */
        struct scm_fp_list        *fp;                /* Passed files                */
        struct scm_creds        creds;                /* Skb credentials        */
#ifdef CONFIG_SECURITY_NETWORK
        u32                        secid;                /* Passed security ID         */
#endif
};

void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm);
void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm);
int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm);
void __scm_destroy(struct scm_cookie *scm);
struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl);

#ifdef CONFIG_SECURITY_NETWORK
static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_cookie *scm)
{
        security_socket_getpeersec_dgram(sock, NULL, &scm->secid);
}
#else
static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_cookie *scm)
{ }
#endif /* CONFIG_SECURITY_NETWORK */

static __inline__ void scm_set_cred(struct scm_cookie *scm,
                                    struct pid *pid, kuid_t uid, kgid_t gid)
{
        scm->pid  = get_pid(pid);
        scm->creds.pid = pid_vnr(pid);
        scm->creds.uid = uid;
        scm->creds.gid = gid;
}

static __inline__ void scm_destroy_cred(struct scm_cookie *scm)
{
        put_pid(scm->pid);
        scm->pid  = NULL;
}

static __inline__ void scm_destroy(struct scm_cookie *scm)
{
        scm_destroy_cred(scm);
        if (scm->fp)
                __scm_destroy(scm);
}

static __inline__ int scm_send(struct socket *sock, struct msghdr *msg,
                               struct scm_cookie *scm, bool forcecreds)
{
        memset(scm, 0, sizeof(*scm));
        scm->creds.uid = INVALID_UID;
        scm->creds.gid = INVALID_GID;
        if (forcecreds)
                scm_set_cred(scm, task_tgid(current), current_uid(), current_gid());
        unix_get_peersec_dgram(sock, scm);
        if (msg->msg_controllen <= 0)
                return 0;
        return __scm_send(sock, msg, scm);
}

#ifdef CONFIG_SECURITY_NETWORK
static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm)
{
        char *secdata;
        u32 seclen;
        int err;

        if (test_bit(SOCK_PASSSEC, &sock->flags)) {
                err = security_secid_to_secctx(scm->secid, &secdata, &seclen);

                if (!err) {
                        put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, seclen, secdata);
                        security_release_secctx(secdata, seclen);
                }
        }
}

static inline bool scm_has_secdata(struct socket *sock)
{
        return test_bit(SOCK_PASSSEC, &sock->flags);
}
#else
static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm)
{ }

static inline bool scm_has_secdata(struct socket *sock)
{
        return false;
}
#endif /* CONFIG_SECURITY_NETWORK */

static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg,
                                struct scm_cookie *scm, int flags)
{
        if (!msg->msg_control) {
                if (test_bit(SOCK_PASSCRED, &sock->flags) || scm->fp ||
                    scm_has_secdata(sock))
                        msg->msg_flags |= MSG_CTRUNC;
                scm_destroy(scm);
                return;
        }

        if (test_bit(SOCK_PASSCRED, &sock->flags)) {
                struct user_namespace *current_ns = current_user_ns();
                struct ucred ucreds = {
                        .pid = scm->creds.pid,
                        .uid = from_kuid_munged(current_ns, scm->creds.uid),
                        .gid = from_kgid_munged(current_ns, scm->creds.gid),
                };
                put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds);
        }

        scm_destroy_cred(scm);

        scm_passec(sock, msg, scm);

        if (!scm->fp)
                return;
        
        scm_detach_fds(msg, scm);
}


#endif /* __LINUX_NET_SCM_H */

























































































































































































































































































































































































    1 


























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_STRING_H_
#define _LINUX_STRING_H_

#include <linux/compiler.h>        /* for inline */
#include <linux/types.h>        /* for size_t */
#include <linux/stddef.h>        /* for NULL */
#include <stdarg.h>
#include <uapi/linux/string.h>

extern char *strndup_user(const char __user *, long);
extern void *memdup_user(const void __user *, size_t);
extern void *vmemdup_user(const void __user *, size_t);
extern void *memdup_user_nul(const void __user *, size_t);

/*
 * Include machine specific inline routines
 */
#include <asm/string.h>

#ifndef __HAVE_ARCH_STRCPY
extern char * strcpy(char *,const char *);
#endif
#ifndef __HAVE_ARCH_STRNCPY
extern char * strncpy(char *,const char *, __kernel_size_t);
#endif
#ifndef __HAVE_ARCH_STRLCPY
size_t strlcpy(char *, const char *, size_t);
#endif
#ifndef __HAVE_ARCH_STRSCPY
ssize_t strscpy(char *, const char *, size_t);
#endif

/* Wraps calls to strscpy()/memset(), no arch specific code required */
ssize_t strscpy_pad(char *dest, const char *src, size_t count);

#ifndef __HAVE_ARCH_STRCAT
extern char * strcat(char *, const char *);
#endif
#ifndef __HAVE_ARCH_STRNCAT
extern char * strncat(char *, const char *, __kernel_size_t);
#endif
#ifndef __HAVE_ARCH_STRLCAT
extern size_t strlcat(char *, const char *, __kernel_size_t);
#endif
#ifndef __HAVE_ARCH_STRCMP
extern int strcmp(const char *,const char *);
#endif
#ifndef __HAVE_ARCH_STRNCMP
extern int strncmp(const char *,const char *,__kernel_size_t);
#endif
#ifndef __HAVE_ARCH_STRCASECMP
extern int strcasecmp(const char *s1, const char *s2);
#endif
#ifndef __HAVE_ARCH_STRNCASECMP
extern int strncasecmp(const char *s1, const char *s2, size_t n);
#endif
#ifndef __HAVE_ARCH_STRCHR
extern char * strchr(const char *,int);
#endif
#ifndef __HAVE_ARCH_STRCHRNUL
extern char * strchrnul(const char *,int);
#endif
extern char * strnchrnul(const char *, size_t, int);
#ifndef __HAVE_ARCH_STRNCHR
extern char * strnchr(const char *, size_t, int);
#endif
#ifndef __HAVE_ARCH_STRRCHR
extern char * strrchr(const char *,int);
#endif
extern char * __must_check skip_spaces(const char *);

extern char *strim(char *);

static inline __must_check char *strstrip(char *str)
{
        return strim(str);
}

#ifndef __HAVE_ARCH_STRSTR
extern char * strstr(const char *, const char *);
#endif
#ifndef __HAVE_ARCH_STRNSTR
extern char * strnstr(const char *, const char *, size_t);
#endif
#ifndef __HAVE_ARCH_STRLEN
extern __kernel_size_t strlen(const char *);
#endif
#ifndef __HAVE_ARCH_STRNLEN
extern __kernel_size_t strnlen(const char *,__kernel_size_t);
#endif
#ifndef __HAVE_ARCH_STRPBRK
extern char * strpbrk(const char *,const char *);
#endif
#ifndef __HAVE_ARCH_STRSEP
extern char * strsep(char **,const char *);
#endif
#ifndef __HAVE_ARCH_STRSPN
extern __kernel_size_t strspn(const char *,const char *);
#endif
#ifndef __HAVE_ARCH_STRCSPN
extern __kernel_size_t strcspn(const char *,const char *);
#endif

#ifndef __HAVE_ARCH_MEMSET
extern void * memset(void *,int,__kernel_size_t);
#endif

#ifndef __HAVE_ARCH_MEMSET16
extern void *memset16(uint16_t *, uint16_t, __kernel_size_t);
#endif

#ifndef __HAVE_ARCH_MEMSET32
extern void *memset32(uint32_t *, uint32_t, __kernel_size_t);
#endif

#ifndef __HAVE_ARCH_MEMSET64
extern void *memset64(uint64_t *, uint64_t, __kernel_size_t);
#endif

static inline void *memset_l(unsigned long *p, unsigned long v,
                __kernel_size_t n)
{
        if (BITS_PER_LONG == 32)
                return memset32((uint32_t *)p, v, n);
        else
                return memset64((uint64_t *)p, v, n);
}

static inline void *memset_p(void **p, void *v, __kernel_size_t n)
{
        if (BITS_PER_LONG == 32)
                return memset32((uint32_t *)p, (uintptr_t)v, n);
        else
                return memset64((uint64_t *)p, (uintptr_t)v, n);
}

extern void **__memcat_p(void **a, void **b);
#define memcat_p(a, b) ({                                        \
        BUILD_BUG_ON_MSG(!__same_type(*(a), *(b)),                \
                         "type mismatch in memcat_p()");        \
        (typeof(*a) *)__memcat_p((void **)(a), (void **)(b));        \
})

#ifndef __HAVE_ARCH_MEMCPY
extern void * memcpy(void *,const void *,__kernel_size_t);
#endif
#ifndef __HAVE_ARCH_MEMMOVE
extern void * memmove(void *,const void *,__kernel_size_t);
#endif
#ifndef __HAVE_ARCH_MEMSCAN
extern void * memscan(void *,int,__kernel_size_t);
#endif
#ifndef __HAVE_ARCH_MEMCMP
extern int memcmp(const void *,const void *,__kernel_size_t);
#endif
#ifndef __HAVE_ARCH_BCMP
extern int bcmp(const void *,const void *,__kernel_size_t);
#endif
#ifndef __HAVE_ARCH_MEMCHR
extern void * memchr(const void *,int,__kernel_size_t);
#endif
#ifndef __HAVE_ARCH_MEMCPY_FLUSHCACHE
static inline void memcpy_flushcache(void *dst, const void *src, size_t cnt)
{
        memcpy(dst, src, cnt);
}
#endif

void *memchr_inv(const void *s, int c, size_t n);
char *strreplace(char *s, char old, char new);

extern void kfree_const(const void *x);

extern char *kstrdup(const char *s, gfp_t gfp) __malloc;
extern const char *kstrdup_const(const char *s, gfp_t gfp);
extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp);

extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
extern void argv_free(char **argv);

extern bool sysfs_streq(const char *s1, const char *s2);
int match_string(const char * const *array, size_t n, const char *string);
int __sysfs_match_string(const char * const *array, size_t n, const char *s);

/**
 * sysfs_match_string - matches given string in an array
 * @_a: array of strings
 * @_s: string to match with
 *
 * Helper for __sysfs_match_string(). Calculates the size of @a automatically.
 */
#define sysfs_match_string(_a, _s) __sysfs_match_string(_a, ARRAY_SIZE(_a), _s)

#ifdef CONFIG_BINARY_PRINTF
int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args);
int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf);
int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4);
#endif

extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
                                       const void *from, size_t available);

int ptr_to_hashval(const void *ptr, unsigned long *hashval_out);

/**
 * strstarts - does @str start with @prefix?
 * @str: string to examine
 * @prefix: prefix to look for.
 */
static inline bool strstarts(const char *str, const char *prefix)
{
        return strncmp(str, prefix, strlen(prefix)) == 0;
}

size_t memweight(const void *ptr, size_t bytes);

/**
 * memzero_explicit - Fill a region of memory (e.g. sensitive
 *                      keying data) with 0s.
 * @s: Pointer to the start of the area.
 * @count: The size of the area.
 *
 * Note: usually using memset() is just fine (!), but in cases
 * where clearing out _local_ data at the end of a scope is
 * necessary, memzero_explicit() should be used instead in
 * order to prevent the compiler from optimising away zeroing.
 *
 * memzero_explicit() doesn't need an arch-specific version as
 * it just invokes the one of memset() implicitly.
 */
static inline void memzero_explicit(void *s, size_t count)
{
        memset(s, 0, count);
        barrier_data(s);
}

/**
 * kbasename - return the last part of a pathname.
 *
 * @path: path to extract the filename from.
 */
static inline const char *kbasename(const char *path)
{
        const char *tail = strrchr(path, '/');
        return tail ? tail + 1 : path;
}

#define __FORTIFY_INLINE extern __always_inline __attribute__((gnu_inline))
#define __RENAME(x) __asm__(#x)

void fortify_panic(const char *name) __noreturn __cold;
void __read_overflow(void) __compiletime_error("detected read beyond size of object passed as 1st parameter");
void __read_overflow2(void) __compiletime_error("detected read beyond size of object passed as 2nd parameter");
void __read_overflow3(void) __compiletime_error("detected read beyond size of object passed as 3rd parameter");
void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter");

#if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE)

#ifdef CONFIG_KASAN
extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr);
extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp);
extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy);
extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove);
extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset);
extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat);
extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy);
extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen);
extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat);
extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy);
#else
#define __underlying_memchr        __builtin_memchr
#define __underlying_memcmp        __builtin_memcmp
#define __underlying_memcpy        __builtin_memcpy
#define __underlying_memmove        __builtin_memmove
#define __underlying_memset        __builtin_memset
#define __underlying_strcat        __builtin_strcat
#define __underlying_strcpy        __builtin_strcpy
#define __underlying_strlen        __builtin_strlen
#define __underlying_strncat        __builtin_strncat
#define __underlying_strncpy        __builtin_strncpy
#endif

__FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size)
{
        size_t p_size = __builtin_object_size(p, 0);
        if (__builtin_constant_p(size) && p_size < size)
                __write_overflow();
        if (p_size < size)
                fortify_panic(__func__);
        return __underlying_strncpy(p, q, size);
}

__FORTIFY_INLINE char *strcat(char *p, const char *q)
{
        size_t p_size = __builtin_object_size(p, 0);
        if (p_size == (size_t)-1)
                return __underlying_strcat(p, q);
        if (strlcat(p, q, p_size) >= p_size)
                fortify_panic(__func__);
        return p;
}

__FORTIFY_INLINE __kernel_size_t strlen(const char *p)
{
        __kernel_size_t ret;
        size_t p_size = __builtin_object_size(p, 0);

        /* Work around gcc excess stack consumption issue */
        if (p_size == (size_t)-1 ||
            (__builtin_constant_p(p[p_size - 1]) && p[p_size - 1] == '\0'))
                return __underlying_strlen(p);
        ret = strnlen(p, p_size);
        if (p_size <= ret)
                fortify_panic(__func__);
        return ret;
}

extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen);
__FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen)
{
        size_t p_size = __builtin_object_size(p, 0);
        __kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size);
        if (p_size <= ret && maxlen != ret)
                fortify_panic(__func__);
        return ret;
}

/* defined after fortified strlen to reuse it */
extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy);
__FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size)
{
        size_t ret;
        size_t p_size = __builtin_object_size(p, 0);
        size_t q_size = __builtin_object_size(q, 0);
        if (p_size == (size_t)-1 && q_size == (size_t)-1)
                return __real_strlcpy(p, q, size);
        ret = strlen(q);
        if (size) {
                size_t len = (ret >= size) ? size - 1 : ret;
                if (__builtin_constant_p(len) && len >= p_size)
                        __write_overflow();
                if (len >= p_size)
                        fortify_panic(__func__);
                __underlying_memcpy(p, q, len);
                p[len] = '\0';
        }
        return ret;
}

/* defined after fortified strlen and strnlen to reuse them */
__FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count)
{
        size_t p_len, copy_len;
        size_t p_size = __builtin_object_size(p, 0);
        size_t q_size = __builtin_object_size(q, 0);
        if (p_size == (size_t)-1 && q_size == (size_t)-1)
                return __underlying_strncat(p, q, count);
        p_len = strlen(p);
        copy_len = strnlen(q, count);
        if (p_size < p_len + copy_len + 1)
                fortify_panic(__func__);
        __underlying_memcpy(p + p_len, q, copy_len);
        p[p_len + copy_len] = '\0';
        return p;
}

__FORTIFY_INLINE void *memset(void *p, int c, __kernel_size_t size)
{
        size_t p_size = __builtin_object_size(p, 0);
        if (__builtin_constant_p(size) && p_size < size)
                __write_overflow();
        if (p_size < size)
                fortify_panic(__func__);
        return __underlying_memset(p, c, size);
}

__FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size)
{
        size_t p_size = __builtin_object_size(p, 0);
        size_t q_size = __builtin_object_size(q, 0);
        if (__builtin_constant_p(size)) {
                if (p_size < size)
                        __write_overflow();
                if (q_size < size)
                        __read_overflow2();
        }
        if (p_size < size || q_size < size)
                fortify_panic(__func__);
        return __underlying_memcpy(p, q, size);
}

__FORTIFY_INLINE void *memmove(void *p, const void *q, __kernel_size_t size)
{
        size_t p_size = __builtin_object_size(p, 0);
        size_t q_size = __builtin_object_size(q, 0);
        if (__builtin_constant_p(size)) {
                if (p_size < size)
                        __write_overflow();
                if (q_size < size)
                        __read_overflow2();
        }
        if (p_size < size || q_size < size)
                fortify_panic(__func__);
        return __underlying_memmove(p, q, size);
}

extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan);
__FORTIFY_INLINE void *memscan(void *p, int c, __kernel_size_t size)
{
        size_t p_size = __builtin_object_size(p, 0);
        if (__builtin_constant_p(size) && p_size < size)
                __read_overflow();
        if (p_size < size)
                fortify_panic(__func__);
        return __real_memscan(p, c, size);
}

__FORTIFY_INLINE int memcmp(const void *p, const void *q, __kernel_size_t size)
{
        size_t p_size = __builtin_object_size(p, 0);
        size_t q_size = __builtin_object_size(q, 0);
        if (__builtin_constant_p(size)) {
                if (p_size < size)
                        __read_overflow();
                if (q_size < size)
                        __read_overflow2();
        }
        if (p_size < size || q_size < size)
                fortify_panic(__func__);
        return __underlying_memcmp(p, q, size);
}

__FORTIFY_INLINE void *memchr(const void *p, int c, __kernel_size_t size)
{
        size_t p_size = __builtin_object_size(p, 0);
        if (__builtin_constant_p(size) && p_size < size)
                __read_overflow();
        if (p_size < size)
                fortify_panic(__func__);
        return __underlying_memchr(p, c, size);
}

void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv);
__FORTIFY_INLINE void *memchr_inv(const void *p, int c, size_t size)
{
        size_t p_size = __builtin_object_size(p, 0);
        if (__builtin_constant_p(size) && p_size < size)
                __read_overflow();
        if (p_size < size)
                fortify_panic(__func__);
        return __real_memchr_inv(p, c, size);
}

extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup);
__FORTIFY_INLINE void *kmemdup(const void *p, size_t size, gfp_t gfp)
{
        size_t p_size = __builtin_object_size(p, 0);
        if (__builtin_constant_p(size) && p_size < size)
                __read_overflow();
        if (p_size < size)
                fortify_panic(__func__);
        return __real_kmemdup(p, size, gfp);
}

/* defined after fortified strlen and memcpy to reuse them */
__FORTIFY_INLINE char *strcpy(char *p, const char *q)
{
        size_t p_size = __builtin_object_size(p, 0);
        size_t q_size = __builtin_object_size(q, 0);
        if (p_size == (size_t)-1 && q_size == (size_t)-1)
                return __underlying_strcpy(p, q);
        memcpy(p, q, strlen(q) + 1);
        return p;
}

/* Don't use these outside the FORITFY_SOURCE implementation */
#undef __underlying_memchr
#undef __underlying_memcmp
#undef __underlying_memcpy
#undef __underlying_memmove
#undef __underlying_memset
#undef __underlying_strcat
#undef __underlying_strcpy
#undef __underlying_strlen
#undef __underlying_strncat
#undef __underlying_strncpy
#endif

/**
 * memcpy_and_pad - Copy one buffer to another with padding
 * @dest: Where to copy to
 * @dest_len: The destination buffer size
 * @src: Where to copy from
 * @count: The number of bytes to copy
 * @pad: Character to use for padding if space is left in destination.
 */
static inline void memcpy_and_pad(void *dest, size_t dest_len,
                                  const void *src, size_t count, int pad)
{
        if (dest_len > count) {
                memcpy(dest, src, count);
                memset(dest + count, pad,  dest_len - count);
        } else
                memcpy(dest, src, dest_len);
}

/**
 * str_has_prefix - Test if a string has a given prefix
 * @str: The string to test
 * @prefix: The string to see if @str starts with
 *
 * A common way to test a prefix of a string is to do:
 *  strncmp(str, prefix, sizeof(prefix) - 1)
 *
 * But this can lead to bugs due to typos, or if prefix is a pointer
 * and not a constant. Instead use str_has_prefix().
 *
 * Returns:
 * * strlen(@prefix) if @str starts with @prefix
 * * 0 if @str does not start with @prefix
 */
static __always_inline size_t str_has_prefix(const char *str, const char *prefix)
{
        size_t len = strlen(prefix);
        return strncmp(str, prefix, len) == 0 ? len : 0;
}

#endif /* _LINUX_STRING_H_ */








































































































































































































































































































































































































































































































































































    1 

















    1 



















































































    1 









    1 

























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Written by Mark Hemment, 1996 (markhe@nextd.demon.co.uk).
 *
 * (C) SGI 2006, Christoph Lameter
 *         Cleaned up and restructured to ease the addition of alternative
 *         implementations of SLAB allocators.
 * (C) Linux Foundation 2008-2013
 *      Unified interface for all slab allocators
 */

#ifndef _LINUX_SLAB_H
#define        _LINUX_SLAB_H

#include <linux/gfp.h>
#include <linux/overflow.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <linux/percpu-refcount.h>


/*
 * Flags to pass to kmem_cache_create().
 * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set.
 */
/* DEBUG: Perform (expensive) checks on alloc/free */
#define SLAB_CONSISTENCY_CHECKS        ((slab_flags_t __force)0x00000100U)
/* DEBUG: Red zone objs in a cache */
#define SLAB_RED_ZONE                ((slab_flags_t __force)0x00000400U)
/* DEBUG: Poison objects */
#define SLAB_POISON                ((slab_flags_t __force)0x00000800U)
/* Align objs on cache lines */
#define SLAB_HWCACHE_ALIGN        ((slab_flags_t __force)0x00002000U)
/* Use GFP_DMA memory */
#define SLAB_CACHE_DMA                ((slab_flags_t __force)0x00004000U)
/* Use GFP_DMA32 memory */
#define SLAB_CACHE_DMA32        ((slab_flags_t __force)0x00008000U)
/* DEBUG: Store the last owner for bug hunting */
#define SLAB_STORE_USER                ((slab_flags_t __force)0x00010000U)
/* Panic if kmem_cache_create() fails */
#define SLAB_PANIC                ((slab_flags_t __force)0x00040000U)
/*
 * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS!
 *
 * This delays freeing the SLAB page by a grace period, it does _NOT_
 * delay object freeing. This means that if you do kmem_cache_free()
 * that memory location is free to be reused at any time. Thus it may
 * be possible to see another object there in the same RCU grace period.
 *
 * This feature only ensures the memory location backing the object
 * stays valid, the trick to using this is relying on an independent
 * object validation pass. Something like:
 *
 *  rcu_read_lock()
 * again:
 *  obj = lockless_lookup(key);
 *  if (obj) {
 *    if (!try_get_ref(obj)) // might fail for free objects
 *      goto again;
 *
 *    if (obj->key != key) { // not the object we expected
 *      put_ref(obj);
 *      goto again;
 *    }
 *  }
 *  rcu_read_unlock();
 *
 * This is useful if we need to approach a kernel structure obliquely,
 * from its address obtained without the usual locking. We can lock
 * the structure to stabilize it and check it's still at the given address,
 * only if we can be sure that the memory has not been meanwhile reused
 * for some other kind of object (which our subsystem's lock might corrupt).
 *
 * rcu_read_lock before reading the address, then rcu_read_unlock after
 * taking the spinlock within the structure expected at that address.
 *
 * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU.
 */
/* Defer freeing slabs to RCU */
#define SLAB_TYPESAFE_BY_RCU        ((slab_flags_t __force)0x00080000U)
/* Spread some memory over cpuset */
#define SLAB_MEM_SPREAD                ((slab_flags_t __force)0x00100000U)
/* Trace allocations and frees */
#define SLAB_TRACE                ((slab_flags_t __force)0x00200000U)

/* Flag to prevent checks on free */
#ifdef CONFIG_DEBUG_OBJECTS
# define SLAB_DEBUG_OBJECTS        ((slab_flags_t __force)0x00400000U)
#else
# define SLAB_DEBUG_OBJECTS        0
#endif

/* Avoid kmemleak tracing */
#define SLAB_NOLEAKTRACE        ((slab_flags_t __force)0x00800000U)

/* Fault injection mark */
#ifdef CONFIG_FAILSLAB
# define SLAB_FAILSLAB                ((slab_flags_t __force)0x02000000U)
#else
# define SLAB_FAILSLAB                0
#endif
/* Account to memcg */
#ifdef CONFIG_MEMCG_KMEM
# define SLAB_ACCOUNT                ((slab_flags_t __force)0x04000000U)
#else
# define SLAB_ACCOUNT                0
#endif

#ifdef CONFIG_KASAN
#define SLAB_KASAN                ((slab_flags_t __force)0x08000000U)
#else
#define SLAB_KASAN                0
#endif

/* The following flags affect the page allocator grouping pages by mobility */
/* Objects are reclaimable */
#define SLAB_RECLAIM_ACCOUNT        ((slab_flags_t __force)0x00020000U)
#define SLAB_TEMPORARY                SLAB_RECLAIM_ACCOUNT        /* Objects are short-lived */

/* Slab deactivation flag */
#define SLAB_DEACTIVATED        ((slab_flags_t __force)0x10000000U)

/*
 * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
 *
 * Dereferencing ZERO_SIZE_PTR will lead to a distinct access fault.
 *
 * ZERO_SIZE_PTR can be passed to kfree though in the same way that NULL can.
 * Both make kfree a no-op.
 */
#define ZERO_SIZE_PTR ((void *)16)

#define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) <= \
                                (unsigned long)ZERO_SIZE_PTR)

#include <linux/kasan.h>

struct mem_cgroup;
/*
 * struct kmem_cache related prototypes
 */
void __init kmem_cache_init(void);
bool slab_is_available(void);

extern bool usercopy_fallback;

struct kmem_cache *kmem_cache_create(const char *name, unsigned int size,
                        unsigned int align, slab_flags_t flags,
                        void (*ctor)(void *));
struct kmem_cache *kmem_cache_create_usercopy(const char *name,
                        unsigned int size, unsigned int align,
                        slab_flags_t flags,
                        unsigned int useroffset, unsigned int usersize,
                        void (*ctor)(void *));
void kmem_cache_destroy(struct kmem_cache *);
int kmem_cache_shrink(struct kmem_cache *);

/*
 * Please use this macro to create slab caches. Simply specify the
 * name of the structure and maybe some flags that are listed above.
 *
 * The alignment of the struct determines object alignment. If you
 * f.e. add ____cacheline_aligned_in_smp to the struct declaration
 * then the objects will be properly aligned in SMP configurations.
 */
#define KMEM_CACHE(__struct, __flags)                                        \
                kmem_cache_create(#__struct, sizeof(struct __struct),        \
                        __alignof__(struct __struct), (__flags), NULL)

/*
 * To whitelist a single field for copying to/from usercopy, use this
 * macro instead for KMEM_CACHE() above.
 */
#define KMEM_CACHE_USERCOPY(__struct, __flags, __field)                        \
                kmem_cache_create_usercopy(#__struct,                        \
                        sizeof(struct __struct),                        \
                        __alignof__(struct __struct), (__flags),        \
                        offsetof(struct __struct, __field),                \
                        sizeof_field(struct __struct, __field), NULL)

/*
 * Common kmalloc functions provided by all allocators
 */
void * __must_check krealloc(const void *, size_t, gfp_t);
void kfree(const void *);
void kfree_sensitive(const void *);
size_t __ksize(const void *);
size_t ksize(const void *);

#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR
void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
                        bool to_user);
#else
static inline void __check_heap_object(const void *ptr, unsigned long n,
                                       struct page *page, bool to_user) { }
#endif

/*
 * Some archs want to perform DMA into kmalloc caches and need a guaranteed
 * alignment larger than the alignment of a 64-bit integer.
 * Setting ARCH_KMALLOC_MINALIGN in arch headers allows that.
 */
#if defined(ARCH_DMA_MINALIGN) && ARCH_DMA_MINALIGN > 8
#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
#define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN
#define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN)
#else
#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
#endif

/*
 * Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment.
 * Intended for arches that get misalignment faults even for 64 bit integer
 * aligned buffers.
 */
#ifndef ARCH_SLAB_MINALIGN
#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
#endif

/*
 * kmalloc and friends return ARCH_KMALLOC_MINALIGN aligned
 * pointers. kmem_cache_alloc and friends return ARCH_SLAB_MINALIGN
 * aligned pointers.
 */
#define __assume_kmalloc_alignment __assume_aligned(ARCH_KMALLOC_MINALIGN)
#define __assume_slab_alignment __assume_aligned(ARCH_SLAB_MINALIGN)
#define __assume_page_alignment __assume_aligned(PAGE_SIZE)

/*
 * Kmalloc array related definitions
 */

#ifdef CONFIG_SLAB
/*
 * The largest kmalloc size supported by the SLAB allocators is
 * 32 megabyte (2^25) or the maximum allocatable page order if that is
 * less than 32 MB.
 *
 * WARNING: Its not easy to increase this value since the allocators have
 * to do various tricks to work around compiler limitations in order to
 * ensure proper constant folding.
 */
#define KMALLOC_SHIFT_HIGH        ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \
                                (MAX_ORDER + PAGE_SHIFT - 1) : 25)
#define KMALLOC_SHIFT_MAX        KMALLOC_SHIFT_HIGH
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW        5
#endif
#endif

#ifdef CONFIG_SLUB
/*
 * SLUB directly allocates requests fitting in to an order-1 page
 * (PAGE_SIZE*2).  Larger requests are passed to the page allocator.
 */
#define KMALLOC_SHIFT_HIGH        (PAGE_SHIFT + 1)
#define KMALLOC_SHIFT_MAX        (MAX_ORDER + PAGE_SHIFT - 1)
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW        3
#endif
#endif

#ifdef CONFIG_SLOB
/*
 * SLOB passes all requests larger than one page to the page allocator.
 * No kmalloc array is necessary since objects of different sizes can
 * be allocated from the same page.
 */
#define KMALLOC_SHIFT_HIGH        PAGE_SHIFT
#define KMALLOC_SHIFT_MAX        (MAX_ORDER + PAGE_SHIFT - 1)
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW        3
#endif
#endif

/* Maximum allocatable size */
#define KMALLOC_MAX_SIZE        (1UL << KMALLOC_SHIFT_MAX)
/* Maximum size for which we actually use a slab cache */
#define KMALLOC_MAX_CACHE_SIZE        (1UL << KMALLOC_SHIFT_HIGH)
/* Maximum order allocatable via the slab allocator */
#define KMALLOC_MAX_ORDER        (KMALLOC_SHIFT_MAX - PAGE_SHIFT)

/*
 * Kmalloc subsystem.
 */
#ifndef KMALLOC_MIN_SIZE
#define KMALLOC_MIN_SIZE (1 << KMALLOC_SHIFT_LOW)
#endif

/*
 * This restriction comes from byte sized index implementation.
 * Page size is normally 2^12 bytes and, in this case, if we want to use
 * byte sized index which can represent 2^8 entries, the size of the object
 * should be equal or greater to 2^12 / 2^8 = 2^4 = 16.
 * If minimum size of kmalloc is less than 16, we use it as minimum object
 * size and give up to use byte sized index.
 */
#define SLAB_OBJ_MIN_SIZE      (KMALLOC_MIN_SIZE < 16 ? \
                               (KMALLOC_MIN_SIZE) : 16)

/*
 * Whenever changing this, take care of that kmalloc_type() and
 * create_kmalloc_caches() still work as intended.
 */
enum kmalloc_cache_type {
        KMALLOC_NORMAL = 0,
        KMALLOC_RECLAIM,
#ifdef CONFIG_ZONE_DMA
        KMALLOC_DMA,
#endif
        NR_KMALLOC_TYPES
};

#ifndef CONFIG_SLOB
extern struct kmem_cache *
kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1];

static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags)
{
#ifdef CONFIG_ZONE_DMA
        /*
         * The most common case is KMALLOC_NORMAL, so test for it
         * with a single branch for both flags.
         */
        if (likely((flags & (__GFP_DMA | __GFP_RECLAIMABLE)) == 0))
                return KMALLOC_NORMAL;

        /*
         * At least one of the flags has to be set. If both are, __GFP_DMA
         * is more important.
         */
        return flags & __GFP_DMA ? KMALLOC_DMA : KMALLOC_RECLAIM;
#else
        return flags & __GFP_RECLAIMABLE ? KMALLOC_RECLAIM : KMALLOC_NORMAL;
#endif
}

/*
 * Figure out which kmalloc slab an allocation of a certain size
 * belongs to.
 * 0 = zero alloc
 * 1 =  65 .. 96 bytes
 * 2 = 129 .. 192 bytes
 * n = 2^(n-1)+1 .. 2^n
 */
static __always_inline unsigned int kmalloc_index(size_t size)
{
        if (!size)
                return 0;

        if (size <= KMALLOC_MIN_SIZE)
                return KMALLOC_SHIFT_LOW;

        if (KMALLOC_MIN_SIZE <= 32 && size > 64 && size <= 96)
                return 1;
        if (KMALLOC_MIN_SIZE <= 64 && size > 128 && size <= 192)
                return 2;
        if (size <=          8) return 3;
        if (size <=         16) return 4;
        if (size <=         32) return 5;
        if (size <=         64) return 6;
        if (size <=        128) return 7;
        if (size <=        256) return 8;
        if (size <=        512) return 9;
        if (size <=       1024) return 10;
        if (size <=   2 * 1024) return 11;
        if (size <=   4 * 1024) return 12;
        if (size <=   8 * 1024) return 13;
        if (size <=  16 * 1024) return 14;
        if (size <=  32 * 1024) return 15;
        if (size <=  64 * 1024) return 16;
        if (size <= 128 * 1024) return 17;
        if (size <= 256 * 1024) return 18;
        if (size <= 512 * 1024) return 19;
        if (size <= 1024 * 1024) return 20;
        if (size <=  2 * 1024 * 1024) return 21;
        if (size <=  4 * 1024 * 1024) return 22;
        if (size <=  8 * 1024 * 1024) return 23;
        if (size <=  16 * 1024 * 1024) return 24;
        if (size <=  32 * 1024 * 1024) return 25;
        if (size <=  64 * 1024 * 1024) return 26;
        BUG();

        /* Will never be reached. Needed because the compiler may complain */
        return -1;
}
#endif /* !CONFIG_SLOB */

void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc;
void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc;
void kmem_cache_free(struct kmem_cache *, void *);

/*
 * Bulk allocation and freeing operations. These are accelerated in an
 * allocator specific way to avoid taking locks repeatedly or building
 * metadata structures unnecessarily.
 *
 * Note that interrupts must be enabled when calling these functions.
 */
void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
int kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);

/*
 * Caller must not use kfree_bulk() on memory not originally allocated
 * by kmalloc(), because the SLOB allocator cannot handle this.
 */
static __always_inline void kfree_bulk(size_t size, void **p)
{
        kmem_cache_free_bulk(NULL, size, p);
}

#ifdef CONFIG_NUMA
void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc;
void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc;
#else
static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
        return __kmalloc(size, flags);
}

static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node)
{
        return kmem_cache_alloc(s, flags);
}
#endif

#ifdef CONFIG_TRACING
extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment __malloc;

#ifdef CONFIG_NUMA
extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
                                           gfp_t gfpflags,
                                           int node, size_t size) __assume_slab_alignment __malloc;
#else
static __always_inline void *
kmem_cache_alloc_node_trace(struct kmem_cache *s,
                              gfp_t gfpflags,
                              int node, size_t size)
{
        return kmem_cache_alloc_trace(s, gfpflags, size);
}
#endif /* CONFIG_NUMA */

#else /* CONFIG_TRACING */
static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s,
                gfp_t flags, size_t size)
{
        void *ret = kmem_cache_alloc(s, flags);

        ret = kasan_kmalloc(s, ret, size, flags);
        return ret;
}

static __always_inline void *
kmem_cache_alloc_node_trace(struct kmem_cache *s,
                              gfp_t gfpflags,
                              int node, size_t size)
{
        void *ret = kmem_cache_alloc_node(s, gfpflags, node);

        ret = kasan_kmalloc(s, ret, size, gfpflags);
        return ret;
}
#endif /* CONFIG_TRACING */

extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc;

#ifdef CONFIG_TRACING
extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc;
#else
static __always_inline void *
kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
{
        return kmalloc_order(size, flags, order);
}
#endif

static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
{
        unsigned int order = get_order(size);
        return kmalloc_order_trace(size, flags, order);
}

/**
 * kmalloc - allocate memory
 * @size: how many bytes of memory are required.
 * @flags: the type of memory to allocate.
 *
 * kmalloc is the normal method of allocating memory
 * for objects smaller than page size in the kernel.
 *
 * The allocated object address is aligned to at least ARCH_KMALLOC_MINALIGN
 * bytes. For @size of power of two bytes, the alignment is also guaranteed
 * to be at least to the size.
 *
 * The @flags argument may be one of the GFP flags defined at
 * include/linux/gfp.h and described at
 * :ref:`Documentation/core-api/mm-api.rst <mm-api-gfp-flags>`
 *
 * The recommended usage of the @flags is described at
 * :ref:`Documentation/core-api/memory-allocation.rst <memory_allocation>`
 *
 * Below is a brief outline of the most useful GFP flags
 *
 * %GFP_KERNEL
 *        Allocate normal kernel ram. May sleep.
 *
 * %GFP_NOWAIT
 *        Allocation will not sleep.
 *
 * %GFP_ATOMIC
 *        Allocation will not sleep.  May use emergency pools.
 *
 * %GFP_HIGHUSER
 *        Allocate memory from high memory on behalf of user.
 *
 * Also it is possible to set different flags by OR'ing
 * in one or more of the following additional @flags:
 *
 * %__GFP_HIGH
 *        This allocation has high priority and may use emergency pools.
 *
 * %__GFP_NOFAIL
 *        Indicate that this allocation is in no way allowed to fail
 *        (think twice before using).
 *
 * %__GFP_NORETRY
 *        If memory is not immediately available,
 *        then give up at once.
 *
 * %__GFP_NOWARN
 *        If allocation fails, don't issue any warnings.
 *
 * %__GFP_RETRY_MAYFAIL
 *        Try really hard to succeed the allocation but fail
 *        eventually.
 */
static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
        if (__builtin_constant_p(size)) {
#ifndef CONFIG_SLOB
                unsigned int index;
#endif
                if (size > KMALLOC_MAX_CACHE_SIZE)
                        return kmalloc_large(size, flags);
#ifndef CONFIG_SLOB
                index = kmalloc_index(size);

                if (!index)
                        return ZERO_SIZE_PTR;

                return kmem_cache_alloc_trace(
                                kmalloc_caches[kmalloc_type(flags)][index],
                                flags, size);
#endif
        }
        return __kmalloc(size, flags);
}

static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
{
#ifndef CONFIG_SLOB
        if (__builtin_constant_p(size) &&
                size <= KMALLOC_MAX_CACHE_SIZE) {
                unsigned int i = kmalloc_index(size);

                if (!i)
                        return ZERO_SIZE_PTR;

                return kmem_cache_alloc_node_trace(
                                kmalloc_caches[kmalloc_type(flags)][i],
                                                flags, node, size);
        }
#endif
        return __kmalloc_node(size, flags, node);
}

/**
 * kmalloc_array - allocate memory for an array.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;
        if (__builtin_constant_p(n) && __builtin_constant_p(size))
                return kmalloc(bytes, flags);
        return __kmalloc(bytes, flags);
}

/**
 * kcalloc - allocate memory for an array. The memory is set to zero.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
{
        return kmalloc_array(n, size, flags | __GFP_ZERO);
}

/*
 * kmalloc_track_caller is a special version of kmalloc that records the
 * calling function of the routine calling it for slab leak tracking instead
 * of just the calling function (confusing, eh?).
 * It's useful when the call to kmalloc comes from a widely-used standard
 * allocator where we care about the real place the memory allocation
 * request comes from.
 */
extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long);
#define kmalloc_track_caller(size, flags) \
        __kmalloc_track_caller(size, flags, _RET_IP_)

static inline void *kmalloc_array_node(size_t n, size_t size, gfp_t flags,
                                       int node)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;
        if (__builtin_constant_p(n) && __builtin_constant_p(size))
                return kmalloc_node(bytes, flags, node);
        return __kmalloc_node(bytes, flags, node);
}

static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node)
{
        return kmalloc_array_node(n, size, flags | __GFP_ZERO, node);
}


#ifdef CONFIG_NUMA
extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long);
#define kmalloc_node_track_caller(size, flags, node) \
        __kmalloc_node_track_caller(size, flags, node, \
                        _RET_IP_)

#else /* CONFIG_NUMA */

#define kmalloc_node_track_caller(size, flags, node) \
        kmalloc_track_caller(size, flags)

#endif /* CONFIG_NUMA */

/*
 * Shortcuts
 */
static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags)
{
        return kmem_cache_alloc(k, flags | __GFP_ZERO);
}

/**
 * kzalloc - allocate memory. The memory is set to zero.
 * @size: how many bytes of memory are required.
 * @flags: the type of memory to allocate (see kmalloc).
 */
static inline void *kzalloc(size_t size, gfp_t flags)
{
        return kmalloc(size, flags | __GFP_ZERO);
}

/**
 * kzalloc_node - allocate zeroed memory from a particular memory node.
 * @size: how many bytes of memory are required.
 * @flags: the type of memory to allocate (see kmalloc).
 * @node: memory node from which to allocate
 */
static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
{
        return kmalloc_node(size, flags | __GFP_ZERO, node);
}

unsigned int kmem_cache_size(struct kmem_cache *s);
void __init kmem_cache_init_late(void);

#if defined(CONFIG_SMP) && defined(CONFIG_SLAB)
int slab_prepare_cpu(unsigned int cpu);
int slab_dead_cpu(unsigned int cpu);
#else
#define slab_prepare_cpu        NULL
#define slab_dead_cpu                NULL
#endif

#endif        /* _LINUX_SLAB_H */



































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
// SPDX-License-Identifier: GPL-2.0
/*
 *  fs/ext4/extents_status.h
 *
 * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
 * Modified by
 *        Allison Henderson <achender@linux.vnet.ibm.com>
 *        Zheng Liu <wenqing.lz@taobao.com>
 *
 */

#ifndef _EXT4_EXTENTS_STATUS_H
#define _EXT4_EXTENTS_STATUS_H

/*
 * Turn on ES_DEBUG__ to get lots of info about extent status operations.
 */
#ifdef ES_DEBUG__
#define es_debug(fmt, ...)        printk(fmt, ##__VA_ARGS__)
#else
#define es_debug(fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
#endif

/*
 * With ES_AGGRESSIVE_TEST defined, the result of es caching will be
 * checked with old map_block's result.
 */
#define ES_AGGRESSIVE_TEST__

/*
 * These flags live in the high bits of extent_status.es_pblk
 */
enum {
        ES_WRITTEN_B,
        ES_UNWRITTEN_B,
        ES_DELAYED_B,
        ES_HOLE_B,
        ES_REFERENCED_B,
        ES_FLAGS
};

#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS)
#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT)

#define EXTENT_STATUS_WRITTEN        (1 << ES_WRITTEN_B)
#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B)
#define EXTENT_STATUS_DELAYED        (1 << ES_DELAYED_B)
#define EXTENT_STATUS_HOLE        (1 << ES_HOLE_B)
#define EXTENT_STATUS_REFERENCED        (1 << ES_REFERENCED_B)

#define ES_TYPE_MASK        ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \
                          EXTENT_STATUS_UNWRITTEN | \
                          EXTENT_STATUS_DELAYED | \
                          EXTENT_STATUS_HOLE) << ES_SHIFT)

struct ext4_sb_info;
struct ext4_extent;

struct extent_status {
        struct rb_node rb_node;
        ext4_lblk_t es_lblk;        /* first logical block extent covers */
        ext4_lblk_t es_len;        /* length of extent in block */
        ext4_fsblk_t es_pblk;        /* first physical block */
};

struct ext4_es_tree {
        struct rb_root root;
        struct extent_status *cache_es;        /* recently accessed extent */
};

struct ext4_es_stats {
        unsigned long es_stats_shrunk;
        struct percpu_counter es_stats_cache_hits;
        struct percpu_counter es_stats_cache_misses;
        u64 es_stats_scan_time;
        u64 es_stats_max_scan_time;
        struct percpu_counter es_stats_all_cnt;
        struct percpu_counter es_stats_shk_cnt;
};

/*
 * Pending cluster reservations for bigalloc file systems
 *
 * A cluster with a pending reservation is a logical cluster shared by at
 * least one extent in the extents status tree with delayed and unwritten
 * status and at least one other written or unwritten extent.  The
 * reservation is said to be pending because a cluster reservation would
 * have to be taken in the event all blocks in the cluster shared with
 * written or unwritten extents were deleted while the delayed and
 * unwritten blocks remained.
 *
 * The set of pending cluster reservations is an auxiliary data structure
 * used with the extents status tree to implement reserved cluster/block
 * accounting for bigalloc file systems.  The set is kept in memory and
 * records all pending cluster reservations.
 *
 * Its primary function is to avoid the need to read extents from the
 * disk when invalidating pages as a result of a truncate, punch hole, or
 * collapse range operation.  Page invalidation requires a decrease in the
 * reserved cluster count if it results in the removal of all delayed
 * and unwritten extents (blocks) from a cluster that is not shared with a
 * written or unwritten extent, and no decrease otherwise.  Determining
 * whether the cluster is shared can be done by searching for a pending
 * reservation on it.
 *
 * Secondarily, it provides a potentially faster method for determining
 * whether the reserved cluster count should be increased when a physical
 * cluster is deallocated as a result of a truncate, punch hole, or
 * collapse range operation.  The necessary information is also present
 * in the extents status tree, but might be more rapidly accessed in
 * the pending reservation set in many cases due to smaller size.
 *
 * The pending cluster reservation set is implemented as a red-black tree
 * with the goal of minimizing per page search time overhead.
 */

struct pending_reservation {
        struct rb_node rb_node;
        ext4_lblk_t lclu;
};

struct ext4_pending_tree {
        struct rb_root root;
};

extern int __init ext4_init_es(void);
extern void ext4_exit_es(void);
extern void ext4_es_init_tree(struct ext4_es_tree *tree);

extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
                                  ext4_lblk_t len, ext4_fsblk_t pblk,
                                  unsigned int status);
extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
                                 ext4_lblk_t len, ext4_fsblk_t pblk,
                                 unsigned int status);
extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
                                 ext4_lblk_t len);
extern void ext4_es_find_extent_range(struct inode *inode,
                                      int (*match_fn)(struct extent_status *es),
                                      ext4_lblk_t lblk, ext4_lblk_t end,
                                      struct extent_status *es);
extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
                                 ext4_lblk_t *next_lblk,
                                 struct extent_status *es);
extern bool ext4_es_scan_range(struct inode *inode,
                               int (*matching_fn)(struct extent_status *es),
                               ext4_lblk_t lblk, ext4_lblk_t end);
extern bool ext4_es_scan_clu(struct inode *inode,
                             int (*matching_fn)(struct extent_status *es),
                             ext4_lblk_t lblk);

static inline unsigned int ext4_es_status(struct extent_status *es)
{
        return es->es_pblk >> ES_SHIFT;
}

static inline unsigned int ext4_es_type(struct extent_status *es)
{
        return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT;
}

static inline int ext4_es_is_written(struct extent_status *es)
{
        return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0;
}

static inline int ext4_es_is_unwritten(struct extent_status *es)
{
        return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0;
}

static inline int ext4_es_is_delayed(struct extent_status *es)
{
        return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0;
}

static inline int ext4_es_is_hole(struct extent_status *es)
{
        return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
}

static inline int ext4_es_is_mapped(struct extent_status *es)
{
        return (ext4_es_is_written(es) || ext4_es_is_unwritten(es));
}

static inline int ext4_es_is_delonly(struct extent_status *es)
{
        return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es));
}

static inline void ext4_es_set_referenced(struct extent_status *es)
{
        es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
}

static inline void ext4_es_clear_referenced(struct extent_status *es)
{
        es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT);
}

static inline int ext4_es_is_referenced(struct extent_status *es)
{
        return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0;
}

static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
{
        return es->es_pblk & ~ES_MASK;
}

static inline ext4_fsblk_t ext4_es_show_pblock(struct extent_status *es)
{
        ext4_fsblk_t pblock = ext4_es_pblock(es);
        return pblock == ~ES_MASK ? 0 : pblock;
}

static inline void ext4_es_store_pblock(struct extent_status *es,
                                        ext4_fsblk_t pb)
{
        ext4_fsblk_t block;

        block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK);
        es->es_pblk = block;
}

static inline void ext4_es_store_status(struct extent_status *es,
                                        unsigned int status)
{
        es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
                      (es->es_pblk & ~ES_MASK);
}

static inline void ext4_es_store_pblock_status(struct extent_status *es,
                                               ext4_fsblk_t pb,
                                               unsigned int status)
{
        es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
                      (pb & ~ES_MASK);
}

extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);

extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v);

extern int __init ext4_init_pending(void);
extern void ext4_exit_pending(void);
extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
                                        bool allocated);
extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
                                        ext4_lblk_t len);
extern void ext4_clear_inode_es(struct inode *inode);

#endif /* _EXT4_EXTENTS_STATUS_H */



























































































































    1 




    1 


















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RMAP_H
#define _LINUX_RMAP_H
/*
 * Declarations for Reverse Mapping functions in mm/rmap.c
 */

#include <linux/list.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/rwsem.h>
#include <linux/memcontrol.h>
#include <linux/highmem.h>

/*
 * The anon_vma heads a list of private "related" vmas, to scan if
 * an anonymous page pointing to this anon_vma needs to be unmapped:
 * the vmas on the list will be related by forking, or by splitting.
 *
 * Since vmas come and go as they are split and merged (particularly
 * in mprotect), the mapping field of an anonymous page cannot point
 * directly to a vma: instead it points to an anon_vma, on whose list
 * the related vmas can be easily linked or unlinked.
 *
 * After unlinking the last vma on the list, we must garbage collect
 * the anon_vma object itself: we're guaranteed no page can be
 * pointing to this anon_vma once its vma list is empty.
 */
struct anon_vma {
        struct anon_vma *root;                /* Root of this anon_vma tree */
        struct rw_semaphore rwsem;        /* W: modification, R: walking the list */
        /*
         * The refcount is taken on an anon_vma when there is no
         * guarantee that the vma of page tables will exist for
         * the duration of the operation. A caller that takes
         * the reference is responsible for clearing up the
         * anon_vma if they are the last user on release
         */
        atomic_t refcount;

        /*
         * Count of child anon_vmas. Equals to the count of all anon_vmas that
         * have ->parent pointing to this one, including itself.
         *
         * This counter is used for making decision about reusing anon_vma
         * instead of forking new one. See comments in function anon_vma_clone.
         */
        unsigned long num_children;
        /* Count of VMAs whose ->anon_vma pointer points to this object. */
        unsigned long num_active_vmas;

        struct anon_vma *parent;        /* Parent of this anon_vma */

        /*
         * NOTE: the LSB of the rb_root.rb_node is set by
         * mm_take_all_locks() _after_ taking the above lock. So the
         * rb_root must only be read/written after taking the above lock
         * to be sure to see a valid next pointer. The LSB bit itself
         * is serialized by a system wide lock only visible to
         * mm_take_all_locks() (mm_all_locks_mutex).
         */

        /* Interval tree of private "related" vmas */
        struct rb_root_cached rb_root;
};

/*
 * The copy-on-write semantics of fork mean that an anon_vma
 * can become associated with multiple processes. Furthermore,
 * each child process will have its own anon_vma, where new
 * pages for that process are instantiated.
 *
 * This structure allows us to find the anon_vmas associated
 * with a VMA, or the VMAs associated with an anon_vma.
 * The "same_vma" list contains the anon_vma_chains linking
 * all the anon_vmas associated with this VMA.
 * The "rb" field indexes on an interval tree the anon_vma_chains
 * which link all the VMAs associated with this anon_vma.
 */
struct anon_vma_chain {
        struct vm_area_struct *vma;
        struct anon_vma *anon_vma;
        struct list_head same_vma;   /* locked by mmap_lock & page_table_lock */
        struct rb_node rb;                        /* locked by anon_vma->rwsem */
        unsigned long rb_subtree_last;
#ifdef CONFIG_DEBUG_VM_RB
        unsigned long cached_vma_start, cached_vma_last;
#endif
};

enum ttu_flags {
        TTU_MIGRATION                = 0x1,        /* migration mode */
        TTU_MUNLOCK                = 0x2,        /* munlock mode */

        TTU_SPLIT_HUGE_PMD        = 0x4,        /* split huge PMD if any */
        TTU_IGNORE_MLOCK        = 0x8,        /* ignore mlock */
        TTU_SYNC                = 0x10,        /* avoid racy checks with PVMW_SYNC */
        TTU_IGNORE_HWPOISON        = 0x20,        /* corrupted page is recoverable */
        TTU_BATCH_FLUSH                = 0x40,        /* Batch TLB flushes where possible
                                         * and caller guarantees they will
                                         * do a final flush if necessary */
        TTU_RMAP_LOCKED                = 0x80,        /* do not grab rmap lock:
                                         * caller holds it */
        TTU_SPLIT_FREEZE        = 0x100,                /* freeze pte under splitting thp */
};

#ifdef CONFIG_MMU
static inline void get_anon_vma(struct anon_vma *anon_vma)
{
        atomic_inc(&anon_vma->refcount);
}

void __put_anon_vma(struct anon_vma *anon_vma);

static inline void put_anon_vma(struct anon_vma *anon_vma)
{
        if (atomic_dec_and_test(&anon_vma->refcount))
                __put_anon_vma(anon_vma);
}

static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
{
        down_write(&anon_vma->root->rwsem);
}

static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
{
        up_write(&anon_vma->root->rwsem);
}

static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
{
        down_read(&anon_vma->root->rwsem);
}

static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
{
        up_read(&anon_vma->root->rwsem);
}


/*
 * anon_vma helper functions.
 */
void anon_vma_init(void);        /* create anon_vma_cachep */
int  __anon_vma_prepare(struct vm_area_struct *);
void unlink_anon_vmas(struct vm_area_struct *);
int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);

static inline int anon_vma_prepare(struct vm_area_struct *vma)
{
        if (likely(vma->anon_vma))
                return 0;

        return __anon_vma_prepare(vma);
}

static inline void anon_vma_merge(struct vm_area_struct *vma,
                                  struct vm_area_struct *next)
{
        VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma);
        unlink_anon_vmas(next);
}

struct anon_vma *page_get_anon_vma(struct page *page);

/* bitflags for do_page_add_anon_rmap() */
#define RMAP_EXCLUSIVE 0x01
#define RMAP_COMPOUND 0x02

/*
 * rmap interfaces called when adding or removing pte of page
 */
void page_move_anon_rmap(struct page *, struct vm_area_struct *);
void page_add_anon_rmap(struct page *, struct vm_area_struct *,
                unsigned long, bool);
void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
                           unsigned long, int);
void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
                unsigned long, bool);
void page_add_file_rmap(struct page *, bool);
void page_remove_rmap(struct page *, bool);

void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
                            unsigned long);
void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *,
                                unsigned long);

static inline void page_dup_rmap(struct page *page, bool compound)
{
        atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount);
}

/*
 * Called from mm/vmscan.c to handle paging out
 */
int page_referenced(struct page *, int is_locked,
                        struct mem_cgroup *memcg, unsigned long *vm_flags);

bool try_to_unmap(struct page *, enum ttu_flags flags);

/* Avoid racy checks */
#define PVMW_SYNC                (1 << 0)
/* Look for migarion entries rather than present PTEs */
#define PVMW_MIGRATION                (1 << 1)

struct page_vma_mapped_walk {
        struct page *page;
        struct vm_area_struct *vma;
        unsigned long address;
        pmd_t *pmd;
        pte_t *pte;
        spinlock_t *ptl;
        unsigned int flags;
};

static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
{
        /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */
        if (pvmw->pte && !PageHuge(pvmw->page))
                pte_unmap(pvmw->pte);
        if (pvmw->ptl)
                spin_unlock(pvmw->ptl);
}

bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);

/*
 * Used by swapoff to help locate where page is expected in vma.
 */
unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);

/*
 * Cleans the PTEs of shared mappings.
 * (and since clean PTEs should also be readonly, write protects them too)
 *
 * returns the number of cleaned PTEs.
 */
int page_mkclean(struct page *);

/*
 * called in munlock()/munmap() path to check for other vmas holding
 * the page mlocked.
 */
void try_to_munlock(struct page *);

void remove_migration_ptes(struct page *old, struct page *new, bool locked);

/*
 * Called by memory-failure.c to kill processes.
 */
struct anon_vma *page_lock_anon_vma_read(struct page *page);
void page_unlock_anon_vma_read(struct anon_vma *anon_vma);
int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);

/*
 * rmap_walk_control: To control rmap traversing for specific needs
 *
 * arg: passed to rmap_one() and invalid_vma()
 * rmap_one: executed on each vma where page is mapped
 * done: for checking traversing termination condition
 * anon_lock: for getting anon_lock by optimized way rather than default
 * invalid_vma: for skipping uninterested vma
 */
struct rmap_walk_control {
        void *arg;
        /*
         * Return false if page table scanning in rmap_walk should be stopped.
         * Otherwise, return true.
         */
        bool (*rmap_one)(struct page *page, struct vm_area_struct *vma,
                                        unsigned long addr, void *arg);
        int (*done)(struct page *page);
        struct anon_vma *(*anon_lock)(struct page *page);
        bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
};

void rmap_walk(struct page *page, struct rmap_walk_control *rwc);
void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc);

#else        /* !CONFIG_MMU */

#define anon_vma_init()                do {} while (0)
#define anon_vma_prepare(vma)        (0)
#define anon_vma_link(vma)        do {} while (0)

static inline int page_referenced(struct page *page, int is_locked,
                                  struct mem_cgroup *memcg,
                                  unsigned long *vm_flags)
{
        *vm_flags = 0;
        return 0;
}

#define try_to_unmap(page, refs) false

static inline int page_mkclean(struct page *page)
{
        return 0;
}


#endif        /* CONFIG_MMU */

#endif        /* _LINUX_RMAP_H */


























































































































































    1 


























































    1 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
// SPDX-License-Identifier: GPL-2.0
#include <linux/export.h>
#include <linux/lockref.h>

#if USE_CMPXCHG_LOCKREF

/*
 * Note that the "cmpxchg()" reloads the "old" value for the
 * failure case.
 */
#define CMPXCHG_LOOP(CODE, SUCCESS) do {                                        \
        int retry = 100;                                                        \
        struct lockref old;                                                        \
        BUILD_BUG_ON(sizeof(old) != 8);                                                \
        old.lock_count = READ_ONCE(lockref->lock_count);                        \
        while (likely(arch_spin_value_unlocked(old.lock.rlock.raw_lock))) {          \
                struct lockref new = old, prev = old;                                \
                CODE                                                                \
                old.lock_count = cmpxchg64_relaxed(&lockref->lock_count,        \
                                                   old.lock_count,                \
                                                   new.lock_count);                \
                if (likely(old.lock_count == prev.lock_count)) {                \
                        SUCCESS;                                                \
                }                                                                \
                if (!--retry)                                                        \
                        break;                                                        \
        }                                                                        \
} while (0)

#else

#define CMPXCHG_LOOP(CODE, SUCCESS) do { } while (0)

#endif

/**
 * lockref_get - Increments reference count unconditionally
 * @lockref: pointer to lockref structure
 *
 * This operation is only valid if you already hold a reference
 * to the object, so you know the count cannot be zero.
 */
void lockref_get(struct lockref *lockref)
{
        CMPXCHG_LOOP(
                new.count++;
        ,
                return;
        );

        spin_lock(&lockref->lock);
        lockref->count++;
        spin_unlock(&lockref->lock);
}
EXPORT_SYMBOL(lockref_get);

/**
 * lockref_get_not_zero - Increments count unless the count is 0 or dead
 * @lockref: pointer to lockref structure
 * Return: 1 if count updated successfully or 0 if count was zero
 */
int lockref_get_not_zero(struct lockref *lockref)
{
        int retval;

        CMPXCHG_LOOP(
                new.count++;
                if (old.count <= 0)
                        return 0;
        ,
                return 1;
        );

        spin_lock(&lockref->lock);
        retval = 0;
        if (lockref->count > 0) {
                lockref->count++;
                retval = 1;
        }
        spin_unlock(&lockref->lock);
        return retval;
}
EXPORT_SYMBOL(lockref_get_not_zero);

/**
 * lockref_put_not_zero - Decrements count unless count <= 1 before decrement
 * @lockref: pointer to lockref structure
 * Return: 1 if count updated successfully or 0 if count would become zero
 */
int lockref_put_not_zero(struct lockref *lockref)
{
        int retval;

        CMPXCHG_LOOP(
                new.count--;
                if (old.count <= 1)
                        return 0;
        ,
                return 1;
        );

        spin_lock(&lockref->lock);
        retval = 0;
        if (lockref->count > 1) {
                lockref->count--;
                retval = 1;
        }
        spin_unlock(&lockref->lock);
        return retval;
}
EXPORT_SYMBOL(lockref_put_not_zero);

/**
 * lockref_get_or_lock - Increments count unless the count is 0 or dead
 * @lockref: pointer to lockref structure
 * Return: 1 if count updated successfully or 0 if count was zero
 * and we got the lock instead.
 */
int lockref_get_or_lock(struct lockref *lockref)
{
        CMPXCHG_LOOP(
                new.count++;
                if (old.count <= 0)
                        break;
        ,
                return 1;
        );

        spin_lock(&lockref->lock);
        if (lockref->count <= 0)
                return 0;
        lockref->count++;
        spin_unlock(&lockref->lock);
        return 1;
}
EXPORT_SYMBOL(lockref_get_or_lock);

/**
 * lockref_put_return - Decrement reference count if possible
 * @lockref: pointer to lockref structure
 *
 * Decrement the reference count and return the new value.
 * If the lockref was dead or locked, return an error.
 */
int lockref_put_return(struct lockref *lockref)
{
        CMPXCHG_LOOP(
                new.count--;
                if (old.count <= 0)
                        return -1;
        ,
                return new.count;
        );
        return -1;
}
EXPORT_SYMBOL(lockref_put_return);

/**
 * lockref_put_or_lock - decrements count unless count <= 1 before decrement
 * @lockref: pointer to lockref structure
 * Return: 1 if count updated successfully or 0 if count <= 1 and lock taken
 */
int lockref_put_or_lock(struct lockref *lockref)
{
        CMPXCHG_LOOP(
                new.count--;
                if (old.count <= 1)
                        break;
        ,
                return 1;
        );

        spin_lock(&lockref->lock);
        if (lockref->count <= 1)
                return 0;
        lockref->count--;
        spin_unlock(&lockref->lock);
        return 1;
}
EXPORT_SYMBOL(lockref_put_or_lock);

/**
 * lockref_mark_dead - mark lockref dead
 * @lockref: pointer to lockref structure
 */
void lockref_mark_dead(struct lockref *lockref)
{
        assert_spin_locked(&lockref->lock);
        lockref->count = -128;
}
EXPORT_SYMBOL(lockref_mark_dead);

/**
 * lockref_get_not_dead - Increments count unless the ref is dead
 * @lockref: pointer to lockref structure
 * Return: 1 if count updated successfully or 0 if lockref was dead
 */
int lockref_get_not_dead(struct lockref *lockref)
{
        int retval;

        CMPXCHG_LOOP(
                new.count++;
                if (old.count < 0)
                        return 0;
        ,
                return 1;
        );

        spin_lock(&lockref->lock);
        retval = 0;
        if (lockref->count >= 0) {
                lockref->count++;
                retval = 1;
        }
        spin_unlock(&lockref->lock);
        return retval;
}
EXPORT_SYMBOL(lockref_get_not_dead);









































    1 











































































    1 

    1 
    1 



    1 

    1 




















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
// SPDX-License-Identifier: GPL-2.0
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/fault-inject-usercopy.h>
#include <linux/kasan-checks.h>
#include <linux/thread_info.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/mm.h>

#include <asm/byteorder.h>
#include <asm/word-at-a-time.h>

#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
#define IS_UNALIGNED(src, dst)        0
#else
#define IS_UNALIGNED(src, dst)        \
        (((long) dst | (long) src) & (sizeof(long) - 1))
#endif

/*
 * Do a strncpy, return length of string without final '\0'.
 * 'count' is the user-supplied count (return 'count' if we
 * hit it), 'max' is the address space maximum (and we return
 * -EFAULT if we hit it).
 */
static inline long do_strncpy_from_user(char *dst, const char __user *src,
                                        unsigned long count, unsigned long max)
{
        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
        unsigned long res = 0;

        if (IS_UNALIGNED(src, dst))
                goto byte_at_a_time;

        while (max >= sizeof(unsigned long)) {
                unsigned long c, data, mask;

                /* Fall back to byte-at-a-time if we get a page fault */
                unsafe_get_user(c, (unsigned long __user *)(src+res), byte_at_a_time);

                /*
                 * Note that we mask out the bytes following the NUL. This is
                 * important to do because string oblivious code may read past
                 * the NUL. For those routines, we don't want to give them
                 * potentially random bytes after the NUL in `src`.
                 *
                 * One example of such code is BPF map keys. BPF treats map keys
                 * as an opaque set of bytes. Without the post-NUL mask, any BPF
                 * maps keyed by strings returned from strncpy_from_user() may
                 * have multiple entries for semantically identical strings.
                 */
                if (has_zero(c, &data, &constants)) {
                        data = prep_zero_mask(c, data, &constants);
                        data = create_zero_mask(data);
                        mask = zero_bytemask(data);
                        *(unsigned long *)(dst+res) = c & mask;
                        return res + find_zero(data);
                }

                *(unsigned long *)(dst+res) = c;

                res += sizeof(unsigned long);
                max -= sizeof(unsigned long);
        }

byte_at_a_time:
        while (max) {
                char c;

                unsafe_get_user(c,src+res, efault);
                dst[res] = c;
                if (!c)
                        return res;
                res++;
                max--;
        }

        /*
         * Uhhuh. We hit 'max'. But was that the user-specified maximum
         * too? If so, that's ok - we got as much as the user asked for.
         */
        if (res >= count)
                return res;

        /*
         * Nope: we hit the address space limit, and we still had more
         * characters the caller would have wanted. That's an EFAULT.
         */
efault:
        return -EFAULT;
}

/**
 * strncpy_from_user: - Copy a NUL terminated string from userspace.
 * @dst:   Destination address, in kernel space.  This buffer must be at
 *         least @count bytes long.
 * @src:   Source address, in user space.
 * @count: Maximum number of bytes to copy, including the trailing NUL.
 *
 * Copies a NUL-terminated string from userspace to kernel space.
 *
 * On success, returns the length of the string (not including the trailing
 * NUL).
 *
 * If access to userspace fails, returns -EFAULT (some data may have been
 * copied).
 *
 * If @count is smaller than the length of the string, copies @count bytes
 * and returns @count.
 */
long strncpy_from_user(char *dst, const char __user *src, long count)
{
        unsigned long max_addr, src_addr;

        might_fault();
        if (should_fail_usercopy())
                return -EFAULT;
        if (unlikely(count <= 0))
                return 0;

        max_addr = user_addr_max();
        src_addr = (unsigned long)untagged_addr(src);
        if (likely(src_addr < max_addr)) {
                unsigned long max = max_addr - src_addr;
                long retval;

                /*
                 * Truncate 'max' to the user-specified limit, so that
                 * we only have one limit we need to check in the loop
                 */
                if (max > count)
                        max = count;

                kasan_check_write(dst, count);
                check_object_size(dst, count, false);
                if (user_read_access_begin(src, max)) {
                        retval = do_strncpy_from_user(dst, src, count, max);
                        user_read_access_end();
                        return retval;
                }
        }
        return -EFAULT;
}
EXPORT_SYMBOL(strncpy_from_user);














































































































































    1 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMAN_H
#define _LINUX_MMAN_H

#include <linux/mm.h>
#include <linux/percpu_counter.h>

#include <linux/atomic.h>
#include <uapi/linux/mman.h>

/*
 * Arrange for legacy / undefined architecture specific flags to be
 * ignored by mmap handling code.
 */
#ifndef MAP_32BIT
#define MAP_32BIT 0
#endif
#ifndef MAP_HUGE_2MB
#define MAP_HUGE_2MB 0
#endif
#ifndef MAP_HUGE_1GB
#define MAP_HUGE_1GB 0
#endif
#ifndef MAP_UNINITIALIZED
#define MAP_UNINITIALIZED 0
#endif
#ifndef MAP_SYNC
#define MAP_SYNC 0
#endif

/*
 * The historical set of flags that all mmap implementations implicitly
 * support when a ->mmap_validate() op is not provided in file_operations.
 */
#define LEGACY_MAP_MASK (MAP_SHARED \
                | MAP_PRIVATE \
                | MAP_FIXED \
                | MAP_ANONYMOUS \
                | MAP_DENYWRITE \
                | MAP_EXECUTABLE \
                | MAP_UNINITIALIZED \
                | MAP_GROWSDOWN \
                | MAP_LOCKED \
                | MAP_NORESERVE \
                | MAP_POPULATE \
                | MAP_NONBLOCK \
                | MAP_STACK \
                | MAP_HUGETLB \
                | MAP_32BIT \
                | MAP_HUGE_2MB \
                | MAP_HUGE_1GB)

extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
extern unsigned long sysctl_overcommit_kbytes;
extern struct percpu_counter vm_committed_as;

#ifdef CONFIG_SMP
extern s32 vm_committed_as_batch;
extern void mm_compute_batch(int overcommit_policy);
#else
#define vm_committed_as_batch 0
static inline void mm_compute_batch(int overcommit_policy)
{
}
#endif

unsigned long vm_memory_committed(void);

static inline void vm_acct_memory(long pages)
{
        percpu_counter_add_batch(&vm_committed_as, pages, vm_committed_as_batch);
}

static inline void vm_unacct_memory(long pages)
{
        vm_acct_memory(-pages);
}

/*
 * Allow architectures to handle additional protection and flag bits. The
 * overriding macros must be defined in the arch-specific asm/mman.h file.
 */

#ifndef arch_calc_vm_prot_bits
#define arch_calc_vm_prot_bits(prot, pkey) 0
#endif

#ifndef arch_calc_vm_flag_bits
#define arch_calc_vm_flag_bits(flags) 0
#endif

#ifndef arch_vm_get_page_prot
#define arch_vm_get_page_prot(vm_flags) __pgprot(0)
#endif

#ifndef arch_validate_prot
/*
 * This is called from mprotect().  PROT_GROWSDOWN and PROT_GROWSUP have
 * already been masked out.
 *
 * Returns true if the prot flags are valid
 */
static inline bool arch_validate_prot(unsigned long prot, unsigned long addr)
{
        return (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM)) == 0;
}
#define arch_validate_prot arch_validate_prot
#endif

#ifndef arch_validate_flags
/*
 * This is called from mmap() and mprotect() with the updated vma->vm_flags.
 *
 * Returns true if the VM_* flags are valid.
 */
static inline bool arch_validate_flags(unsigned long flags)
{
        return true;
}
#define arch_validate_flags arch_validate_flags
#endif

/*
 * Optimisation macro.  It is equivalent to:
 *      (x & bit1) ? bit2 : 0
 * but this version is faster.
 * ("bit1" and "bit2" must be single bits)
 */
#define _calc_vm_trans(x, bit1, bit2) \
  ((!(bit1) || !(bit2)) ? 0 : \
  ((bit1) <= (bit2) ? ((x) & (bit1)) * ((bit2) / (bit1)) \
   : ((x) & (bit1)) / ((bit1) / (bit2))))

/*
 * Combine the mmap "prot" argument into "vm_flags" used internally.
 */
static inline unsigned long
calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
{
        return _calc_vm_trans(prot, PROT_READ,  VM_READ ) |
               _calc_vm_trans(prot, PROT_WRITE, VM_WRITE) |
               _calc_vm_trans(prot, PROT_EXEC,  VM_EXEC) |
               arch_calc_vm_prot_bits(prot, pkey);
}

/*
 * Combine the mmap "flags" argument into "vm_flags" used internally.
 */
static inline unsigned long
calc_vm_flag_bits(unsigned long flags)
{
        return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
               _calc_vm_trans(flags, MAP_DENYWRITE,  VM_DENYWRITE ) |
               _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    ) |
               _calc_vm_trans(flags, MAP_SYNC,             VM_SYNC      ) |
               arch_calc_vm_flag_bits(flags);
}

unsigned long vm_commit_limit(void);
#endif /* _LINUX_MMAN_H */


































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * workqueue.h --- work queue handling for Linux.
 */

#ifndef _LINUX_WORKQUEUE_H
#define _LINUX_WORKQUEUE_H

#include <linux/timer.h>
#include <linux/linkage.h>
#include <linux/bitops.h>
#include <linux/lockdep.h>
#include <linux/threads.h>
#include <linux/atomic.h>
#include <linux/cpumask.h>
#include <linux/rcupdate.h>

struct workqueue_struct;

struct work_struct;
typedef void (*work_func_t)(struct work_struct *work);
void delayed_work_timer_fn(struct timer_list *t);

/*
 * The first word is the work queue pointer and the flags rolled into
 * one
 */
#define work_data_bits(work) ((unsigned long *)(&(work)->data))

enum {
        WORK_STRUCT_PENDING_BIT        = 0,        /* work item is pending execution */
        WORK_STRUCT_INACTIVE_BIT= 1,        /* work item is inactive */
        WORK_STRUCT_PWQ_BIT        = 2,        /* data points to pwq */
        WORK_STRUCT_LINKED_BIT        = 3,        /* next work is linked to this one */
#ifdef CONFIG_DEBUG_OBJECTS_WORK
        WORK_STRUCT_STATIC_BIT        = 4,        /* static initializer (debugobjects) */
        WORK_STRUCT_COLOR_SHIFT        = 5,        /* color for workqueue flushing */
#else
        WORK_STRUCT_COLOR_SHIFT        = 4,        /* color for workqueue flushing */
#endif

        WORK_STRUCT_COLOR_BITS        = 4,

        WORK_STRUCT_PENDING        = 1 << WORK_STRUCT_PENDING_BIT,
        WORK_STRUCT_INACTIVE        = 1 << WORK_STRUCT_INACTIVE_BIT,
        WORK_STRUCT_PWQ                = 1 << WORK_STRUCT_PWQ_BIT,
        WORK_STRUCT_LINKED        = 1 << WORK_STRUCT_LINKED_BIT,
#ifdef CONFIG_DEBUG_OBJECTS_WORK
        WORK_STRUCT_STATIC        = 1 << WORK_STRUCT_STATIC_BIT,
#else
        WORK_STRUCT_STATIC        = 0,
#endif

        /*
         * The last color is no color used for works which don't
         * participate in workqueue flushing.
         */
        WORK_NR_COLORS                = (1 << WORK_STRUCT_COLOR_BITS) - 1,
        WORK_NO_COLOR                = WORK_NR_COLORS,

        /* not bound to any CPU, prefer the local CPU */
        WORK_CPU_UNBOUND        = NR_CPUS,

        /*
         * Reserve 8 bits off of pwq pointer w/ debugobjects turned off.
         * This makes pwqs aligned to 256 bytes and allows 15 workqueue
         * flush colors.
         */
        WORK_STRUCT_FLAG_BITS        = WORK_STRUCT_COLOR_SHIFT +
                                  WORK_STRUCT_COLOR_BITS,

        /* data contains off-queue information when !WORK_STRUCT_PWQ */
        WORK_OFFQ_FLAG_BASE        = WORK_STRUCT_COLOR_SHIFT,

        __WORK_OFFQ_CANCELING        = WORK_OFFQ_FLAG_BASE,

        /*
         * When a work item is off queue, its high bits point to the last
         * pool it was on.  Cap at 31 bits and use the highest number to
         * indicate that no pool is associated.
         */
        WORK_OFFQ_FLAG_BITS        = 1,
        WORK_OFFQ_POOL_SHIFT        = WORK_OFFQ_FLAG_BASE + WORK_OFFQ_FLAG_BITS,
        WORK_OFFQ_LEFT                = BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT,
        WORK_OFFQ_POOL_BITS        = WORK_OFFQ_LEFT <= 31 ? WORK_OFFQ_LEFT : 31,

        /* bit mask for work_busy() return values */
        WORK_BUSY_PENDING        = 1 << 0,
        WORK_BUSY_RUNNING        = 1 << 1,

        /* maximum string length for set_worker_desc() */
        WORKER_DESC_LEN                = 24,
};

/* Convenience constants - of type 'unsigned long', not 'enum'! */
#define WORK_OFFQ_CANCELING        (1ul << __WORK_OFFQ_CANCELING)
#define WORK_OFFQ_POOL_NONE        ((1ul << WORK_OFFQ_POOL_BITS) - 1)
#define WORK_STRUCT_NO_POOL        (WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT)

#define WORK_STRUCT_FLAG_MASK    ((1ul << WORK_STRUCT_FLAG_BITS) - 1)
#define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK)

struct work_struct {
        atomic_long_t data;
        struct list_head entry;
        work_func_t func;
#ifdef CONFIG_LOCKDEP
        struct lockdep_map lockdep_map;
#endif
};

#define WORK_DATA_INIT()        ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL)
#define WORK_DATA_STATIC_INIT()        \
        ATOMIC_LONG_INIT((unsigned long)(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC))

struct delayed_work {
        struct work_struct work;
        struct timer_list timer;

        /* target workqueue and CPU ->timer uses to queue ->work */
        struct workqueue_struct *wq;
        int cpu;
};

struct rcu_work {
        struct work_struct work;
        struct rcu_head rcu;

        /* target workqueue ->rcu uses to queue ->work */
        struct workqueue_struct *wq;
};

/**
 * struct workqueue_attrs - A struct for workqueue attributes.
 *
 * This can be used to change attributes of an unbound workqueue.
 */
struct workqueue_attrs {
        /**
         * @nice: nice level
         */
        int nice;

        /**
         * @cpumask: allowed CPUs
         */
        cpumask_var_t cpumask;

        /**
         * @no_numa: disable NUMA affinity
         *
         * Unlike other fields, ``no_numa`` isn't a property of a worker_pool. It
         * only modifies how :c:func:`apply_workqueue_attrs` select pools and thus
         * doesn't participate in pool hash calculations or equality comparisons.
         */
        bool no_numa;
};

static inline struct delayed_work *to_delayed_work(struct work_struct *work)
{
        return container_of(work, struct delayed_work, work);
}

static inline struct rcu_work *to_rcu_work(struct work_struct *work)
{
        return container_of(work, struct rcu_work, work);
}

struct execute_work {
        struct work_struct work;
};

#ifdef CONFIG_LOCKDEP
/*
 * NB: because we have to copy the lockdep_map, setting _key
 * here is required, otherwise it could get initialised to the
 * copy of the lockdep_map!
 */
#define __WORK_INIT_LOCKDEP_MAP(n, k) \
        .lockdep_map = STATIC_LOCKDEP_MAP_INIT(n, k),
#else
#define __WORK_INIT_LOCKDEP_MAP(n, k)
#endif

#define __WORK_INITIALIZER(n, f) {                                        \
        .data = WORK_DATA_STATIC_INIT(),                                \
        .entry        = { &(n).entry, &(n).entry },                                \
        .func = (f),                                                        \
        __WORK_INIT_LOCKDEP_MAP(#n, &(n))                                \
        }

#define __DELAYED_WORK_INITIALIZER(n, f, tflags) {                        \
        .work = __WORK_INITIALIZER((n).work, (f)),                        \
        .timer = __TIMER_INITIALIZER(delayed_work_timer_fn,\
                                     (tflags) | TIMER_IRQSAFE),                \
        }

#define DECLARE_WORK(n, f)                                                \
        struct work_struct n = __WORK_INITIALIZER(n, f)

#define DECLARE_DELAYED_WORK(n, f)                                        \
        struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, 0)

#define DECLARE_DEFERRABLE_WORK(n, f)                                        \
        struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, TIMER_DEFERRABLE)

#ifdef CONFIG_DEBUG_OBJECTS_WORK
extern void __init_work(struct work_struct *work, int onstack);
extern void destroy_work_on_stack(struct work_struct *work);
extern void destroy_delayed_work_on_stack(struct delayed_work *work);
static inline unsigned int work_static(struct work_struct *work)
{
        return *work_data_bits(work) & WORK_STRUCT_STATIC;
}
#else
static inline void __init_work(struct work_struct *work, int onstack) { }
static inline void destroy_work_on_stack(struct work_struct *work) { }
static inline void destroy_delayed_work_on_stack(struct delayed_work *work) { }
static inline unsigned int work_static(struct work_struct *work) { return 0; }
#endif

/*
 * initialize all of a work item in one go
 *
 * NOTE! No point in using "atomic_long_set()": using a direct
 * assignment of the work data initializer allows the compiler
 * to generate better code.
 */
#ifdef CONFIG_LOCKDEP
#define __INIT_WORK(_work, _func, _onstack)                                \
        do {                                                                \
                static struct lock_class_key __key;                        \
                                                                        \
                __init_work((_work), _onstack);                                \
                (_work)->data = (atomic_long_t) WORK_DATA_INIT();        \
                lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, &__key, 0); \
                INIT_LIST_HEAD(&(_work)->entry);                        \
                (_work)->func = (_func);                                \
        } while (0)
#else
#define __INIT_WORK(_work, _func, _onstack)                                \
        do {                                                                \
                __init_work((_work), _onstack);                                \
                (_work)->data = (atomic_long_t) WORK_DATA_INIT();        \
                INIT_LIST_HEAD(&(_work)->entry);                        \
                (_work)->func = (_func);                                \
        } while (0)
#endif

#define INIT_WORK(_work, _func)                                                \
        __INIT_WORK((_work), (_func), 0)

#define INIT_WORK_ONSTACK(_work, _func)                                        \
        __INIT_WORK((_work), (_func), 1)

#define __INIT_DELAYED_WORK(_work, _func, _tflags)                        \
        do {                                                                \
                INIT_WORK(&(_work)->work, (_func));                        \
                __init_timer(&(_work)->timer,                                \
                             delayed_work_timer_fn,                        \
                             (_tflags) | TIMER_IRQSAFE);                \
        } while (0)

#define __INIT_DELAYED_WORK_ONSTACK(_work, _func, _tflags)                \
        do {                                                                \
                INIT_WORK_ONSTACK(&(_work)->work, (_func));                \
                __init_timer_on_stack(&(_work)->timer,                        \
                                      delayed_work_timer_fn,                \
                                      (_tflags) | TIMER_IRQSAFE);        \
        } while (0)

#define INIT_DELAYED_WORK(_work, _func)                                        \
        __INIT_DELAYED_WORK(_work, _func, 0)

#define INIT_DELAYED_WORK_ONSTACK(_work, _func)                                \
        __INIT_DELAYED_WORK_ONSTACK(_work, _func, 0)

#define INIT_DEFERRABLE_WORK(_work, _func)                                \
        __INIT_DELAYED_WORK(_work, _func, TIMER_DEFERRABLE)

#define INIT_DEFERRABLE_WORK_ONSTACK(_work, _func)                        \
        __INIT_DELAYED_WORK_ONSTACK(_work, _func, TIMER_DEFERRABLE)

#define INIT_RCU_WORK(_work, _func)                                        \
        INIT_WORK(&(_work)->work, (_func))

#define INIT_RCU_WORK_ONSTACK(_work, _func)                                \
        INIT_WORK_ONSTACK(&(_work)->work, (_func))

/**
 * work_pending - Find out whether a work item is currently pending
 * @work: The work item in question
 */
#define work_pending(work) \
        test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))

/**
 * delayed_work_pending - Find out whether a delayable work item is currently
 * pending
 * @w: The work item in question
 */
#define delayed_work_pending(w) \
        work_pending(&(w)->work)

/*
 * Workqueue flags and constants.  For details, please refer to
 * Documentation/core-api/workqueue.rst.
 */
enum {
        WQ_UNBOUND                = 1 << 1, /* not bound to any cpu */
        WQ_FREEZABLE                = 1 << 2, /* freeze during suspend */
        WQ_MEM_RECLAIM                = 1 << 3, /* may be used for memory reclaim */
        WQ_HIGHPRI                = 1 << 4, /* high priority */
        WQ_CPU_INTENSIVE        = 1 << 5, /* cpu intensive workqueue */
        WQ_SYSFS                = 1 << 6, /* visible in sysfs, see wq_sysfs_register() */

        /*
         * Per-cpu workqueues are generally preferred because they tend to
         * show better performance thanks to cache locality.  Per-cpu
         * workqueues exclude the scheduler from choosing the CPU to
         * execute the worker threads, which has an unfortunate side effect
         * of increasing power consumption.
         *
         * The scheduler considers a CPU idle if it doesn't have any task
         * to execute and tries to keep idle cores idle to conserve power;
         * however, for example, a per-cpu work item scheduled from an
         * interrupt handler on an idle CPU will force the scheduler to
         * excute the work item on that CPU breaking the idleness, which in
         * turn may lead to more scheduling choices which are sub-optimal
         * in terms of power consumption.
         *
         * Workqueues marked with WQ_POWER_EFFICIENT are per-cpu by default
         * but become unbound if workqueue.power_efficient kernel param is
         * specified.  Per-cpu workqueues which are identified to
         * contribute significantly to power-consumption are identified and
         * marked with this flag and enabling the power_efficient mode
         * leads to noticeable power saving at the cost of small
         * performance disadvantage.
         *
         * http://thread.gmane.org/gmane.linux.kernel/1480396
         */
        WQ_POWER_EFFICIENT        = 1 << 7,

        __WQ_DRAINING                = 1 << 16, /* internal: workqueue is draining */
        __WQ_ORDERED                = 1 << 17, /* internal: workqueue is ordered */
        __WQ_LEGACY                = 1 << 18, /* internal: create*_workqueue() */
        __WQ_ORDERED_EXPLICIT        = 1 << 19, /* internal: alloc_ordered_workqueue() */

        WQ_MAX_ACTIVE                = 512,          /* I like 512, better ideas? */
        WQ_MAX_UNBOUND_PER_CPU        = 4,          /* 4 * #cpus for unbound wq */
        WQ_DFL_ACTIVE                = WQ_MAX_ACTIVE / 2,
};

/* unbound wq's aren't per-cpu, scale max_active according to #cpus */
#define WQ_UNBOUND_MAX_ACTIVE        \
        max_t(int, WQ_MAX_ACTIVE, num_possible_cpus() * WQ_MAX_UNBOUND_PER_CPU)

/*
 * System-wide workqueues which are always present.
 *
 * system_wq is the one used by schedule[_delayed]_work[_on]().
 * Multi-CPU multi-threaded.  There are users which expect relatively
 * short queue flush time.  Don't queue works which can run for too
 * long.
 *
 * system_highpri_wq is similar to system_wq but for work items which
 * require WQ_HIGHPRI.
 *
 * system_long_wq is similar to system_wq but may host long running
 * works.  Queue flushing might take relatively long.
 *
 * system_unbound_wq is unbound workqueue.  Workers are not bound to
 * any specific CPU, not concurrency managed, and all queued works are
 * executed immediately as long as max_active limit is not reached and
 * resources are available.
 *
 * system_freezable_wq is equivalent to system_wq except that it's
 * freezable.
 *
 * *_power_efficient_wq are inclined towards saving power and converted
 * into WQ_UNBOUND variants if 'wq_power_efficient' is enabled; otherwise,
 * they are same as their non-power-efficient counterparts - e.g.
 * system_power_efficient_wq is identical to system_wq if
 * 'wq_power_efficient' is disabled.  See WQ_POWER_EFFICIENT for more info.
 */
extern struct workqueue_struct *system_wq;
extern struct workqueue_struct *system_highpri_wq;
extern struct workqueue_struct *system_long_wq;
extern struct workqueue_struct *system_unbound_wq;
extern struct workqueue_struct *system_freezable_wq;
extern struct workqueue_struct *system_power_efficient_wq;
extern struct workqueue_struct *system_freezable_power_efficient_wq;

/**
 * alloc_workqueue - allocate a workqueue
 * @fmt: printf format for the name of the workqueue
 * @flags: WQ_* flags
 * @max_active: max in-flight work items, 0 for default
 * remaining args: args for @fmt
 *
 * Allocate a workqueue with the specified parameters.  For detailed
 * information on WQ_* flags, please refer to
 * Documentation/core-api/workqueue.rst.
 *
 * RETURNS:
 * Pointer to the allocated workqueue on success, %NULL on failure.
 */
struct workqueue_struct *alloc_workqueue(const char *fmt,
                                         unsigned int flags,
                                         int max_active, ...);

/**
 * alloc_ordered_workqueue - allocate an ordered workqueue
 * @fmt: printf format for the name of the workqueue
 * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful)
 * @args...: args for @fmt
 *
 * Allocate an ordered workqueue.  An ordered workqueue executes at
 * most one work item at any given time in the queued order.  They are
 * implemented as unbound workqueues with @max_active of one.
 *
 * RETURNS:
 * Pointer to the allocated workqueue on success, %NULL on failure.
 */
#define alloc_ordered_workqueue(fmt, flags, args...)                        \
        alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED |                \
                        __WQ_ORDERED_EXPLICIT | (flags), 1, ##args)

#define create_workqueue(name)                                                \
        alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name))
#define create_freezable_workqueue(name)                                \
        alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND |        \
                        WQ_MEM_RECLAIM, 1, (name))
#define create_singlethread_workqueue(name)                                \
        alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name)

extern void destroy_workqueue(struct workqueue_struct *wq);

struct workqueue_attrs *alloc_workqueue_attrs(void);
void free_workqueue_attrs(struct workqueue_attrs *attrs);
int apply_workqueue_attrs(struct workqueue_struct *wq,
                          const struct workqueue_attrs *attrs);
int workqueue_set_unbound_cpumask(cpumask_var_t cpumask);

extern bool queue_work_on(int cpu, struct workqueue_struct *wq,
                        struct work_struct *work);
extern bool queue_work_node(int node, struct workqueue_struct *wq,
                            struct work_struct *work);
extern bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
                        struct delayed_work *work, unsigned long delay);
extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
                        struct delayed_work *dwork, unsigned long delay);
extern bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork);

extern void flush_workqueue(struct workqueue_struct *wq);
extern void drain_workqueue(struct workqueue_struct *wq);

extern int schedule_on_each_cpu(work_func_t func);

int execute_in_process_context(work_func_t fn, struct execute_work *);

extern bool flush_work(struct work_struct *work);
extern bool cancel_work(struct work_struct *work);
extern bool cancel_work_sync(struct work_struct *work);

extern bool flush_delayed_work(struct delayed_work *dwork);
extern bool cancel_delayed_work(struct delayed_work *dwork);
extern bool cancel_delayed_work_sync(struct delayed_work *dwork);

extern bool flush_rcu_work(struct rcu_work *rwork);

extern void workqueue_set_max_active(struct workqueue_struct *wq,
                                     int max_active);
extern struct work_struct *current_work(void);
extern bool current_is_workqueue_rescuer(void);
extern bool workqueue_congested(int cpu, struct workqueue_struct *wq);
extern unsigned int work_busy(struct work_struct *work);
extern __printf(1, 2) void set_worker_desc(const char *fmt, ...);
extern void print_worker_info(const char *log_lvl, struct task_struct *task);
extern void show_workqueue_state(void);
extern void wq_worker_comm(char *buf, size_t size, struct task_struct *task);

/**
 * queue_work - queue work on a workqueue
 * @wq: workqueue to use
 * @work: work to queue
 *
 * Returns %false if @work was already on a queue, %true otherwise.
 *
 * We queue the work to the CPU on which it was submitted, but if the CPU dies
 * it can be processed by another CPU.
 *
 * Memory-ordering properties:  If it returns %true, guarantees that all stores
 * preceding the call to queue_work() in the program order will be visible from
 * the CPU which will execute @work by the time such work executes, e.g.,
 *
 * { x is initially 0 }
 *
 *   CPU0                                CPU1
 *
 *   WRITE_ONCE(x, 1);                        [ @work is being executed ]
 *   r0 = queue_work(wq, work);                  r1 = READ_ONCE(x);
 *
 * Forbids: r0 == true && r1 == 0
 */
static inline bool queue_work(struct workqueue_struct *wq,
                              struct work_struct *work)
{
        return queue_work_on(WORK_CPU_UNBOUND, wq, work);
}

/**
 * queue_delayed_work - queue work on a workqueue after delay
 * @wq: workqueue to use
 * @dwork: delayable work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * Equivalent to queue_delayed_work_on() but tries to use the local CPU.
 */
static inline bool queue_delayed_work(struct workqueue_struct *wq,
                                      struct delayed_work *dwork,
                                      unsigned long delay)
{
        return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
}

/**
 * mod_delayed_work - modify delay of or queue a delayed work
 * @wq: workqueue to use
 * @dwork: work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * mod_delayed_work_on() on local CPU.
 */
static inline bool mod_delayed_work(struct workqueue_struct *wq,
                                    struct delayed_work *dwork,
                                    unsigned long delay)
{
        return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
}

/**
 * schedule_work_on - put work task on a specific cpu
 * @cpu: cpu to put the work task on
 * @work: job to be done
 *
 * This puts a job on a specific cpu
 */
static inline bool schedule_work_on(int cpu, struct work_struct *work)
{
        return queue_work_on(cpu, system_wq, work);
}

/**
 * schedule_work - put work task in global workqueue
 * @work: job to be done
 *
 * Returns %false if @work was already on the kernel-global workqueue and
 * %true otherwise.
 *
 * This puts a job in the kernel-global workqueue if it was not already
 * queued and leaves it in the same position on the kernel-global
 * workqueue otherwise.
 *
 * Shares the same memory-ordering properties of queue_work(), cf. the
 * DocBook header of queue_work().
 */
static inline bool schedule_work(struct work_struct *work)
{
        return queue_work(system_wq, work);
}

/**
 * flush_scheduled_work - ensure that any scheduled work has run to completion.
 *
 * Forces execution of the kernel-global workqueue and blocks until its
 * completion.
 *
 * Think twice before calling this function!  It's very easy to get into
 * trouble if you don't take great care.  Either of the following situations
 * will lead to deadlock:
 *
 *        One of the work items currently on the workqueue needs to acquire
 *        a lock held by your code or its caller.
 *
 *        Your code is running in the context of a work routine.
 *
 * They will be detected by lockdep when they occur, but the first might not
 * occur very often.  It depends on what work items are on the workqueue and
 * what locks they need, which you have no control over.
 *
 * In most situations flushing the entire workqueue is overkill; you merely
 * need to know that a particular work item isn't queued and isn't running.
 * In such cases you should use cancel_delayed_work_sync() or
 * cancel_work_sync() instead.
 */
static inline void flush_scheduled_work(void)
{
        flush_workqueue(system_wq);
}

/**
 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
 * @cpu: cpu to use
 * @dwork: job to be done
 * @delay: number of jiffies to wait
 *
 * After waiting for a given time this puts a job in the kernel-global
 * workqueue on the specified CPU.
 */
static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
                                            unsigned long delay)
{
        return queue_delayed_work_on(cpu, system_wq, dwork, delay);
}

/**
 * schedule_delayed_work - put work task in global workqueue after delay
 * @dwork: job to be done
 * @delay: number of jiffies to wait or 0 for immediate execution
 *
 * After waiting for a given time this puts a job in the kernel-global
 * workqueue.
 */
static inline bool schedule_delayed_work(struct delayed_work *dwork,
                                         unsigned long delay)
{
        return queue_delayed_work(system_wq, dwork, delay);
}

#ifndef CONFIG_SMP
static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
{
        return fn(arg);
}
static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
{
        return fn(arg);
}
#else
long work_on_cpu(int cpu, long (*fn)(void *), void *arg);
long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg);
#endif /* CONFIG_SMP */

#ifdef CONFIG_FREEZER
extern void freeze_workqueues_begin(void);
extern bool freeze_workqueues_busy(void);
extern void thaw_workqueues(void);
#endif /* CONFIG_FREEZER */

#ifdef CONFIG_SYSFS
int workqueue_sysfs_register(struct workqueue_struct *wq);
#else        /* CONFIG_SYSFS */
static inline int workqueue_sysfs_register(struct workqueue_struct *wq)
{ return 0; }
#endif        /* CONFIG_SYSFS */

#ifdef CONFIG_WQ_WATCHDOG
void wq_watchdog_touch(int cpu);
#else        /* CONFIG_WQ_WATCHDOG */
static inline void wq_watchdog_touch(int cpu) { }
#endif        /* CONFIG_WQ_WATCHDOG */

#ifdef CONFIG_SMP
int workqueue_prepare_cpu(unsigned int cpu);
int workqueue_online_cpu(unsigned int cpu);
int workqueue_offline_cpu(unsigned int cpu);
#endif

void __init workqueue_init_early(void);
void __init workqueue_init(void);

#endif

























































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SIGNAL_H
#define _LINUX_SIGNAL_H

#include <linux/bug.h>
#include <linux/signal_types.h>
#include <linux/string.h>

struct task_struct;

/* for sysctl */
extern int print_fatal_signals;

static inline void copy_siginfo(kernel_siginfo_t *to,
                                const kernel_siginfo_t *from)
{
        memcpy(to, from, sizeof(*to));
}

static inline void clear_siginfo(kernel_siginfo_t *info)
{
        memset(info, 0, sizeof(*info));
}

#define SI_EXPANSION_SIZE (sizeof(struct siginfo) - sizeof(struct kernel_siginfo))

static inline void copy_siginfo_to_external(siginfo_t *to,
                                            const kernel_siginfo_t *from)
{
        memcpy(to, from, sizeof(*from));
        memset(((char *)to) + sizeof(struct kernel_siginfo), 0,
                SI_EXPANSION_SIZE);
}

int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from);
int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from);

enum siginfo_layout {
        SIL_KILL,
        SIL_TIMER,
        SIL_POLL,
        SIL_FAULT,
        SIL_FAULT_MCEERR,
        SIL_FAULT_BNDERR,
        SIL_FAULT_PKUERR,
        SIL_CHLD,
        SIL_RT,
        SIL_SYS,
};

enum siginfo_layout siginfo_layout(unsigned sig, int si_code);

/*
 * Define some primitives to manipulate sigset_t.
 */

#ifndef __HAVE_ARCH_SIG_BITOPS
#include <linux/bitops.h>

/* We don't use <linux/bitops.h> for these because there is no need to
   be atomic.  */
static inline void sigaddset(sigset_t *set, int _sig)
{
        unsigned long sig = _sig - 1;
        if (_NSIG_WORDS == 1)
                set->sig[0] |= 1UL << sig;
        else
                set->sig[sig / _NSIG_BPW] |= 1UL << (sig % _NSIG_BPW);
}

static inline void sigdelset(sigset_t *set, int _sig)
{
        unsigned long sig = _sig - 1;
        if (_NSIG_WORDS == 1)
                set->sig[0] &= ~(1UL << sig);
        else
                set->sig[sig / _NSIG_BPW] &= ~(1UL << (sig % _NSIG_BPW));
}

static inline int sigismember(sigset_t *set, int _sig)
{
        unsigned long sig = _sig - 1;
        if (_NSIG_WORDS == 1)
                return 1 & (set->sig[0] >> sig);
        else
                return 1 & (set->sig[sig / _NSIG_BPW] >> (sig % _NSIG_BPW));
}

#endif /* __HAVE_ARCH_SIG_BITOPS */

static inline int sigisemptyset(sigset_t *set)
{
        switch (_NSIG_WORDS) {
        case 4:
                return (set->sig[3] | set->sig[2] |
                        set->sig[1] | set->sig[0]) == 0;
        case 2:
                return (set->sig[1] | set->sig[0]) == 0;
        case 1:
                return set->sig[0] == 0;
        default:
                BUILD_BUG();
                return 0;
        }
}

static inline int sigequalsets(const sigset_t *set1, const sigset_t *set2)
{
        switch (_NSIG_WORDS) {
        case 4:
                return        (set1->sig[3] == set2->sig[3]) &&
                        (set1->sig[2] == set2->sig[2]) &&
                        (set1->sig[1] == set2->sig[1]) &&
                        (set1->sig[0] == set2->sig[0]);
        case 2:
                return        (set1->sig[1] == set2->sig[1]) &&
                        (set1->sig[0] == set2->sig[0]);
        case 1:
                return        set1->sig[0] == set2->sig[0];
        }
        return 0;
}

#define sigmask(sig)        (1UL << ((sig) - 1))

#ifndef __HAVE_ARCH_SIG_SETOPS
#include <linux/string.h>

#define _SIG_SET_BINOP(name, op)                                        \
static inline void name(sigset_t *r, const sigset_t *a, const sigset_t *b) \
{                                                                        \
        unsigned long a0, a1, a2, a3, b0, b1, b2, b3;                        \
                                                                        \
        switch (_NSIG_WORDS) {                                                \
        case 4:                                                                \
                a3 = a->sig[3]; a2 = a->sig[2];                                \
                b3 = b->sig[3]; b2 = b->sig[2];                                \
                r->sig[3] = op(a3, b3);                                        \
                r->sig[2] = op(a2, b2);                                        \
                fallthrough;                                                \
        case 2:                                                                \
                a1 = a->sig[1]; b1 = b->sig[1];                                \
                r->sig[1] = op(a1, b1);                                        \
                fallthrough;                                                \
        case 1:                                                                \
                a0 = a->sig[0]; b0 = b->sig[0];                                \
                r->sig[0] = op(a0, b0);                                        \
                break;                                                        \
        default:                                                        \
                BUILD_BUG();                                                \
        }                                                                \
}

#define _sig_or(x,y)        ((x) | (y))
_SIG_SET_BINOP(sigorsets, _sig_or)

#define _sig_and(x,y)        ((x) & (y))
_SIG_SET_BINOP(sigandsets, _sig_and)

#define _sig_andn(x,y)        ((x) & ~(y))
_SIG_SET_BINOP(sigandnsets, _sig_andn)

#undef _SIG_SET_BINOP
#undef _sig_or
#undef _sig_and
#undef _sig_andn

#define _SIG_SET_OP(name, op)                                                \
static inline void name(sigset_t *set)                                        \
{                                                                        \
        switch (_NSIG_WORDS) {                                                \
        case 4:        set->sig[3] = op(set->sig[3]);                                \
                set->sig[2] = op(set->sig[2]);                                \
                fallthrough;                                                \
        case 2:        set->sig[1] = op(set->sig[1]);                                \
                fallthrough;                                                \
        case 1:        set->sig[0] = op(set->sig[0]);                                \
                    break;                                                \
        default:                                                        \
                BUILD_BUG();                                                \
        }                                                                \
}

#define _sig_not(x)        (~(x))
_SIG_SET_OP(signotset, _sig_not)

#undef _SIG_SET_OP
#undef _sig_not

static inline void sigemptyset(sigset_t *set)
{
        switch (_NSIG_WORDS) {
        default:
                memset(set, 0, sizeof(sigset_t));
                break;
        case 2: set->sig[1] = 0;
                fallthrough;
        case 1:        set->sig[0] = 0;
                break;
        }
}

static inline void sigfillset(sigset_t *set)
{
        switch (_NSIG_WORDS) {
        default:
                memset(set, -1, sizeof(sigset_t));
                break;
        case 2: set->sig[1] = -1;
                fallthrough;
        case 1:        set->sig[0] = -1;
                break;
        }
}

/* Some extensions for manipulating the low 32 signals in particular.  */

static inline void sigaddsetmask(sigset_t *set, unsigned long mask)
{
        set->sig[0] |= mask;
}

static inline void sigdelsetmask(sigset_t *set, unsigned long mask)
{
        set->sig[0] &= ~mask;
}

static inline int sigtestsetmask(sigset_t *set, unsigned long mask)
{
        return (set->sig[0] & mask) != 0;
}

static inline void siginitset(sigset_t *set, unsigned long mask)
{
        set->sig[0] = mask;
        switch (_NSIG_WORDS) {
        default:
                memset(&set->sig[1], 0, sizeof(long)*(_NSIG_WORDS-1));
                break;
        case 2: set->sig[1] = 0;
                break;
        case 1: ;
        }
}

static inline void siginitsetinv(sigset_t *set, unsigned long mask)
{
        set->sig[0] = ~mask;
        switch (_NSIG_WORDS) {
        default:
                memset(&set->sig[1], -1, sizeof(long)*(_NSIG_WORDS-1));
                break;
        case 2: set->sig[1] = -1;
                break;
        case 1: ;
        }
}

#endif /* __HAVE_ARCH_SIG_SETOPS */

static inline void init_sigpending(struct sigpending *sig)
{
        sigemptyset(&sig->signal);
        INIT_LIST_HEAD(&sig->list);
}

extern void flush_sigqueue(struct sigpending *queue);

/* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
static inline int valid_signal(unsigned long sig)
{
        return sig <= _NSIG ? 1 : 0;
}

struct timespec;
struct pt_regs;
enum pid_type;

extern int next_signal(struct sigpending *pending, sigset_t *mask);
extern int do_send_sig_info(int sig, struct kernel_siginfo *info,
                                struct task_struct *p, enum pid_type type);
extern int group_send_sig_info(int sig, struct kernel_siginfo *info,
                               struct task_struct *p, enum pid_type type);
extern int __group_send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
extern int sigprocmask(int, sigset_t *, sigset_t *);
extern void set_current_blocked(sigset_t *);
extern void __set_current_blocked(const sigset_t *);
extern int show_unhandled_signals;

extern bool get_signal(struct ksignal *ksig);
extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping);
extern void exit_signals(struct task_struct *tsk);
extern void kernel_sigaction(int, __sighandler_t);

#define SIG_KTHREAD ((__force __sighandler_t)2)
#define SIG_KTHREAD_KERNEL ((__force __sighandler_t)3)

static inline void allow_signal(int sig)
{
        /*
         * Kernel threads handle their own signals. Let the signal code
         * know it'll be handled, so that they don't get converted to
         * SIGKILL or just silently dropped.
         */
        kernel_sigaction(sig, SIG_KTHREAD);
}

static inline void allow_kernel_signal(int sig)
{
        /*
         * Kernel threads handle their own signals. Let the signal code
         * know signals sent by the kernel will be handled, so that they
         * don't get silently dropped.
         */
        kernel_sigaction(sig, SIG_KTHREAD_KERNEL);
}

static inline void disallow_signal(int sig)
{
        kernel_sigaction(sig, SIG_IGN);
}

extern struct kmem_cache *sighand_cachep;

extern bool unhandled_signal(struct task_struct *tsk, int sig);

/*
 * In POSIX a signal is sent either to a specific thread (Linux task)
 * or to the process as a whole (Linux thread group).  How the signal
 * is sent determines whether it's to one thread or the whole group,
 * which determines which signal mask(s) are involved in blocking it
 * from being delivered until later.  When the signal is delivered,
 * either it's caught or ignored by a user handler or it has a default
 * effect that applies to the whole thread group (POSIX process).
 *
 * The possible effects an unblocked signal set to SIG_DFL can have are:
 *   ignore        - Nothing Happens
 *   terminate        - kill the process, i.e. all threads in the group,
 *                   similar to exit_group.  The group leader (only) reports
 *                  WIFSIGNALED status to its parent.
 *   coredump        - write a core dump file describing all threads using
 *                  the same mm and then kill all those threads
 *   stop         - stop all the threads in the group, i.e. TASK_STOPPED state
 *
 * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored.
 * Other signals when not blocked and set to SIG_DFL behaves as follows.
 * The job control signals also have other special effects.
 *
 *        +--------------------+------------------+
 *        |  POSIX signal      |  default action  |
 *        +--------------------+------------------+
 *        |  SIGHUP            |  terminate        |
 *        |  SIGINT            |        terminate        |
 *        |  SIGQUIT           |        coredump         |
 *        |  SIGILL            |        coredump         |
 *        |  SIGTRAP           |        coredump         |
 *        |  SIGABRT/SIGIOT    |        coredump         |
 *        |  SIGBUS            |        coredump         |
 *        |  SIGFPE            |        coredump         |
 *        |  SIGKILL           |        terminate(+)        |
 *        |  SIGUSR1           |        terminate        |
 *        |  SIGSEGV           |        coredump         |
 *        |  SIGUSR2           |        terminate        |
 *        |  SIGPIPE           |        terminate        |
 *        |  SIGALRM           |        terminate        |
 *        |  SIGTERM           |        terminate        |
 *        |  SIGCHLD           |        ignore           |
 *        |  SIGCONT           |        ignore(*)        |
 *        |  SIGSTOP           |        stop(*)(+)          |
 *        |  SIGTSTP           |        stop(*)          |
 *        |  SIGTTIN           |        stop(*)          |
 *        |  SIGTTOU           |        stop(*)          |
 *        |  SIGURG            |        ignore           |
 *        |  SIGXCPU           |        coredump         |
 *        |  SIGXFSZ           |        coredump         |
 *        |  SIGVTALRM         |        terminate        |
 *        |  SIGPROF           |        terminate        |
 *        |  SIGPOLL/SIGIO     |        terminate        |
 *        |  SIGSYS/SIGUNUSED  |        coredump         |
 *        |  SIGSTKFLT         |        terminate        |
 *        |  SIGWINCH          |        ignore           |
 *        |  SIGPWR            |        terminate        |
 *        |  SIGRTMIN-SIGRTMAX |        terminate       |
 *        +--------------------+------------------+
 *        |  non-POSIX signal  |  default action  |
 *        +--------------------+------------------+
 *        |  SIGEMT            |  coredump        |
 *        +--------------------+------------------+
 *
 * (+) For SIGKILL and SIGSTOP the action is "always", not just "default".
 * (*) Special job control effects:
 * When SIGCONT is sent, it resumes the process (all threads in the group)
 * from TASK_STOPPED state and also clears any pending/queued stop signals
 * (any of those marked with "stop(*)").  This happens regardless of blocking,
 * catching, or ignoring SIGCONT.  When any stop signal is sent, it clears
 * any pending/queued SIGCONT signals; this happens regardless of blocking,
 * catching, or ignored the stop signal, though (except for SIGSTOP) the
 * default action of stopping the process may happen later or never.
 */

#ifdef SIGEMT
#define SIGEMT_MASK        rt_sigmask(SIGEMT)
#else
#define SIGEMT_MASK        0
#endif

#if SIGRTMIN > BITS_PER_LONG
#define rt_sigmask(sig)        (1ULL << ((sig)-1))
#else
#define rt_sigmask(sig)        sigmask(sig)
#endif

#define siginmask(sig, mask) \
        ((sig) > 0 && (sig) < SIGRTMIN && (rt_sigmask(sig) & (mask)))

#define SIG_KERNEL_ONLY_MASK (\
        rt_sigmask(SIGKILL)   |  rt_sigmask(SIGSTOP))

#define SIG_KERNEL_STOP_MASK (\
        rt_sigmask(SIGSTOP)   |  rt_sigmask(SIGTSTP)   | \
        rt_sigmask(SIGTTIN)   |  rt_sigmask(SIGTTOU)   )

#define SIG_KERNEL_COREDUMP_MASK (\
        rt_sigmask(SIGQUIT)   |  rt_sigmask(SIGILL)    | \
        rt_sigmask(SIGTRAP)   |  rt_sigmask(SIGABRT)   | \
        rt_sigmask(SIGFPE)    |  rt_sigmask(SIGSEGV)   | \
        rt_sigmask(SIGBUS)    |  rt_sigmask(SIGSYS)    | \
        rt_sigmask(SIGXCPU)   |  rt_sigmask(SIGXFSZ)   | \
        SIGEMT_MASK                                       )

#define SIG_KERNEL_IGNORE_MASK (\
        rt_sigmask(SIGCONT)   |  rt_sigmask(SIGCHLD)   | \
        rt_sigmask(SIGWINCH)  |  rt_sigmask(SIGURG)    )

#define SIG_SPECIFIC_SICODES_MASK (\
        rt_sigmask(SIGILL)    |  rt_sigmask(SIGFPE)    | \
        rt_sigmask(SIGSEGV)   |  rt_sigmask(SIGBUS)    | \
        rt_sigmask(SIGTRAP)   |  rt_sigmask(SIGCHLD)   | \
        rt_sigmask(SIGPOLL)   |  rt_sigmask(SIGSYS)    | \
        SIGEMT_MASK                                    )

#define sig_kernel_only(sig)                siginmask(sig, SIG_KERNEL_ONLY_MASK)
#define sig_kernel_coredump(sig)        siginmask(sig, SIG_KERNEL_COREDUMP_MASK)
#define sig_kernel_ignore(sig)                siginmask(sig, SIG_KERNEL_IGNORE_MASK)
#define sig_kernel_stop(sig)                siginmask(sig, SIG_KERNEL_STOP_MASK)
#define sig_specific_sicodes(sig)        siginmask(sig, SIG_SPECIFIC_SICODES_MASK)

#define sig_fatal(t, signr) \
        (!siginmask(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
         (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)

void signals_init(void);

int restore_altstack(const stack_t __user *);
int __save_altstack(stack_t __user *, unsigned long);

#define unsafe_save_altstack(uss, sp, label) do { \
        stack_t __user *__uss = uss; \
        struct task_struct *t = current; \
        unsafe_put_user((void __user *)t->sas_ss_sp, &__uss->ss_sp, label); \
        unsafe_put_user(t->sas_ss_flags, &__uss->ss_flags, label); \
        unsafe_put_user(t->sas_ss_size, &__uss->ss_size, label); \
        if (t->sas_ss_flags & SS_AUTODISARM) \
                sas_ss_reset(t); \
} while (0);

#ifdef CONFIG_PROC_FS
struct seq_file;
extern void render_sigset_t(struct seq_file *, const char *, sigset_t *);
#endif

#endif /* _LINUX_SIGNAL_H */









































































































































    1 











    1 





























































































    1 


    1 






    1 
    1 

























    1 










    1 















    1 
    1 

    1 

























































































    1 
    1 


    1 













    1 
    1 







    1 


























































































































































































































































    1 





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
/*
 * mm/rmap.c - physical to virtual reverse mappings
 *
 * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
 * Released under the General Public License (GPL).
 *
 * Simple, low overhead reverse mapping scheme.
 * Please try to keep this thing as modular as possible.
 *
 * Provides methods for unmapping each kind of mapped page:
 * the anon methods track anonymous pages, and
 * the file methods track pages belonging to an inode.
 *
 * Original design by Rik van Riel <riel@conectiva.com.br> 2001
 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
 * Contributions by Hugh Dickins 2003, 2004
 */

/*
 * Lock ordering in mm:
 *
 * inode->i_mutex        (while writing or truncating, not reading or faulting)
 *   mm->mmap_lock
 *     page->flags PG_locked (lock_page)   * (see huegtlbfs below)
 *       hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
 *         mapping->i_mmap_rwsem
 *           hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
 *           anon_vma->rwsem
 *             mm->page_table_lock or pte_lock
 *               pgdat->lru_lock (in mark_page_accessed, isolate_lru_page)
 *               swap_lock (in swap_duplicate, swap_info_get)
 *                 mmlist_lock (in mmput, drain_mmlist and others)
 *                 mapping->private_lock (in __set_page_dirty_buffers)
 *                   mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
 *                     i_pages lock (widely used)
 *                 inode->i_lock (in set_page_dirty's __mark_inode_dirty)
 *                 bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
 *                   sb_lock (within inode_lock in fs/fs-writeback.c)
 *                   i_pages lock (widely used, in set_page_dirty,
 *                             in arch-dependent flush_dcache_mmap_lock,
 *                             within bdi.wb->list_lock in __sync_single_inode)
 *
 * anon_vma->rwsem,mapping->i_mutex      (memory_failure, collect_procs_anon)
 *   ->tasklist_lock
 *     pte map lock
 *
 * * hugetlbfs PageHuge() pages take locks in this order:
 *         mapping->i_mmap_rwsem
 *           hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
 *             page->flags PG_locked (lock_page)
 */

#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/rcupdate.h>
#include <linux/export.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/hugetlb.h>
#include <linux/huge_mm.h>
#include <linux/backing-dev.h>
#include <linux/page_idle.h>
#include <linux/memremap.h>
#include <linux/userfaultfd_k.h>

#include <asm/tlbflush.h>

#include <trace/events/tlb.h>

#include "internal.h"

static struct kmem_cache *anon_vma_cachep;
static struct kmem_cache *anon_vma_chain_cachep;

static inline struct anon_vma *anon_vma_alloc(void)
{
        struct anon_vma *anon_vma;

        anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
        if (anon_vma) {
                atomic_set(&anon_vma->refcount, 1);
                anon_vma->num_children = 0;
                anon_vma->num_active_vmas = 0;
                anon_vma->parent = anon_vma;
                /*
                 * Initialise the anon_vma root to point to itself. If called
                 * from fork, the root will be reset to the parents anon_vma.
                 */
                anon_vma->root = anon_vma;
        }

        return anon_vma;
}

static inline void anon_vma_free(struct anon_vma *anon_vma)
{
        VM_BUG_ON(atomic_read(&anon_vma->refcount));

        /*
         * Synchronize against page_lock_anon_vma_read() such that
         * we can safely hold the lock without the anon_vma getting
         * freed.
         *
         * Relies on the full mb implied by the atomic_dec_and_test() from
         * put_anon_vma() against the acquire barrier implied by
         * down_read_trylock() from page_lock_anon_vma_read(). This orders:
         *
         * page_lock_anon_vma_read()        VS        put_anon_vma()
         *   down_read_trylock()                  atomic_dec_and_test()
         *   LOCK                                  MB
         *   atomic_read()                          rwsem_is_locked()
         *
         * LOCK should suffice since the actual taking of the lock must
         * happen _before_ what follows.
         */
        might_sleep();
        if (rwsem_is_locked(&anon_vma->root->rwsem)) {
                anon_vma_lock_write(anon_vma);
                anon_vma_unlock_write(anon_vma);
        }

        kmem_cache_free(anon_vma_cachep, anon_vma);
}

static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
{
        return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
}

static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
{
        kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
}

static void anon_vma_chain_link(struct vm_area_struct *vma,
                                struct anon_vma_chain *avc,
                                struct anon_vma *anon_vma)
{
        avc->vma = vma;
        avc->anon_vma = anon_vma;
        list_add(&avc->same_vma, &vma->anon_vma_chain);
        anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
}

/**
 * __anon_vma_prepare - attach an anon_vma to a memory region
 * @vma: the memory region in question
 *
 * This makes sure the memory mapping described by 'vma' has
 * an 'anon_vma' attached to it, so that we can associate the
 * anonymous pages mapped into it with that anon_vma.
 *
 * The common case will be that we already have one, which
 * is handled inline by anon_vma_prepare(). But if
 * not we either need to find an adjacent mapping that we
 * can re-use the anon_vma from (very common when the only
 * reason for splitting a vma has been mprotect()), or we
 * allocate a new one.
 *
 * Anon-vma allocations are very subtle, because we may have
 * optimistically looked up an anon_vma in page_lock_anon_vma_read()
 * and that may actually touch the spinlock even in the newly
 * allocated vma (it depends on RCU to make sure that the
 * anon_vma isn't actually destroyed).
 *
 * As a result, we need to do proper anon_vma locking even
 * for the new allocation. At the same time, we do not want
 * to do any locking for the common case of already having
 * an anon_vma.
 *
 * This must be called with the mmap_lock held for reading.
 */
int __anon_vma_prepare(struct vm_area_struct *vma)
{
        struct mm_struct *mm = vma->vm_mm;
        struct anon_vma *anon_vma, *allocated;
        struct anon_vma_chain *avc;

        might_sleep();

        avc = anon_vma_chain_alloc(GFP_KERNEL);
        if (!avc)
                goto out_enomem;

        anon_vma = find_mergeable_anon_vma(vma);
        allocated = NULL;
        if (!anon_vma) {
                anon_vma = anon_vma_alloc();
                if (unlikely(!anon_vma))
                        goto out_enomem_free_avc;
                anon_vma->num_children++; /* self-parent link for new root */
                allocated = anon_vma;
        }

        anon_vma_lock_write(anon_vma);
        /* page_table_lock to protect against threads */
        spin_lock(&mm->page_table_lock);
        if (likely(!vma->anon_vma)) {
                vma->anon_vma = anon_vma;
                anon_vma_chain_link(vma, avc, anon_vma);
                anon_vma->num_active_vmas++;
                allocated = NULL;
                avc = NULL;
        }
        spin_unlock(&mm->page_table_lock);
        anon_vma_unlock_write(anon_vma);

        if (unlikely(allocated))
                put_anon_vma(allocated);
        if (unlikely(avc))
                anon_vma_chain_free(avc);

        return 0;

 out_enomem_free_avc:
        anon_vma_chain_free(avc);
 out_enomem:
        return -ENOMEM;
}

/*
 * This is a useful helper function for locking the anon_vma root as
 * we traverse the vma->anon_vma_chain, looping over anon_vma's that
 * have the same vma.
 *
 * Such anon_vma's should have the same root, so you'd expect to see
 * just a single mutex_lock for the whole traversal.
 */
static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
{
        struct anon_vma *new_root = anon_vma->root;
        if (new_root != root) {
                if (WARN_ON_ONCE(root))
                        up_write(&root->rwsem);
                root = new_root;
                down_write(&root->rwsem);
        }
        return root;
}

static inline void unlock_anon_vma_root(struct anon_vma *root)
{
        if (root)
                up_write(&root->rwsem);
}

/*
 * Attach the anon_vmas from src to dst.
 * Returns 0 on success, -ENOMEM on failure.
 *
 * anon_vma_clone() is called by __vma_split(), __split_vma(), copy_vma() and
 * anon_vma_fork(). The first three want an exact copy of src, while the last
 * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent
 * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call,
 * we can identify this case by checking (!dst->anon_vma && src->anon_vma).
 *
 * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
 * and reuse existing anon_vma which has no vmas and only one child anon_vma.
 * This prevents degradation of anon_vma hierarchy to endless linear chain in
 * case of constantly forking task. On the other hand, an anon_vma with more
 * than one child isn't reused even if there was no alive vma, thus rmap
 * walker has a good chance of avoiding scanning the whole hierarchy when it
 * searches where page is mapped.
 */
int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
{
        struct anon_vma_chain *avc, *pavc;
        struct anon_vma *root = NULL;

        list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
                struct anon_vma *anon_vma;

                avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
                if (unlikely(!avc)) {
                        unlock_anon_vma_root(root);
                        root = NULL;
                        avc = anon_vma_chain_alloc(GFP_KERNEL);
                        if (!avc)
                                goto enomem_failure;
                }
                anon_vma = pavc->anon_vma;
                root = lock_anon_vma_root(root, anon_vma);
                anon_vma_chain_link(dst, avc, anon_vma);

                /*
                 * Reuse existing anon_vma if it has no vma and only one
                 * anon_vma child.
                 *
                 * Root anon_vma is never reused:
                 * it has self-parent reference and at least one child.
                 */
                if (!dst->anon_vma && src->anon_vma &&
                    anon_vma->num_children < 2 &&
                    anon_vma->num_active_vmas == 0)
                        dst->anon_vma = anon_vma;
        }
        if (dst->anon_vma)
                dst->anon_vma->num_active_vmas++;
        unlock_anon_vma_root(root);
        return 0;

 enomem_failure:
        /*
         * dst->anon_vma is dropped here otherwise its degree can be incorrectly
         * decremented in unlink_anon_vmas().
         * We can safely do this because callers of anon_vma_clone() don't care
         * about dst->anon_vma if anon_vma_clone() failed.
         */
        dst->anon_vma = NULL;
        unlink_anon_vmas(dst);
        return -ENOMEM;
}

/*
 * Attach vma to its own anon_vma, as well as to the anon_vmas that
 * the corresponding VMA in the parent process is attached to.
 * Returns 0 on success, non-zero on failure.
 */
int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
{
        struct anon_vma_chain *avc;
        struct anon_vma *anon_vma;
        int error;

        /* Don't bother if the parent process has no anon_vma here. */
        if (!pvma->anon_vma)
                return 0;

        /* Drop inherited anon_vma, we'll reuse existing or allocate new. */
        vma->anon_vma = NULL;

        /*
         * First, attach the new VMA to the parent VMA's anon_vmas,
         * so rmap can find non-COWed pages in child processes.
         */
        error = anon_vma_clone(vma, pvma);
        if (error)
                return error;

        /* An existing anon_vma has been reused, all done then. */
        if (vma->anon_vma)
                return 0;

        /* Then add our own anon_vma. */
        anon_vma = anon_vma_alloc();
        if (!anon_vma)
                goto out_error;
        anon_vma->num_active_vmas++;
        avc = anon_vma_chain_alloc(GFP_KERNEL);
        if (!avc)
                goto out_error_free_anon_vma;

        /*
         * The root anon_vma's spinlock is the lock actually used when we
         * lock any of the anon_vmas in this anon_vma tree.
         */
        anon_vma->root = pvma->anon_vma->root;
        anon_vma->parent = pvma->anon_vma;
        /*
         * With refcounts, an anon_vma can stay around longer than the
         * process it belongs to. The root anon_vma needs to be pinned until
         * this anon_vma is freed, because the lock lives in the root.
         */
        get_anon_vma(anon_vma->root);
        /* Mark this anon_vma as the one where our new (COWed) pages go. */
        vma->anon_vma = anon_vma;
        anon_vma_lock_write(anon_vma);
        anon_vma_chain_link(vma, avc, anon_vma);
        anon_vma->parent->num_children++;
        anon_vma_unlock_write(anon_vma);

        return 0;

 out_error_free_anon_vma:
        put_anon_vma(anon_vma);
 out_error:
        unlink_anon_vmas(vma);
        return -ENOMEM;
}

void unlink_anon_vmas(struct vm_area_struct *vma)
{
        struct anon_vma_chain *avc, *next;
        struct anon_vma *root = NULL;

        /*
         * Unlink each anon_vma chained to the VMA.  This list is ordered
         * from newest to oldest, ensuring the root anon_vma gets freed last.
         */
        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
                struct anon_vma *anon_vma = avc->anon_vma;

                root = lock_anon_vma_root(root, anon_vma);
                anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);

                /*
                 * Leave empty anon_vmas on the list - we'll need
                 * to free them outside the lock.
                 */
                if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
                        anon_vma->parent->num_children--;
                        continue;
                }

                list_del(&avc->same_vma);
                anon_vma_chain_free(avc);
        }
        if (vma->anon_vma)
                vma->anon_vma->num_active_vmas--;
        unlock_anon_vma_root(root);

        /*
         * Iterate the list once more, it now only contains empty and unlinked
         * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
         * needing to write-acquire the anon_vma->root->rwsem.
         */
        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
                struct anon_vma *anon_vma = avc->anon_vma;

                VM_WARN_ON(anon_vma->num_children);
                VM_WARN_ON(anon_vma->num_active_vmas);
                put_anon_vma(anon_vma);

                list_del(&avc->same_vma);
                anon_vma_chain_free(avc);
        }
}

static void anon_vma_ctor(void *data)
{
        struct anon_vma *anon_vma = data;

        init_rwsem(&anon_vma->rwsem);
        atomic_set(&anon_vma->refcount, 0);
        anon_vma->rb_root = RB_ROOT_CACHED;
}

void __init anon_vma_init(void)
{
        anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
                        0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
                        anon_vma_ctor);
        anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
                        SLAB_PANIC|SLAB_ACCOUNT);
}

/*
 * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
 *
 * Since there is no serialization what so ever against page_remove_rmap()
 * the best this function can do is return a locked anon_vma that might
 * have been relevant to this page.
 *
 * The page might have been remapped to a different anon_vma or the anon_vma
 * returned may already be freed (and even reused).
 *
 * In case it was remapped to a different anon_vma, the new anon_vma will be a
 * child of the old anon_vma, and the anon_vma lifetime rules will therefore
 * ensure that any anon_vma obtained from the page will still be valid for as
 * long as we observe page_mapped() [ hence all those page_mapped() tests ].
 *
 * All users of this function must be very careful when walking the anon_vma
 * chain and verify that the page in question is indeed mapped in it
 * [ something equivalent to page_mapped_in_vma() ].
 *
 * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from
 * page_remove_rmap() that the anon_vma pointer from page->mapping is valid
 * if there is a mapcount, we can dereference the anon_vma after observing
 * those.
 */
struct anon_vma *page_get_anon_vma(struct page *page)
{
        struct anon_vma *anon_vma = NULL;
        unsigned long anon_mapping;

        rcu_read_lock();
        anon_mapping = (unsigned long)READ_ONCE(page->mapping);
        if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
                goto out;
        if (!page_mapped(page))
                goto out;

        anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
        if (!atomic_inc_not_zero(&anon_vma->refcount)) {
                anon_vma = NULL;
                goto out;
        }

        /*
         * If this page is still mapped, then its anon_vma cannot have been
         * freed.  But if it has been unmapped, we have no security against the
         * anon_vma structure being freed and reused (for another anon_vma:
         * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
         * above cannot corrupt).
         */
        if (!page_mapped(page)) {
                rcu_read_unlock();
                put_anon_vma(anon_vma);
                return NULL;
        }
out:
        rcu_read_unlock();

        return anon_vma;
}

/*
 * Similar to page_get_anon_vma() except it locks the anon_vma.
 *
 * Its a little more complex as it tries to keep the fast path to a single
 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
 * reference like with page_get_anon_vma() and then block on the mutex.
 */
struct anon_vma *page_lock_anon_vma_read(struct page *page)
{
        struct anon_vma *anon_vma = NULL;
        struct anon_vma *root_anon_vma;
        unsigned long anon_mapping;

        rcu_read_lock();
        anon_mapping = (unsigned long)READ_ONCE(page->mapping);
        if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
                goto out;
        if (!page_mapped(page))
                goto out;

        anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
        root_anon_vma = READ_ONCE(anon_vma->root);
        if (down_read_trylock(&root_anon_vma->rwsem)) {
                /*
                 * If the page is still mapped, then this anon_vma is still
                 * its anon_vma, and holding the mutex ensures that it will
                 * not go away, see anon_vma_free().
                 */
                if (!page_mapped(page)) {
                        up_read(&root_anon_vma->rwsem);
                        anon_vma = NULL;
                }
                goto out;
        }

        /* trylock failed, we got to sleep */
        if (!atomic_inc_not_zero(&anon_vma->refcount)) {
                anon_vma = NULL;
                goto out;
        }

        if (!page_mapped(page)) {
                rcu_read_unlock();
                put_anon_vma(anon_vma);
                return NULL;
        }

        /* we pinned the anon_vma, its safe to sleep */
        rcu_read_unlock();
        anon_vma_lock_read(anon_vma);

        if (atomic_dec_and_test(&anon_vma->refcount)) {
                /*
                 * Oops, we held the last refcount, release the lock
                 * and bail -- can't simply use put_anon_vma() because
                 * we'll deadlock on the anon_vma_lock_write() recursion.
                 */
                anon_vma_unlock_read(anon_vma);
                __put_anon_vma(anon_vma);
                anon_vma = NULL;
        }

        return anon_vma;

out:
        rcu_read_unlock();
        return anon_vma;
}

void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
{
        anon_vma_unlock_read(anon_vma);
}

#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
/*
 * Flush TLB entries for recently unmapped pages from remote CPUs. It is
 * important if a PTE was dirty when it was unmapped that it's flushed
 * before any IO is initiated on the page to prevent lost writes. Similarly,
 * it must be flushed before freeing to prevent data leakage.
 */
void try_to_unmap_flush(void)
{
        struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;

        if (!tlb_ubc->flush_required)
                return;

        arch_tlbbatch_flush(&tlb_ubc->arch);
        tlb_ubc->flush_required = false;
        tlb_ubc->writable = false;
}

/* Flush iff there are potentially writable TLB entries that can race with IO */
void try_to_unmap_flush_dirty(void)
{
        struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;

        if (tlb_ubc->writable)
                try_to_unmap_flush();
}

static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
{
        struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;

        arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
        tlb_ubc->flush_required = true;

        /*
         * Ensure compiler does not re-order the setting of tlb_flush_batched
         * before the PTE is cleared.
         */
        barrier();
        mm->tlb_flush_batched = true;

        /*
         * If the PTE was dirty then it's best to assume it's writable. The
         * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
         * before the page is queued for IO.
         */
        if (writable)
                tlb_ubc->writable = true;
}

/*
 * Returns true if the TLB flush should be deferred to the end of a batch of
 * unmap operations to reduce IPIs.
 */
static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
{
        bool should_defer = false;

        if (!(flags & TTU_BATCH_FLUSH))
                return false;

        /* If remote CPUs need to be flushed then defer batch the flush */
        if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
                should_defer = true;
        put_cpu();

        return should_defer;
}

/*
 * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
 * releasing the PTL if TLB flushes are batched. It's possible for a parallel
 * operation such as mprotect or munmap to race between reclaim unmapping
 * the page and flushing the page. If this race occurs, it potentially allows
 * access to data via a stale TLB entry. Tracking all mm's that have TLB
 * batching in flight would be expensive during reclaim so instead track
 * whether TLB batching occurred in the past and if so then do a flush here
 * if required. This will cost one additional flush per reclaim cycle paid
 * by the first operation at risk such as mprotect and mumap.
 *
 * This must be called under the PTL so that an access to tlb_flush_batched
 * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
 * via the PTL.
 */
void flush_tlb_batched_pending(struct mm_struct *mm)
{
        if (data_race(mm->tlb_flush_batched)) {
                flush_tlb_mm(mm);

                /*
                 * Do not allow the compiler to re-order the clearing of
                 * tlb_flush_batched before the tlb is flushed.
                 */
                barrier();
                mm->tlb_flush_batched = false;
        }
}
#else
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
{
}

static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
{
        return false;
}
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */

/*
 * At what user virtual address is page expected in vma?
 * Caller should check the page is actually part of the vma.
 */
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
        if (PageAnon(page)) {
                struct anon_vma *page__anon_vma = page_anon_vma(page);
                /*
                 * Note: swapoff's unuse_vma() is more efficient with this
                 * check, and needs it to match anon_vma when KSM is active.
                 */
                if (!vma->anon_vma || !page__anon_vma ||
                    vma->anon_vma->root != page__anon_vma->root)
                        return -EFAULT;
        } else if (!vma->vm_file) {
                return -EFAULT;
        } else if (vma->vm_file->f_mapping != compound_head(page)->mapping) {
                return -EFAULT;
        }

        return vma_address(page, vma);
}

pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd = NULL;
        pmd_t pmde;

        pgd = pgd_offset(mm, address);
        if (!pgd_present(*pgd))
                goto out;

        p4d = p4d_offset(pgd, address);
        if (!p4d_present(*p4d))
                goto out;

        pud = pud_offset(p4d, address);
        if (!pud_present(*pud))
                goto out;

        pmd = pmd_offset(pud, address);
        /*
         * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
         * without holding anon_vma lock for write.  So when looking for a
         * genuine pmde (in which to find pte), test present and !THP together.
         */
        pmde = *pmd;
        barrier();
        if (!pmd_present(pmde) || pmd_trans_huge(pmde))
                pmd = NULL;
out:
        return pmd;
}

struct page_referenced_arg {
        int mapcount;
        int referenced;
        unsigned long vm_flags;
        struct mem_cgroup *memcg;
};
/*
 * arg: page_referenced_arg will be passed
 */
static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
                        unsigned long address, void *arg)
{
        struct page_referenced_arg *pra = arg;
        struct page_vma_mapped_walk pvmw = {
                .page = page,
                .vma = vma,
                .address = address,
        };
        int referenced = 0;

        while (page_vma_mapped_walk(&pvmw)) {
                address = pvmw.address;

                if (vma->vm_flags & VM_LOCKED) {
                        page_vma_mapped_walk_done(&pvmw);
                        pra->vm_flags |= VM_LOCKED;
                        return false; /* To break the loop */
                }

                if (pvmw.pte) {
                        if (ptep_clear_flush_young_notify(vma, address,
                                                pvmw.pte)) {
                                /*
                                 * Don't treat a reference through
                                 * a sequentially read mapping as such.
                                 * If the page has been used in another mapping,
                                 * we will catch it; if this other mapping is
                                 * already gone, the unmap path will have set
                                 * PG_referenced or activated the page.
                                 */
                                if (likely(!(vma->vm_flags & VM_SEQ_READ)))
                                        referenced++;
                        }
                } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
                        if (pmdp_clear_flush_young_notify(vma, address,
                                                pvmw.pmd))
                                referenced++;
                } else {
                        /* unexpected pmd-mapped page? */
                        WARN_ON_ONCE(1);
                }

                pra->mapcount--;
        }

        if (referenced)
                clear_page_idle(page);
        if (test_and_clear_page_young(page))
                referenced++;

        if (referenced) {
                pra->referenced++;
                pra->vm_flags |= vma->vm_flags;
        }

        if (!pra->mapcount)
                return false; /* To break the loop */

        return true;
}

static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
{
        struct page_referenced_arg *pra = arg;
        struct mem_cgroup *memcg = pra->memcg;

        if (!mm_match_cgroup(vma->vm_mm, memcg))
                return true;

        return false;
}

/**
 * page_referenced - test if the page was referenced
 * @page: the page to test
 * @is_locked: caller holds lock on the page
 * @memcg: target memory cgroup
 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
 *
 * Quick test_and_clear_referenced for all mappings to a page,
 * returns the number of ptes which referenced the page.
 */
int page_referenced(struct page *page,
                    int is_locked,
                    struct mem_cgroup *memcg,
                    unsigned long *vm_flags)
{
        int we_locked = 0;
        struct page_referenced_arg pra = {
                .mapcount = total_mapcount(page),
                .memcg = memcg,
        };
        struct rmap_walk_control rwc = {
                .rmap_one = page_referenced_one,
                .arg = (void *)&pra,
                .anon_lock = page_lock_anon_vma_read,
        };

        *vm_flags = 0;
        if (!pra.mapcount)
                return 0;

        if (!page_rmapping(page))
                return 0;

        if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
                we_locked = trylock_page(page);
                if (!we_locked)
                        return 1;
        }

        /*
         * If we are reclaiming on behalf of a cgroup, skip
         * counting on behalf of references from different
         * cgroups
         */
        if (memcg) {
                rwc.invalid_vma = invalid_page_referenced_vma;
        }

        rmap_walk(page, &rwc);
        *vm_flags = pra.vm_flags;

        if (we_locked)
                unlock_page(page);

        return pra.referenced;
}

static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
                            unsigned long address, void *arg)
{
        struct page_vma_mapped_walk pvmw = {
                .page = page,
                .vma = vma,
                .address = address,
                .flags = PVMW_SYNC,
        };
        struct mmu_notifier_range range;
        int *cleaned = arg;

        /*
         * We have to assume the worse case ie pmd for invalidation. Note that
         * the page can not be free from this function.
         */
        mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
                                0, vma, vma->vm_mm, address,
                                vma_address_end(page, vma));
        mmu_notifier_invalidate_range_start(&range);

        while (page_vma_mapped_walk(&pvmw)) {
                int ret = 0;

                address = pvmw.address;
                if (pvmw.pte) {
                        pte_t entry;
                        pte_t *pte = pvmw.pte;

                        if (!pte_dirty(*pte) && !pte_write(*pte))
                                continue;

                        flush_cache_page(vma, address, pte_pfn(*pte));
                        entry = ptep_clear_flush(vma, address, pte);
                        entry = pte_wrprotect(entry);
                        entry = pte_mkclean(entry);
                        set_pte_at(vma->vm_mm, address, pte, entry);
                        ret = 1;
                } else {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
                        pmd_t *pmd = pvmw.pmd;
                        pmd_t entry;

                        if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
                                continue;

                        flush_cache_page(vma, address, page_to_pfn(page));
                        entry = pmdp_invalidate(vma, address, pmd);
                        entry = pmd_wrprotect(entry);
                        entry = pmd_mkclean(entry);
                        set_pmd_at(vma->vm_mm, address, pmd, entry);
                        ret = 1;
#else
                        /* unexpected pmd-mapped page? */
                        WARN_ON_ONCE(1);
#endif
                }

                /*
                 * No need to call mmu_notifier_invalidate_range() as we are
                 * downgrading page table protection not changing it to point
                 * to a new page.
                 *
                 * See Documentation/vm/mmu_notifier.rst
                 */
                if (ret)
                        (*cleaned)++;
        }

        mmu_notifier_invalidate_range_end(&range);

        return true;
}

static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
{
        if (vma->vm_flags & VM_SHARED)
                return false;

        return true;
}

int page_mkclean(struct page *page)
{
        int cleaned = 0;
        struct address_space *mapping;
        struct rmap_walk_control rwc = {
                .arg = (void *)&cleaned,
                .rmap_one = page_mkclean_one,
                .invalid_vma = invalid_mkclean_vma,
        };

        BUG_ON(!PageLocked(page));

        if (!page_mapped(page))
                return 0;

        mapping = page_mapping(page);
        if (!mapping)
                return 0;

        rmap_walk(page, &rwc);

        return cleaned;
}
EXPORT_SYMBOL_GPL(page_mkclean);

/**
 * page_move_anon_rmap - move a page to our anon_vma
 * @page:        the page to move to our anon_vma
 * @vma:        the vma the page belongs to
 *
 * When a page belongs exclusively to one process after a COW event,
 * that page can be moved into the anon_vma that belongs to just that
 * process, so the rmap code will not search the parent or sibling
 * processes.
 */
void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
{
        struct anon_vma *anon_vma = vma->anon_vma;

        page = compound_head(page);

        VM_BUG_ON_PAGE(!PageLocked(page), page);
        VM_BUG_ON_VMA(!anon_vma, vma);

        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
        /*
         * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
         * simultaneously, so a concurrent reader (eg page_referenced()'s
         * PageAnon()) will not see one without the other.
         */
        WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
}

/**
 * __page_set_anon_rmap - set up new anonymous rmap
 * @page:        Page or Hugepage to add to rmap
 * @vma:        VM area to add page to.
 * @address:        User virtual address of the mapping        
 * @exclusive:        the page is exclusively owned by the current process
 */
static void __page_set_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address, int exclusive)
{
        struct anon_vma *anon_vma = vma->anon_vma;

        BUG_ON(!anon_vma);

        if (PageAnon(page))
                return;

        /*
         * If the page isn't exclusively mapped into this vma,
         * we must use the _oldest_ possible anon_vma for the
         * page mapping!
         */
        if (!exclusive)
                anon_vma = anon_vma->root;

        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
        page->mapping = (struct address_space *) anon_vma;
        page->index = linear_page_index(vma, address);
}

/**
 * __page_check_anon_rmap - sanity check anonymous rmap addition
 * @page:        the page to add the mapping to
 * @vma:        the vm area in which the mapping is added
 * @address:        the user virtual address mapped
 */
static void __page_check_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address)
{
        /*
         * The page's anon-rmap details (mapping and index) are guaranteed to
         * be set up correctly at this point.
         *
         * We have exclusion against page_add_anon_rmap because the caller
         * always holds the page locked, except if called from page_dup_rmap,
         * in which case the page is already known to be setup.
         *
         * We have exclusion against page_add_new_anon_rmap because those pages
         * are initially only visible via the pagetables, and the pte is locked
         * over the call to page_add_new_anon_rmap.
         */
        VM_BUG_ON_PAGE(page_anon_vma(page)->root != vma->anon_vma->root, page);
        VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address),
                       page);
}

/**
 * page_add_anon_rmap - add pte mapping to an anonymous page
 * @page:        the page to add the mapping to
 * @vma:        the vm area in which the mapping is added
 * @address:        the user virtual address mapped
 * @compound:        charge the page as compound or small page
 *
 * The caller needs to hold the pte lock, and the page must be locked in
 * the anon_vma case: to serialize mapping,index checking after setting,
 * and to ensure that PageAnon is not being upgraded racily to PageKsm
 * (but PageKsm is never downgraded to PageAnon).
 */
void page_add_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address, bool compound)
{
        do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0);
}

/*
 * Special version of the above for do_swap_page, which often runs
 * into pages that are exclusively owned by the current process.
 * Everybody else should continue to use page_add_anon_rmap above.
 */
void do_page_add_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address, int flags)
{
        bool compound = flags & RMAP_COMPOUND;
        bool first;

        if (unlikely(PageKsm(page)))
                lock_page_memcg(page);
        else
                VM_BUG_ON_PAGE(!PageLocked(page), page);

        if (compound) {
                atomic_t *mapcount;
                VM_BUG_ON_PAGE(!PageLocked(page), page);
                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
                mapcount = compound_mapcount_ptr(page);
                first = atomic_inc_and_test(mapcount);
        } else {
                first = atomic_inc_and_test(&page->_mapcount);
        }

        if (first) {
                int nr = compound ? thp_nr_pages(page) : 1;
                /*
                 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
                 * these counters are not modified in interrupt context, and
                 * pte lock(a spinlock) is held, which implies preemption
                 * disabled.
                 */
                if (compound)
                        __inc_lruvec_page_state(page, NR_ANON_THPS);
                __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
        }

        if (unlikely(PageKsm(page))) {
                unlock_page_memcg(page);
                return;
        }

        /* address might be in next vma when migration races vma_adjust */
        if (first)
                __page_set_anon_rmap(page, vma, address,
                                flags & RMAP_EXCLUSIVE);
        else
                __page_check_anon_rmap(page, vma, address);
}

/**
 * page_add_new_anon_rmap - add pte mapping to a new anonymous page
 * @page:        the page to add the mapping to
 * @vma:        the vm area in which the mapping is added
 * @address:        the user virtual address mapped
 * @compound:        charge the page as compound or small page
 *
 * Same as page_add_anon_rmap but must only be called on *new* pages.
 * This means the inc-and-test can be bypassed.
 * Page does not have to be locked.
 */
void page_add_new_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address, bool compound)
{
        int nr = compound ? thp_nr_pages(page) : 1;

        VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
        __SetPageSwapBacked(page);
        if (compound) {
                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
                /* increment count (starts at -1) */
                atomic_set(compound_mapcount_ptr(page), 0);
                if (hpage_pincount_available(page))
                        atomic_set(compound_pincount_ptr(page), 0);

                __inc_lruvec_page_state(page, NR_ANON_THPS);
        } else {
                /* Anon THP always mapped first with PMD */
                VM_BUG_ON_PAGE(PageTransCompound(page), page);
                /* increment count (starts at -1) */
                atomic_set(&page->_mapcount, 0);
        }
        __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
        __page_set_anon_rmap(page, vma, address, 1);
}

/**
 * page_add_file_rmap - add pte mapping to a file page
 * @page: the page to add the mapping to
 * @compound: charge the page as compound or small page
 *
 * The caller needs to hold the pte lock.
 */
void page_add_file_rmap(struct page *page, bool compound)
{
        int i, nr = 1;

        VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
        lock_page_memcg(page);
        if (compound && PageTransHuge(page)) {
                for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
                        if (atomic_inc_and_test(&page[i]._mapcount))
                                nr++;
                }
                if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
                        goto out;
                if (PageSwapBacked(page))
                        __inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
                else
                        __inc_node_page_state(page, NR_FILE_PMDMAPPED);
        } else {
                if (PageTransCompound(page) && page_mapping(page)) {
                        VM_WARN_ON_ONCE(!PageLocked(page));

                        SetPageDoubleMap(compound_head(page));
                        if (PageMlocked(page))
                                clear_page_mlock(compound_head(page));
                }
                if (!atomic_inc_and_test(&page->_mapcount))
                        goto out;
        }
        __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
out:
        unlock_page_memcg(page);
}

static void page_remove_file_rmap(struct page *page, bool compound)
{
        int i, nr = 1;

        VM_BUG_ON_PAGE(compound && !PageHead(page), page);

        /* Hugepages are not counted in NR_FILE_MAPPED for now. */
        if (unlikely(PageHuge(page))) {
                /* hugetlb pages are always mapped with pmds */
                atomic_dec(compound_mapcount_ptr(page));
                return;
        }

        /* page still mapped by someone else? */
        if (compound && PageTransHuge(page)) {
                for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
                        if (atomic_add_negative(-1, &page[i]._mapcount))
                                nr++;
                }
                if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
                        return;
                if (PageSwapBacked(page))
                        __dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
                else
                        __dec_node_page_state(page, NR_FILE_PMDMAPPED);
        } else {
                if (!atomic_add_negative(-1, &page->_mapcount))
                        return;
        }

        /*
         * We use the irq-unsafe __{inc|mod}_lruvec_page_state because
         * these counters are not modified in interrupt context, and
         * pte lock(a spinlock) is held, which implies preemption disabled.
         */
        __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr);

        if (unlikely(PageMlocked(page)))
                clear_page_mlock(page);
}

static void page_remove_anon_compound_rmap(struct page *page)
{
        int i, nr;

        if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
                return;

        /* Hugepages are not counted in NR_ANON_PAGES for now. */
        if (unlikely(PageHuge(page)))
                return;

        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                return;

        __dec_lruvec_page_state(page, NR_ANON_THPS);

        if (TestClearPageDoubleMap(page)) {
                /*
                 * Subpages can be mapped with PTEs too. Check how many of
                 * them are still mapped.
                 */
                for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
                        if (atomic_add_negative(-1, &page[i]._mapcount))
                                nr++;
                }

                /*
                 * Queue the page for deferred split if at least one small
                 * page of the compound page is unmapped, but at least one
                 * small page is still mapped.
                 */
                if (nr && nr < thp_nr_pages(page))
                        deferred_split_huge_page(page);
        } else {
                nr = thp_nr_pages(page);
        }

        if (unlikely(PageMlocked(page)))
                clear_page_mlock(page);

        if (nr)
                __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr);
}

/**
 * page_remove_rmap - take down pte mapping from a page
 * @page:        page to remove mapping from
 * @compound:        uncharge the page as compound or small page
 *
 * The caller needs to hold the pte lock.
 */
void page_remove_rmap(struct page *page, bool compound)
{
        lock_page_memcg(page);

        if (!PageAnon(page)) {
                page_remove_file_rmap(page, compound);
                goto out;
        }

        if (compound) {
                page_remove_anon_compound_rmap(page);
                goto out;
        }

        /* page still mapped by someone else? */
        if (!atomic_add_negative(-1, &page->_mapcount))
                goto out;

        /*
         * We use the irq-unsafe __{inc|mod}_zone_page_stat because
         * these counters are not modified in interrupt context, and
         * pte lock(a spinlock) is held, which implies preemption disabled.
         */
        __dec_lruvec_page_state(page, NR_ANON_MAPPED);

        if (unlikely(PageMlocked(page)))
                clear_page_mlock(page);

        if (PageTransCompound(page))
                deferred_split_huge_page(compound_head(page));

        /*
         * It would be tidy to reset the PageAnon mapping here,
         * but that might overwrite a racing page_add_anon_rmap
         * which increments mapcount after us but sets mapping
         * before us: so leave the reset to free_unref_page,
         * and remember that it's only reliable while mapped.
         * Leaving it set also helps swapoff to reinstate ptes
         * faster for those pages still in swapcache.
         */
out:
        unlock_page_memcg(page);
}

/*
 * @arg: enum ttu_flags will be passed to this argument
 */
static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                     unsigned long address, void *arg)
{
        struct mm_struct *mm = vma->vm_mm;
        struct page_vma_mapped_walk pvmw = {
                .page = page,
                .vma = vma,
                .address = address,
        };
        pte_t pteval;
        struct page *subpage;
        bool ret = true;
        struct mmu_notifier_range range;
        enum ttu_flags flags = (enum ttu_flags)(long)arg;

        /*
         * When racing against e.g. zap_pte_range() on another cpu,
         * in between its ptep_get_and_clear_full() and page_remove_rmap(),
         * try_to_unmap() may return false when it is about to become true,
         * if page table locking is skipped: use TTU_SYNC to wait for that.
         */
        if (flags & TTU_SYNC)
                pvmw.flags = PVMW_SYNC;

        /* munlock has nothing to gain from examining un-locked vmas */
        if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
                return true;

        if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
            is_zone_device_page(page) && !is_device_private_page(page))
                return true;

        if (flags & TTU_SPLIT_HUGE_PMD) {
                split_huge_pmd_address(vma, address,
                                flags & TTU_SPLIT_FREEZE, page);
        }

        /*
         * For THP, we have to assume the worse case ie pmd for invalidation.
         * For hugetlb, it could be much worse if we need to do pud
         * invalidation in the case of pmd sharing.
         *
         * Note that the page can not be free in this function as call of
         * try_to_unmap() must hold a reference on the page.
         */
        range.end = PageKsm(page) ?
                        address + PAGE_SIZE : vma_address_end(page, vma);
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
                                address, range.end);
        if (PageHuge(page)) {
                /*
                 * If sharing is possible, start and end will be adjusted
                 * accordingly.
                 */
                adjust_range_if_pmd_sharing_possible(vma, &range.start,
                                                     &range.end);
        }
        mmu_notifier_invalidate_range_start(&range);

        while (page_vma_mapped_walk(&pvmw)) {
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
                /* PMD-mapped THP migration entry */
                if (!pvmw.pte && (flags & TTU_MIGRATION)) {
                        VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);

                        set_pmd_migration_entry(&pvmw, page);
                        continue;
                }
#endif

                /*
                 * If the page is mlock()d, we cannot swap it out.
                 * If it's recently referenced (perhaps page_referenced
                 * skipped over this mm) then we should reactivate it.
                 */
                if (!(flags & TTU_IGNORE_MLOCK)) {
                        if (vma->vm_flags & VM_LOCKED) {
                                /* PTE-mapped THP are never mlocked */
                                if (!PageTransCompound(page)) {
                                        /*
                                         * Holding pte lock, we do *not* need
                                         * mmap_lock here
                                         */
                                        mlock_vma_page(page);
                                }
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }
                        if (flags & TTU_MUNLOCK)
                                continue;
                }

                /* Unexpected PMD-mapped THP? */
                VM_BUG_ON_PAGE(!pvmw.pte, page);

                subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
                address = pvmw.address;

                if (PageHuge(page) && !PageAnon(page)) {
                        /*
                         * To call huge_pmd_unshare, i_mmap_rwsem must be
                         * held in write mode.  Caller needs to explicitly
                         * do this outside rmap routines.
                         */
                        VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
                        if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
                                /*
                                 * huge_pmd_unshare unmapped an entire PMD
                                 * page.  There is no way of knowing exactly
                                 * which PMDs may be cached for this mm, so
                                 * we must flush them all.  start/end were
                                 * already adjusted above to cover this range.
                                 */
                                flush_cache_range(vma, range.start, range.end);
                                flush_tlb_range(vma, range.start, range.end);
                                mmu_notifier_invalidate_range(mm, range.start,
                                                              range.end);

                                /*
                                 * The ref count of the PMD page was dropped
                                 * which is part of the way map counting
                                 * is done for shared PMDs.  Return 'true'
                                 * here.  When there is no other sharing,
                                 * huge_pmd_unshare returns false and we will
                                 * unmap the actual page and drop map count
                                 * to zero.
                                 */
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }
                }

                if (IS_ENABLED(CONFIG_MIGRATION) &&
                    (flags & TTU_MIGRATION) &&
                    is_zone_device_page(page)) {
                        swp_entry_t entry;
                        pte_t swp_pte;

                        pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte);

                        /*
                         * Store the pfn of the page in a special migration
                         * pte. do_swap_page() will wait until the migration
                         * pte is removed and then restart fault handling.
                         */
                        entry = make_migration_entry(page, 0);
                        swp_pte = swp_entry_to_pte(entry);

                        /*
                         * pteval maps a zone device page and is therefore
                         * a swap pte.
                         */
                        if (pte_swp_soft_dirty(pteval))
                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
                        if (pte_swp_uffd_wp(pteval))
                                swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
                        /*
                         * No need to invalidate here it will synchronize on
                         * against the special swap migration pte.
                         *
                         * The assignment to subpage above was computed from a
                         * swap PTE which results in an invalid pointer.
                         * Since only PAGE_SIZE pages can currently be
                         * migrated, just set it to page. This will need to be
                         * changed when hugepage migrations to device private
                         * memory are supported.
                         */
                        subpage = page;
                        goto discard;
                }

                /* Nuke the page table entry. */
                flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
                if (should_defer_flush(mm, flags)) {
                        /*
                         * We clear the PTE but do not flush so potentially
                         * a remote CPU could still be writing to the page.
                         * If the entry was previously clean then the
                         * architecture must guarantee that a clear->dirty
                         * transition on a cached TLB entry is written through
                         * and traps if the PTE is unmapped.
                         */
                        pteval = ptep_get_and_clear(mm, address, pvmw.pte);

                        set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
                } else {
                        pteval = ptep_clear_flush(vma, address, pvmw.pte);
                }

                /* Move the dirty bit to the page. Now the pte is gone. */
                if (pte_dirty(pteval))
                        set_page_dirty(page);

                /* Update high watermark before we lower rss */
                update_hiwater_rss(mm);

                if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
                        pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
                        if (PageHuge(page)) {
                                hugetlb_count_sub(compound_nr(page), mm);
                                set_huge_swap_pte_at(mm, address,
                                                     pvmw.pte, pteval,
                                                     vma_mmu_pagesize(vma));
                        } else {
                                dec_mm_counter(mm, mm_counter(page));
                                set_pte_at(mm, address, pvmw.pte, pteval);
                        }

                } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
                        /*
                         * The guest indicated that the page content is of no
                         * interest anymore. Simply discard the pte, vmscan
                         * will take care of the rest.
                         * A future reference will then fault in a new zero
                         * page. When userfaultfd is active, we must not drop
                         * this page though, as its main user (postcopy
                         * migration) will not expect userfaults on already
                         * copied pages.
                         */
                        dec_mm_counter(mm, mm_counter(page));
                        /* We have to invalidate as we cleared the pte */
                        mmu_notifier_invalidate_range(mm, address,
                                                      address + PAGE_SIZE);
                } else if (IS_ENABLED(CONFIG_MIGRATION) &&
                                (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) {
                        swp_entry_t entry;
                        pte_t swp_pte;

                        if (arch_unmap_one(mm, vma, address, pteval) < 0) {
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }

                        /*
                         * Store the pfn of the page in a special migration
                         * pte. do_swap_page() will wait until the migration
                         * pte is removed and then restart fault handling.
                         */
                        entry = make_migration_entry(subpage,
                                        pte_write(pteval));
                        swp_pte = swp_entry_to_pte(entry);
                        if (pte_soft_dirty(pteval))
                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
                        if (pte_uffd_wp(pteval))
                                swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        set_pte_at(mm, address, pvmw.pte, swp_pte);
                        /*
                         * No need to invalidate here it will synchronize on
                         * against the special swap migration pte.
                         */
                } else if (PageAnon(page)) {
                        swp_entry_t entry = { .val = page_private(subpage) };
                        pte_t swp_pte;
                        /*
                         * Store the swap location in the pte.
                         * See handle_pte_fault() ...
                         */
                        if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) {
                                WARN_ON_ONCE(1);
                                ret = false;
                                /* We have to invalidate as we cleared the pte */
                                mmu_notifier_invalidate_range(mm, address,
                                                        address + PAGE_SIZE);
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }

                        /* MADV_FREE page check */
                        if (!PageSwapBacked(page)) {
                                int ref_count, map_count;

                                /*
                                 * Synchronize with gup_pte_range():
                                 * - clear PTE; barrier; read refcount
                                 * - inc refcount; barrier; read PTE
                                 */
                                smp_mb();

                                ref_count = page_ref_count(page);
                                map_count = page_mapcount(page);

                                /*
                                 * Order reads for page refcount and dirty flag
                                 * (see comments in __remove_mapping()).
                                 */
                                smp_rmb();

                                /*
                                 * The only page refs must be one from isolation
                                 * plus the rmap(s) (dropped by discard:).
                                 */
                                if (ref_count == 1 + map_count &&
                                    !PageDirty(page)) {
                                        /* Invalidate as we cleared the pte */
                                        mmu_notifier_invalidate_range(mm,
                                                address, address + PAGE_SIZE);
                                        dec_mm_counter(mm, MM_ANONPAGES);
                                        goto discard;
                                }

                                /*
                                 * If the page was redirtied, it cannot be
                                 * discarded. Remap the page to page table.
                                 */
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                SetPageSwapBacked(page);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }

                        if (swap_duplicate(entry) < 0) {
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }
                        if (arch_unmap_one(mm, vma, address, pteval) < 0) {
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }
                        if (list_empty(&mm->mmlist)) {
                                spin_lock(&mmlist_lock);
                                if (list_empty(&mm->mmlist))
                                        list_add(&mm->mmlist, &init_mm.mmlist);
                                spin_unlock(&mmlist_lock);
                        }
                        dec_mm_counter(mm, MM_ANONPAGES);
                        inc_mm_counter(mm, MM_SWAPENTS);
                        swp_pte = swp_entry_to_pte(entry);
                        if (pte_soft_dirty(pteval))
                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
                        if (pte_uffd_wp(pteval))
                                swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        set_pte_at(mm, address, pvmw.pte, swp_pte);
                        /* Invalidate as we cleared the pte */
                        mmu_notifier_invalidate_range(mm, address,
                                                      address + PAGE_SIZE);
                } else {
                        /*
                         * This is a locked file-backed page, thus it cannot
                         * be removed from the page cache and replaced by a new
                         * page before mmu_notifier_invalidate_range_end, so no
                         * concurrent thread might update its page table to
                         * point at new page while a device still is using this
                         * page.
                         *
                         * See Documentation/vm/mmu_notifier.rst
                         */
                        dec_mm_counter(mm, mm_counter_file(page));
                }
discard:
                /*
                 * No need to call mmu_notifier_invalidate_range() it has be
                 * done above for all cases requiring it to happen under page
                 * table lock before mmu_notifier_invalidate_range_end()
                 *
                 * See Documentation/vm/mmu_notifier.rst
                 */
                page_remove_rmap(subpage, PageHuge(page));
                put_page(page);
        }

        mmu_notifier_invalidate_range_end(&range);

        return ret;
}

static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
{
        return vma_is_temporary_stack(vma);
}

static int page_not_mapped(struct page *page)
{
        return !page_mapped(page);
}

/**
 * try_to_unmap - try to remove all page table mappings to a page
 * @page: the page to get unmapped
 * @flags: action and flags
 *
 * Tries to remove all the page table entries which are mapping this
 * page, used in the pageout path.  Caller must hold the page lock.
 *
 * If unmap is successful, return true. Otherwise, false.
 */
bool try_to_unmap(struct page *page, enum ttu_flags flags)
{
        struct rmap_walk_control rwc = {
                .rmap_one = try_to_unmap_one,
                .arg = (void *)flags,
                .done = page_not_mapped,
                .anon_lock = page_lock_anon_vma_read,
        };

        /*
         * During exec, a temporary VMA is setup and later moved.
         * The VMA is moved under the anon_vma lock but not the
         * page tables leading to a race where migration cannot
         * find the migration ptes. Rather than increasing the
         * locking requirements of exec(), migration skips
         * temporary VMAs until after exec() completes.
         */
        if ((flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))
            && !PageKsm(page) && PageAnon(page))
                rwc.invalid_vma = invalid_migration_vma;

        if (flags & TTU_RMAP_LOCKED)
                rmap_walk_locked(page, &rwc);
        else
                rmap_walk(page, &rwc);

        /*
         * When racing against e.g. zap_pte_range() on another cpu,
         * in between its ptep_get_and_clear_full() and page_remove_rmap(),
         * try_to_unmap() may return false when it is about to become true,
         * if page table locking is skipped: use TTU_SYNC to wait for that.
         */
        return !page_mapcount(page);
}

/**
 * try_to_munlock - try to munlock a page
 * @page: the page to be munlocked
 *
 * Called from munlock code.  Checks all of the VMAs mapping the page
 * to make sure nobody else has this page mlocked. The page will be
 * returned with PG_mlocked cleared if no other vmas have it mlocked.
 */

void try_to_munlock(struct page *page)
{
        struct rmap_walk_control rwc = {
                .rmap_one = try_to_unmap_one,
                .arg = (void *)TTU_MUNLOCK,
                .done = page_not_mapped,
                .anon_lock = page_lock_anon_vma_read,

        };

        VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
        VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);

        rmap_walk(page, &rwc);
}

void __put_anon_vma(struct anon_vma *anon_vma)
{
        struct anon_vma *root = anon_vma->root;

        anon_vma_free(anon_vma);
        if (root != anon_vma && atomic_dec_and_test(&root->refcount))
                anon_vma_free(root);
}

static struct anon_vma *rmap_walk_anon_lock(struct page *page,
                                        struct rmap_walk_control *rwc)
{
        struct anon_vma *anon_vma;

        if (rwc->anon_lock)
                return rwc->anon_lock(page);

        /*
         * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
         * because that depends on page_mapped(); but not all its usages
         * are holding mmap_lock. Users without mmap_lock are required to
         * take a reference count to prevent the anon_vma disappearing
         */
        anon_vma = page_anon_vma(page);
        if (!anon_vma)
                return NULL;

        anon_vma_lock_read(anon_vma);
        return anon_vma;
}

/*
 * rmap_walk_anon - do something to anonymous page using the object-based
 * rmap method
 * @page: the page to be handled
 * @rwc: control variable according to each walk type
 *
 * Find all the mappings of a page using the mapping pointer and the vma chains
 * contained in the anon_vma struct it points to.
 *
 * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
 * where the page was found will be held for write.  So, we won't recheck
 * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
 * LOCKED.
 */
static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
                bool locked)
{
        struct anon_vma *anon_vma;
        pgoff_t pgoff_start, pgoff_end;
        struct anon_vma_chain *avc;

        if (locked) {
                anon_vma = page_anon_vma(page);
                /* anon_vma disappear under us? */
                VM_BUG_ON_PAGE(!anon_vma, page);
        } else {
                anon_vma = rmap_walk_anon_lock(page, rwc);
        }
        if (!anon_vma)
                return;

        pgoff_start = page_to_pgoff(page);
        pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
                        pgoff_start, pgoff_end) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long address = vma_address(page, vma);

                VM_BUG_ON_VMA(address == -EFAULT, vma);
                cond_resched();

                if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
                        continue;

                if (!rwc->rmap_one(page, vma, address, rwc->arg))
                        break;
                if (rwc->done && rwc->done(page))
                        break;
        }

        if (!locked)
                anon_vma_unlock_read(anon_vma);
}

/*
 * rmap_walk_file - do something to file page using the object-based rmap method
 * @page: the page to be handled
 * @rwc: control variable according to each walk type
 *
 * Find all the mappings of a page using the mapping pointer and the vma chains
 * contained in the address_space struct it points to.
 *
 * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
 * where the page was found will be held for write.  So, we won't recheck
 * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
 * LOCKED.
 */
static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
                bool locked)
{
        struct address_space *mapping = page_mapping(page);
        pgoff_t pgoff_start, pgoff_end;
        struct vm_area_struct *vma;

        /*
         * The page lock not only makes sure that page->mapping cannot
         * suddenly be NULLified by truncation, it makes sure that the
         * structure at mapping cannot be freed and reused yet,
         * so we can safely take mapping->i_mmap_rwsem.
         */
        VM_BUG_ON_PAGE(!PageLocked(page), page);

        if (!mapping)
                return;

        pgoff_start = page_to_pgoff(page);
        pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
        if (!locked)
                i_mmap_lock_read(mapping);
        vma_interval_tree_foreach(vma, &mapping->i_mmap,
                        pgoff_start, pgoff_end) {
                unsigned long address = vma_address(page, vma);

                VM_BUG_ON_VMA(address == -EFAULT, vma);
                cond_resched();

                if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
                        continue;

                if (!rwc->rmap_one(page, vma, address, rwc->arg))
                        goto done;
                if (rwc->done && rwc->done(page))
                        goto done;
        }

done:
        if (!locked)
                i_mmap_unlock_read(mapping);
}

void rmap_walk(struct page *page, struct rmap_walk_control *rwc)
{
        if (unlikely(PageKsm(page)))
                rmap_walk_ksm(page, rwc);
        else if (PageAnon(page))
                rmap_walk_anon(page, rwc, false);
        else
                rmap_walk_file(page, rwc, false);
}

/* Like rmap_walk, but caller holds relevant rmap lock */
void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
{
        /* no ksm support for now */
        VM_BUG_ON_PAGE(PageKsm(page), page);
        if (PageAnon(page))
                rmap_walk_anon(page, rwc, true);
        else
                rmap_walk_file(page, rwc, true);
}

#ifdef CONFIG_HUGETLB_PAGE
/*
 * The following two functions are for anonymous (private mapped) hugepages.
 * Unlike common anonymous pages, anonymous hugepages have no accounting code
 * and no lru code, because we handle hugepages differently from common pages.
 */
void hugepage_add_anon_rmap(struct page *page,
                            struct vm_area_struct *vma, unsigned long address)
{
        struct anon_vma *anon_vma = vma->anon_vma;
        int first;

        BUG_ON(!PageLocked(page));
        BUG_ON(!anon_vma);
        /* address might be in next vma when migration races vma_adjust */
        first = atomic_inc_and_test(compound_mapcount_ptr(page));
        if (first)
                __page_set_anon_rmap(page, vma, address, 0);
}

void hugepage_add_new_anon_rmap(struct page *page,
                        struct vm_area_struct *vma, unsigned long address)
{
        BUG_ON(address < vma->vm_start || address >= vma->vm_end);
        atomic_set(compound_mapcount_ptr(page), 0);
        if (hpage_pincount_available(page))
                atomic_set(compound_pincount_ptr(page), 0);

        __page_set_anon_rmap(page, vma, address, 1);
}
#endif /* CONFIG_HUGETLB_PAGE */
























































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_SPECIAL_INSNS_H
#define _ASM_X86_SPECIAL_INSNS_H


#ifdef __KERNEL__

#include <asm/nops.h>
#include <asm/processor-flags.h>
#include <linux/irqflags.h>
#include <linux/jump_label.h>

/*
 * The compiler should not reorder volatile asm statements with respect to each
 * other: they should execute in program order. However GCC 4.9.x and 5.x have
 * a bug (which was fixed in 8.1, 7.3 and 6.5) where they might reorder
 * volatile asm. The write functions are not affected since they have memory
 * clobbers preventing reordering. To prevent reads from being reordered with
 * respect to writes, use a dummy memory operand.
 */

#define __FORCE_ORDER "m"(*(unsigned int *)0x1000UL)

void native_write_cr0(unsigned long val);

static inline unsigned long native_read_cr0(void)
{
        unsigned long val;
        asm volatile("mov %%cr0,%0\n\t" : "=r" (val) : __FORCE_ORDER);
        return val;
}

static __always_inline unsigned long native_read_cr2(void)
{
        unsigned long val;
        asm volatile("mov %%cr2,%0\n\t" : "=r" (val) : __FORCE_ORDER);
        return val;
}

static __always_inline void native_write_cr2(unsigned long val)
{
        asm volatile("mov %0,%%cr2": : "r" (val) : "memory");
}

static inline unsigned long __native_read_cr3(void)
{
        unsigned long val;
        asm volatile("mov %%cr3,%0\n\t" : "=r" (val) : __FORCE_ORDER);
        return val;
}

static inline void native_write_cr3(unsigned long val)
{
        asm volatile("mov %0,%%cr3": : "r" (val) : "memory");
}

static inline unsigned long native_read_cr4(void)
{
        unsigned long val;
#ifdef CONFIG_X86_32
        /*
         * This could fault if CR4 does not exist.  Non-existent CR4
         * is functionally equivalent to CR4 == 0.  Keep it simple and pretend
         * that CR4 == 0 on CPUs that don't have CR4.
         */
        asm volatile("1: mov %%cr4, %0\n"
                     "2:\n"
                     _ASM_EXTABLE(1b, 2b)
                     : "=r" (val) : "0" (0), __FORCE_ORDER);
#else
        /* CR4 always exists on x86_64. */
        asm volatile("mov %%cr4,%0\n\t" : "=r" (val) : __FORCE_ORDER);
#endif
        return val;
}

void native_write_cr4(unsigned long val);

#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
static inline u32 rdpkru(void)
{
        u32 ecx = 0;
        u32 edx, pkru;

        /*
         * "rdpkru" instruction.  Places PKRU contents in to EAX,
         * clears EDX and requires that ecx=0.
         */
        asm volatile(".byte 0x0f,0x01,0xee\n\t"
                     : "=a" (pkru), "=d" (edx)
                     : "c" (ecx));
        return pkru;
}

static inline void wrpkru(u32 pkru)
{
        u32 ecx = 0, edx = 0;

        /*
         * "wrpkru" instruction.  Loads contents in EAX to PKRU,
         * requires that ecx = edx = 0.
         */
        asm volatile(".byte 0x0f,0x01,0xef\n\t"
                     : : "a" (pkru), "c"(ecx), "d"(edx));
}

static inline void __write_pkru(u32 pkru)
{
        /*
         * WRPKRU is relatively expensive compared to RDPKRU.
         * Avoid WRPKRU when it would not change the value.
         */
        if (pkru == rdpkru())
                return;

        wrpkru(pkru);
}

#else
static inline u32 rdpkru(void)
{
        return 0;
}

static inline void __write_pkru(u32 pkru)
{
}
#endif

static inline void native_wbinvd(void)
{
        asm volatile("wbinvd": : :"memory");
}

extern asmlinkage void asm_load_gs_index(unsigned int selector);

static inline void native_load_gs_index(unsigned int selector)
{
        unsigned long flags;

        local_irq_save(flags);
        asm_load_gs_index(selector);
        local_irq_restore(flags);
}

static inline unsigned long __read_cr4(void)
{
        return native_read_cr4();
}

#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else

static inline unsigned long read_cr0(void)
{
        return native_read_cr0();
}

static inline void write_cr0(unsigned long x)
{
        native_write_cr0(x);
}

static __always_inline unsigned long read_cr2(void)
{
        return native_read_cr2();
}

static __always_inline void write_cr2(unsigned long x)
{
        native_write_cr2(x);
}

/*
 * Careful!  CR3 contains more than just an address.  You probably want
 * read_cr3_pa() instead.
 */
static inline unsigned long __read_cr3(void)
{
        return __native_read_cr3();
}

static inline void write_cr3(unsigned long x)
{
        native_write_cr3(x);
}

static inline void __write_cr4(unsigned long x)
{
        native_write_cr4(x);
}

static inline void wbinvd(void)
{
        native_wbinvd();
}

#ifdef CONFIG_X86_64

static inline void load_gs_index(unsigned int selector)
{
        native_load_gs_index(selector);
}

#endif

#endif /* CONFIG_PARAVIRT_XXL */

static inline void clflush(volatile void *__p)
{
        asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
}

static inline void clflushopt(volatile void *__p)
{
        alternative_io(".byte " __stringify(NOP_DS_PREFIX) "; clflush %P0",
                       ".byte 0x66; clflush %P0",
                       X86_FEATURE_CLFLUSHOPT,
                       "+m" (*(volatile char __force *)__p));
}

static inline void clwb(volatile void *__p)
{
        volatile struct { char x[64]; } *p = __p;

        asm volatile(ALTERNATIVE_2(
                ".byte " __stringify(NOP_DS_PREFIX) "; clflush (%[pax])",
                ".byte 0x66; clflush (%[pax])", /* clflushopt (%%rax) */
                X86_FEATURE_CLFLUSHOPT,
                ".byte 0x66, 0x0f, 0xae, 0x30",  /* clwb (%%rax) */
                X86_FEATURE_CLWB)
                : [p] "+m" (*p)
                : [pax] "a" (p));
}

#define nop() asm volatile ("nop")

static inline void serialize(void)
{
        /* Instruction opcode for SERIALIZE; supported in binutils >= 2.35. */
        asm volatile(".byte 0xf, 0x1, 0xe8" ::: "memory");
}

/* The dst parameter must be 64-bytes aligned */
static inline void movdir64b(void *dst, const void *src)
{
        const struct { char _[64]; } *__src = src;
        struct { char _[64]; } *__dst = dst;

        /*
         * MOVDIR64B %(rdx), rax.
         *
         * Both __src and __dst must be memory constraints in order to tell the
         * compiler that no other memory accesses should be reordered around
         * this one.
         *
         * Also, both must be supplied as lvalues because this tells
         * the compiler what the object is (its size) the instruction accesses.
         * I.e., not the pointers but what they point to, thus the deref'ing '*'.
         */
        asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
                     : "+m" (*__dst)
                     :  "m" (*__src), "a" (__dst), "d" (__src));
}

/**
 * enqcmds - Enqueue a command in supervisor (CPL0) mode
 * @dst: destination, in MMIO space (must be 512-bit aligned)
 * @src: 512 bits memory operand
 *
 * The ENQCMDS instruction allows software to write a 512-bit command to
 * a 512-bit-aligned special MMIO region that supports the instruction.
 * A return status is loaded into the ZF flag in the RFLAGS register.
 * ZF = 0 equates to success, and ZF = 1 indicates retry or error.
 *
 * This function issues the ENQCMDS instruction to submit data from
 * kernel space to MMIO space, in a unit of 512 bits. Order of data access
 * is not guaranteed, nor is a memory barrier performed afterwards. It
 * returns 0 on success and -EAGAIN on failure.
 *
 * Warning: Do not use this helper unless your driver has checked that the
 * ENQCMDS instruction is supported on the platform and the device accepts
 * ENQCMDS.
 */
static inline int enqcmds(void __iomem *dst, const void *src)
{
        const struct { char _[64]; } *__src = src;
        struct { char _[64]; } __iomem *__dst = dst;
        bool zf;

        /*
         * ENQCMDS %(rdx), rax
         *
         * See movdir64b()'s comment on operand specification.
         */
        asm volatile(".byte 0xf3, 0x0f, 0x38, 0xf8, 0x02, 0x66, 0x90"
                     CC_SET(z)
                     : CC_OUT(z) (zf), "+m" (*__dst)
                     : "m" (*__src), "a" (__dst), "d" (__src));

        /* Submission failure is indicated via EFLAGS.ZF=1 */
        if (zf)
                return -EAGAIN;

        return 0;
}

#endif /* __KERNEL__ */

#endif /* _ASM_X86_SPECIAL_INSNS_H */









































    1 





























    1 





























    1 
















    1 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
/* SPDX-License-Identifier: GPL-2.0 */

/*
 * This header provides generic wrappers for memory access instrumentation that
 * the compiler cannot emit for: KASAN, KCSAN.
 */
#ifndef _LINUX_INSTRUMENTED_H
#define _LINUX_INSTRUMENTED_H

#include <linux/compiler.h>
#include <linux/kasan-checks.h>
#include <linux/kcsan-checks.h>
#include <linux/types.h>

/**
 * instrument_read - instrument regular read access
 *
 * Instrument a regular read access. The instrumentation should be inserted
 * before the actual read happens.
 *
 * @ptr address of access
 * @size size of access
 */
static __always_inline void instrument_read(const volatile void *v, size_t size)
{
        kasan_check_read(v, size);
        kcsan_check_read(v, size);
}

/**
 * instrument_write - instrument regular write access
 *
 * Instrument a regular write access. The instrumentation should be inserted
 * before the actual write happens.
 *
 * @ptr address of access
 * @size size of access
 */
static __always_inline void instrument_write(const volatile void *v, size_t size)
{
        kasan_check_write(v, size);
        kcsan_check_write(v, size);
}

/**
 * instrument_read_write - instrument regular read-write access
 *
 * Instrument a regular write access. The instrumentation should be inserted
 * before the actual write happens.
 *
 * @ptr address of access
 * @size size of access
 */
static __always_inline void instrument_read_write(const volatile void *v, size_t size)
{
        kasan_check_write(v, size);
        kcsan_check_read_write(v, size);
}

/**
 * instrument_atomic_read - instrument atomic read access
 *
 * Instrument an atomic read access. The instrumentation should be inserted
 * before the actual read happens.
 *
 * @ptr address of access
 * @size size of access
 */
static __always_inline void instrument_atomic_read(const volatile void *v, size_t size)
{
        kasan_check_read(v, size);
        kcsan_check_atomic_read(v, size);
}

/**
 * instrument_atomic_write - instrument atomic write access
 *
 * Instrument an atomic write access. The instrumentation should be inserted
 * before the actual write happens.
 *
 * @ptr address of access
 * @size size of access
 */
static __always_inline void instrument_atomic_write(const volatile void *v, size_t size)
{
        kasan_check_write(v, size);
        kcsan_check_atomic_write(v, size);
}

/**
 * instrument_atomic_read_write - instrument atomic read-write access
 *
 * Instrument an atomic read-write access. The instrumentation should be
 * inserted before the actual write happens.
 *
 * @ptr address of access
 * @size size of access
 */
static __always_inline void instrument_atomic_read_write(const volatile void *v, size_t size)
{
        kasan_check_write(v, size);
        kcsan_check_atomic_read_write(v, size);
}

/**
 * instrument_copy_to_user - instrument reads of copy_to_user
 *
 * Instrument reads from kernel memory, that are due to copy_to_user (and
 * variants). The instrumentation must be inserted before the accesses.
 *
 * @to destination address
 * @from source address
 * @n number of bytes to copy
 */
static __always_inline void
instrument_copy_to_user(void __user *to, const void *from, unsigned long n)
{
        kasan_check_read(from, n);
        kcsan_check_read(from, n);
}

/**
 * instrument_copy_from_user - instrument writes of copy_from_user
 *
 * Instrument writes to kernel memory, that are due to copy_from_user (and
 * variants). The instrumentation should be inserted before the accesses.
 *
 * @to destination address
 * @from source address
 * @n number of bytes to copy
 */
static __always_inline void
instrument_copy_from_user(const void *to, const void __user *from, unsigned long n)
{
        kasan_check_write(to, n);
        kcsan_check_write(to, n);
}

#endif /* _LINUX_INSTRUMENTED_H */


































































































































































































































































































































































































































































































































































































































































































































































































































    1 










































    1 


    1 















    1 





    1 
    1 





































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/file.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/fs/minix/file.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  ext4 fs regular file handling primitives
 *
 *  64-bit file support on 64-bit platforms by Jakub Jelinek
 *        (jj@sunsite.ms.mff.cuni.cz)
 */

#include <linux/time.h>
#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/mount.h>
#include <linux/path.h>
#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/pagevec.h>
#include <linux/uio.h>
#include <linux/mman.h>
#include <linux/backing-dev.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
#include "truncate.h"

static bool ext4_dio_supported(struct inode *inode)
{
        if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode))
                return false;
        if (fsverity_active(inode))
                return false;
        if (ext4_should_journal_data(inode))
                return false;
        if (ext4_has_inline_data(inode))
                return false;
        return true;
}

static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        ssize_t ret;
        struct inode *inode = file_inode(iocb->ki_filp);

        if (iocb->ki_flags & IOCB_NOWAIT) {
                if (!inode_trylock_shared(inode))
                        return -EAGAIN;
        } else {
                inode_lock_shared(inode);
        }

        if (!ext4_dio_supported(inode)) {
                inode_unlock_shared(inode);
                /*
                 * Fallback to buffered I/O if the operation being performed on
                 * the inode is not supported by direct I/O. The IOCB_DIRECT
                 * flag needs to be cleared here in order to ensure that the
                 * direct I/O path within generic_file_read_iter() is not
                 * taken.
                 */
                iocb->ki_flags &= ~IOCB_DIRECT;
                return generic_file_read_iter(iocb, to);
        }

        ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL,
                           is_sync_kiocb(iocb));
        inode_unlock_shared(inode);

        file_accessed(iocb->ki_filp);
        return ret;
}

#ifdef CONFIG_FS_DAX
static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct inode *inode = file_inode(iocb->ki_filp);
        ssize_t ret;

        if (iocb->ki_flags & IOCB_NOWAIT) {
                if (!inode_trylock_shared(inode))
                        return -EAGAIN;
        } else {
                inode_lock_shared(inode);
        }
        /*
         * Recheck under inode lock - at this point we are sure it cannot
         * change anymore
         */
        if (!IS_DAX(inode)) {
                inode_unlock_shared(inode);
                /* Fallback to buffered IO in case we cannot support DAX */
                return generic_file_read_iter(iocb, to);
        }
        ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops);
        inode_unlock_shared(inode);

        file_accessed(iocb->ki_filp);
        return ret;
}
#endif

static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct inode *inode = file_inode(iocb->ki_filp);

        if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
                return -EIO;

        if (!iov_iter_count(to))
                return 0; /* skip atime */

#ifdef CONFIG_FS_DAX
        if (IS_DAX(inode))
                return ext4_dax_read_iter(iocb, to);
#endif
        if (iocb->ki_flags & IOCB_DIRECT)
                return ext4_dio_read_iter(iocb, to);

        return generic_file_read_iter(iocb, to);
}

/*
 * Called when an inode is released. Note that this is different
 * from ext4_file_open: open gets called at every open, but release
 * gets called only when /all/ the files are closed.
 */
static int ext4_release_file(struct inode *inode, struct file *filp)
{
        if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
                ext4_alloc_da_blocks(inode);
                ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
        }
        /* if we are the last writer on the inode, drop the block reservation */
        if ((filp->f_mode & FMODE_WRITE) &&
                        (atomic_read(&inode->i_writecount) == 1) &&
                        !EXT4_I(inode)->i_reserved_data_blocks) {
                down_write(&EXT4_I(inode)->i_data_sem);
                ext4_discard_preallocations(inode, 0);
                up_write(&EXT4_I(inode)->i_data_sem);
        }
        if (is_dx(inode) && filp->private_data)
                ext4_htree_free_dir_info(filp->private_data);

        return 0;
}

/*
 * This tests whether the IO in question is block-aligned or not.
 * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
 * are converted to written only after the IO is complete.  Until they are
 * mapped, these blocks appear as holes, so dio_zero_block() will assume that
 * it needs to zero out portions of the start and/or end block.  If 2 AIO
 * threads are at work on the same unwritten block, they must be synchronized
 * or one thread will zero the other's data, causing corruption.
 */
static bool
ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos)
{
        struct super_block *sb = inode->i_sb;
        unsigned long blockmask = sb->s_blocksize - 1;

        if ((pos | iov_iter_alignment(from)) & blockmask)
                return true;

        return false;
}

static bool
ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
{
        if (offset + len > i_size_read(inode) ||
            offset + len > EXT4_I(inode)->i_disksize)
                return true;
        return false;
}

/* Is IO overwriting allocated and initialized blocks? */
static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
{
        struct ext4_map_blocks map;
        unsigned int blkbits = inode->i_blkbits;
        int err, blklen;

        if (pos + len > i_size_read(inode))
                return false;

        map.m_lblk = pos >> blkbits;
        map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits);
        blklen = map.m_len;

        err = ext4_map_blocks(NULL, inode, &map, 0);
        /*
         * 'err==len' means that all of the blocks have been preallocated,
         * regardless of whether they have been initialized or not. To exclude
         * unwritten extents, we need to check m_flags.
         */
        return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
}

static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
                                         struct iov_iter *from)
{
        struct inode *inode = file_inode(iocb->ki_filp);
        ssize_t ret;

        if (unlikely(IS_IMMUTABLE(inode)))
                return -EPERM;

        ret = generic_write_checks(iocb, from);
        if (ret <= 0)
                return ret;

        /*
         * If we have encountered a bitmap-format file, the size limit
         * is smaller than s_maxbytes, which is for extent-mapped files.
         */
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

                if (iocb->ki_pos >= sbi->s_bitmap_maxbytes)
                        return -EFBIG;
                iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
        }

        return iov_iter_count(from);
}

static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
{
        ssize_t ret, count;

        count = ext4_generic_write_checks(iocb, from);
        if (count <= 0)
                return count;

        ret = file_modified(iocb->ki_filp);
        if (ret)
                return ret;
        return count;
}

static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
                                        struct iov_iter *from)
{
        ssize_t ret;
        struct inode *inode = file_inode(iocb->ki_filp);

        if (iocb->ki_flags & IOCB_NOWAIT)
                return -EOPNOTSUPP;

        inode_lock(inode);
        ret = ext4_write_checks(iocb, from);
        if (ret <= 0)
                goto out;

        current->backing_dev_info = inode_to_bdi(inode);
        ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
        current->backing_dev_info = NULL;

out:
        inode_unlock(inode);
        if (likely(ret > 0)) {
                iocb->ki_pos += ret;
                ret = generic_write_sync(iocb, ret);
        }

        return ret;
}

static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
                                           ssize_t written, size_t count)
{
        handle_t *handle;
        bool truncate = false;
        u8 blkbits = inode->i_blkbits;
        ext4_lblk_t written_blk, end_blk;
        int ret;

        /*
         * Note that EXT4_I(inode)->i_disksize can get extended up to
         * inode->i_size while the I/O was running due to writeback of delalloc
         * blocks. But, the code in ext4_iomap_alloc() is careful to use
         * zeroed/unwritten extents if this is possible; thus we won't leave
         * uninitialized blocks in a file even if we didn't succeed in writing
         * as much as we intended.
         */
        WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
        if (offset + count <= EXT4_I(inode)->i_disksize) {
                /*
                 * We need to ensure that the inode is removed from the orphan
                 * list if it has been added prematurely, due to writeback of
                 * delalloc blocks.
                 */
                if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
                        handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);

                        if (IS_ERR(handle)) {
                                ext4_orphan_del(NULL, inode);
                                return PTR_ERR(handle);
                        }

                        ext4_orphan_del(handle, inode);
                        ext4_journal_stop(handle);
                }

                return written;
        }

        if (written < 0)
                goto truncate;

        handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
        if (IS_ERR(handle)) {
                written = PTR_ERR(handle);
                goto truncate;
        }

        if (ext4_update_inode_size(inode, offset + written)) {
                ret = ext4_mark_inode_dirty(handle, inode);
                if (unlikely(ret)) {
                        written = ret;
                        ext4_journal_stop(handle);
                        goto truncate;
                }
        }

        /*
         * We may need to truncate allocated but not written blocks beyond EOF.
         */
        written_blk = ALIGN(offset + written, 1 << blkbits);
        end_blk = ALIGN(offset + count, 1 << blkbits);
        if (written_blk < end_blk && ext4_can_truncate(inode))
                truncate = true;

        /*
         * Remove the inode from the orphan list if it has been extended and
         * everything went OK.
         */
        if (!truncate && inode->i_nlink)
                ext4_orphan_del(handle, inode);
        ext4_journal_stop(handle);

        if (truncate) {
truncate:
                ext4_truncate_failed_write(inode);
                /*
                 * If the truncate operation failed early, then the inode may
                 * still be on the orphan list. In that case, we need to try
                 * remove the inode from the in-memory linked list.
                 */
                if (inode->i_nlink)
                        ext4_orphan_del(NULL, inode);
        }

        return written;
}

static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
                                 int error, unsigned int flags)
{
        loff_t pos = iocb->ki_pos;
        struct inode *inode = file_inode(iocb->ki_filp);

        if (error)
                return error;

        if (size && flags & IOMAP_DIO_UNWRITTEN) {
                error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
                if (error < 0)
                        return error;
        }
        /*
         * If we are extending the file, we have to update i_size here before
         * page cache gets invalidated in iomap_dio_rw(). Otherwise racing
         * buffered reads could zero out too much from page cache pages. Update
         * of on-disk size will happen later in ext4_dio_write_iter() where
         * we have enough information to also perform orphan list handling etc.
         * Note that we perform all extending writes synchronously under
         * i_rwsem held exclusively so i_size update is safe here in that case.
         * If the write was not extending, we cannot see pos > i_size here
         * because operations reducing i_size like truncate wait for all
         * outstanding DIO before updating i_size.
         */
        pos += size;
        if (pos > i_size_read(inode))
                i_size_write(inode, pos);

        return 0;
}

static const struct iomap_dio_ops ext4_dio_write_ops = {
        .end_io = ext4_dio_write_end_io,
};

/*
 * The intention here is to start with shared lock acquired then see if any
 * condition requires an exclusive inode lock. If yes, then we restart the
 * whole operation by releasing the shared lock and acquiring exclusive lock.
 *
 * - For unaligned_io we never take shared lock as it may cause data corruption
 *   when two unaligned IO tries to modify the same block e.g. while zeroing.
 *
 * - For extending writes case we don't take the shared lock, since it requires
 *   updating inode i_disksize and/or orphan handling with exclusive lock.
 *
 * - shared locking will only be true mostly with overwrites. Otherwise we will
 *   switch to exclusive i_rwsem lock.
 */
static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
                                     bool *ilock_shared, bool *extend)
{
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        loff_t offset;
        size_t count;
        ssize_t ret;

restart:
        ret = ext4_generic_write_checks(iocb, from);
        if (ret <= 0)
                goto out;

        offset = iocb->ki_pos;
        count = ret;
        if (ext4_extending_io(inode, offset, count))
                *extend = true;
        /*
         * Determine whether the IO operation will overwrite allocated
         * and initialized blocks.
         * We need exclusive i_rwsem for changing security info
         * in file_modified().
         */
        if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
             !ext4_overwrite_io(inode, offset, count))) {
                if (iocb->ki_flags & IOCB_NOWAIT) {
                        ret = -EAGAIN;
                        goto out;
                }
                inode_unlock_shared(inode);
                *ilock_shared = false;
                inode_lock(inode);
                goto restart;
        }

        ret = file_modified(file);
        if (ret < 0)
                goto out;

        return count;
out:
        if (*ilock_shared)
                inode_unlock_shared(inode);
        else
                inode_unlock(inode);
        return ret;
}

static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        ssize_t ret;
        handle_t *handle;
        struct inode *inode = file_inode(iocb->ki_filp);
        loff_t offset = iocb->ki_pos;
        size_t count = iov_iter_count(from);
        const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
        bool extend = false, unaligned_io = false;
        bool ilock_shared = true;

        /*
         * We initially start with shared inode lock unless it is
         * unaligned IO which needs exclusive lock anyways.
         */
        if (ext4_unaligned_io(inode, from, offset)) {
                unaligned_io = true;
                ilock_shared = false;
        }
        /*
         * Quick check here without any i_rwsem lock to see if it is extending
         * IO. A more reliable check is done in ext4_dio_write_checks() with
         * proper locking in place.
         */
        if (offset + count > i_size_read(inode))
                ilock_shared = false;

        if (iocb->ki_flags & IOCB_NOWAIT) {
                if (ilock_shared) {
                        if (!inode_trylock_shared(inode))
                                return -EAGAIN;
                } else {
                        if (!inode_trylock(inode))
                                return -EAGAIN;
                }
        } else {
                if (ilock_shared)
                        inode_lock_shared(inode);
                else
                        inode_lock(inode);
        }

        /* Fallback to buffered I/O if the inode does not support direct I/O. */
        if (!ext4_dio_supported(inode)) {
                if (ilock_shared)
                        inode_unlock_shared(inode);
                else
                        inode_unlock(inode);
                return ext4_buffered_write_iter(iocb, from);
        }

        ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
        if (ret <= 0)
                return ret;

        /* if we're going to block and IOCB_NOWAIT is set, return -EAGAIN */
        if ((iocb->ki_flags & IOCB_NOWAIT) && (unaligned_io || extend)) {
                ret = -EAGAIN;
                goto out;
        }
        /*
         * Make sure inline data cannot be created anymore since we are going
         * to allocate blocks for DIO. We know the inode does not have any
         * inline data now because ext4_dio_supported() checked for that.
         */
        ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);

        offset = iocb->ki_pos;
        count = ret;

        /*
         * Unaligned direct IO must be serialized among each other as zeroing
         * of partial blocks of two competing unaligned IOs can result in data
         * corruption.
         *
         * So we make sure we don't allow any unaligned IO in flight.
         * For IOs where we need not wait (like unaligned non-AIO DIO),
         * below inode_dio_wait() may anyway become a no-op, since we start
         * with exclusive lock.
         */
        if (unaligned_io)
                inode_dio_wait(inode);

        if (extend) {
                handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        goto out;
                }

                ret = ext4_orphan_add(handle, inode);
                if (ret) {
                        ext4_journal_stop(handle);
                        goto out;
                }

                ext4_journal_stop(handle);
        }

        if (ilock_shared)
                iomap_ops = &ext4_iomap_overwrite_ops;
        ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
                           is_sync_kiocb(iocb) || unaligned_io || extend);
        if (ret == -ENOTBLK)
                ret = 0;

        if (extend)
                ret = ext4_handle_inode_extension(inode, offset, ret, count);

out:
        if (ilock_shared)
                inode_unlock_shared(inode);
        else
                inode_unlock(inode);

        if (ret >= 0 && iov_iter_count(from)) {
                ssize_t err;
                loff_t endbyte;

                offset = iocb->ki_pos;
                err = ext4_buffered_write_iter(iocb, from);
                if (err < 0)
                        return err;

                /*
                 * We need to ensure that the pages within the page cache for
                 * the range covered by this I/O are written to disk and
                 * invalidated. This is in attempt to preserve the expected
                 * direct I/O semantics in the case we fallback to buffered I/O
                 * to complete off the I/O request.
                 */
                ret += err;
                endbyte = offset + err - 1;
                err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping,
                                                   offset, endbyte);
                if (!err)
                        invalidate_mapping_pages(iocb->ki_filp->f_mapping,
                                                 offset >> PAGE_SHIFT,
                                                 endbyte >> PAGE_SHIFT);
        }

        return ret;
}

#ifdef CONFIG_FS_DAX
static ssize_t
ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        ssize_t ret;
        size_t count;
        loff_t offset;
        handle_t *handle;
        bool extend = false;
        struct inode *inode = file_inode(iocb->ki_filp);

        if (iocb->ki_flags & IOCB_NOWAIT) {
                if (!inode_trylock(inode))
                        return -EAGAIN;
        } else {
                inode_lock(inode);
        }

        ret = ext4_write_checks(iocb, from);
        if (ret <= 0)
                goto out;

        offset = iocb->ki_pos;
        count = iov_iter_count(from);

        if (offset + count > EXT4_I(inode)->i_disksize) {
                handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        goto out;
                }

                ret = ext4_orphan_add(handle, inode);
                if (ret) {
                        ext4_journal_stop(handle);
                        goto out;
                }

                extend = true;
                ext4_journal_stop(handle);
        }

        ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);

        if (extend)
                ret = ext4_handle_inode_extension(inode, offset, ret, count);
out:
        inode_unlock(inode);
        if (ret > 0)
                ret = generic_write_sync(iocb, ret);
        return ret;
}
#endif

static ssize_t
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct inode *inode = file_inode(iocb->ki_filp);

        if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
                return -EIO;

#ifdef CONFIG_FS_DAX
        if (IS_DAX(inode))
                return ext4_dax_write_iter(iocb, from);
#endif
        if (iocb->ki_flags & IOCB_DIRECT)
                return ext4_dio_write_iter(iocb, from);
        else
                return ext4_buffered_write_iter(iocb, from);
}

#ifdef CONFIG_FS_DAX
static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
                enum page_entry_size pe_size)
{
        int error = 0;
        vm_fault_t result;
        int retries = 0;
        handle_t *handle = NULL;
        struct inode *inode = file_inode(vmf->vma->vm_file);
        struct super_block *sb = inode->i_sb;

        /*
         * We have to distinguish real writes from writes which will result in a
         * COW page; COW writes should *not* poke the journal (the file will not
         * be changed). Doing so would cause unintended failures when mounted
         * read-only.
         *
         * We check for VM_SHARED rather than vmf->cow_page since the latter is
         * unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for
         * other sizes, dax_iomap_fault will handle splitting / fallback so that
         * we eventually come back with a COW page.
         */
        bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
                (vmf->vma->vm_flags & VM_SHARED);
        pfn_t pfn;

        if (write) {
                sb_start_pagefault(sb);
                file_update_time(vmf->vma->vm_file);
                down_read(&EXT4_I(inode)->i_mmap_sem);
retry:
                handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
                                               EXT4_DATA_TRANS_BLOCKS(sb));
                if (IS_ERR(handle)) {
                        up_read(&EXT4_I(inode)->i_mmap_sem);
                        sb_end_pagefault(sb);
                        return VM_FAULT_SIGBUS;
                }
        } else {
                down_read(&EXT4_I(inode)->i_mmap_sem);
        }
        result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);
        if (write) {
                ext4_journal_stop(handle);

                if ((result & VM_FAULT_ERROR) && error == -ENOSPC &&
                    ext4_should_retry_alloc(sb, &retries))
                        goto retry;
                /* Handling synchronous page fault? */
                if (result & VM_FAULT_NEEDDSYNC)
                        result = dax_finish_sync_fault(vmf, pe_size, pfn);
                up_read(&EXT4_I(inode)->i_mmap_sem);
                sb_end_pagefault(sb);
        } else {
                up_read(&EXT4_I(inode)->i_mmap_sem);
        }

        return result;
}

static vm_fault_t ext4_dax_fault(struct vm_fault *vmf)
{
        return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
}

static const struct vm_operations_struct ext4_dax_vm_ops = {
        .fault                = ext4_dax_fault,
        .huge_fault        = ext4_dax_huge_fault,
        .page_mkwrite        = ext4_dax_fault,
        .pfn_mkwrite        = ext4_dax_fault,
};
#else
#define ext4_dax_vm_ops        ext4_file_vm_ops
#endif

static const struct vm_operations_struct ext4_file_vm_ops = {
        .fault                = ext4_filemap_fault,
        .map_pages        = filemap_map_pages,
        .page_mkwrite   = ext4_page_mkwrite,
};

static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct inode *inode = file->f_mapping->host;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct dax_device *dax_dev = sbi->s_daxdev;

        if (unlikely(ext4_forced_shutdown(sbi)))
                return -EIO;

        /*
         * We don't support synchronous mappings for non-DAX files and
         * for DAX files if underneath dax_device is not synchronous.
         */
        if (!daxdev_mapping_supported(vma, dax_dev))
                return -EOPNOTSUPP;

        file_accessed(file);
        if (IS_DAX(file_inode(file))) {
                vma->vm_ops = &ext4_dax_vm_ops;
                vma->vm_flags |= VM_HUGEPAGE;
        } else {
                vma->vm_ops = &ext4_file_vm_ops;
        }
        return 0;
}

static int ext4_sample_last_mounted(struct super_block *sb,
                                    struct vfsmount *mnt)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct path path;
        char buf[64], *cp;
        handle_t *handle;
        int err;

        if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED)))
                return 0;

        if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb))
                return 0;

        ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED);
        /*
         * Sample where the filesystem has been mounted and
         * store it in the superblock for sysadmin convenience
         * when trying to sort through large numbers of block
         * devices or filesystem images.
         */
        memset(buf, 0, sizeof(buf));
        path.mnt = mnt;
        path.dentry = mnt->mnt_root;
        cp = d_path(&path, buf, sizeof(buf));
        err = 0;
        if (IS_ERR(cp))
                goto out;

        handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
        err = PTR_ERR(handle);
        if (IS_ERR(handle))
                goto out;
        BUFFER_TRACE(sbi->s_sbh, "get_write_access");
        err = ext4_journal_get_write_access(handle, sbi->s_sbh);
        if (err)
                goto out_journal;
        strncpy(sbi->s_es->s_last_mounted, cp,
                sizeof(sbi->s_es->s_last_mounted));
        ext4_handle_dirty_super(handle, sb);
out_journal:
        ext4_journal_stop(handle);
out:
        sb_end_intwrite(sb);
        return err;
}

static int ext4_file_open(struct inode *inode, struct file *filp)
{
        int ret;

        if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
                return -EIO;

        ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt);
        if (ret)
                return ret;

        ret = fscrypt_file_open(inode, filp);
        if (ret)
                return ret;

        ret = fsverity_file_open(inode, filp);
        if (ret)
                return ret;

        /*
         * Set up the jbd2_inode if we are opening the inode for
         * writing and the journal is present
         */
        if (filp->f_mode & FMODE_WRITE) {
                ret = ext4_inode_attach_jinode(inode);
                if (ret < 0)
                        return ret;
        }

        filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
        return dquot_file_open(inode, filp);
}

/*
 * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
 * by calling generic_file_llseek_size() with the appropriate maxbytes
 * value for each.
 */
loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
{
        struct inode *inode = file->f_mapping->host;
        loff_t maxbytes;

        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
        else
                maxbytes = inode->i_sb->s_maxbytes;

        switch (whence) {
        default:
                return generic_file_llseek_size(file, offset, whence,
                                                maxbytes, i_size_read(inode));
        case SEEK_HOLE:
                inode_lock_shared(inode);
                offset = iomap_seek_hole(inode, offset,
                                         &ext4_iomap_report_ops);
                inode_unlock_shared(inode);
                break;
        case SEEK_DATA:
                inode_lock_shared(inode);
                offset = iomap_seek_data(inode, offset,
                                         &ext4_iomap_report_ops);
                inode_unlock_shared(inode);
                break;
        }

        if (offset < 0)
                return offset;
        return vfs_setpos(file, offset, maxbytes);
}

const struct file_operations ext4_file_operations = {
        .llseek                = ext4_llseek,
        .read_iter        = ext4_file_read_iter,
        .write_iter        = ext4_file_write_iter,
        .iopoll                = iomap_dio_iopoll,
        .unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl        = ext4_compat_ioctl,
#endif
        .mmap                = ext4_file_mmap,
        .mmap_supported_flags = MAP_SYNC,
        .open                = ext4_file_open,
        .release        = ext4_release_file,
        .fsync                = ext4_sync_file,
        .get_unmapped_area = thp_get_unmapped_area,
        .splice_read        = generic_file_splice_read,
        .splice_write        = iter_file_splice_write,
        .fallocate        = ext4_fallocate,
};

const struct inode_operations ext4_file_inode_operations = {
        .setattr        = ext4_setattr,
        .getattr        = ext4_file_getattr,
        .listxattr        = ext4_listxattr,
        .get_acl        = ext4_get_acl,
        .set_acl        = ext4_set_acl,
        .fiemap                = ext4_fiemap,
};























    1 


    1 








    1 

    1 



    1 












    1 













































    1 















    1 












    1 







    1 

















    1 
    1 










































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
// SPDX-License-Identifier: GPL-2.0
/*
 * Lockless hierarchical page accounting & limiting
 *
 * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner
 */

#include <linux/page_counter.h>
#include <linux/atomic.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/sched.h>
#include <linux/bug.h>
#include <asm/page.h>

static void propagate_protected_usage(struct page_counter *c,
                                      unsigned long usage)
{
        unsigned long protected, old_protected;
        unsigned long low, min;
        long delta;

        if (!c->parent)
                return;

        min = READ_ONCE(c->min);
        if (min || atomic_long_read(&c->min_usage)) {
                protected = min(usage, min);
                old_protected = atomic_long_xchg(&c->min_usage, protected);
                delta = protected - old_protected;
                if (delta)
                        atomic_long_add(delta, &c->parent->children_min_usage);
        }

        low = READ_ONCE(c->low);
        if (low || atomic_long_read(&c->low_usage)) {
                protected = min(usage, low);
                old_protected = atomic_long_xchg(&c->low_usage, protected);
                delta = protected - old_protected;
                if (delta)
                        atomic_long_add(delta, &c->parent->children_low_usage);
        }
}

/**
 * page_counter_cancel - take pages out of the local counter
 * @counter: counter
 * @nr_pages: number of pages to cancel
 */
void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
{
        long new;

        new = atomic_long_sub_return(nr_pages, &counter->usage);
        propagate_protected_usage(counter, new);
        /* More uncharges than charges? */
        WARN_ON_ONCE(new < 0);
}

/**
 * page_counter_charge - hierarchically charge pages
 * @counter: counter
 * @nr_pages: number of pages to charge
 *
 * NOTE: This does not consider any configured counter limits.
 */
void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
{
        struct page_counter *c;

        for (c = counter; c; c = c->parent) {
                long new;

                new = atomic_long_add_return(nr_pages, &c->usage);
                propagate_protected_usage(c, new);
                /*
                 * This is indeed racy, but we can live with some
                 * inaccuracy in the watermark.
                 */
                if (new > READ_ONCE(c->watermark))
                        WRITE_ONCE(c->watermark, new);
        }
}

/**
 * page_counter_try_charge - try to hierarchically charge pages
 * @counter: counter
 * @nr_pages: number of pages to charge
 * @fail: points first counter to hit its limit, if any
 *
 * Returns %true on success, or %false and @fail if the counter or one
 * of its ancestors has hit its configured limit.
 */
bool page_counter_try_charge(struct page_counter *counter,
                             unsigned long nr_pages,
                             struct page_counter **fail)
{
        struct page_counter *c;

        for (c = counter; c; c = c->parent) {
                long new;
                /*
                 * Charge speculatively to avoid an expensive CAS.  If
                 * a bigger charge fails, it might falsely lock out a
                 * racing smaller charge and send it into reclaim
                 * early, but the error is limited to the difference
                 * between the two sizes, which is less than 2M/4M in
                 * case of a THP locking out a regular page charge.
                 *
                 * The atomic_long_add_return() implies a full memory
                 * barrier between incrementing the count and reading
                 * the limit.  When racing with page_counter_set_max(),
                 * we either see the new limit or the setter sees the
                 * counter has changed and retries.
                 */
                new = atomic_long_add_return(nr_pages, &c->usage);
                if (new > c->max) {
                        atomic_long_sub(nr_pages, &c->usage);
                        propagate_protected_usage(c, new);
                        /*
                         * This is racy, but we can live with some
                         * inaccuracy in the failcnt which is only used
                         * to report stats.
                         */
                        data_race(c->failcnt++);
                        *fail = c;
                        goto failed;
                }
                propagate_protected_usage(c, new);
                /*
                 * Just like with failcnt, we can live with some
                 * inaccuracy in the watermark.
                 */
                if (new > READ_ONCE(c->watermark))
                        WRITE_ONCE(c->watermark, new);
        }
        return true;

failed:
        for (c = counter; c != *fail; c = c->parent)
                page_counter_cancel(c, nr_pages);

        return false;
}

/**
 * page_counter_uncharge - hierarchically uncharge pages
 * @counter: counter
 * @nr_pages: number of pages to uncharge
 */
void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
{
        struct page_counter *c;

        for (c = counter; c; c = c->parent)
                page_counter_cancel(c, nr_pages);
}

/**
 * page_counter_set_max - set the maximum number of pages allowed
 * @counter: counter
 * @nr_pages: limit to set
 *
 * Returns 0 on success, -EBUSY if the current number of pages on the
 * counter already exceeds the specified limit.
 *
 * The caller must serialize invocations on the same counter.
 */
int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
{
        for (;;) {
                unsigned long old;
                long usage;

                /*
                 * Update the limit while making sure that it's not
                 * below the concurrently-changing counter value.
                 *
                 * The xchg implies two full memory barriers before
                 * and after, so the read-swap-read is ordered and
                 * ensures coherency with page_counter_try_charge():
                 * that function modifies the count before checking
                 * the limit, so if it sees the old limit, we see the
                 * modified counter and retry.
                 */
                usage = atomic_long_read(&counter->usage);

                if (usage > nr_pages)
                        return -EBUSY;

                old = xchg(&counter->max, nr_pages);

                if (atomic_long_read(&counter->usage) <= usage)
                        return 0;

                counter->max = old;
                cond_resched();
        }
}

/**
 * page_counter_set_min - set the amount of protected memory
 * @counter: counter
 * @nr_pages: value to set
 *
 * The caller must serialize invocations on the same counter.
 */
void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
{
        struct page_counter *c;

        WRITE_ONCE(counter->min, nr_pages);

        for (c = counter; c; c = c->parent)
                propagate_protected_usage(c, atomic_long_read(&c->usage));
}

/**
 * page_counter_set_low - set the amount of protected memory
 * @counter: counter
 * @nr_pages: value to set
 *
 * The caller must serialize invocations on the same counter.
 */
void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
{
        struct page_counter *c;

        WRITE_ONCE(counter->low, nr_pages);

        for (c = counter; c; c = c->parent)
                propagate_protected_usage(c, atomic_long_read(&c->usage));
}

/**
 * page_counter_memparse - memparse() for page counter limits
 * @buf: string to parse
 * @max: string meaning maximum possible value
 * @nr_pages: returns the result in number of pages
 *
 * Returns -EINVAL, or 0 and @nr_pages on success.  @nr_pages will be
 * limited to %PAGE_COUNTER_MAX.
 */
int page_counter_memparse(const char *buf, const char *max,
                          unsigned long *nr_pages)
{
        char *end;
        u64 bytes;

        if (!strcmp(buf, max)) {
                *nr_pages = PAGE_COUNTER_MAX;
                return 0;
        }

        bytes = memparse(buf, &end);
        if (*end != '\0')
                return -EINVAL;

        *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX);

        return 0;
}

























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * include/linux/prandom.h
 *
 * Include file for the fast pseudo-random 32-bit
 * generation.
 */
#ifndef _LINUX_PRANDOM_H
#define _LINUX_PRANDOM_H

#include <linux/types.h>
#include <linux/percpu.h>
#include <linux/siphash.h>

u32 prandom_u32(void);
void prandom_bytes(void *buf, size_t nbytes);
void prandom_seed(u32 seed);
void prandom_reseed_late(void);

DECLARE_PER_CPU(unsigned long, net_rand_noise);

#define PRANDOM_ADD_NOISE(a, b, c, d) \
        prandom_u32_add_noise((unsigned long)(a), (unsigned long)(b), \
                              (unsigned long)(c), (unsigned long)(d))

#if BITS_PER_LONG == 64
/*
 * The core SipHash round function.  Each line can be executed in
 * parallel given enough CPU resources.
 */
#define PRND_SIPROUND(v0, v1, v2, v3) SIPHASH_PERMUTATION(v0, v1, v2, v3)

#define PRND_K0 (SIPHASH_CONST_0 ^ SIPHASH_CONST_2)
#define PRND_K1 (SIPHASH_CONST_1 ^ SIPHASH_CONST_3)

#elif BITS_PER_LONG == 32
/*
 * On 32-bit machines, we use HSipHash, a reduced-width version of SipHash.
 * This is weaker, but 32-bit machines are not used for high-traffic
 * applications, so there is less output for an attacker to analyze.
 */
#define PRND_SIPROUND(v0, v1, v2, v3) HSIPHASH_PERMUTATION(v0, v1, v2, v3)
#define PRND_K0 (HSIPHASH_CONST_0 ^ HSIPHASH_CONST_2)
#define PRND_K1 (HSIPHASH_CONST_1 ^ HSIPHASH_CONST_3)

#else
#error Unsupported BITS_PER_LONG
#endif

static inline void prandom_u32_add_noise(unsigned long a, unsigned long b,
                                         unsigned long c, unsigned long d)
{
        /*
         * This is not used cryptographically; it's just
         * a convenient 4-word hash function. (3 xor, 2 add, 2 rol)
         */
        a ^= raw_cpu_read(net_rand_noise);
        PRND_SIPROUND(a, b, c, d);
        raw_cpu_write(net_rand_noise, d);
}

struct rnd_state {
        __u32 s1, s2, s3, s4;
};

u32 prandom_u32_state(struct rnd_state *state);
void prandom_bytes_state(struct rnd_state *state, void *buf, size_t nbytes);
void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state);

#define prandom_init_once(pcpu_state)                        \
        DO_ONCE(prandom_seed_full_state, (pcpu_state))

/**
 * prandom_u32_max - returns a pseudo-random number in interval [0, ep_ro)
 * @ep_ro: right open interval endpoint
 *
 * Returns a pseudo-random number that is in interval [0, ep_ro). Note
 * that the result depends on PRNG being well distributed in [0, ~0U]
 * u32 space. Here we use maximally equidistributed combined Tausworthe
 * generator, that is, prandom_u32(). This is useful when requesting a
 * random index of an array containing ep_ro elements, for example.
 *
 * Returns: pseudo-random number in interval [0, ep_ro)
 */
static inline u32 prandom_u32_max(u32 ep_ro)
{
        return (u32)(((u64) prandom_u32() * ep_ro) >> 32);
}

/*
 * Handle minimum values for seeds
 */
static inline u32 __seed(u32 x, u32 m)
{
        return (x < m) ? x + m : x;
}

/**
 * prandom_seed_state - set seed for prandom_u32_state().
 * @state: pointer to state structure to receive the seed.
 * @seed: arbitrary 64-bit value to use as a seed.
 */
static inline void prandom_seed_state(struct rnd_state *state, u64 seed)
{
        u32 i = ((seed >> 32) ^ (seed << 10) ^ seed) & 0xffffffffUL;

        state->s1 = __seed(i,   2U);
        state->s2 = __seed(i,   8U);
        state->s3 = __seed(i,  16U);
        state->s4 = __seed(i, 128U);
        PRANDOM_ADD_NOISE(state, i, 0, 0);
}

/* Pseudo random number generator from numerical recipes. */
static inline u32 next_pseudo_random32(u32 seed)
{
        return seed * 1664525 + 1013904223;
}

#endif

















































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_BITMAP_H
#define __LINUX_BITMAP_H

#ifndef __ASSEMBLY__

#include <linux/bitops.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/types.h>

struct device;

/*
 * bitmaps provide bit arrays that consume one or more unsigned
 * longs.  The bitmap interface and available operations are listed
 * here, in bitmap.h
 *
 * Function implementations generic to all architectures are in
 * lib/bitmap.c.  Functions implementations that are architecture
 * specific are in various include/asm-<arch>/bitops.h headers
 * and other arch/<arch> specific files.
 *
 * See lib/bitmap.c for more details.
 */

/**
 * DOC: bitmap overview
 *
 * The available bitmap operations and their rough meaning in the
 * case that the bitmap is a single unsigned long are thus:
 *
 * The generated code is more efficient when nbits is known at
 * compile-time and at most BITS_PER_LONG.
 *
 * ::
 *
 *  bitmap_zero(dst, nbits)                     *dst = 0UL
 *  bitmap_fill(dst, nbits)                     *dst = ~0UL
 *  bitmap_copy(dst, src, nbits)                *dst = *src
 *  bitmap_and(dst, src1, src2, nbits)          *dst = *src1 & *src2
 *  bitmap_or(dst, src1, src2, nbits)           *dst = *src1 | *src2
 *  bitmap_xor(dst, src1, src2, nbits)          *dst = *src1 ^ *src2
 *  bitmap_andnot(dst, src1, src2, nbits)       *dst = *src1 & ~(*src2)
 *  bitmap_complement(dst, src, nbits)          *dst = ~(*src)
 *  bitmap_equal(src1, src2, nbits)             Are *src1 and *src2 equal?
 *  bitmap_intersects(src1, src2, nbits)        Do *src1 and *src2 overlap?
 *  bitmap_subset(src1, src2, nbits)            Is *src1 a subset of *src2?
 *  bitmap_empty(src, nbits)                    Are all bits zero in *src?
 *  bitmap_full(src, nbits)                     Are all bits set in *src?
 *  bitmap_weight(src, nbits)                   Hamming Weight: number set bits
 *  bitmap_set(dst, pos, nbits)                 Set specified bit area
 *  bitmap_clear(dst, pos, nbits)               Clear specified bit area
 *  bitmap_find_next_zero_area(buf, len, pos, n, mask)  Find bit free area
 *  bitmap_find_next_zero_area_off(buf, len, pos, n, mask, mask_off)  as above
 *  bitmap_next_clear_region(map, &start, &end, nbits)  Find next clear region
 *  bitmap_next_set_region(map, &start, &end, nbits)  Find next set region
 *  bitmap_for_each_clear_region(map, rs, re, start, end)
 *                                                  Iterate over all clear regions
 *  bitmap_for_each_set_region(map, rs, re, start, end)
 *                                                  Iterate over all set regions
 *  bitmap_shift_right(dst, src, n, nbits)      *dst = *src >> n
 *  bitmap_shift_left(dst, src, n, nbits)       *dst = *src << n
 *  bitmap_cut(dst, src, first, n, nbits)       Cut n bits from first, copy rest
 *  bitmap_replace(dst, old, new, mask, nbits)  *dst = (*old & ~(*mask)) | (*new & *mask)
 *  bitmap_remap(dst, src, old, new, nbits)     *dst = map(old, new)(src)
 *  bitmap_bitremap(oldbit, old, new, nbits)    newbit = map(old, new)(oldbit)
 *  bitmap_onto(dst, orig, relmap, nbits)       *dst = orig relative to relmap
 *  bitmap_fold(dst, orig, sz, nbits)           dst bits = orig bits mod sz
 *  bitmap_parse(buf, buflen, dst, nbits)       Parse bitmap dst from kernel buf
 *  bitmap_parse_user(ubuf, ulen, dst, nbits)   Parse bitmap dst from user buf
 *  bitmap_parselist(buf, dst, nbits)           Parse bitmap dst from kernel buf
 *  bitmap_parselist_user(buf, dst, nbits)      Parse bitmap dst from user buf
 *  bitmap_find_free_region(bitmap, bits, order)  Find and allocate bit region
 *  bitmap_release_region(bitmap, pos, order)   Free specified bit region
 *  bitmap_allocate_region(bitmap, pos, order)  Allocate specified bit region
 *  bitmap_from_arr32(dst, buf, nbits)          Copy nbits from u32[] buf to dst
 *  bitmap_to_arr32(buf, src, nbits)            Copy nbits from buf to u32[] dst
 *  bitmap_get_value8(map, start)               Get 8bit value from map at start
 *  bitmap_set_value8(map, value, start)        Set 8bit value to map at start
 *
 * Note, bitmap_zero() and bitmap_fill() operate over the region of
 * unsigned longs, that is, bits behind bitmap till the unsigned long
 * boundary will be zeroed or filled as well. Consider to use
 * bitmap_clear() or bitmap_set() to make explicit zeroing or filling
 * respectively.
 */

/**
 * DOC: bitmap bitops
 *
 * Also the following operations in asm/bitops.h apply to bitmaps.::
 *
 *  set_bit(bit, addr)                  *addr |= bit
 *  clear_bit(bit, addr)                *addr &= ~bit
 *  change_bit(bit, addr)               *addr ^= bit
 *  test_bit(bit, addr)                 Is bit set in *addr?
 *  test_and_set_bit(bit, addr)         Set bit and return old value
 *  test_and_clear_bit(bit, addr)       Clear bit and return old value
 *  test_and_change_bit(bit, addr)      Change bit and return old value
 *  find_first_zero_bit(addr, nbits)    Position first zero bit in *addr
 *  find_first_bit(addr, nbits)         Position first set bit in *addr
 *  find_next_zero_bit(addr, nbits, bit)
 *                                      Position next zero bit in *addr >= bit
 *  find_next_bit(addr, nbits, bit)     Position next set bit in *addr >= bit
 *  find_next_and_bit(addr1, addr2, nbits, bit)
 *                                      Same as find_next_bit, but in
 *                                      (*addr1 & *addr2)
 *
 */

/**
 * DOC: declare bitmap
 * The DECLARE_BITMAP(name,bits) macro, in linux/types.h, can be used
 * to declare an array named 'name' of just enough unsigned longs to
 * contain all bit positions from 0 to 'bits' - 1.
 */

/*
 * Allocation and deallocation of bitmap.
 * Provided in lib/bitmap.c to avoid circular dependency.
 */
extern unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags);
extern unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags);
extern void bitmap_free(const unsigned long *bitmap);

/* Managed variants of the above. */
unsigned long *devm_bitmap_alloc(struct device *dev,
                                 unsigned int nbits, gfp_t flags);
unsigned long *devm_bitmap_zalloc(struct device *dev,
                                  unsigned int nbits, gfp_t flags);

/*
 * lib/bitmap.c provides these functions:
 */

extern int __bitmap_empty(const unsigned long *bitmap, unsigned int nbits);
extern int __bitmap_full(const unsigned long *bitmap, unsigned int nbits);
extern int __bitmap_equal(const unsigned long *bitmap1,
                          const unsigned long *bitmap2, unsigned int nbits);
extern bool __pure __bitmap_or_equal(const unsigned long *src1,
                                     const unsigned long *src2,
                                     const unsigned long *src3,
                                     unsigned int nbits);
extern void __bitmap_complement(unsigned long *dst, const unsigned long *src,
                        unsigned int nbits);
extern void __bitmap_shift_right(unsigned long *dst, const unsigned long *src,
                                unsigned int shift, unsigned int nbits);
extern void __bitmap_shift_left(unsigned long *dst, const unsigned long *src,
                                unsigned int shift, unsigned int nbits);
extern void bitmap_cut(unsigned long *dst, const unsigned long *src,
                       unsigned int first, unsigned int cut,
                       unsigned int nbits);
extern int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
                        const unsigned long *bitmap2, unsigned int nbits);
extern void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
                        const unsigned long *bitmap2, unsigned int nbits);
extern void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
                        const unsigned long *bitmap2, unsigned int nbits);
extern int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
                        const unsigned long *bitmap2, unsigned int nbits);
extern void __bitmap_replace(unsigned long *dst,
                        const unsigned long *old, const unsigned long *new,
                        const unsigned long *mask, unsigned int nbits);
extern int __bitmap_intersects(const unsigned long *bitmap1,
                        const unsigned long *bitmap2, unsigned int nbits);
extern int __bitmap_subset(const unsigned long *bitmap1,
                        const unsigned long *bitmap2, unsigned int nbits);
extern int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits);
extern void __bitmap_set(unsigned long *map, unsigned int start, int len);
extern void __bitmap_clear(unsigned long *map, unsigned int start, int len);

extern unsigned long bitmap_find_next_zero_area_off(unsigned long *map,
                                                    unsigned long size,
                                                    unsigned long start,
                                                    unsigned int nr,
                                                    unsigned long align_mask,
                                                    unsigned long align_offset);

/**
 * bitmap_find_next_zero_area - find a contiguous aligned zero area
 * @map: The address to base the search on
 * @size: The bitmap size in bits
 * @start: The bitnumber to start searching at
 * @nr: The number of zeroed bits we're looking for
 * @align_mask: Alignment mask for zero area
 *
 * The @align_mask should be one less than a power of 2; the effect is that
 * the bit offset of all zero areas this function finds is multiples of that
 * power of 2. A @align_mask of 0 means no alignment is required.
 */
static inline unsigned long
bitmap_find_next_zero_area(unsigned long *map,
                           unsigned long size,
                           unsigned long start,
                           unsigned int nr,
                           unsigned long align_mask)
{
        return bitmap_find_next_zero_area_off(map, size, start, nr,
                                              align_mask, 0);
}

extern int bitmap_parse(const char *buf, unsigned int buflen,
                        unsigned long *dst, int nbits);
extern int bitmap_parse_user(const char __user *ubuf, unsigned int ulen,
                        unsigned long *dst, int nbits);
extern int bitmap_parselist(const char *buf, unsigned long *maskp,
                        int nmaskbits);
extern int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen,
                        unsigned long *dst, int nbits);
extern void bitmap_remap(unsigned long *dst, const unsigned long *src,
                const unsigned long *old, const unsigned long *new, unsigned int nbits);
extern int bitmap_bitremap(int oldbit,
                const unsigned long *old, const unsigned long *new, int bits);
extern void bitmap_onto(unsigned long *dst, const unsigned long *orig,
                const unsigned long *relmap, unsigned int bits);
extern void bitmap_fold(unsigned long *dst, const unsigned long *orig,
                unsigned int sz, unsigned int nbits);
extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order);
extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order);
extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order);

#ifdef __BIG_ENDIAN
extern void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits);
#else
#define bitmap_copy_le bitmap_copy
#endif
extern unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int ord, unsigned int nbits);
extern int bitmap_print_to_pagebuf(bool list, char *buf,
                                   const unsigned long *maskp, int nmaskbits);

#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))

/*
 * The static inlines below do not handle constant nbits==0 correctly,
 * so make such users (should any ever turn up) call the out-of-line
 * versions.
 */
#define small_const_nbits(nbits) \
        (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG && (nbits) > 0)

static inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
{
        unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
        memset(dst, 0, len);
}

static inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
{
        unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
        memset(dst, 0xff, len);
}

static inline void bitmap_copy(unsigned long *dst, const unsigned long *src,
                        unsigned int nbits)
{
        unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
        memcpy(dst, src, len);
}

/*
 * Copy bitmap and clear tail bits in last word.
 */
static inline void bitmap_copy_clear_tail(unsigned long *dst,
                const unsigned long *src, unsigned int nbits)
{
        bitmap_copy(dst, src, nbits);
        if (nbits % BITS_PER_LONG)
                dst[nbits / BITS_PER_LONG] &= BITMAP_LAST_WORD_MASK(nbits);
}

/*
 * On 32-bit systems bitmaps are represented as u32 arrays internally, and
 * therefore conversion is not needed when copying data from/to arrays of u32.
 */
#if BITS_PER_LONG == 64
extern void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf,
                                                        unsigned int nbits);
extern void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap,
                                                        unsigned int nbits);
#else
#define bitmap_from_arr32(bitmap, buf, nbits)                        \
        bitmap_copy_clear_tail((unsigned long *) (bitmap),        \
                        (const unsigned long *) (buf), (nbits))
#define bitmap_to_arr32(buf, bitmap, nbits)                        \
        bitmap_copy_clear_tail((unsigned long *) (buf),                \
                        (const unsigned long *) (bitmap), (nbits))
#endif

static inline int bitmap_and(unsigned long *dst, const unsigned long *src1,
                        const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return (*dst = *src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)) != 0;
        return __bitmap_and(dst, src1, src2, nbits);
}

static inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
                        const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = *src1 | *src2;
        else
                __bitmap_or(dst, src1, src2, nbits);
}

static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1,
                        const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = *src1 ^ *src2;
        else
                __bitmap_xor(dst, src1, src2, nbits);
}

static inline int bitmap_andnot(unsigned long *dst, const unsigned long *src1,
                        const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return (*dst = *src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0;
        return __bitmap_andnot(dst, src1, src2, nbits);
}

static inline void bitmap_complement(unsigned long *dst, const unsigned long *src,
                        unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = ~(*src);
        else
                __bitmap_complement(dst, src, nbits);
}

#ifdef __LITTLE_ENDIAN
#define BITMAP_MEM_ALIGNMENT 8
#else
#define BITMAP_MEM_ALIGNMENT (8 * sizeof(unsigned long))
#endif
#define BITMAP_MEM_MASK (BITMAP_MEM_ALIGNMENT - 1)

static inline int bitmap_equal(const unsigned long *src1,
                        const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return !((*src1 ^ *src2) & BITMAP_LAST_WORD_MASK(nbits));
        if (__builtin_constant_p(nbits & BITMAP_MEM_MASK) &&
            IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT))
                return !memcmp(src1, src2, nbits / 8);
        return __bitmap_equal(src1, src2, nbits);
}

/**
 * bitmap_or_equal - Check whether the or of two bitmaps is equal to a third
 * @src1:        Pointer to bitmap 1
 * @src2:        Pointer to bitmap 2 will be or'ed with bitmap 1
 * @src3:        Pointer to bitmap 3. Compare to the result of *@src1 | *@src2
 * @nbits:        number of bits in each of these bitmaps
 *
 * Returns: True if (*@src1 | *@src2) == *@src3, false otherwise
 */
static inline bool bitmap_or_equal(const unsigned long *src1,
                                   const unsigned long *src2,
                                   const unsigned long *src3,
                                   unsigned int nbits)
{
        if (!small_const_nbits(nbits))
                return __bitmap_or_equal(src1, src2, src3, nbits);

        return !(((*src1 | *src2) ^ *src3) & BITMAP_LAST_WORD_MASK(nbits));
}

static inline int bitmap_intersects(const unsigned long *src1,
                        const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return ((*src1 & *src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0;
        else
                return __bitmap_intersects(src1, src2, nbits);
}

static inline int bitmap_subset(const unsigned long *src1,
                        const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits));
        else
                return __bitmap_subset(src1, src2, nbits);
}

static inline int bitmap_empty(const unsigned long *src, unsigned nbits)
{
        if (small_const_nbits(nbits))
                return ! (*src & BITMAP_LAST_WORD_MASK(nbits));

        return find_first_bit(src, nbits) == nbits;
}

static inline int bitmap_full(const unsigned long *src, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits));

        return find_first_zero_bit(src, nbits) == nbits;
}

static __always_inline int bitmap_weight(const unsigned long *src, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
        return __bitmap_weight(src, nbits);
}

static __always_inline void bitmap_set(unsigned long *map, unsigned int start,
                unsigned int nbits)
{
        if (__builtin_constant_p(nbits) && nbits == 1)
                __set_bit(start, map);
        else if (__builtin_constant_p(start & BITMAP_MEM_MASK) &&
                 IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) &&
                 __builtin_constant_p(nbits & BITMAP_MEM_MASK) &&
                 IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT))
                memset((char *)map + start / 8, 0xff, nbits / 8);
        else
                __bitmap_set(map, start, nbits);
}

static __always_inline void bitmap_clear(unsigned long *map, unsigned int start,
                unsigned int nbits)
{
        if (__builtin_constant_p(nbits) && nbits == 1)
                __clear_bit(start, map);
        else if (__builtin_constant_p(start & BITMAP_MEM_MASK) &&
                 IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) &&
                 __builtin_constant_p(nbits & BITMAP_MEM_MASK) &&
                 IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT))
                memset((char *)map + start / 8, 0, nbits / 8);
        else
                __bitmap_clear(map, start, nbits);
}

static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src,
                                unsigned int shift, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = (*src & BITMAP_LAST_WORD_MASK(nbits)) >> shift;
        else
                __bitmap_shift_right(dst, src, shift, nbits);
}

static inline void bitmap_shift_left(unsigned long *dst, const unsigned long *src,
                                unsigned int shift, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = (*src << shift) & BITMAP_LAST_WORD_MASK(nbits);
        else
                __bitmap_shift_left(dst, src, shift, nbits);
}

static inline void bitmap_replace(unsigned long *dst,
                                  const unsigned long *old,
                                  const unsigned long *new,
                                  const unsigned long *mask,
                                  unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = (*old & ~(*mask)) | (*new & *mask);
        else
                __bitmap_replace(dst, old, new, mask, nbits);
}

static inline void bitmap_next_clear_region(unsigned long *bitmap,
                                            unsigned int *rs, unsigned int *re,
                                            unsigned int end)
{
        *rs = find_next_zero_bit(bitmap, end, *rs);
        *re = find_next_bit(bitmap, end, *rs + 1);
}

static inline void bitmap_next_set_region(unsigned long *bitmap,
                                          unsigned int *rs, unsigned int *re,
                                          unsigned int end)
{
        *rs = find_next_bit(bitmap, end, *rs);
        *re = find_next_zero_bit(bitmap, end, *rs + 1);
}

/*
 * Bitmap region iterators.  Iterates over the bitmap between [@start, @end).
 * @rs and @re should be integer variables and will be set to start and end
 * index of the current clear or set region.
 */
#define bitmap_for_each_clear_region(bitmap, rs, re, start, end)             \
        for ((rs) = (start),                                                     \
             bitmap_next_clear_region((bitmap), &(rs), &(re), (end));             \
             (rs) < (re);                                                     \
             (rs) = (re) + 1,                                                     \
             bitmap_next_clear_region((bitmap), &(rs), &(re), (end)))

#define bitmap_for_each_set_region(bitmap, rs, re, start, end)                     \
        for ((rs) = (start),                                                     \
             bitmap_next_set_region((bitmap), &(rs), &(re), (end));             \
             (rs) < (re);                                                     \
             (rs) = (re) + 1,                                                     \
             bitmap_next_set_region((bitmap), &(rs), &(re), (end)))

/**
 * BITMAP_FROM_U64() - Represent u64 value in the format suitable for bitmap.
 * @n: u64 value
 *
 * Linux bitmaps are internally arrays of unsigned longs, i.e. 32-bit
 * integers in 32-bit environment, and 64-bit integers in 64-bit one.
 *
 * There are four combinations of endianness and length of the word in linux
 * ABIs: LE64, BE64, LE32 and BE32.
 *
 * On 64-bit kernels 64-bit LE and BE numbers are naturally ordered in
 * bitmaps and therefore don't require any special handling.
 *
 * On 32-bit kernels 32-bit LE ABI orders lo word of 64-bit number in memory
 * prior to hi, and 32-bit BE orders hi word prior to lo. The bitmap on the
 * other hand is represented as an array of 32-bit words and the position of
 * bit N may therefore be calculated as: word #(N/32) and bit #(N%32) in that
 * word.  For example, bit #42 is located at 10th position of 2nd word.
 * It matches 32-bit LE ABI, and we can simply let the compiler store 64-bit
 * values in memory as it usually does. But for BE we need to swap hi and lo
 * words manually.
 *
 * With all that, the macro BITMAP_FROM_U64() does explicit reordering of hi and
 * lo parts of u64.  For LE32 it does nothing, and for BE environment it swaps
 * hi and lo words, as is expected by bitmap.
 */
#if __BITS_PER_LONG == 64
#define BITMAP_FROM_U64(n) (n)
#else
#define BITMAP_FROM_U64(n) ((unsigned long) ((u64)(n) & ULONG_MAX)), \
                                ((unsigned long) ((u64)(n) >> 32))
#endif

/**
 * bitmap_from_u64 - Check and swap words within u64.
 *  @mask: source bitmap
 *  @dst:  destination bitmap
 *
 * In 32-bit Big Endian kernel, when using ``(u32 *)(&val)[*]``
 * to read u64 mask, we will get the wrong word.
 * That is ``(u32 *)(&val)[0]`` gets the upper 32 bits,
 * but we expect the lower 32-bits of u64.
 */
static inline void bitmap_from_u64(unsigned long *dst, u64 mask)
{
        dst[0] = mask & ULONG_MAX;

        if (sizeof(mask) > sizeof(unsigned long))
                dst[1] = mask >> 32;
}

/**
 * bitmap_get_value8 - get an 8-bit value within a memory region
 * @map: address to the bitmap memory region
 * @start: bit offset of the 8-bit value; must be a multiple of 8
 *
 * Returns the 8-bit value located at the @start bit offset within the @src
 * memory region.
 */
static inline unsigned long bitmap_get_value8(const unsigned long *map,
                                              unsigned long start)
{
        const size_t index = BIT_WORD(start);
        const unsigned long offset = start % BITS_PER_LONG;

        return (map[index] >> offset) & 0xFF;
}

/**
 * bitmap_set_value8 - set an 8-bit value within a memory region
 * @map: address to the bitmap memory region
 * @value: the 8-bit value; values wider than 8 bits may clobber bitmap
 * @start: bit offset of the 8-bit value; must be a multiple of 8
 */
static inline void bitmap_set_value8(unsigned long *map, unsigned long value,
                                     unsigned long start)
{
        const size_t index = BIT_WORD(start);
        const unsigned long offset = start % BITS_PER_LONG;

        map[index] &= ~(0xFFUL << offset);
        map[index] |= value << offset;
}

#endif /* __ASSEMBLY__ */

#endif /* __LINUX_BITMAP_H */























































































    1 










































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/poll.h>
#include <linux/ns_common.h>
#include <linux/fs_pin.h>

struct mnt_namespace {
        atomic_t                count;
        struct ns_common        ns;
        struct mount *        root;
        /*
         * Traversal and modification of .list is protected by either
         * - taking namespace_sem for write, OR
         * - taking namespace_sem for read AND taking .ns_lock.
         */
        struct list_head        list;
        spinlock_t                ns_lock;
        struct user_namespace        *user_ns;
        struct ucounts                *ucounts;
        u64                        seq;        /* Sequence number to prevent loops */
        wait_queue_head_t poll;
        u64 event;
        unsigned int                mounts; /* # of mounts in the namespace */
        unsigned int                pending_mounts;
} __randomize_layout;

struct mnt_pcp {
        int mnt_count;
        int mnt_writers;
};

struct mountpoint {
        struct hlist_node m_hash;
        struct dentry *m_dentry;
        struct hlist_head m_list;
        int m_count;
};

struct mount {
        struct hlist_node mnt_hash;
        struct mount *mnt_parent;
        struct dentry *mnt_mountpoint;
        struct vfsmount mnt;
        union {
                struct rcu_head mnt_rcu;
                struct llist_node mnt_llist;
        };
#ifdef CONFIG_SMP
        struct mnt_pcp __percpu *mnt_pcp;
#else
        int mnt_count;
        int mnt_writers;
#endif
        struct list_head mnt_mounts;        /* list of children, anchored here */
        struct list_head mnt_child;        /* and going through their mnt_child */
        struct list_head mnt_instance;        /* mount instance on sb->s_mounts */
        const char *mnt_devname;        /* Name of device e.g. /dev/dsk/hda1 */
        struct list_head mnt_list;
        struct list_head mnt_expire;        /* link in fs-specific expiry list */
        struct list_head mnt_share;        /* circular list of shared mounts */
        struct list_head mnt_slave_list;/* list of slave mounts */
        struct list_head mnt_slave;        /* slave list entry */
        struct mount *mnt_master;        /* slave is on master->mnt_slave_list */
        struct mnt_namespace *mnt_ns;        /* containing namespace */
        struct mountpoint *mnt_mp;        /* where is it mounted */
        union {
                struct hlist_node mnt_mp_list;        /* list mounts with the same mountpoint */
                struct hlist_node mnt_umount;
        };
        struct list_head mnt_umounting; /* list entry for umount propagation */
#ifdef CONFIG_FSNOTIFY
        struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
        __u32 mnt_fsnotify_mask;
#endif
        int mnt_id;                        /* mount identifier */
        int mnt_group_id;                /* peer group identifier */
        int mnt_expiry_mark;                /* true if marked for expiry */
        struct hlist_head mnt_pins;
        struct hlist_head mnt_stuck_children;
} __randomize_layout;

#define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */

static inline struct mount *real_mount(struct vfsmount *mnt)
{
        return container_of(mnt, struct mount, mnt);
}

static inline int mnt_has_parent(struct mount *mnt)
{
        return mnt != mnt->mnt_parent;
}

static inline int is_mounted(struct vfsmount *mnt)
{
        /* neither detached nor internal? */
        return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns);
}

extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);

extern int __legitimize_mnt(struct vfsmount *, unsigned);
extern bool legitimize_mnt(struct vfsmount *, unsigned);

static inline bool __path_is_mountpoint(const struct path *path)
{
        struct mount *m = __lookup_mnt(path->mnt, path->dentry);
        return m && likely(!(m->mnt.mnt_flags & MNT_SYNC_UMOUNT));
}

extern void __detach_mounts(struct dentry *dentry);

static inline void detach_mounts(struct dentry *dentry)
{
        if (!d_mountpoint(dentry))
                return;
        __detach_mounts(dentry);
}

static inline void get_mnt_ns(struct mnt_namespace *ns)
{
        atomic_inc(&ns->count);
}

extern seqlock_t mount_lock;

static inline void lock_mount_hash(void)
{
        write_seqlock(&mount_lock);
}

static inline void unlock_mount_hash(void)
{
        write_sequnlock(&mount_lock);
}

struct proc_mounts {
        struct mnt_namespace *ns;
        struct path root;
        int (*show)(struct seq_file *, struct vfsmount *);
        struct mount cursor;
};

extern const struct seq_operations mounts_op;

extern bool __is_local_mountpoint(struct dentry *dentry);
static inline bool is_local_mountpoint(struct dentry *dentry)
{
        if (!d_mountpoint(dentry))
                return false;

        return __is_local_mountpoint(dentry);
}

static inline bool is_anon_ns(struct mnt_namespace *ns)
{
        return ns->seq == 0;
}

extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);















    1 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CURRENT_H
#define _ASM_X86_CURRENT_H

#include <linux/compiler.h>
#include <asm/percpu.h>

#ifndef __ASSEMBLY__
struct task_struct;

DECLARE_PER_CPU(struct task_struct *, current_task);

static __always_inline struct task_struct *get_current(void)
{
        return this_cpu_read_stable(current_task);
}

#define current get_current()

#endif /* __ASSEMBLY__ */

#endif /* _ASM_X86_CURRENT_H */







































































    1 










    1 









    1 


    1 

    1 





    1 

















































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) "%s: " fmt, __func__

#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/slab.h>
#include <linux/percpu-refcount.h>

/*
 * Initially, a percpu refcount is just a set of percpu counters. Initially, we
 * don't try to detect the ref hitting 0 - which means that get/put can just
 * increment or decrement the local counter. Note that the counter on a
 * particular cpu can (and will) wrap - this is fine, when we go to shutdown the
 * percpu counters will all sum to the correct value
 *
 * (More precisely: because modular arithmetic is commutative the sum of all the
 * percpu_count vars will be equal to what it would have been if all the gets
 * and puts were done to a single integer, even if some of the percpu integers
 * overflow or underflow).
 *
 * The real trick to implementing percpu refcounts is shutdown. We can't detect
 * the ref hitting 0 on every put - this would require global synchronization
 * and defeat the whole purpose of using percpu refs.
 *
 * What we do is require the user to keep track of the initial refcount; we know
 * the ref can't hit 0 before the user drops the initial ref, so as long as we
 * convert to non percpu mode before the initial ref is dropped everything
 * works.
 *
 * Converting to non percpu mode is done with some RCUish stuff in
 * percpu_ref_kill. Additionally, we need a bias value so that the
 * atomic_long_t can't hit 0 before we've added up all the percpu refs.
 */

#define PERCPU_COUNT_BIAS        (1LU << (BITS_PER_LONG - 1))

static DEFINE_SPINLOCK(percpu_ref_switch_lock);
static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq);

static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref)
{
        return (unsigned long __percpu *)
                (ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC_DEAD);
}

/**
 * percpu_ref_init - initialize a percpu refcount
 * @ref: percpu_ref to initialize
 * @release: function which will be called when refcount hits 0
 * @flags: PERCPU_REF_INIT_* flags
 * @gfp: allocation mask to use
 *
 * Initializes @ref.  @ref starts out in percpu mode with a refcount of 1 unless
 * @flags contains PERCPU_REF_INIT_ATOMIC or PERCPU_REF_INIT_DEAD.  These flags
 * change the start state to atomic with the latter setting the initial refcount
 * to 0.  See the definitions of PERCPU_REF_INIT_* flags for flag behaviors.
 *
 * Note that @release must not sleep - it may potentially be called from RCU
 * callback context by percpu_ref_kill().
 */
int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release,
                    unsigned int flags, gfp_t gfp)
{
        size_t align = max_t(size_t, 1 << __PERCPU_REF_FLAG_BITS,
                             __alignof__(unsigned long));
        unsigned long start_count = 0;
        struct percpu_ref_data *data;

        ref->percpu_count_ptr = (unsigned long)
                __alloc_percpu_gfp(sizeof(unsigned long), align, gfp);
        if (!ref->percpu_count_ptr)
                return -ENOMEM;

        data = kzalloc(sizeof(*ref->data), gfp);
        if (!data) {
                free_percpu((void __percpu *)ref->percpu_count_ptr);
                ref->percpu_count_ptr = 0;
                return -ENOMEM;
        }

        data->force_atomic = flags & PERCPU_REF_INIT_ATOMIC;
        data->allow_reinit = flags & PERCPU_REF_ALLOW_REINIT;

        if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD)) {
                ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;
                data->allow_reinit = true;
        } else {
                start_count += PERCPU_COUNT_BIAS;
        }

        if (flags & PERCPU_REF_INIT_DEAD)
                ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
        else
                start_count++;

        atomic_long_set(&data->count, start_count);

        data->release = release;
        data->confirm_switch = NULL;
        data->ref = ref;
        ref->data = data;
        return 0;
}
EXPORT_SYMBOL_GPL(percpu_ref_init);

static void __percpu_ref_exit(struct percpu_ref *ref)
{
        unsigned long __percpu *percpu_count = percpu_count_ptr(ref);

        if (percpu_count) {
                /* non-NULL confirm_switch indicates switching in progress */
                WARN_ON_ONCE(ref->data && ref->data->confirm_switch);
                free_percpu(percpu_count);
                ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD;
        }
}

/**
 * percpu_ref_exit - undo percpu_ref_init()
 * @ref: percpu_ref to exit
 *
 * This function exits @ref.  The caller is responsible for ensuring that
 * @ref is no longer in active use.  The usual places to invoke this
 * function from are the @ref->release() callback or in init failure path
 * where percpu_ref_init() succeeded but other parts of the initialization
 * of the embedding object failed.
 */
void percpu_ref_exit(struct percpu_ref *ref)
{
        struct percpu_ref_data *data = ref->data;
        unsigned long flags;

        __percpu_ref_exit(ref);

        if (!data)
                return;

        spin_lock_irqsave(&percpu_ref_switch_lock, flags);
        ref->percpu_count_ptr |= atomic_long_read(&ref->data->count) <<
                __PERCPU_REF_FLAG_BITS;
        ref->data = NULL;
        spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);

        kfree(data);
}
EXPORT_SYMBOL_GPL(percpu_ref_exit);

static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu)
{
        struct percpu_ref_data *data = container_of(rcu,
                        struct percpu_ref_data, rcu);
        struct percpu_ref *ref = data->ref;

        data->confirm_switch(ref);
        data->confirm_switch = NULL;
        wake_up_all(&percpu_ref_switch_waitq);

        if (!data->allow_reinit)
                __percpu_ref_exit(ref);

        /* drop ref from percpu_ref_switch_to_atomic() */
        percpu_ref_put(ref);
}

static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu)
{
        struct percpu_ref_data *data = container_of(rcu,
                        struct percpu_ref_data, rcu);
        struct percpu_ref *ref = data->ref;
        unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
        unsigned long count = 0;
        int cpu;

        for_each_possible_cpu(cpu)
                count += *per_cpu_ptr(percpu_count, cpu);

        pr_debug("global %lu percpu %lu\n",
                 atomic_long_read(&data->count), count);

        /*
         * It's crucial that we sum the percpu counters _before_ adding the sum
         * to &ref->count; since gets could be happening on one cpu while puts
         * happen on another, adding a single cpu's count could cause
         * @ref->count to hit 0 before we've got a consistent value - but the
         * sum of all the counts will be consistent and correct.
         *
         * Subtracting the bias value then has to happen _after_ adding count to
         * &ref->count; we need the bias value to prevent &ref->count from
         * reaching 0 before we add the percpu counts. But doing it at the same
         * time is equivalent and saves us atomic operations:
         */
        atomic_long_add((long)count - PERCPU_COUNT_BIAS, &data->count);

        WARN_ONCE(atomic_long_read(&data->count) <= 0,
                  "percpu ref (%ps) <= 0 (%ld) after switching to atomic",
                  data->release, atomic_long_read(&data->count));

        /* @ref is viewed as dead on all CPUs, send out switch confirmation */
        percpu_ref_call_confirm_rcu(rcu);
}

static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref)
{
}

static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref,
                                          percpu_ref_func_t *confirm_switch)
{
        if (ref->percpu_count_ptr & __PERCPU_REF_ATOMIC) {
                if (confirm_switch)
                        confirm_switch(ref);
                return;
        }

        /* switching from percpu to atomic */
        ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;

        /*
         * Non-NULL ->confirm_switch is used to indicate that switching is
         * in progress.  Use noop one if unspecified.
         */
        ref->data->confirm_switch = confirm_switch ?:
                percpu_ref_noop_confirm_switch;

        percpu_ref_get(ref);        /* put after confirmation */
        call_rcu(&ref->data->rcu, percpu_ref_switch_to_atomic_rcu);
}

static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
{
        unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
        int cpu;

        BUG_ON(!percpu_count);

        if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC))
                return;

        if (WARN_ON_ONCE(!ref->data->allow_reinit))
                return;

        atomic_long_add(PERCPU_COUNT_BIAS, &ref->data->count);

        /*
         * Restore per-cpu operation.  smp_store_release() is paired
         * with READ_ONCE() in __ref_is_percpu() and guarantees that the
         * zeroing is visible to all percpu accesses which can see the
         * following __PERCPU_REF_ATOMIC clearing.
         */
        for_each_possible_cpu(cpu)
                *per_cpu_ptr(percpu_count, cpu) = 0;

        smp_store_release(&ref->percpu_count_ptr,
                          ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC);
}

static void __percpu_ref_switch_mode(struct percpu_ref *ref,
                                     percpu_ref_func_t *confirm_switch)
{
        struct percpu_ref_data *data = ref->data;

        lockdep_assert_held(&percpu_ref_switch_lock);

        /*
         * If the previous ATOMIC switching hasn't finished yet, wait for
         * its completion.  If the caller ensures that ATOMIC switching
         * isn't in progress, this function can be called from any context.
         */
        wait_event_lock_irq(percpu_ref_switch_waitq, !data->confirm_switch,
                            percpu_ref_switch_lock);

        if (data->force_atomic || (ref->percpu_count_ptr & __PERCPU_REF_DEAD))
                __percpu_ref_switch_to_atomic(ref, confirm_switch);
        else
                __percpu_ref_switch_to_percpu(ref);
}

/**
 * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode
 * @ref: percpu_ref to switch to atomic mode
 * @confirm_switch: optional confirmation callback
 *
 * There's no reason to use this function for the usual reference counting.
 * Use percpu_ref_kill[_and_confirm]().
 *
 * Schedule switching of @ref to atomic mode.  All its percpu counts will
 * be collected to the main atomic counter.  On completion, when all CPUs
 * are guaraneed to be in atomic mode, @confirm_switch, which may not
 * block, is invoked.  This function may be invoked concurrently with all
 * the get/put operations and can safely be mixed with kill and reinit
 * operations.  Note that @ref will stay in atomic mode across kill/reinit
 * cycles until percpu_ref_switch_to_percpu() is called.
 *
 * This function may block if @ref is in the process of switching to atomic
 * mode.  If the caller ensures that @ref is not in the process of
 * switching to atomic mode, this function can be called from any context.
 */
void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
                                 percpu_ref_func_t *confirm_switch)
{
        unsigned long flags;

        spin_lock_irqsave(&percpu_ref_switch_lock, flags);

        ref->data->force_atomic = true;
        __percpu_ref_switch_mode(ref, confirm_switch);

        spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic);

/**
 * percpu_ref_switch_to_atomic_sync - switch a percpu_ref to atomic mode
 * @ref: percpu_ref to switch to atomic mode
 *
 * Schedule switching the ref to atomic mode, and wait for the
 * switch to complete.  Caller must ensure that no other thread
 * will switch back to percpu mode.
 */
void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref)
{
        percpu_ref_switch_to_atomic(ref, NULL);
        wait_event(percpu_ref_switch_waitq, !ref->data->confirm_switch);
}
EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic_sync);

/**
 * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode
 * @ref: percpu_ref to switch to percpu mode
 *
 * There's no reason to use this function for the usual reference counting.
 * To re-use an expired ref, use percpu_ref_reinit().
 *
 * Switch @ref to percpu mode.  This function may be invoked concurrently
 * with all the get/put operations and can safely be mixed with kill and
 * reinit operations.  This function reverses the sticky atomic state set
 * by PERCPU_REF_INIT_ATOMIC or percpu_ref_switch_to_atomic().  If @ref is
 * dying or dead, the actual switching takes place on the following
 * percpu_ref_reinit().
 *
 * This function may block if @ref is in the process of switching to atomic
 * mode.  If the caller ensures that @ref is not in the process of
 * switching to atomic mode, this function can be called from any context.
 */
void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
{
        unsigned long flags;

        spin_lock_irqsave(&percpu_ref_switch_lock, flags);

        ref->data->force_atomic = false;
        __percpu_ref_switch_mode(ref, NULL);

        spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu);

/**
 * percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation
 * @ref: percpu_ref to kill
 * @confirm_kill: optional confirmation callback
 *
 * Equivalent to percpu_ref_kill() but also schedules kill confirmation if
 * @confirm_kill is not NULL.  @confirm_kill, which may not block, will be
 * called after @ref is seen as dead from all CPUs at which point all
 * further invocations of percpu_ref_tryget_live() will fail.  See
 * percpu_ref_tryget_live() for details.
 *
 * This function normally doesn't block and can be called from any context
 * but it may block if @confirm_kill is specified and @ref is in the
 * process of switching to atomic mode by percpu_ref_switch_to_atomic().
 *
 * There are no implied RCU grace periods between kill and release.
 */
void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
                                 percpu_ref_func_t *confirm_kill)
{
        unsigned long flags;

        spin_lock_irqsave(&percpu_ref_switch_lock, flags);

        WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_DEAD,
                  "%s called more than once on %ps!", __func__,
                  ref->data->release);

        ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
        __percpu_ref_switch_mode(ref, confirm_kill);
        percpu_ref_put(ref);

        spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm);

/**
 * percpu_ref_is_zero - test whether a percpu refcount reached zero
 * @ref: percpu_ref to test
 *
 * Returns %true if @ref reached zero.
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
bool percpu_ref_is_zero(struct percpu_ref *ref)
{
        unsigned long __percpu *percpu_count;
        unsigned long count, flags;

        if (__ref_is_percpu(ref, &percpu_count))
                return false;

        /* protect us from being destroyed */
        spin_lock_irqsave(&percpu_ref_switch_lock, flags);
        if (ref->data)
                count = atomic_long_read(&ref->data->count);
        else
                count = ref->percpu_count_ptr >> __PERCPU_REF_FLAG_BITS;
        spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);

        return count == 0;
}
EXPORT_SYMBOL_GPL(percpu_ref_is_zero);

/**
 * percpu_ref_reinit - re-initialize a percpu refcount
 * @ref: perpcu_ref to re-initialize
 *
 * Re-initialize @ref so that it's in the same state as when it finished
 * percpu_ref_init() ignoring %PERCPU_REF_INIT_DEAD.  @ref must have been
 * initialized successfully and reached 0 but not exited.
 *
 * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while
 * this function is in progress.
 */
void percpu_ref_reinit(struct percpu_ref *ref)
{
        WARN_ON_ONCE(!percpu_ref_is_zero(ref));

        percpu_ref_resurrect(ref);
}
EXPORT_SYMBOL_GPL(percpu_ref_reinit);

/**
 * percpu_ref_resurrect - modify a percpu refcount from dead to live
 * @ref: perpcu_ref to resurrect
 *
 * Modify @ref so that it's in the same state as before percpu_ref_kill() was
 * called. @ref must be dead but must not yet have exited.
 *
 * If @ref->release() frees @ref then the caller is responsible for
 * guaranteeing that @ref->release() does not get called while this
 * function is in progress.
 *
 * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while
 * this function is in progress.
 */
void percpu_ref_resurrect(struct percpu_ref *ref)
{
        unsigned long __percpu *percpu_count;
        unsigned long flags;

        spin_lock_irqsave(&percpu_ref_switch_lock, flags);

        WARN_ON_ONCE(!(ref->percpu_count_ptr & __PERCPU_REF_DEAD));
        WARN_ON_ONCE(__ref_is_percpu(ref, &percpu_count));

        ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD;
        percpu_ref_get(ref);
        __percpu_ref_switch_mode(ref, NULL);

        spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_resurrect);






















































    1 







































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_LIST_BL_H
#define _LINUX_LIST_BL_H

#include <linux/list.h>
#include <linux/bit_spinlock.h>

/*
 * Special version of lists, where head of the list has a lock in the lowest
 * bit. This is useful for scalable hash tables without increasing memory
 * footprint overhead.
 *
 * For modification operations, the 0 bit of hlist_bl_head->first
 * pointer must be set.
 *
 * With some small modifications, this can easily be adapted to store several
 * arbitrary bits (not just a single lock bit), if the need arises to store
 * some fast and compact auxiliary data.
 */

#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
#define LIST_BL_LOCKMASK        1UL
#else
#define LIST_BL_LOCKMASK        0UL
#endif

#ifdef CONFIG_DEBUG_LIST
#define LIST_BL_BUG_ON(x) BUG_ON(x)
#else
#define LIST_BL_BUG_ON(x)
#endif


struct hlist_bl_head {
        struct hlist_bl_node *first;
};

struct hlist_bl_node {
        struct hlist_bl_node *next, **pprev;
};
#define INIT_HLIST_BL_HEAD(ptr) \
        ((ptr)->first = NULL)

static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
{
        h->next = NULL;
        h->pprev = NULL;
}

#define hlist_bl_entry(ptr, type, member) container_of(ptr,type,member)

static inline bool  hlist_bl_unhashed(const struct hlist_bl_node *h)
{
        return !h->pprev;
}

static inline struct hlist_bl_node *hlist_bl_first(struct hlist_bl_head *h)
{
        return (struct hlist_bl_node *)
                ((unsigned long)h->first & ~LIST_BL_LOCKMASK);
}

static inline void hlist_bl_set_first(struct hlist_bl_head *h,
                                        struct hlist_bl_node *n)
{
        LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);
        LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) !=
                                                        LIST_BL_LOCKMASK);
        h->first = (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK);
}

static inline bool hlist_bl_empty(const struct hlist_bl_head *h)
{
        return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK);
}

static inline void hlist_bl_add_head(struct hlist_bl_node *n,
                                        struct hlist_bl_head *h)
{
        struct hlist_bl_node *first = hlist_bl_first(h);

        n->next = first;
        if (first)
                first->pprev = &n->next;
        n->pprev = &h->first;
        hlist_bl_set_first(h, n);
}

static inline void hlist_bl_add_before(struct hlist_bl_node *n,
                                       struct hlist_bl_node *next)
{
        struct hlist_bl_node **pprev = next->pprev;

        n->pprev = pprev;
        n->next = next;
        next->pprev = &n->next;

        /* pprev may be `first`, so be careful not to lose the lock bit */
        WRITE_ONCE(*pprev,
                   (struct hlist_bl_node *)
                        ((uintptr_t)n | ((uintptr_t)*pprev & LIST_BL_LOCKMASK)));
}

static inline void hlist_bl_add_behind(struct hlist_bl_node *n,
                                       struct hlist_bl_node *prev)
{
        n->next = prev->next;
        n->pprev = &prev->next;
        prev->next = n;

        if (n->next)
                n->next->pprev = &n->next;
}

static inline void __hlist_bl_del(struct hlist_bl_node *n)
{
        struct hlist_bl_node *next = n->next;
        struct hlist_bl_node **pprev = n->pprev;

        LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);

        /* pprev may be `first`, so be careful not to lose the lock bit */
        WRITE_ONCE(*pprev,
                   (struct hlist_bl_node *)
                        ((unsigned long)next |
                         ((unsigned long)*pprev & LIST_BL_LOCKMASK)));
        if (next)
                next->pprev = pprev;
}

static inline void hlist_bl_del(struct hlist_bl_node *n)
{
        __hlist_bl_del(n);
        n->next = LIST_POISON1;
        n->pprev = LIST_POISON2;
}

static inline void hlist_bl_del_init(struct hlist_bl_node *n)
{
        if (!hlist_bl_unhashed(n)) {
                __hlist_bl_del(n);
                INIT_HLIST_BL_NODE(n);
        }
}

static inline void hlist_bl_lock(struct hlist_bl_head *b)
{
        bit_spin_lock(0, (unsigned long *)b);
}

static inline void hlist_bl_unlock(struct hlist_bl_head *b)
{
        __bit_spin_unlock(0, (unsigned long *)b);
}

static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
{
        return bit_spin_is_locked(0, (unsigned long *)b);
}

/**
 * hlist_bl_for_each_entry        - iterate over list of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_node to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 *
 */
#define hlist_bl_for_each_entry(tpos, pos, head, member)                \
        for (pos = hlist_bl_first(head);                                \
             pos &&                                                        \
                ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1;}); \
             pos = pos->next)

/**
 * hlist_bl_for_each_entry_safe - iterate over list of given type safe against removal of list entry
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_node to use as a loop cursor.
 * @n:                another &struct hlist_node to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_bl_for_each_entry_safe(tpos, pos, n, head, member)         \
        for (pos = hlist_bl_first(head);                                 \
             pos && ({ n = pos->next; 1; }) &&                                  \
                ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1;}); \
             pos = n)

#endif












































































    1 
    1 





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






    1 





    1 







































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
// SPDX-License-Identifier: GPL-2.0-or-later
/* Common capabilities, needed by capability.o.
 */

#include <linux/capability.h>
#include <linux/audit.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/lsm_hooks.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/ptrace.h>
#include <linux/xattr.h>
#include <linux/hugetlb.h>
#include <linux/mount.h>
#include <linux/sched.h>
#include <linux/prctl.h>
#include <linux/securebits.h>
#include <linux/user_namespace.h>
#include <linux/binfmts.h>
#include <linux/personality.h>

/*
 * If a non-root user executes a setuid-root binary in
 * !secure(SECURE_NOROOT) mode, then we raise capabilities.
 * However if fE is also set, then the intent is for only
 * the file capabilities to be applied, and the setuid-root
 * bit is left on either to change the uid (plausible) or
 * to get full privilege on a kernel without file capabilities
 * support.  So in that case we do not raise capabilities.
 *
 * Warn if that happens, once per boot.
 */
static void warn_setuid_and_fcaps_mixed(const char *fname)
{
        static int warned;
        if (!warned) {
                printk(KERN_INFO "warning: `%s' has both setuid-root and"
                        " effective capabilities. Therefore not raising all"
                        " capabilities.\n", fname);
                warned = 1;
        }
}

/**
 * cap_capable - Determine whether a task has a particular effective capability
 * @cred: The credentials to use
 * @ns:  The user namespace in which we need the capability
 * @cap: The capability to check for
 * @opts: Bitmask of options defined in include/linux/security.h
 *
 * Determine whether the nominated task has the specified capability amongst
 * its effective set, returning 0 if it does, -ve if it does not.
 *
 * NOTE WELL: cap_has_capability() cannot be used like the kernel's capable()
 * and has_capability() functions.  That is, it has the reverse semantics:
 * cap_has_capability() returns 0 when a task has a capability, but the
 * kernel's capable() and has_capability() returns 1 for this case.
 */
int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
                int cap, unsigned int opts)
{
        struct user_namespace *ns = targ_ns;

        /* See if cred has the capability in the target user namespace
         * by examining the target user namespace and all of the target
         * user namespace's parents.
         */
        for (;;) {
                /* Do we have the necessary capabilities? */
                if (ns == cred->user_ns)
                        return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;

                /*
                 * If we're already at a lower level than we're looking for,
                 * we're done searching.
                 */
                if (ns->level <= cred->user_ns->level)
                        return -EPERM;

                /* 
                 * The owner of the user namespace in the parent of the
                 * user namespace has all caps.
                 */
                if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid))
                        return 0;

                /*
                 * If you have a capability in a parent user ns, then you have
                 * it over all children user namespaces as well.
                 */
                ns = ns->parent;
        }

        /* We never get here */
}

/**
 * cap_settime - Determine whether the current process may set the system clock
 * @ts: The time to set
 * @tz: The timezone to set
 *
 * Determine whether the current process may set the system clock and timezone
 * information, returning 0 if permission granted, -ve if denied.
 */
int cap_settime(const struct timespec64 *ts, const struct timezone *tz)
{
        if (!capable(CAP_SYS_TIME))
                return -EPERM;
        return 0;
}

/**
 * cap_ptrace_access_check - Determine whether the current process may access
 *                           another
 * @child: The process to be accessed
 * @mode: The mode of attachment.
 *
 * If we are in the same or an ancestor user_ns and have all the target
 * task's capabilities, then ptrace access is allowed.
 * If we have the ptrace capability to the target user_ns, then ptrace
 * access is allowed.
 * Else denied.
 *
 * Determine whether a process may access another, returning 0 if permission
 * granted, -ve if denied.
 */
int cap_ptrace_access_check(struct task_struct *child, unsigned int mode)
{
        int ret = 0;
        const struct cred *cred, *child_cred;
        const kernel_cap_t *caller_caps;

        rcu_read_lock();
        cred = current_cred();
        child_cred = __task_cred(child);
        if (mode & PTRACE_MODE_FSCREDS)
                caller_caps = &cred->cap_effective;
        else
                caller_caps = &cred->cap_permitted;
        if (cred->user_ns == child_cred->user_ns &&
            cap_issubset(child_cred->cap_permitted, *caller_caps))
                goto out;
        if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE))
                goto out;
        ret = -EPERM;
out:
        rcu_read_unlock();
        return ret;
}

/**
 * cap_ptrace_traceme - Determine whether another process may trace the current
 * @parent: The task proposed to be the tracer
 *
 * If parent is in the same or an ancestor user_ns and has all current's
 * capabilities, then ptrace access is allowed.
 * If parent has the ptrace capability to current's user_ns, then ptrace
 * access is allowed.
 * Else denied.
 *
 * Determine whether the nominated task is permitted to trace the current
 * process, returning 0 if permission is granted, -ve if denied.
 */
int cap_ptrace_traceme(struct task_struct *parent)
{
        int ret = 0;
        const struct cred *cred, *child_cred;

        rcu_read_lock();
        cred = __task_cred(parent);
        child_cred = current_cred();
        if (cred->user_ns == child_cred->user_ns &&
            cap_issubset(child_cred->cap_permitted, cred->cap_permitted))
                goto out;
        if (has_ns_capability(parent, child_cred->user_ns, CAP_SYS_PTRACE))
                goto out;
        ret = -EPERM;
out:
        rcu_read_unlock();
        return ret;
}

/**
 * cap_capget - Retrieve a task's capability sets
 * @target: The task from which to retrieve the capability sets
 * @effective: The place to record the effective set
 * @inheritable: The place to record the inheritable set
 * @permitted: The place to record the permitted set
 *
 * This function retrieves the capabilities of the nominated task and returns
 * them to the caller.
 */
int cap_capget(struct task_struct *target, kernel_cap_t *effective,
               kernel_cap_t *inheritable, kernel_cap_t *permitted)
{
        const struct cred *cred;

        /* Derived from kernel/capability.c:sys_capget. */
        rcu_read_lock();
        cred = __task_cred(target);
        *effective   = cred->cap_effective;
        *inheritable = cred->cap_inheritable;
        *permitted   = cred->cap_permitted;
        rcu_read_unlock();
        return 0;
}

/*
 * Determine whether the inheritable capabilities are limited to the old
 * permitted set.  Returns 1 if they are limited, 0 if they are not.
 */
static inline int cap_inh_is_capped(void)
{
        /* they are so limited unless the current task has the CAP_SETPCAP
         * capability
         */
        if (cap_capable(current_cred(), current_cred()->user_ns,
                        CAP_SETPCAP, CAP_OPT_NONE) == 0)
                return 0;
        return 1;
}

/**
 * cap_capset - Validate and apply proposed changes to current's capabilities
 * @new: The proposed new credentials; alterations should be made here
 * @old: The current task's current credentials
 * @effective: A pointer to the proposed new effective capabilities set
 * @inheritable: A pointer to the proposed new inheritable capabilities set
 * @permitted: A pointer to the proposed new permitted capabilities set
 *
 * This function validates and applies a proposed mass change to the current
 * process's capability sets.  The changes are made to the proposed new
 * credentials, and assuming no error, will be committed by the caller of LSM.
 */
int cap_capset(struct cred *new,
               const struct cred *old,
               const kernel_cap_t *effective,
               const kernel_cap_t *inheritable,
               const kernel_cap_t *permitted)
{
        if (cap_inh_is_capped() &&
            !cap_issubset(*inheritable,
                          cap_combine(old->cap_inheritable,
                                      old->cap_permitted)))
                /* incapable of using this inheritable set */
                return -EPERM;

        if (!cap_issubset(*inheritable,
                          cap_combine(old->cap_inheritable,
                                      old->cap_bset)))
                /* no new pI capabilities outside bounding set */
                return -EPERM;

        /* verify restrictions on target's new Permitted set */
        if (!cap_issubset(*permitted, old->cap_permitted))
                return -EPERM;

        /* verify the _new_Effective_ is a subset of the _new_Permitted_ */
        if (!cap_issubset(*effective, *permitted))
                return -EPERM;

        new->cap_effective   = *effective;
        new->cap_inheritable = *inheritable;
        new->cap_permitted   = *permitted;

        /*
         * Mask off ambient bits that are no longer both permitted and
         * inheritable.
         */
        new->cap_ambient = cap_intersect(new->cap_ambient,
                                         cap_intersect(*permitted,
                                                       *inheritable));
        if (WARN_ON(!cap_ambient_invariant_ok(new)))
                return -EINVAL;
        return 0;
}

/**
 * cap_inode_need_killpriv - Determine if inode change affects privileges
 * @dentry: The inode/dentry in being changed with change marked ATTR_KILL_PRIV
 *
 * Determine if an inode having a change applied that's marked ATTR_KILL_PRIV
 * affects the security markings on that inode, and if it is, should
 * inode_killpriv() be invoked or the change rejected.
 *
 * Returns 1 if security.capability has a value, meaning inode_killpriv()
 * is required, 0 otherwise, meaning inode_killpriv() is not required.
 */
int cap_inode_need_killpriv(struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);
        int error;

        error = __vfs_getxattr(dentry, inode, XATTR_NAME_CAPS, NULL, 0);
        return error > 0;
}

/**
 * cap_inode_killpriv - Erase the security markings on an inode
 * @dentry: The inode/dentry to alter
 *
 * Erase the privilege-enhancing security markings on an inode.
 *
 * Returns 0 if successful, -ve on error.
 */
int cap_inode_killpriv(struct dentry *dentry)
{
        int error;

        error = __vfs_removexattr(dentry, XATTR_NAME_CAPS);
        if (error == -EOPNOTSUPP)
                error = 0;
        return error;
}

static bool rootid_owns_currentns(kuid_t kroot)
{
        struct user_namespace *ns;

        if (!uid_valid(kroot))
                return false;

        for (ns = current_user_ns(); ; ns = ns->parent) {
                if (from_kuid(ns, kroot) == 0)
                        return true;
                if (ns == &init_user_ns)
                        break;
        }

        return false;
}

static __u32 sansflags(__u32 m)
{
        return m & ~VFS_CAP_FLAGS_EFFECTIVE;
}

static bool is_v2header(size_t size, const struct vfs_cap_data *cap)
{
        if (size != XATTR_CAPS_SZ_2)
                return false;
        return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_2;
}

static bool is_v3header(size_t size, const struct vfs_cap_data *cap)
{
        if (size != XATTR_CAPS_SZ_3)
                return false;
        return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3;
}

/*
 * getsecurity: We are called for security.* before any attempt to read the
 * xattr from the inode itself.
 *
 * This gives us a chance to read the on-disk value and convert it.  If we
 * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler.
 *
 * Note we are not called by vfs_getxattr_alloc(), but that is only called
 * by the integrity subsystem, which really wants the unconverted values -
 * so that's good.
 */
int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
                          bool alloc)
{
        int size, ret;
        kuid_t kroot;
        u32 nsmagic, magic;
        uid_t root, mappedroot;
        char *tmpbuf = NULL;
        struct vfs_cap_data *cap;
        struct vfs_ns_cap_data *nscap = NULL;
        struct dentry *dentry;
        struct user_namespace *fs_ns;

        if (strcmp(name, "capability") != 0)
                return -EOPNOTSUPP;

        dentry = d_find_any_alias(inode);
        if (!dentry)
                return -EINVAL;

        size = sizeof(struct vfs_ns_cap_data);
        ret = (int) vfs_getxattr_alloc(dentry, XATTR_NAME_CAPS,
                                 &tmpbuf, size, GFP_NOFS);
        dput(dentry);

        if (ret < 0 || !tmpbuf) {
                size = ret;
                goto out_free;
        }

        fs_ns = inode->i_sb->s_user_ns;
        cap = (struct vfs_cap_data *) tmpbuf;
        if (is_v2header((size_t) ret, cap)) {
                root = 0;
        } else if (is_v3header((size_t) ret, cap)) {
                nscap = (struct vfs_ns_cap_data *) tmpbuf;
                root = le32_to_cpu(nscap->rootid);
        } else {
                size = -EINVAL;
                goto out_free;
        }

        kroot = make_kuid(fs_ns, root);

        /* If the root kuid maps to a valid uid in current ns, then return
         * this as a nscap. */
        mappedroot = from_kuid(current_user_ns(), kroot);
        if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) {
                size = sizeof(struct vfs_ns_cap_data);
                if (alloc) {
                        if (!nscap) {
                                /* v2 -> v3 conversion */
                                nscap = kzalloc(size, GFP_ATOMIC);
                                if (!nscap) {
                                        size = -ENOMEM;
                                        goto out_free;
                                }
                                nsmagic = VFS_CAP_REVISION_3;
                                magic = le32_to_cpu(cap->magic_etc);
                                if (magic & VFS_CAP_FLAGS_EFFECTIVE)
                                        nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
                                memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
                                nscap->magic_etc = cpu_to_le32(nsmagic);
                        } else {
                                /* use allocated v3 buffer */
                                tmpbuf = NULL;
                        }
                        nscap->rootid = cpu_to_le32(mappedroot);
                        *buffer = nscap;
                }
                goto out_free;
        }

        if (!rootid_owns_currentns(kroot)) {
                size = -EOVERFLOW;
                goto out_free;
        }

        /* This comes from a parent namespace.  Return as a v2 capability */
        size = sizeof(struct vfs_cap_data);
        if (alloc) {
                if (nscap) {
                        /* v3 -> v2 conversion */
                        cap = kzalloc(size, GFP_ATOMIC);
                        if (!cap) {
                                size = -ENOMEM;
                                goto out_free;
                        }
                        magic = VFS_CAP_REVISION_2;
                        nsmagic = le32_to_cpu(nscap->magic_etc);
                        if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
                                magic |= VFS_CAP_FLAGS_EFFECTIVE;
                        memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
                        cap->magic_etc = cpu_to_le32(magic);
                } else {
                        /* use unconverted v2 */
                        tmpbuf = NULL;
                }
                *buffer = cap;
        }
out_free:
        kfree(tmpbuf);
        return size;
}

static kuid_t rootid_from_xattr(const void *value, size_t size,
                                struct user_namespace *task_ns)
{
        const struct vfs_ns_cap_data *nscap = value;
        uid_t rootid = 0;

        if (size == XATTR_CAPS_SZ_3)
                rootid = le32_to_cpu(nscap->rootid);

        return make_kuid(task_ns, rootid);
}

static bool validheader(size_t size, const struct vfs_cap_data *cap)
{
        return is_v2header(size, cap) || is_v3header(size, cap);
}

/*
 * User requested a write of security.capability.  If needed, update the
 * xattr to change from v2 to v3, or to fixup the v3 rootid.
 *
 * If all is ok, we return the new size, on error return < 0.
 */
int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size)
{
        struct vfs_ns_cap_data *nscap;
        uid_t nsrootid;
        const struct vfs_cap_data *cap = *ivalue;
        __u32 magic, nsmagic;
        struct inode *inode = d_backing_inode(dentry);
        struct user_namespace *task_ns = current_user_ns(),
                *fs_ns = inode->i_sb->s_user_ns;
        kuid_t rootid;
        size_t newsize;

        if (!*ivalue)
                return -EINVAL;
        if (!validheader(size, cap))
                return -EINVAL;
        if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
                return -EPERM;
        if (size == XATTR_CAPS_SZ_2)
                if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP))
                        /* user is privileged, just write the v2 */
                        return size;

        rootid = rootid_from_xattr(*ivalue, size, task_ns);
        if (!uid_valid(rootid))
                return -EINVAL;

        nsrootid = from_kuid(fs_ns, rootid);
        if (nsrootid == -1)
                return -EINVAL;

        newsize = sizeof(struct vfs_ns_cap_data);
        nscap = kmalloc(newsize, GFP_ATOMIC);
        if (!nscap)
                return -ENOMEM;
        nscap->rootid = cpu_to_le32(nsrootid);
        nsmagic = VFS_CAP_REVISION_3;
        magic = le32_to_cpu(cap->magic_etc);
        if (magic & VFS_CAP_FLAGS_EFFECTIVE)
                nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
        nscap->magic_etc = cpu_to_le32(nsmagic);
        memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);

        kvfree(*ivalue);
        *ivalue = nscap;
        return newsize;
}

/*
 * Calculate the new process capability sets from the capability sets attached
 * to a file.
 */
static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
                                          struct linux_binprm *bprm,
                                          bool *effective,
                                          bool *has_fcap)
{
        struct cred *new = bprm->cred;
        unsigned i;
        int ret = 0;

        if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE)
                *effective = true;

        if (caps->magic_etc & VFS_CAP_REVISION_MASK)
                *has_fcap = true;

        CAP_FOR_EACH_U32(i) {
                __u32 permitted = caps->permitted.cap[i];
                __u32 inheritable = caps->inheritable.cap[i];

                /*
                 * pP' = (X & fP) | (pI & fI)
                 * The addition of pA' is handled later.
                 */
                new->cap_permitted.cap[i] =
                        (new->cap_bset.cap[i] & permitted) |
                        (new->cap_inheritable.cap[i] & inheritable);

                if (permitted & ~new->cap_permitted.cap[i])
                        /* insufficient to execute correctly */
                        ret = -EPERM;
        }

        /*
         * For legacy apps, with no internal support for recognizing they
         * do not have enough capabilities, we return an error if they are
         * missing some "forced" (aka file-permitted) capabilities.
         */
        return *effective ? ret : 0;
}

/*
 * Extract the on-exec-apply capability sets for an executable file.
 */
int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps)
{
        struct inode *inode = d_backing_inode(dentry);
        __u32 magic_etc;
        unsigned tocopy, i;
        int size;
        struct vfs_ns_cap_data data, *nscaps = &data;
        struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
        kuid_t rootkuid;
        struct user_namespace *fs_ns;

        memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));

        if (!inode)
                return -ENODATA;

        fs_ns = inode->i_sb->s_user_ns;
        size = __vfs_getxattr((struct dentry *)dentry, inode,
                              XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
        if (size == -ENODATA || size == -EOPNOTSUPP)
                /* no data, that's ok */
                return -ENODATA;

        if (size < 0)
                return size;

        if (size < sizeof(magic_etc))
                return -EINVAL;

        cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);

        rootkuid = make_kuid(fs_ns, 0);
        switch (magic_etc & VFS_CAP_REVISION_MASK) {
        case VFS_CAP_REVISION_1:
                if (size != XATTR_CAPS_SZ_1)
                        return -EINVAL;
                tocopy = VFS_CAP_U32_1;
                break;
        case VFS_CAP_REVISION_2:
                if (size != XATTR_CAPS_SZ_2)
                        return -EINVAL;
                tocopy = VFS_CAP_U32_2;
                break;
        case VFS_CAP_REVISION_3:
                if (size != XATTR_CAPS_SZ_3)
                        return -EINVAL;
                tocopy = VFS_CAP_U32_3;
                rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
                break;

        default:
                return -EINVAL;
        }
        /* Limit the caps to the mounter of the filesystem
         * or the more limited uid specified in the xattr.
         */
        if (!rootid_owns_currentns(rootkuid))
                return -ENODATA;

        CAP_FOR_EACH_U32(i) {
                if (i >= tocopy)
                        break;
                cpu_caps->permitted.cap[i] = le32_to_cpu(caps->data[i].permitted);
                cpu_caps->inheritable.cap[i] = le32_to_cpu(caps->data[i].inheritable);
        }

        cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
        cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;

        cpu_caps->rootid = rootkuid;

        return 0;
}

/*
 * Attempt to get the on-exec apply capability sets for an executable file from
 * its xattrs and, if present, apply them to the proposed credentials being
 * constructed by execve().
 */
static int get_file_caps(struct linux_binprm *bprm, struct file *file,
                         bool *effective, bool *has_fcap)
{
        int rc = 0;
        struct cpu_vfs_cap_data vcaps;

        cap_clear(bprm->cred->cap_permitted);

        if (!file_caps_enabled)
                return 0;

        if (!mnt_may_suid(file->f_path.mnt))
                return 0;

        /*
         * This check is redundant with mnt_may_suid() but is kept to make
         * explicit that capability bits are limited to s_user_ns and its
         * descendants.
         */
        if (!current_in_userns(file->f_path.mnt->mnt_sb->s_user_ns))
                return 0;

        rc = get_vfs_caps_from_disk(file->f_path.dentry, &vcaps);
        if (rc < 0) {
                if (rc == -EINVAL)
                        printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
                                        bprm->filename);
                else if (rc == -ENODATA)
                        rc = 0;
                goto out;
        }

        rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective, has_fcap);

out:
        if (rc)
                cap_clear(bprm->cred->cap_permitted);

        return rc;
}

static inline bool root_privileged(void) { return !issecure(SECURE_NOROOT); }

static inline bool __is_real(kuid_t uid, struct cred *cred)
{ return uid_eq(cred->uid, uid); }

static inline bool __is_eff(kuid_t uid, struct cred *cred)
{ return uid_eq(cred->euid, uid); }

static inline bool __is_suid(kuid_t uid, struct cred *cred)
{ return !__is_real(uid, cred) && __is_eff(uid, cred); }

/*
 * handle_privileged_root - Handle case of privileged root
 * @bprm: The execution parameters, including the proposed creds
 * @has_fcap: Are any file capabilities set?
 * @effective: Do we have effective root privilege?
 * @root_uid: This namespace' root UID WRT initial USER namespace
 *
 * Handle the case where root is privileged and hasn't been neutered by
 * SECURE_NOROOT.  If file capabilities are set, they won't be combined with
 * set UID root and nothing is changed.  If we are root, cap_permitted is
 * updated.  If we have become set UID root, the effective bit is set.
 */
static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap,
                                   bool *effective, kuid_t root_uid)
{
        const struct cred *old = current_cred();
        struct cred *new = bprm->cred;

        if (!root_privileged())
                return;
        /*
         * If the legacy file capability is set, then don't set privs
         * for a setuid root binary run by a non-root user.  Do set it
         * for a root user just to cause least surprise to an admin.
         */
        if (has_fcap && __is_suid(root_uid, new)) {
                warn_setuid_and_fcaps_mixed(bprm->filename);
                return;
        }
        /*
         * To support inheritance of root-permissions and suid-root
         * executables under compatibility mode, we override the
         * capability sets for the file.
         */
        if (__is_eff(root_uid, new) || __is_real(root_uid, new)) {
                /* pP' = (cap_bset & ~0) | (pI & ~0) */
                new->cap_permitted = cap_combine(old->cap_bset,
                                                 old->cap_inheritable);
        }
        /*
         * If only the real uid is 0, we do not set the effective bit.
         */
        if (__is_eff(root_uid, new))
                *effective = true;
}

#define __cap_gained(field, target, source) \
        !cap_issubset(target->cap_##field, source->cap_##field)
#define __cap_grew(target, source, cred) \
        !cap_issubset(cred->cap_##target, cred->cap_##source)
#define __cap_full(field, cred) \
        cap_issubset(CAP_FULL_SET, cred->cap_##field)

static inline bool __is_setuid(struct cred *new, const struct cred *old)
{ return !uid_eq(new->euid, old->uid); }

static inline bool __is_setgid(struct cred *new, const struct cred *old)
{ return !gid_eq(new->egid, old->gid); }

/*
 * 1) Audit candidate if current->cap_effective is set
 *
 * We do not bother to audit if 3 things are true:
 *   1) cap_effective has all caps
 *   2) we became root *OR* are were already root
 *   3) root is supposed to have all caps (SECURE_NOROOT)
 * Since this is just a normal root execing a process.
 *
 * Number 1 above might fail if you don't have a full bset, but I think
 * that is interesting information to audit.
 *
 * A number of other conditions require logging:
 * 2) something prevented setuid root getting all caps
 * 3) non-setuid root gets fcaps
 * 4) non-setuid root gets ambient
 */
static inline bool nonroot_raised_pE(struct cred *new, const struct cred *old,
                                     kuid_t root, bool has_fcap)
{
        bool ret = false;

        if ((__cap_grew(effective, ambient, new) &&
             !(__cap_full(effective, new) &&
               (__is_eff(root, new) || __is_real(root, new)) &&
               root_privileged())) ||
            (root_privileged() &&
             __is_suid(root, new) &&
             !__cap_full(effective, new)) ||
            (!__is_setuid(new, old) &&
             ((has_fcap &&
               __cap_gained(permitted, new, old)) ||
              __cap_gained(ambient, new, old))))

                ret = true;

        return ret;
}

/**
 * cap_bprm_creds_from_file - Set up the proposed credentials for execve().
 * @bprm: The execution parameters, including the proposed creds
 * @file: The file to pull the credentials from
 *
 * Set up the proposed credentials for a new execution context being
 * constructed by execve().  The proposed creds in @bprm->cred is altered,
 * which won't take effect immediately.  Returns 0 if successful, -ve on error.
 */
int cap_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file)
{
        /* Process setpcap binaries and capabilities for uid 0 */
        const struct cred *old = current_cred();
        struct cred *new = bprm->cred;
        bool effective = false, has_fcap = false, is_setid;
        int ret;
        kuid_t root_uid;

        if (WARN_ON(!cap_ambient_invariant_ok(old)))
                return -EPERM;

        ret = get_file_caps(bprm, file, &effective, &has_fcap);
        if (ret < 0)
                return ret;

        root_uid = make_kuid(new->user_ns, 0);

        handle_privileged_root(bprm, has_fcap, &effective, root_uid);

        /* if we have fs caps, clear dangerous personality flags */
        if (__cap_gained(permitted, new, old))
                bprm->per_clear |= PER_CLEAR_ON_SETID;

        /* Don't let someone trace a set[ug]id/setpcap binary with the revised
         * credentials unless they have the appropriate permit.
         *
         * In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
         */
        is_setid = __is_setuid(new, old) || __is_setgid(new, old);

        if ((is_setid || __cap_gained(permitted, new, old)) &&
            ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) ||
             !ptracer_capable(current, new->user_ns))) {
                /* downgrade; they get no more than they had, and maybe less */
                if (!ns_capable(new->user_ns, CAP_SETUID) ||
                    (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) {
                        new->euid = new->uid;
                        new->egid = new->gid;
                }
                new->cap_permitted = cap_intersect(new->cap_permitted,
                                                   old->cap_permitted);
        }

        new->suid = new->fsuid = new->euid;
        new->sgid = new->fsgid = new->egid;

        /* File caps or setid cancels ambient. */
        if (has_fcap || is_setid)
                cap_clear(new->cap_ambient);

        /*
         * Now that we've computed pA', update pP' to give:
         *   pP' = (X & fP) | (pI & fI) | pA'
         */
        new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient);

        /*
         * Set pE' = (fE ? pP' : pA').  Because pA' is zero if fE is set,
         * this is the same as pE' = (fE ? pP' : 0) | pA'.
         */
        if (effective)
                new->cap_effective = new->cap_permitted;
        else
                new->cap_effective = new->cap_ambient;

        if (WARN_ON(!cap_ambient_invariant_ok(new)))
                return -EPERM;

        if (nonroot_raised_pE(new, old, root_uid, has_fcap)) {
                ret = audit_log_bprm_fcaps(bprm, new, old);
                if (ret < 0)
                        return ret;
        }

        new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);

        if (WARN_ON(!cap_ambient_invariant_ok(new)))
                return -EPERM;

        /* Check for privilege-elevated exec. */
        if (is_setid ||
            (!__is_real(root_uid, new) &&
             (effective ||
              __cap_grew(permitted, ambient, new))))
                bprm->secureexec = 1;

        return 0;
}

/**
 * cap_inode_setxattr - Determine whether an xattr may be altered
 * @dentry: The inode/dentry being altered
 * @name: The name of the xattr to be changed
 * @value: The value that the xattr will be changed to
 * @size: The size of value
 * @flags: The replacement flag
 *
 * Determine whether an xattr may be altered or set on an inode, returning 0 if
 * permission is granted, -ve if denied.
 *
 * This is used to make sure security xattrs don't get updated or set by those
 * who aren't privileged to do so.
 */
int cap_inode_setxattr(struct dentry *dentry, const char *name,
                       const void *value, size_t size, int flags)
{
        struct user_namespace *user_ns = dentry->d_sb->s_user_ns;

        /* Ignore non-security xattrs */
        if (strncmp(name, XATTR_SECURITY_PREFIX,
                        XATTR_SECURITY_PREFIX_LEN) != 0)
                return 0;

        /*
         * For XATTR_NAME_CAPS the check will be done in
         * cap_convert_nscap(), called by setxattr()
         */
        if (strcmp(name, XATTR_NAME_CAPS) == 0)
                return 0;

        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        return 0;
}

/**
 * cap_inode_removexattr - Determine whether an xattr may be removed
 * @dentry: The inode/dentry being altered
 * @name: The name of the xattr to be changed
 *
 * Determine whether an xattr may be removed from an inode, returning 0 if
 * permission is granted, -ve if denied.
 *
 * This is used to make sure security xattrs don't get removed by those who
 * aren't privileged to remove them.
 */
int cap_inode_removexattr(struct dentry *dentry, const char *name)
{
        struct user_namespace *user_ns = dentry->d_sb->s_user_ns;

        /* Ignore non-security xattrs */
        if (strncmp(name, XATTR_SECURITY_PREFIX,
                        XATTR_SECURITY_PREFIX_LEN) != 0)
                return 0;

        if (strcmp(name, XATTR_NAME_CAPS) == 0) {
                /* security.capability gets namespaced */
                struct inode *inode = d_backing_inode(dentry);
                if (!inode)
                        return -EINVAL;
                if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
                        return -EPERM;
                return 0;
        }

        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        return 0;
}

/*
 * cap_emulate_setxuid() fixes the effective / permitted capabilities of
 * a process after a call to setuid, setreuid, or setresuid.
 *
 *  1) When set*uiding _from_ one of {r,e,s}uid == 0 _to_ all of
 *  {r,e,s}uid != 0, the permitted and effective capabilities are
 *  cleared.
 *
 *  2) When set*uiding _from_ euid == 0 _to_ euid != 0, the effective
 *  capabilities of the process are cleared.
 *
 *  3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective
 *  capabilities are set to the permitted capabilities.
 *
 *  fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should
 *  never happen.
 *
 *  -astor
 *
 * cevans - New behaviour, Oct '99
 * A process may, via prctl(), elect to keep its capabilities when it
 * calls setuid() and switches away from uid==0. Both permitted and
 * effective sets will be retained.
 * Without this change, it was impossible for a daemon to drop only some
 * of its privilege. The call to setuid(!=0) would drop all privileges!
 * Keeping uid 0 is not an option because uid 0 owns too many vital
 * files..
 * Thanks to Olaf Kirch and Peter Benie for spotting this.
 */
static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old)
{
        kuid_t root_uid = make_kuid(old->user_ns, 0);

        if ((uid_eq(old->uid, root_uid) ||
             uid_eq(old->euid, root_uid) ||
             uid_eq(old->suid, root_uid)) &&
            (!uid_eq(new->uid, root_uid) &&
             !uid_eq(new->euid, root_uid) &&
             !uid_eq(new->suid, root_uid))) {
                if (!issecure(SECURE_KEEP_CAPS)) {
                        cap_clear(new->cap_permitted);
                        cap_clear(new->cap_effective);
                }

                /*
                 * Pre-ambient programs expect setresuid to nonroot followed
                 * by exec to drop capabilities.  We should make sure that
                 * this remains the case.
                 */
                cap_clear(new->cap_ambient);
        }
        if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid))
                cap_clear(new->cap_effective);
        if (!uid_eq(old->euid, root_uid) && uid_eq(new->euid, root_uid))
                new->cap_effective = new->cap_permitted;
}

/**
 * cap_task_fix_setuid - Fix up the results of setuid() call
 * @new: The proposed credentials
 * @old: The current task's current credentials
 * @flags: Indications of what has changed
 *
 * Fix up the results of setuid() call before the credential changes are
 * actually applied, returning 0 to grant the changes, -ve to deny them.
 */
int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags)
{
        switch (flags) {
        case LSM_SETID_RE:
        case LSM_SETID_ID:
        case LSM_SETID_RES:
                /* juggle the capabilities to follow [RES]UID changes unless
                 * otherwise suppressed */
                if (!issecure(SECURE_NO_SETUID_FIXUP))
                        cap_emulate_setxuid(new, old);
                break;

        case LSM_SETID_FS:
                /* juggle the capabilties to follow FSUID changes, unless
                 * otherwise suppressed
                 *
                 * FIXME - is fsuser used for all CAP_FS_MASK capabilities?
                 *          if not, we might be a bit too harsh here.
                 */
                if (!issecure(SECURE_NO_SETUID_FIXUP)) {
                        kuid_t root_uid = make_kuid(old->user_ns, 0);
                        if (uid_eq(old->fsuid, root_uid) && !uid_eq(new->fsuid, root_uid))
                                new->cap_effective =
                                        cap_drop_fs_set(new->cap_effective);

                        if (!uid_eq(old->fsuid, root_uid) && uid_eq(new->fsuid, root_uid))
                                new->cap_effective =
                                        cap_raise_fs_set(new->cap_effective,
                                                         new->cap_permitted);
                }
                break;

        default:
                return -EINVAL;
        }

        return 0;
}

/*
 * Rationale: code calling task_setscheduler, task_setioprio, and
 * task_setnice, assumes that
 *   . if capable(cap_sys_nice), then those actions should be allowed
 *   . if not capable(cap_sys_nice), but acting on your own processes,
 *           then those actions should be allowed
 * This is insufficient now since you can call code without suid, but
 * yet with increased caps.
 * So we check for increased caps on the target process.
 */
static int cap_safe_nice(struct task_struct *p)
{
        int is_subset, ret = 0;

        rcu_read_lock();
        is_subset = cap_issubset(__task_cred(p)->cap_permitted,
                                 current_cred()->cap_permitted);
        if (!is_subset && !ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
                ret = -EPERM;
        rcu_read_unlock();

        return ret;
}

/**
 * cap_task_setscheduler - Detemine if scheduler policy change is permitted
 * @p: The task to affect
 *
 * Detemine if the requested scheduler policy change is permitted for the
 * specified task, returning 0 if permission is granted, -ve if denied.
 */
int cap_task_setscheduler(struct task_struct *p)
{
        return cap_safe_nice(p);
}

/**
 * cap_task_ioprio - Detemine if I/O priority change is permitted
 * @p: The task to affect
 * @ioprio: The I/O priority to set
 *
 * Detemine if the requested I/O priority change is permitted for the specified
 * task, returning 0 if permission is granted, -ve if denied.
 */
int cap_task_setioprio(struct task_struct *p, int ioprio)
{
        return cap_safe_nice(p);
}

/**
 * cap_task_ioprio - Detemine if task priority change is permitted
 * @p: The task to affect
 * @nice: The nice value to set
 *
 * Detemine if the requested task priority change is permitted for the
 * specified task, returning 0 if permission is granted, -ve if denied.
 */
int cap_task_setnice(struct task_struct *p, int nice)
{
        return cap_safe_nice(p);
}

/*
 * Implement PR_CAPBSET_DROP.  Attempt to remove the specified capability from
 * the current task's bounding set.  Returns 0 on success, -ve on error.
 */
static int cap_prctl_drop(unsigned long cap)
{
        struct cred *new;

        if (!ns_capable(current_user_ns(), CAP_SETPCAP))
                return -EPERM;
        if (!cap_valid(cap))
                return -EINVAL;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;
        cap_lower(new->cap_bset, cap);
        return commit_creds(new);
}

/**
 * cap_task_prctl - Implement process control functions for this security module
 * @option: The process control function requested
 * @arg2, @arg3, @arg4, @arg5: The argument data for this function
 *
 * Allow process control functions (sys_prctl()) to alter capabilities; may
 * also deny access to other functions not otherwise implemented here.
 *
 * Returns 0 or +ve on success, -ENOSYS if this function is not implemented
 * here, other -ve on error.  If -ENOSYS is returned, sys_prctl() and other LSM
 * modules will consider performing the function.
 */
int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
                   unsigned long arg4, unsigned long arg5)
{
        const struct cred *old = current_cred();
        struct cred *new;

        switch (option) {
        case PR_CAPBSET_READ:
                if (!cap_valid(arg2))
                        return -EINVAL;
                return !!cap_raised(old->cap_bset, arg2);

        case PR_CAPBSET_DROP:
                return cap_prctl_drop(arg2);

        /*
         * The next four prctl's remain to assist with transitioning a
         * system from legacy UID=0 based privilege (when filesystem
         * capabilities are not in use) to a system using filesystem
         * capabilities only - as the POSIX.1e draft intended.
         *
         * Note:
         *
         *  PR_SET_SECUREBITS =
         *      issecure_mask(SECURE_KEEP_CAPS_LOCKED)
         *    | issecure_mask(SECURE_NOROOT)
         *    | issecure_mask(SECURE_NOROOT_LOCKED)
         *    | issecure_mask(SECURE_NO_SETUID_FIXUP)
         *    | issecure_mask(SECURE_NO_SETUID_FIXUP_LOCKED)
         *
         * will ensure that the current process and all of its
         * children will be locked into a pure
         * capability-based-privilege environment.
         */
        case PR_SET_SECUREBITS:
                if ((((old->securebits & SECURE_ALL_LOCKS) >> 1)
                     & (old->securebits ^ arg2))                        /*[1]*/
                    || ((old->securebits & SECURE_ALL_LOCKS & ~arg2))        /*[2]*/
                    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))        /*[3]*/
                    || (cap_capable(current_cred(),
                                    current_cred()->user_ns,
                                    CAP_SETPCAP,
                                    CAP_OPT_NONE) != 0)                        /*[4]*/
                        /*
                         * [1] no changing of bits that are locked
                         * [2] no unlocking of locks
                         * [3] no setting of unsupported bits
                         * [4] doing anything requires privilege (go read about
                         *     the "sendmail capabilities bug")
                         */
                    )
                        /* cannot change a locked bit */
                        return -EPERM;

                new = prepare_creds();
                if (!new)
                        return -ENOMEM;
                new->securebits = arg2;
                return commit_creds(new);

        case PR_GET_SECUREBITS:
                return old->securebits;

        case PR_GET_KEEPCAPS:
                return !!issecure(SECURE_KEEP_CAPS);

        case PR_SET_KEEPCAPS:
                if (arg2 > 1) /* Note, we rely on arg2 being unsigned here */
                        return -EINVAL;
                if (issecure(SECURE_KEEP_CAPS_LOCKED))
                        return -EPERM;

                new = prepare_creds();
                if (!new)
                        return -ENOMEM;
                if (arg2)
                        new->securebits |= issecure_mask(SECURE_KEEP_CAPS);
                else
                        new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
                return commit_creds(new);

        case PR_CAP_AMBIENT:
                if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) {
                        if (arg3 | arg4 | arg5)
                                return -EINVAL;

                        new = prepare_creds();
                        if (!new)
                                return -ENOMEM;
                        cap_clear(new->cap_ambient);
                        return commit_creds(new);
                }

                if (((!cap_valid(arg3)) | arg4 | arg5))
                        return -EINVAL;

                if (arg2 == PR_CAP_AMBIENT_IS_SET) {
                        return !!cap_raised(current_cred()->cap_ambient, arg3);
                } else if (arg2 != PR_CAP_AMBIENT_RAISE &&
                           arg2 != PR_CAP_AMBIENT_LOWER) {
                        return -EINVAL;
                } else {
                        if (arg2 == PR_CAP_AMBIENT_RAISE &&
                            (!cap_raised(current_cred()->cap_permitted, arg3) ||
                             !cap_raised(current_cred()->cap_inheritable,
                                         arg3) ||
                             issecure(SECURE_NO_CAP_AMBIENT_RAISE)))
                                return -EPERM;

                        new = prepare_creds();
                        if (!new)
                                return -ENOMEM;
                        if (arg2 == PR_CAP_AMBIENT_RAISE)
                                cap_raise(new->cap_ambient, arg3);
                        else
                                cap_lower(new->cap_ambient, arg3);
                        return commit_creds(new);
                }

        default:
                /* No functionality available - continue with default */
                return -ENOSYS;
        }
}

/**
 * cap_vm_enough_memory - Determine whether a new virtual mapping is permitted
 * @mm: The VM space in which the new mapping is to be made
 * @pages: The size of the mapping
 *
 * Determine whether the allocation of a new virtual mapping by the current
 * task is permitted, returning 1 if permission is granted, 0 if not.
 */
int cap_vm_enough_memory(struct mm_struct *mm, long pages)
{
        int cap_sys_admin = 0;

        if (cap_capable(current_cred(), &init_user_ns,
                                CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) == 0)
                cap_sys_admin = 1;

        return cap_sys_admin;
}

/*
 * cap_mmap_addr - check if able to map given addr
 * @addr: address attempting to be mapped
 *
 * If the process is attempting to map memory below dac_mmap_min_addr they need
 * CAP_SYS_RAWIO.  The other parameters to this function are unused by the
 * capability security module.  Returns 0 if this mapping should be allowed
 * -EPERM if not.
 */
int cap_mmap_addr(unsigned long addr)
{
        int ret = 0;

        if (addr < dac_mmap_min_addr) {
                ret = cap_capable(current_cred(), &init_user_ns, CAP_SYS_RAWIO,
                                  CAP_OPT_NONE);
                /* set PF_SUPERPRIV if it turns out we allow the low mmap */
                if (ret == 0)
                        current->flags |= PF_SUPERPRIV;
        }
        return ret;
}

int cap_mmap_file(struct file *file, unsigned long reqprot,
                  unsigned long prot, unsigned long flags)
{
        return 0;
}

#ifdef CONFIG_SECURITY

static struct security_hook_list capability_hooks[] __lsm_ro_after_init = {
        LSM_HOOK_INIT(capable, cap_capable),
        LSM_HOOK_INIT(settime, cap_settime),
        LSM_HOOK_INIT(ptrace_access_check, cap_ptrace_access_check),
        LSM_HOOK_INIT(ptrace_traceme, cap_ptrace_traceme),
        LSM_HOOK_INIT(capget, cap_capget),
        LSM_HOOK_INIT(capset, cap_capset),
        LSM_HOOK_INIT(bprm_creds_from_file, cap_bprm_creds_from_file),
        LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
        LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
        LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity),
        LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
        LSM_HOOK_INIT(mmap_file, cap_mmap_file),
        LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid),
        LSM_HOOK_INIT(task_prctl, cap_task_prctl),
        LSM_HOOK_INIT(task_setscheduler, cap_task_setscheduler),
        LSM_HOOK_INIT(task_setioprio, cap_task_setioprio),
        LSM_HOOK_INIT(task_setnice, cap_task_setnice),
        LSM_HOOK_INIT(vm_enough_memory, cap_vm_enough_memory),
};

static int __init capability_init(void)
{
        security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks),
                                "capability");
        return 0;
}

DEFINE_LSM(capability) = {
        .name = "capability",
        .order = LSM_ORDER_FIRST,
        .init = capability_init,
};

#endif /* CONFIG_SECURITY */


















    1 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_PGTABLE_INVERT_H
#define _ASM_PGTABLE_INVERT_H 1

#ifndef __ASSEMBLY__

/*
 * A clear pte value is special, and doesn't get inverted.
 *
 * Note that even users that only pass a pgprot_t (rather
 * than a full pte) won't trigger the special zero case,
 * because even PAGE_NONE has _PAGE_PROTNONE | _PAGE_ACCESSED
 * set. So the all zero case really is limited to just the
 * cleared page table entry case.
 */
static inline bool __pte_needs_invert(u64 val)
{
        return val && !(val & _PAGE_PRESENT);
}

/* Get a mask to xor with the page table entry to get the correct pfn. */
static inline u64 protnone_mask(u64 val)
{
        return __pte_needs_invert(val) ?  ~0ull : 0;
}

static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask)
{
        /*
         * When a PTE transitions from NONE to !NONE or vice-versa
         * invert the PFN part to stop speculation.
         * pte_pfn undoes this when needed.
         */
        if (__pte_needs_invert(oldval) != __pte_needs_invert(val))
                val = (val & ~mask) | (~val & mask);
        return val;
}

#endif /* __ASSEMBLY__ */

#endif






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
// SPDX-License-Identifier: GPL-2.0
/*
 *  ext4.h
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/include/linux/minix_fs.h
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#ifndef _EXT4_H
#define _EXT4_H

#include <linux/types.h>
#include <linux/blkdev.h>
#include <linux/magic.h>
#include <linux/jbd2.h>
#include <linux/quota.h>
#include <linux/rwsem.h>
#include <linux/rbtree.h>
#include <linux/seqlock.h>
#include <linux/mutex.h>
#include <linux/timer.h>
#include <linux/wait.h>
#include <linux/sched/signal.h>
#include <linux/blockgroup_lock.h>
#include <linux/percpu_counter.h>
#include <linux/ratelimit.h>
#include <crypto/hash.h>
#include <linux/falloc.h>
#include <linux/percpu-rwsem.h>
#include <linux/fiemap.h>
#ifdef __KERNEL__
#include <linux/compat.h>
#endif

#include <linux/fscrypt.h>
#include <linux/fsverity.h>

#include <linux/compiler.h>

/*
 * The fourth extended filesystem constants/structures
 */

/*
 * with AGGRESSIVE_CHECK allocator runs consistency checks over
 * structures. these checks slow things down a lot
 */
#define AGGRESSIVE_CHECK__

/*
 * with DOUBLE_CHECK defined mballoc creates persistent in-core
 * bitmaps, maintains and uses them to check for double allocations
 */
#define DOUBLE_CHECK__

/*
 * Define EXT4FS_DEBUG to produce debug messages
 */
#undef EXT4FS_DEBUG

/*
 * Debug code
 */
#ifdef EXT4FS_DEBUG
#define ext4_debug(f, a...)                                                \
        do {                                                                \
                printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",        \
                        __FILE__, __LINE__, __func__);                        \
                printk(KERN_DEBUG f, ## a);                                \
        } while (0)
#else
#define ext4_debug(fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
#endif

 /*
  * Turn on EXT_DEBUG to enable ext4_ext_show_path/leaf/move in extents.c
  */
#define EXT_DEBUG__

/*
 * Dynamic printk for controlled extents debugging.
 */
#ifdef CONFIG_EXT4_DEBUG
#define ext_debug(ino, fmt, ...)                                        \
        pr_debug("[%s/%d] EXT4-fs (%s): ino %lu: (%s, %d): %s:" fmt,        \
                 current->comm, task_pid_nr(current),                        \
                 ino->i_sb->s_id, ino->i_ino, __FILE__, __LINE__,        \
                 __func__, ##__VA_ARGS__)
#else
#define ext_debug(ino, fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
#endif

/* data type for block offset of block group */
typedef int ext4_grpblk_t;

/* data type for filesystem-wide blocks number */
typedef unsigned long long ext4_fsblk_t;

/* data type for file logical block number */
typedef __u32 ext4_lblk_t;

/* data type for block group number */
typedef unsigned int ext4_group_t;

enum SHIFT_DIRECTION {
        SHIFT_LEFT = 0,
        SHIFT_RIGHT,
};

/*
 * Flags used in mballoc's allocation_context flags field.
 *
 * Also used to show what's going on for debugging purposes when the
 * flag field is exported via the traceport interface
 */

/* prefer goal again. length */
#define EXT4_MB_HINT_MERGE                0x0001
/* blocks already reserved */
#define EXT4_MB_HINT_RESERVED                0x0002
/* metadata is being allocated */
#define EXT4_MB_HINT_METADATA                0x0004
/* first blocks in the file */
#define EXT4_MB_HINT_FIRST                0x0008
/* search for the best chunk */
#define EXT4_MB_HINT_BEST                0x0010
/* data is being allocated */
#define EXT4_MB_HINT_DATA                0x0020
/* don't preallocate (for tails) */
#define EXT4_MB_HINT_NOPREALLOC                0x0040
/* allocate for locality group */
#define EXT4_MB_HINT_GROUP_ALLOC        0x0080
/* allocate goal blocks or none */
#define EXT4_MB_HINT_GOAL_ONLY                0x0100
/* goal is meaningful */
#define EXT4_MB_HINT_TRY_GOAL                0x0200
/* blocks already pre-reserved by delayed allocation */
#define EXT4_MB_DELALLOC_RESERVED        0x0400
/* We are doing stream allocation */
#define EXT4_MB_STREAM_ALLOC                0x0800
/* Use reserved root blocks if needed */
#define EXT4_MB_USE_ROOT_BLOCKS                0x1000
/* Use blocks from reserved pool */
#define EXT4_MB_USE_RESERVED                0x2000
/* Do strict check for free blocks while retrying block allocation */
#define EXT4_MB_STRICT_CHECK                0x4000

struct ext4_allocation_request {
        /* target inode for block we're allocating */
        struct inode *inode;
        /* how many blocks we want to allocate */
        unsigned int len;
        /* logical block in target inode */
        ext4_lblk_t logical;
        /* the closest logical allocated block to the left */
        ext4_lblk_t lleft;
        /* the closest logical allocated block to the right */
        ext4_lblk_t lright;
        /* phys. target (a hint) */
        ext4_fsblk_t goal;
        /* phys. block for the closest logical allocated block to the left */
        ext4_fsblk_t pleft;
        /* phys. block for the closest logical allocated block to the right */
        ext4_fsblk_t pright;
        /* flags. see above EXT4_MB_HINT_* */
        unsigned int flags;
};

/*
 * Logical to physical block mapping, used by ext4_map_blocks()
 *
 * This structure is used to pass requests into ext4_map_blocks() as
 * well as to store the information returned by ext4_map_blocks().  It
 * takes less room on the stack than a struct buffer_head.
 */
#define EXT4_MAP_NEW                BIT(BH_New)
#define EXT4_MAP_MAPPED                BIT(BH_Mapped)
#define EXT4_MAP_UNWRITTEN        BIT(BH_Unwritten)
#define EXT4_MAP_BOUNDARY        BIT(BH_Boundary)
#define EXT4_MAP_FLAGS                (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
                                 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY)

struct ext4_map_blocks {
        ext4_fsblk_t m_pblk;
        ext4_lblk_t m_lblk;
        unsigned int m_len;
        unsigned int m_flags;
};

/*
 * Block validity checking, system zone rbtree.
 */
struct ext4_system_blocks {
        struct rb_root root;
        struct rcu_head rcu;
};

/*
 * Flags for ext4_io_end->flags
 */
#define        EXT4_IO_END_UNWRITTEN        0x0001

struct ext4_io_end_vec {
        struct list_head list;                /* list of io_end_vec */
        loff_t offset;                        /* offset in the file */
        ssize_t size;                        /* size of the extent */
};

/*
 * For converting unwritten extents on a work queue. 'handle' is used for
 * buffered writeback.
 */
typedef struct ext4_io_end {
        struct list_head        list;                /* per-file finished IO list */
        handle_t                *handle;        /* handle reserved for extent
                                                 * conversion */
        struct inode                *inode;                /* file being written to */
        struct bio                *bio;                /* Linked list of completed
                                                 * bios covering the extent */
        unsigned int                flag;                /* unwritten or not */
        atomic_t                count;                /* reference counter */
        struct list_head        list_vec;        /* list of ext4_io_end_vec */
} ext4_io_end_t;

struct ext4_io_submit {
        struct writeback_control *io_wbc;
        struct bio                *io_bio;
        ext4_io_end_t                *io_end;
        sector_t                io_next_block;
};

/*
 * Special inodes numbers
 */
#define        EXT4_BAD_INO                 1        /* Bad blocks inode */
#define EXT4_ROOT_INO                 2        /* Root inode */
#define EXT4_USR_QUOTA_INO         3        /* User quota inode */
#define EXT4_GRP_QUOTA_INO         4        /* Group quota inode */
#define EXT4_BOOT_LOADER_INO         5        /* Boot loader inode */
#define EXT4_UNDEL_DIR_INO         6        /* Undelete directory inode */
#define EXT4_RESIZE_INO                 7        /* Reserved group descriptors inode */
#define EXT4_JOURNAL_INO         8        /* Journal inode */

/* First non-reserved inode for old ext4 filesystems */
#define EXT4_GOOD_OLD_FIRST_INO        11

/*
 * Maximal count of links to a file
 */
#define EXT4_LINK_MAX                65000

/*
 * Macro-instructions used to manage several block sizes
 */
#define EXT4_MIN_BLOCK_SIZE                1024
#define        EXT4_MAX_BLOCK_SIZE                65536
#define EXT4_MIN_BLOCK_LOG_SIZE                10
#define EXT4_MAX_BLOCK_LOG_SIZE                16
#define EXT4_MAX_CLUSTER_LOG_SIZE        30
#ifdef __KERNEL__
# define EXT4_BLOCK_SIZE(s)                ((s)->s_blocksize)
#else
# define EXT4_BLOCK_SIZE(s)                (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
#endif
#define        EXT4_ADDR_PER_BLOCK(s)                (EXT4_BLOCK_SIZE(s) / sizeof(__u32))
#define EXT4_CLUSTER_SIZE(s)                (EXT4_BLOCK_SIZE(s) << \
                                         EXT4_SB(s)->s_cluster_bits)
#ifdef __KERNEL__
# define EXT4_BLOCK_SIZE_BITS(s)        ((s)->s_blocksize_bits)
# define EXT4_CLUSTER_BITS(s)                (EXT4_SB(s)->s_cluster_bits)
#else
# define EXT4_BLOCK_SIZE_BITS(s)        ((s)->s_log_block_size + 10)
#endif
#ifdef __KERNEL__
#define        EXT4_ADDR_PER_BLOCK_BITS(s)        (EXT4_SB(s)->s_addr_per_block_bits)
#define EXT4_INODE_SIZE(s)                (EXT4_SB(s)->s_inode_size)
#define EXT4_FIRST_INO(s)                (EXT4_SB(s)->s_first_ino)
#else
#define EXT4_INODE_SIZE(s)        (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
                                 EXT4_GOOD_OLD_INODE_SIZE : \
                                 (s)->s_inode_size)
#define EXT4_FIRST_INO(s)        (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
                                 EXT4_GOOD_OLD_FIRST_INO : \
                                 (s)->s_first_ino)
#endif
#define EXT4_BLOCK_ALIGN(size, blkbits)                ALIGN((size), (1 << (blkbits)))
#define EXT4_MAX_BLOCKS(size, offset, blkbits) \
        ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \
                                                                  blkbits))

/* Translate a block number to a cluster number */
#define EXT4_B2C(sbi, blk)        ((blk) >> (sbi)->s_cluster_bits)
/* Translate a cluster number to a block number */
#define EXT4_C2B(sbi, cluster)        ((cluster) << (sbi)->s_cluster_bits)
/* Translate # of blks to # of clusters */
#define EXT4_NUM_B2C(sbi, blks)        (((blks) + (sbi)->s_cluster_ratio - 1) >> \
                                 (sbi)->s_cluster_bits)
/* Mask out the low bits to get the starting block of the cluster */
#define EXT4_PBLK_CMASK(s, pblk) ((pblk) &                                \
                                  ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
#define EXT4_LBLK_CMASK(s, lblk) ((lblk) &                                \
                                  ~((ext4_lblk_t) (s)->s_cluster_ratio - 1))
/* Fill in the low bits to get the last block of the cluster */
#define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) |                                \
                                    ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1))
/* Get the cluster offset */
#define EXT4_PBLK_COFF(s, pblk) ((pblk) &                                \
                                 ((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
#define EXT4_LBLK_COFF(s, lblk) ((lblk) &                                \
                                 ((ext4_lblk_t) (s)->s_cluster_ratio - 1))

/*
 * Structure of a blocks group descriptor
 */
struct ext4_group_desc
{
        __le32        bg_block_bitmap_lo;        /* Blocks bitmap block */
        __le32        bg_inode_bitmap_lo;        /* Inodes bitmap block */
        __le32        bg_inode_table_lo;        /* Inodes table block */
        __le16        bg_free_blocks_count_lo;/* Free blocks count */
        __le16        bg_free_inodes_count_lo;/* Free inodes count */
        __le16        bg_used_dirs_count_lo;        /* Directories count */
        __le16        bg_flags;                /* EXT4_BG_flags (INODE_UNINIT, etc) */
        __le32  bg_exclude_bitmap_lo;   /* Exclude bitmap for snapshots */
        __le16  bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */
        __le16  bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */
        __le16  bg_itable_unused_lo;        /* Unused inodes count */
        __le16  bg_checksum;                /* crc16(sb_uuid+group+desc) */
        __le32        bg_block_bitmap_hi;        /* Blocks bitmap block MSB */
        __le32        bg_inode_bitmap_hi;        /* Inodes bitmap block MSB */
        __le32        bg_inode_table_hi;        /* Inodes table block MSB */
        __le16        bg_free_blocks_count_hi;/* Free blocks count MSB */
        __le16        bg_free_inodes_count_hi;/* Free inodes count MSB */
        __le16        bg_used_dirs_count_hi;        /* Directories count MSB */
        __le16  bg_itable_unused_hi;    /* Unused inodes count MSB */
        __le32  bg_exclude_bitmap_hi;   /* Exclude bitmap block MSB */
        __le16  bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */
        __le16  bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */
        __u32   bg_reserved;
};

#define EXT4_BG_INODE_BITMAP_CSUM_HI_END        \
        (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \
         sizeof(__le16))
#define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END        \
        (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \
         sizeof(__le16))

/*
 * Structure of a flex block group info
 */

struct flex_groups {
        atomic64_t        free_clusters;
        atomic_t        free_inodes;
        atomic_t        used_dirs;
};

#define EXT4_BG_INODE_UNINIT        0x0001 /* Inode table/bitmap not in use */
#define EXT4_BG_BLOCK_UNINIT        0x0002 /* Block bitmap not in use */
#define EXT4_BG_INODE_ZEROED        0x0004 /* On-disk itable initialized to zero */

/*
 * Macro-instructions used to manage group descriptors
 */
#define EXT4_MIN_DESC_SIZE                32
#define EXT4_MIN_DESC_SIZE_64BIT        64
#define        EXT4_MAX_DESC_SIZE                EXT4_MIN_BLOCK_SIZE
#define EXT4_DESC_SIZE(s)                (EXT4_SB(s)->s_desc_size)
#ifdef __KERNEL__
# define EXT4_BLOCKS_PER_GROUP(s)        (EXT4_SB(s)->s_blocks_per_group)
# define EXT4_CLUSTERS_PER_GROUP(s)        (EXT4_SB(s)->s_clusters_per_group)
# define EXT4_DESC_PER_BLOCK(s)                (EXT4_SB(s)->s_desc_per_block)
# define EXT4_INODES_PER_GROUP(s)        (EXT4_SB(s)->s_inodes_per_group)
# define EXT4_DESC_PER_BLOCK_BITS(s)        (EXT4_SB(s)->s_desc_per_block_bits)
#else
# define EXT4_BLOCKS_PER_GROUP(s)        ((s)->s_blocks_per_group)
# define EXT4_DESC_PER_BLOCK(s)                (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s))
# define EXT4_INODES_PER_GROUP(s)        ((s)->s_inodes_per_group)
#endif

/*
 * Constants relative to the data blocks
 */
#define        EXT4_NDIR_BLOCKS                12
#define        EXT4_IND_BLOCK                        EXT4_NDIR_BLOCKS
#define        EXT4_DIND_BLOCK                        (EXT4_IND_BLOCK + 1)
#define        EXT4_TIND_BLOCK                        (EXT4_DIND_BLOCK + 1)
#define        EXT4_N_BLOCKS                        (EXT4_TIND_BLOCK + 1)

/*
 * Inode flags
 */
#define        EXT4_SECRM_FL                        0x00000001 /* Secure deletion */
#define        EXT4_UNRM_FL                        0x00000002 /* Undelete */
#define        EXT4_COMPR_FL                        0x00000004 /* Compress file */
#define EXT4_SYNC_FL                        0x00000008 /* Synchronous updates */
#define EXT4_IMMUTABLE_FL                0x00000010 /* Immutable file */
#define EXT4_APPEND_FL                        0x00000020 /* writes to file may only append */
#define EXT4_NODUMP_FL                        0x00000040 /* do not dump file */
#define EXT4_NOATIME_FL                        0x00000080 /* do not update atime */
/* Reserved for compression usage... */
#define EXT4_DIRTY_FL                        0x00000100
#define EXT4_COMPRBLK_FL                0x00000200 /* One or more compressed clusters */
#define EXT4_NOCOMPR_FL                        0x00000400 /* Don't compress */
        /* nb: was previously EXT2_ECOMPR_FL */
#define EXT4_ENCRYPT_FL                        0x00000800 /* encrypted file */
/* End compression flags --- maybe not all used */
#define EXT4_INDEX_FL                        0x00001000 /* hash-indexed directory */
#define EXT4_IMAGIC_FL                        0x00002000 /* AFS directory */
#define EXT4_JOURNAL_DATA_FL                0x00004000 /* file data should be journaled */
#define EXT4_NOTAIL_FL                        0x00008000 /* file tail should not be merged */
#define EXT4_DIRSYNC_FL                        0x00010000 /* dirsync behaviour (directories only) */
#define EXT4_TOPDIR_FL                        0x00020000 /* Top of directory hierarchies*/
#define EXT4_HUGE_FILE_FL               0x00040000 /* Set to each huge file */
#define EXT4_EXTENTS_FL                        0x00080000 /* Inode uses extents */
#define EXT4_VERITY_FL                        0x00100000 /* Verity protected inode */
#define EXT4_EA_INODE_FL                0x00200000 /* Inode used for large EA */
/* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */

#define EXT4_DAX_FL                        0x02000000 /* Inode is DAX */

#define EXT4_INLINE_DATA_FL                0x10000000 /* Inode has inline data. */
#define EXT4_PROJINHERIT_FL                0x20000000 /* Create with parents projid */
#define EXT4_CASEFOLD_FL                0x40000000 /* Casefolded directory */
#define EXT4_RESERVED_FL                0x80000000 /* reserved for ext4 lib */

/* User modifiable flags */
#define EXT4_FL_USER_MODIFIABLE                (EXT4_SECRM_FL | \
                                         EXT4_UNRM_FL | \
                                         EXT4_COMPR_FL | \
                                         EXT4_SYNC_FL | \
                                         EXT4_IMMUTABLE_FL | \
                                         EXT4_APPEND_FL | \
                                         EXT4_NODUMP_FL | \
                                         EXT4_NOATIME_FL | \
                                         EXT4_JOURNAL_DATA_FL | \
                                         EXT4_NOTAIL_FL | \
                                         EXT4_DIRSYNC_FL | \
                                         EXT4_TOPDIR_FL | \
                                         EXT4_EXTENTS_FL | \
                                         0x00400000 /* EXT4_EOFBLOCKS_FL */ | \
                                         EXT4_DAX_FL | \
                                         EXT4_PROJINHERIT_FL | \
                                         EXT4_CASEFOLD_FL)

/* User visible flags */
#define EXT4_FL_USER_VISIBLE                (EXT4_FL_USER_MODIFIABLE | \
                                         EXT4_DIRTY_FL | \
                                         EXT4_COMPRBLK_FL | \
                                         EXT4_NOCOMPR_FL | \
                                         EXT4_ENCRYPT_FL | \
                                         EXT4_INDEX_FL | \
                                         EXT4_VERITY_FL | \
                                         EXT4_INLINE_DATA_FL)

/* Flags we can manipulate with through FS_IOC_FSSETXATTR */
#define EXT4_FL_XFLAG_VISIBLE                (EXT4_SYNC_FL | \
                                         EXT4_IMMUTABLE_FL | \
                                         EXT4_APPEND_FL | \
                                         EXT4_NODUMP_FL | \
                                         EXT4_NOATIME_FL | \
                                         EXT4_PROJINHERIT_FL | \
                                         EXT4_DAX_FL)

/* Flags that should be inherited by new inodes from their parent. */
#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
                           EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
                           EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
                           EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\
                           EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL |\
                           EXT4_DAX_FL)

/* Flags that are appropriate for regular files (all but dir-specific ones). */
#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\
                           EXT4_PROJINHERIT_FL))

/* Flags that are appropriate for non-directories/regular files. */
#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)

/* The only flags that should be swapped */
#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL)

/* Flags which are mutually exclusive to DAX */
#define EXT4_DAX_MUT_EXCL (EXT4_VERITY_FL | EXT4_ENCRYPT_FL |\
                           EXT4_JOURNAL_DATA_FL | EXT4_INLINE_DATA_FL)

/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
{
        if (S_ISDIR(mode))
                return flags;
        else if (S_ISREG(mode))
                return flags & EXT4_REG_FLMASK;
        else
                return flags & EXT4_OTHER_FLMASK;
}

/*
 * Inode flags used for atomic set/get
 */
enum {
        EXT4_INODE_SECRM        = 0,        /* Secure deletion */
        EXT4_INODE_UNRM                = 1,        /* Undelete */
        EXT4_INODE_COMPR        = 2,        /* Compress file */
        EXT4_INODE_SYNC                = 3,        /* Synchronous updates */
        EXT4_INODE_IMMUTABLE        = 4,        /* Immutable file */
        EXT4_INODE_APPEND        = 5,        /* writes to file may only append */
        EXT4_INODE_NODUMP        = 6,        /* do not dump file */
        EXT4_INODE_NOATIME        = 7,        /* do not update atime */
/* Reserved for compression usage... */
        EXT4_INODE_DIRTY        = 8,
        EXT4_INODE_COMPRBLK        = 9,        /* One or more compressed clusters */
        EXT4_INODE_NOCOMPR        = 10,        /* Don't compress */
        EXT4_INODE_ENCRYPT        = 11,        /* Encrypted file */
/* End compression flags --- maybe not all used */
        EXT4_INODE_INDEX        = 12,        /* hash-indexed directory */
        EXT4_INODE_IMAGIC        = 13,        /* AFS directory */
        EXT4_INODE_JOURNAL_DATA        = 14,        /* file data should be journaled */
        EXT4_INODE_NOTAIL        = 15,        /* file tail should not be merged */
        EXT4_INODE_DIRSYNC        = 16,        /* dirsync behaviour (directories only) */
        EXT4_INODE_TOPDIR        = 17,        /* Top of directory hierarchies*/
        EXT4_INODE_HUGE_FILE        = 18,        /* Set to each huge file */
        EXT4_INODE_EXTENTS        = 19,        /* Inode uses extents */
        EXT4_INODE_VERITY        = 20,        /* Verity protected inode */
        EXT4_INODE_EA_INODE        = 21,        /* Inode used for large EA */
/* 22 was formerly EXT4_INODE_EOFBLOCKS */
        EXT4_INODE_DAX                = 25,        /* Inode is DAX */
        EXT4_INODE_INLINE_DATA        = 28,        /* Data in inode. */
        EXT4_INODE_PROJINHERIT        = 29,        /* Create with parents projid */
        EXT4_INODE_CASEFOLD        = 30,        /* Casefolded directory */
        EXT4_INODE_RESERVED        = 31,        /* reserved for ext4 lib */
};

/*
 * Since it's pretty easy to mix up bit numbers and hex values, we use a
 * build-time check to make sure that EXT4_XXX_FL is consistent with respect to
 * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost
 * any extra space in the compiled kernel image, otherwise, the build will fail.
 * It's important that these values are the same, since we are using
 * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent
 * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk
 * values found in ext2, ext3 and ext4 filesystems, and of course the values
 * defined in e2fsprogs.
 *
 * It's not paranoia if the Murphy's Law really *is* out to get you.  :-)
 */
#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1U << EXT4_INODE_##FLAG))
#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG))

static inline void ext4_check_flag_values(void)
{
        CHECK_FLAG_VALUE(SECRM);
        CHECK_FLAG_VALUE(UNRM);
        CHECK_FLAG_VALUE(COMPR);
        CHECK_FLAG_VALUE(SYNC);
        CHECK_FLAG_VALUE(IMMUTABLE);
        CHECK_FLAG_VALUE(APPEND);
        CHECK_FLAG_VALUE(NODUMP);
        CHECK_FLAG_VALUE(NOATIME);
        CHECK_FLAG_VALUE(DIRTY);
        CHECK_FLAG_VALUE(COMPRBLK);
        CHECK_FLAG_VALUE(NOCOMPR);
        CHECK_FLAG_VALUE(ENCRYPT);
        CHECK_FLAG_VALUE(INDEX);
        CHECK_FLAG_VALUE(IMAGIC);
        CHECK_FLAG_VALUE(JOURNAL_DATA);
        CHECK_FLAG_VALUE(NOTAIL);
        CHECK_FLAG_VALUE(DIRSYNC);
        CHECK_FLAG_VALUE(TOPDIR);
        CHECK_FLAG_VALUE(HUGE_FILE);
        CHECK_FLAG_VALUE(EXTENTS);
        CHECK_FLAG_VALUE(VERITY);
        CHECK_FLAG_VALUE(EA_INODE);
        CHECK_FLAG_VALUE(INLINE_DATA);
        CHECK_FLAG_VALUE(PROJINHERIT);
        CHECK_FLAG_VALUE(CASEFOLD);
        CHECK_FLAG_VALUE(RESERVED);
}

/* Used to pass group descriptor data when online resize is done */
struct ext4_new_group_input {
        __u32 group;                /* Group number for this data */
        __u64 block_bitmap;        /* Absolute block number of block bitmap */
        __u64 inode_bitmap;        /* Absolute block number of inode bitmap */
        __u64 inode_table;        /* Absolute block number of inode table start */
        __u32 blocks_count;        /* Total number of blocks in this group */
        __u16 reserved_blocks;        /* Number of reserved blocks in this group */
        __u16 unused;
};

#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
struct compat_ext4_new_group_input {
        u32 group;
        compat_u64 block_bitmap;
        compat_u64 inode_bitmap;
        compat_u64 inode_table;
        u32 blocks_count;
        u16 reserved_blocks;
        u16 unused;
};
#endif

/* The struct ext4_new_group_input in kernel space, with free_blocks_count */
struct ext4_new_group_data {
        __u32 group;
        __u64 block_bitmap;
        __u64 inode_bitmap;
        __u64 inode_table;
        __u32 blocks_count;
        __u16 reserved_blocks;
        __u16 mdata_blocks;
        __u32 free_clusters_count;
};

/* Indexes used to index group tables in ext4_new_group_data */
enum {
        BLOCK_BITMAP = 0,        /* block bitmap */
        INODE_BITMAP,                /* inode bitmap */
        INODE_TABLE,                /* inode tables */
        GROUP_TABLE_COUNT,
};

/*
 * Flags used by ext4_map_blocks()
 */
        /* Allocate any needed blocks and/or convert an unwritten
           extent to be an initialized ext4 */
#define EXT4_GET_BLOCKS_CREATE                        0x0001
        /* Request the creation of an unwritten extent */
#define EXT4_GET_BLOCKS_UNWRIT_EXT                0x0002
#define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT        (EXT4_GET_BLOCKS_UNWRIT_EXT|\
                                                 EXT4_GET_BLOCKS_CREATE)
        /* Caller is from the delayed allocation writeout path
         * finally doing the actual allocation of delayed blocks */
#define EXT4_GET_BLOCKS_DELALLOC_RESERVE        0x0004
        /* caller is from the direct IO path, request to creation of an
        unwritten extents if not allocated, split the unwritten
        extent if blocks has been preallocated already*/
#define EXT4_GET_BLOCKS_PRE_IO                        0x0008
#define EXT4_GET_BLOCKS_CONVERT                        0x0010
#define EXT4_GET_BLOCKS_IO_CREATE_EXT                (EXT4_GET_BLOCKS_PRE_IO|\
                                         EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
        /* Convert extent to initialized after IO complete */
#define EXT4_GET_BLOCKS_IO_CONVERT_EXT                (EXT4_GET_BLOCKS_CONVERT|\
                                         EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
        /* Eventual metadata allocation (due to growing extent tree)
         * should not fail, so try to use reserved blocks for that.*/
#define EXT4_GET_BLOCKS_METADATA_NOFAIL                0x0020
        /* Don't normalize allocation size (used for fallocate) */
#define EXT4_GET_BLOCKS_NO_NORMALIZE                0x0040
        /* Convert written extents to unwritten */
#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN        0x0100
        /* Write zeros to newly created written extents */
#define EXT4_GET_BLOCKS_ZERO                        0x0200
#define EXT4_GET_BLOCKS_CREATE_ZERO                (EXT4_GET_BLOCKS_CREATE |\
                                        EXT4_GET_BLOCKS_ZERO)
        /* Caller will submit data before dropping transaction handle. This
         * allows jbd2 to avoid submitting data before commit. */
#define EXT4_GET_BLOCKS_IO_SUBMIT                0x0400

/*
 * The bit position of these flags must not overlap with any of the
 * EXT4_GET_BLOCKS_*.  They are used by ext4_find_extent(),
 * read_extent_tree_block(), ext4_split_extent_at(),
 * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf().
 * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be
 * caching the extents when reading from the extent tree while a
 * truncate or punch hole operation is in progress.
 */
#define EXT4_EX_NOCACHE                                0x40000000
#define EXT4_EX_FORCE_CACHE                        0x20000000
#define EXT4_EX_NOFAIL                                0x10000000

/*
 * Flags used by ext4_free_blocks
 */
#define EXT4_FREE_BLOCKS_METADATA                0x0001
#define EXT4_FREE_BLOCKS_FORGET                        0x0002
#define EXT4_FREE_BLOCKS_VALIDATED                0x0004
#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE                0x0008
#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER        0x0010
#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER        0x0020
#define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER      0x0040

/*
 * ioctl commands
 */
#define        EXT4_IOC_GETVERSION                _IOR('f', 3, long)
#define        EXT4_IOC_SETVERSION                _IOW('f', 4, long)
#define        EXT4_IOC_GETVERSION_OLD                FS_IOC_GETVERSION
#define        EXT4_IOC_SETVERSION_OLD                FS_IOC_SETVERSION
#define EXT4_IOC_GETRSVSZ                _IOR('f', 5, long)
#define EXT4_IOC_SETRSVSZ                _IOW('f', 6, long)
#define EXT4_IOC_GROUP_EXTEND                _IOW('f', 7, unsigned long)
#define EXT4_IOC_GROUP_ADD                _IOW('f', 8, struct ext4_new_group_input)
#define EXT4_IOC_MIGRATE                _IO('f', 9)
 /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
 /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
#define EXT4_IOC_ALLOC_DA_BLKS                _IO('f', 12)
#define EXT4_IOC_MOVE_EXT                _IOWR('f', 15, struct move_extent)
#define EXT4_IOC_RESIZE_FS                _IOW('f', 16, __u64)
#define EXT4_IOC_SWAP_BOOT                _IO('f', 17)
#define EXT4_IOC_PRECACHE_EXTENTS        _IO('f', 18)
/* ioctl codes 19--39 are reserved for fscrypt */
#define EXT4_IOC_CLEAR_ES_CACHE                _IO('f', 40)
#define EXT4_IOC_GETSTATE                _IOW('f', 41, __u32)
#define EXT4_IOC_GET_ES_CACHE                _IOWR('f', 42, struct fiemap)

#define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)

/*
 * Flags for going down operation
 */
#define EXT4_GOING_FLAGS_DEFAULT                0x0        /* going down */
#define EXT4_GOING_FLAGS_LOGFLUSH                0x1        /* flush log but not data */
#define EXT4_GOING_FLAGS_NOLOGFLUSH                0x2        /* don't flush log nor data */

/*
 * Flags returned by EXT4_IOC_GETSTATE
 *
 * We only expose to userspace a subset of the state flags in
 * i_state_flags
 */
#define EXT4_STATE_FLAG_EXT_PRECACHED        0x00000001
#define EXT4_STATE_FLAG_NEW                0x00000002
#define EXT4_STATE_FLAG_NEWENTRY        0x00000004
#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE        0x00000008

#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
 * ioctl commands in 32 bit emulation
 */
#define EXT4_IOC32_GETVERSION                _IOR('f', 3, int)
#define EXT4_IOC32_SETVERSION                _IOW('f', 4, int)
#define EXT4_IOC32_GETRSVSZ                _IOR('f', 5, int)
#define EXT4_IOC32_SETRSVSZ                _IOW('f', 6, int)
#define EXT4_IOC32_GROUP_EXTEND                _IOW('f', 7, unsigned int)
#define EXT4_IOC32_GROUP_ADD                _IOW('f', 8, struct compat_ext4_new_group_input)
#define EXT4_IOC32_GETVERSION_OLD        FS_IOC32_GETVERSION
#define EXT4_IOC32_SETVERSION_OLD        FS_IOC32_SETVERSION
#endif

/*
 * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag.
 * It indicates that the entry in extent status cache is for a hole.
 */
#define EXT4_FIEMAP_EXTENT_HOLE                0x08000000

/* Max physical block we can address w/o extents */
#define EXT4_MAX_BLOCK_FILE_PHYS        0xFFFFFFFF

/* Max logical block we can support */
#define EXT4_MAX_LOGICAL_BLOCK                0xFFFFFFFE

/*
 * Structure of an inode on the disk
 */
struct ext4_inode {
        __le16        i_mode;                /* File mode */
        __le16        i_uid;                /* Low 16 bits of Owner Uid */
        __le32        i_size_lo;        /* Size in bytes */
        __le32        i_atime;        /* Access time */
        __le32        i_ctime;        /* Inode Change time */
        __le32        i_mtime;        /* Modification time */
        __le32        i_dtime;        /* Deletion Time */
        __le16        i_gid;                /* Low 16 bits of Group Id */
        __le16        i_links_count;        /* Links count */
        __le32        i_blocks_lo;        /* Blocks count */
        __le32        i_flags;        /* File flags */
        union {
                struct {
                        __le32  l_i_version;
                } linux1;
                struct {
                        __u32  h_i_translator;
                } hurd1;
                struct {
                        __u32  m_i_reserved1;
                } masix1;
        } osd1;                                /* OS dependent 1 */
        __le32        i_block[EXT4_N_BLOCKS];/* Pointers to blocks */
        __le32        i_generation;        /* File version (for NFS) */
        __le32        i_file_acl_lo;        /* File ACL */
        __le32        i_size_high;
        __le32        i_obso_faddr;        /* Obsoleted fragment address */
        union {
                struct {
                        __le16        l_i_blocks_high; /* were l_i_reserved1 */
                        __le16        l_i_file_acl_high;
                        __le16        l_i_uid_high;        /* these 2 fields */
                        __le16        l_i_gid_high;        /* were reserved2[0] */
                        __le16        l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */
                        __le16        l_i_reserved;
                } linux2;
                struct {
                        __le16        h_i_reserved1;        /* Obsoleted fragment number/size which are removed in ext4 */
                        __u16        h_i_mode_high;
                        __u16        h_i_uid_high;
                        __u16        h_i_gid_high;
                        __u32        h_i_author;
                } hurd2;
                struct {
                        __le16        h_i_reserved1;        /* Obsoleted fragment number/size which are removed in ext4 */
                        __le16        m_i_file_acl_high;
                        __u32        m_i_reserved2[2];
                } masix2;
        } osd2;                                /* OS dependent 2 */
        __le16        i_extra_isize;
        __le16        i_checksum_hi;        /* crc32c(uuid+inum+inode) BE */
        __le32  i_ctime_extra;  /* extra Change time      (nsec << 2 | epoch) */
        __le32  i_mtime_extra;  /* extra Modification time(nsec << 2 | epoch) */
        __le32  i_atime_extra;  /* extra Access time      (nsec << 2 | epoch) */
        __le32  i_crtime;       /* File Creation time */
        __le32  i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */
        __le32  i_version_hi;        /* high 32 bits for 64-bit version */
        __le32        i_projid;        /* Project ID */
};

struct move_extent {
        __u32 reserved;                /* should be zero */
        __u32 donor_fd;                /* donor file descriptor */
        __u64 orig_start;        /* logical start offset in block for orig */
        __u64 donor_start;        /* logical start offset in block for donor */
        __u64 len;                /* block length to be moved */
        __u64 moved_len;        /* moved block length */
};

#define EXT4_EPOCH_BITS 2
#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
#define EXT4_NSEC_MASK  (~0UL << EXT4_EPOCH_BITS)

/*
 * Extended fields will fit into an inode if the filesystem was formatted
 * with large inodes (-I 256 or larger) and there are not currently any EAs
 * consuming all of the available space. For new inodes we always reserve
 * enough space for the kernel's known extended fields, but for inodes
 * created with an old kernel this might not have been the case. None of
 * the extended inode fields is critical for correct filesystem operation.
 * This macro checks if a certain field fits in the inode. Note that
 * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize
 */
#define EXT4_FITS_IN_INODE(ext4_inode, einode, field)        \
        ((offsetof(typeof(*ext4_inode), field) +        \
          sizeof((ext4_inode)->field))                        \
        <= (EXT4_GOOD_OLD_INODE_SIZE +                        \
            (einode)->i_extra_isize))                        \

/*
 * We use an encoding that preserves the times for extra epoch "00":
 *
 * extra  msb of                         adjust for signed
 * epoch  32-bit                         32-bit tv_sec to
 * bits   time    decoded 64-bit tv_sec  64-bit tv_sec      valid time range
 * 0 0    1    -0x80000000..-0x00000001  0x000000000 1901-12-13..1969-12-31
 * 0 0    0    0x000000000..0x07fffffff  0x000000000 1970-01-01..2038-01-19
 * 0 1    1    0x080000000..0x0ffffffff  0x100000000 2038-01-19..2106-02-07
 * 0 1    0    0x100000000..0x17fffffff  0x100000000 2106-02-07..2174-02-25
 * 1 0    1    0x180000000..0x1ffffffff  0x200000000 2174-02-25..2242-03-16
 * 1 0    0    0x200000000..0x27fffffff  0x200000000 2242-03-16..2310-04-04
 * 1 1    1    0x280000000..0x2ffffffff  0x300000000 2310-04-04..2378-04-22
 * 1 1    0    0x300000000..0x37fffffff  0x300000000 2378-04-22..2446-05-10
 *
 * Note that previous versions of the kernel on 64-bit systems would
 * incorrectly use extra epoch bits 1,1 for dates between 1901 and
 * 1970.  e2fsck will correct this, assuming that it is run on the
 * affected filesystem before 2242.
 */

static inline __le32 ext4_encode_extra_time(struct timespec64 *time)
{
        u32 extra =((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK;
        return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS));
}

static inline void ext4_decode_extra_time(struct timespec64 *time,
                                          __le32 extra)
{
        if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK)))
                time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32;
        time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
}

#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode)                                \
do {                                                                                \
        if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     {\
                (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec);        \
                (raw_inode)->xtime ## _extra =                                        \
                                ext4_encode_extra_time(&(inode)->xtime);        \
                }                                                                \
        else        \
                (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (inode)->xtime.tv_sec, S32_MIN, S32_MAX));        \
} while (0)

#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode)                               \
do {                                                                               \
        if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))                       \
                (raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec);      \
        if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))               \
                (raw_inode)->xtime ## _extra =                                       \
                                ext4_encode_extra_time(&(einode)->xtime);      \
} while (0)

#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode)                                \
do {                                                                                \
        (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime);        \
        if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {        \
                ext4_decode_extra_time(&(inode)->xtime,                                \
                                       raw_inode->xtime ## _extra);                \
                }                                                                \
        else                                                                        \
                (inode)->xtime.tv_nsec = 0;                                        \
} while (0)


#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode)                               \
do {                                                                               \
        if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))                       \
                (einode)->xtime.tv_sec =                                        \
                        (signed)le32_to_cpu((raw_inode)->xtime);               \
        else                                                                       \
                (einode)->xtime.tv_sec = 0;                                       \
        if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))               \
                ext4_decode_extra_time(&(einode)->xtime,                       \
                                       raw_inode->xtime ## _extra);               \
        else                                                                       \
                (einode)->xtime.tv_nsec = 0;                                       \
} while (0)

#define i_disk_version osd1.linux1.l_i_version

#if defined(__KERNEL__) || defined(__linux__)
#define i_reserved1        osd1.linux1.l_i_reserved1
#define i_file_acl_high        osd2.linux2.l_i_file_acl_high
#define i_blocks_high        osd2.linux2.l_i_blocks_high
#define i_uid_low        i_uid
#define i_gid_low        i_gid
#define i_uid_high        osd2.linux2.l_i_uid_high
#define i_gid_high        osd2.linux2.l_i_gid_high
#define i_checksum_lo        osd2.linux2.l_i_checksum_lo

#elif defined(__GNU__)

#define i_translator        osd1.hurd1.h_i_translator
#define i_uid_high        osd2.hurd2.h_i_uid_high
#define i_gid_high        osd2.hurd2.h_i_gid_high
#define i_author        osd2.hurd2.h_i_author

#elif defined(__masix__)

#define i_reserved1        osd1.masix1.m_i_reserved1
#define i_file_acl_high        osd2.masix2.m_i_file_acl_high
#define i_reserved2        osd2.masix2.m_i_reserved2

#endif /* defined(__KERNEL__) || defined(__linux__) */

#include "extents_status.h"
#include "fast_commit.h"

/*
 * Lock subclasses for i_data_sem in the ext4_inode_info structure.
 *
 * These are needed to avoid lockdep false positives when we need to
 * allocate blocks to the quota inode during ext4_map_blocks(), while
 * holding i_data_sem for a normal (non-quota) inode.  Since we don't
 * do quota tracking for the quota inode, this avoids deadlock (as
 * well as infinite recursion, since it isn't turtles all the way
 * down...)
 *
 *  I_DATA_SEM_NORMAL - Used for most inodes
 *  I_DATA_SEM_OTHER  - Used by move_inode.c for the second normal inode
 *                          where the second inode has larger inode number
 *                          than the first
 *  I_DATA_SEM_QUOTA  - Used for quota inodes only
 *  I_DATA_SEM_EA     - Used for ea_inodes only
 */
enum {
        I_DATA_SEM_NORMAL = 0,
        I_DATA_SEM_OTHER,
        I_DATA_SEM_QUOTA,
        I_DATA_SEM_EA
};


/*
 * fourth extended file system inode data in memory
 */
struct ext4_inode_info {
        __le32        i_data[15];        /* unconverted */
        __u32        i_dtime;
        ext4_fsblk_t        i_file_acl;

        /*
         * i_block_group is the number of the block group which contains
         * this file's inode.  Constant across the lifetime of the inode,
         * it is used for making block allocation decisions - we try to
         * place a file's data blocks near its inode block, and new inodes
         * near to their parent directory's inode.
         */
        ext4_group_t        i_block_group;
        ext4_lblk_t        i_dir_start_lookup;
#if (BITS_PER_LONG < 64)
        unsigned long        i_state_flags;                /* Dynamic state flags */
#endif
        unsigned long        i_flags;

        /*
         * Extended attributes can be read independently of the main file
         * data. Taking i_mutex even when reading would cause contention
         * between readers of EAs and writers of regular file data, so
         * instead we synchronize on xattr_sem when reading or changing
         * EAs.
         */
        struct rw_semaphore xattr_sem;

        struct list_head i_orphan;        /* unlinked but open inodes */

        /* Fast commit related info */

        struct list_head i_fc_list;        /*
                                         * inodes that need fast commit
                                         * protected by sbi->s_fc_lock.
                                         */

        /* Start of lblk range that needs to be committed in this fast commit */
        ext4_lblk_t i_fc_lblk_start;

        /* End of lblk range that needs to be committed in this fast commit */
        ext4_lblk_t i_fc_lblk_len;

        /* Number of ongoing updates on this inode */
        atomic_t  i_fc_updates;

        /* Fast commit wait queue for this inode */
        wait_queue_head_t i_fc_wait;

        /* Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len */
        struct mutex i_fc_lock;

        /*
         * i_disksize keeps track of what the inode size is ON DISK, not
         * in memory.  During truncate, i_size is set to the new size by
         * the VFS prior to calling ext4_truncate(), but the filesystem won't
         * set i_disksize to 0 until the truncate is actually under way.
         *
         * The intent is that i_disksize always represents the blocks which
         * are used by this file.  This allows recovery to restart truncate
         * on orphans if we crash during truncate.  We actually write i_disksize
         * into the on-disk inode when writing inodes out, instead of i_size.
         *
         * The only time when i_disksize and i_size may be different is when
         * a truncate is in progress.  The only things which change i_disksize
         * are ext4_get_block (growth) and ext4_truncate (shrinkth).
         */
        loff_t        i_disksize;

        /*
         * i_data_sem is for serialising ext4_truncate() against
         * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
         * data tree are chopped off during truncate. We can't do that in
         * ext4 because whenever we perform intermediate commits during
         * truncate, the inode and all the metadata blocks *must* be in a
         * consistent state which allows truncation of the orphans to restart
         * during recovery.  Hence we must fix the get_block-vs-truncate race
         * by other means, so we have i_data_sem.
         */
        struct rw_semaphore i_data_sem;
        /*
         * i_mmap_sem is for serializing page faults with truncate / punch hole
         * operations. We have to make sure that new page cannot be faulted in
         * a section of the inode that is being punched. We cannot easily use
         * i_data_sem for this since we need protection for the whole punch
         * operation and i_data_sem ranks below transaction start so we have
         * to occasionally drop it.
         */
        struct rw_semaphore i_mmap_sem;
        struct inode vfs_inode;
        struct jbd2_inode *jinode;

        spinlock_t i_raw_lock;        /* protects updates to the raw inode */

        /*
         * File creation time. Its function is same as that of
         * struct timespec64 i_{a,c,m}time in the generic inode.
         */
        struct timespec64 i_crtime;

        /* mballoc */
        atomic_t i_prealloc_active;
        struct list_head i_prealloc_list;
        spinlock_t i_prealloc_lock;

        /* extents status tree */
        struct ext4_es_tree i_es_tree;
        rwlock_t i_es_lock;
        struct list_head i_es_list;
        unsigned int i_es_all_nr;        /* protected by i_es_lock */
        unsigned int i_es_shk_nr;        /* protected by i_es_lock */
        ext4_lblk_t i_es_shrink_lblk;        /* Offset where we start searching for
                                           extents to shrink. Protected by
                                           i_es_lock  */

        /* ialloc */
        ext4_group_t        i_last_alloc_group;

        /* allocation reservation info for delalloc */
        /* In case of bigalloc, this refer to clusters rather than blocks */
        unsigned int i_reserved_data_blocks;

        /* pending cluster reservations for bigalloc file systems */
        struct ext4_pending_tree i_pending_tree;

        /* on-disk additional length */
        __u16 i_extra_isize;

        /* Indicate the inline data space. */
        u16 i_inline_off;
        u16 i_inline_size;

#ifdef CONFIG_QUOTA
        /* quota space reservation, managed internally by quota code */
        qsize_t i_reserved_quota;
#endif

        /* Lock protecting lists below */
        spinlock_t i_completed_io_lock;
        /*
         * Completed IOs that need unwritten extents handling and have
         * transaction reserved
         */
        struct list_head i_rsv_conversion_list;
        struct work_struct i_rsv_conversion_work;
        atomic_t i_unwritten; /* Nr. of inflight conversions pending */

        spinlock_t i_block_reservation_lock;

        /*
         * Transactions that contain inode's metadata needed to complete
         * fsync and fdatasync, respectively.
         */
        tid_t i_sync_tid;
        tid_t i_datasync_tid;

#ifdef CONFIG_QUOTA
        struct dquot *i_dquot[MAXQUOTAS];
#endif

        /* Precomputed uuid+inum+igen checksum for seeding inode checksums */
        __u32 i_csum_seed;

        kprojid_t i_projid;
};

/*
 * File system states
 */
#define        EXT4_VALID_FS                        0x0001        /* Unmounted cleanly */
#define        EXT4_ERROR_FS                        0x0002        /* Errors detected */
#define        EXT4_ORPHAN_FS                        0x0004        /* Orphans being recovered */
#define EXT4_FC_REPLAY                        0x0020        /* Fast commit replay ongoing */

/*
 * Misc. filesystem flags
 */
#define EXT2_FLAGS_SIGNED_HASH                0x0001  /* Signed dirhash in use */
#define EXT2_FLAGS_UNSIGNED_HASH        0x0002  /* Unsigned dirhash in use */
#define EXT2_FLAGS_TEST_FILESYS                0x0004        /* to test development code */

/*
 * Mount flags set via mount options or defaults
 */
#define EXT4_MOUNT_NO_MBCACHE                0x00001 /* Do not use mbcache */
#define EXT4_MOUNT_GRPID                0x00004        /* Create files with directory's group */
#define EXT4_MOUNT_DEBUG                0x00008        /* Some debugging messages */
#define EXT4_MOUNT_ERRORS_CONT                0x00010        /* Continue on errors */
#define EXT4_MOUNT_ERRORS_RO                0x00020        /* Remount fs ro on errors */
#define EXT4_MOUNT_ERRORS_PANIC                0x00040        /* Panic on errors */
#define EXT4_MOUNT_ERRORS_MASK                0x00070
#define EXT4_MOUNT_MINIX_DF                0x00080        /* Mimics the Minix statfs */
#define EXT4_MOUNT_NOLOAD                0x00100        /* Don't use existing journal*/
#ifdef CONFIG_FS_DAX
#define EXT4_MOUNT_DAX_ALWAYS                0x00200        /* Direct Access */
#else
#define EXT4_MOUNT_DAX_ALWAYS                0
#endif
#define EXT4_MOUNT_DATA_FLAGS                0x00C00        /* Mode for data writes: */
#define EXT4_MOUNT_JOURNAL_DATA                0x00400        /* Write data to journal */
#define EXT4_MOUNT_ORDERED_DATA                0x00800        /* Flush data before commit */
#define EXT4_MOUNT_WRITEBACK_DATA        0x00C00        /* No data ordering */
#define EXT4_MOUNT_UPDATE_JOURNAL        0x01000        /* Update the journal format */
#define EXT4_MOUNT_NO_UID32                0x02000  /* Disable 32-bit UIDs */
#define EXT4_MOUNT_XATTR_USER                0x04000        /* Extended user attributes */
#define EXT4_MOUNT_POSIX_ACL                0x08000        /* POSIX Access Control Lists */
#define EXT4_MOUNT_NO_AUTO_DA_ALLOC        0x10000        /* No auto delalloc mapping */
#define EXT4_MOUNT_BARRIER                0x20000 /* Use block barriers */
#define EXT4_MOUNT_QUOTA                0x40000 /* Some quota option set */
#define EXT4_MOUNT_USRQUOTA                0x80000 /* "old" user quota,
                                                 * enable enforcement for hidden
                                                 * quota files */
#define EXT4_MOUNT_GRPQUOTA                0x100000 /* "old" group quota, enable
                                                  * enforcement for hidden quota
                                                  * files */
#define EXT4_MOUNT_PRJQUOTA                0x200000 /* Enable project quota
                                                  * enforcement */
#define EXT4_MOUNT_DIOREAD_NOLOCK        0x400000 /* Enable support for dio read nolocking */
#define EXT4_MOUNT_JOURNAL_CHECKSUM        0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT        0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_WARN_ON_ERROR        0x2000000 /* Trigger WARN_ON on error */
#define EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS 0x4000000
#define EXT4_MOUNT_DELALLOC                0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT        0x10000000 /* Abort on file data write */
#define EXT4_MOUNT_BLOCK_VALIDITY        0x20000000 /* Block validity checking */
#define EXT4_MOUNT_DISCARD                0x40000000 /* Issue DISCARD requests */
#define EXT4_MOUNT_INIT_INODE_TABLE        0x80000000 /* Initialize uninitialized itables */

/*
 * Mount flags set either automatically (could not be set by mount option)
 * based on per file system feature or property or in special cases such as
 * distinguishing between explicit mount option definition and default.
 */
#define EXT4_MOUNT2_EXPLICIT_DELALLOC        0x00000001 /* User explicitly
                                                      specified delalloc */
#define EXT4_MOUNT2_STD_GROUP_SIZE        0x00000002 /* We have standard group
                                                      size of blocksize * 8
                                                      blocks */
#define EXT4_MOUNT2_HURD_COMPAT                0x00000004 /* Support HURD-castrated
                                                      file systems */
#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM        0x00000008 /* User explicitly
                                                specified journal checksum */

#define EXT4_MOUNT2_JOURNAL_FAST_COMMIT        0x00000010 /* Journal fast commit */
#define EXT4_MOUNT2_DAX_NEVER                0x00000020 /* Do not allow Direct Access */
#define EXT4_MOUNT2_DAX_INODE                0x00000040 /* For printing options only */


#define clear_opt(sb, opt)                EXT4_SB(sb)->s_mount_opt &= \
                                                ~EXT4_MOUNT_##opt
#define set_opt(sb, opt)                EXT4_SB(sb)->s_mount_opt |= \
                                                EXT4_MOUNT_##opt
#define test_opt(sb, opt)                (EXT4_SB(sb)->s_mount_opt & \
                                         EXT4_MOUNT_##opt)

#define clear_opt2(sb, opt)                EXT4_SB(sb)->s_mount_opt2 &= \
                                                ~EXT4_MOUNT2_##opt
#define set_opt2(sb, opt)                EXT4_SB(sb)->s_mount_opt2 |= \
                                                EXT4_MOUNT2_##opt
#define test_opt2(sb, opt)                (EXT4_SB(sb)->s_mount_opt2 & \
                                         EXT4_MOUNT2_##opt)

#define ext4_test_and_set_bit                __test_and_set_bit_le
#define ext4_set_bit                        __set_bit_le
#define ext4_set_bit_atomic                ext2_set_bit_atomic
#define ext4_test_and_clear_bit                __test_and_clear_bit_le
#define ext4_clear_bit                        __clear_bit_le
#define ext4_clear_bit_atomic                ext2_clear_bit_atomic
#define ext4_test_bit                        test_bit_le
#define ext4_find_next_zero_bit                find_next_zero_bit_le
#define ext4_find_next_bit                find_next_bit_le

extern void ext4_set_bits(void *bm, int cur, int len);

/*
 * Maximal mount counts between two filesystem checks
 */
#define EXT4_DFL_MAX_MNT_COUNT                20        /* Allow 20 mounts */
#define EXT4_DFL_CHECKINTERVAL                0        /* Don't use interval check */

/*
 * Behaviour when detecting errors
 */
#define EXT4_ERRORS_CONTINUE                1        /* Continue execution */
#define EXT4_ERRORS_RO                        2        /* Remount fs read-only */
#define EXT4_ERRORS_PANIC                3        /* Panic */
#define EXT4_ERRORS_DEFAULT                EXT4_ERRORS_CONTINUE

/* Metadata checksum algorithm codes */
#define EXT4_CRC32C_CHKSUM                1

/*
 * Structure of the super block
 */
struct ext4_super_block {
/*00*/        __le32        s_inodes_count;                /* Inodes count */
        __le32        s_blocks_count_lo;        /* Blocks count */
        __le32        s_r_blocks_count_lo;        /* Reserved blocks count */
        __le32        s_free_blocks_count_lo;        /* Free blocks count */
/*10*/        __le32        s_free_inodes_count;        /* Free inodes count */
        __le32        s_first_data_block;        /* First Data Block */
        __le32        s_log_block_size;        /* Block size */
        __le32        s_log_cluster_size;        /* Allocation cluster size */
/*20*/        __le32        s_blocks_per_group;        /* # Blocks per group */
        __le32        s_clusters_per_group;        /* # Clusters per group */
        __le32        s_inodes_per_group;        /* # Inodes per group */
        __le32        s_mtime;                /* Mount time */
/*30*/        __le32        s_wtime;                /* Write time */
        __le16        s_mnt_count;                /* Mount count */
        __le16        s_max_mnt_count;        /* Maximal mount count */
        __le16        s_magic;                /* Magic signature */
        __le16        s_state;                /* File system state */
        __le16        s_errors;                /* Behaviour when detecting errors */
        __le16        s_minor_rev_level;        /* minor revision level */
/*40*/        __le32        s_lastcheck;                /* time of last check */
        __le32        s_checkinterval;        /* max. time between checks */
        __le32        s_creator_os;                /* OS */
        __le32        s_rev_level;                /* Revision level */
/*50*/        __le16        s_def_resuid;                /* Default uid for reserved blocks */
        __le16        s_def_resgid;                /* Default gid for reserved blocks */
        /*
         * These fields are for EXT4_DYNAMIC_REV superblocks only.
         *
         * Note: the difference between the compatible feature set and
         * the incompatible feature set is that if there is a bit set
         * in the incompatible feature set that the kernel doesn't
         * know about, it should refuse to mount the filesystem.
         *
         * e2fsck's requirements are more strict; if it doesn't know
         * about a feature in either the compatible or incompatible
         * feature set, it must abort and not try to meddle with
         * things it doesn't understand...
         */
        __le32        s_first_ino;                /* First non-reserved inode */
        __le16  s_inode_size;                /* size of inode structure */
        __le16        s_block_group_nr;        /* block group # of this superblock */
        __le32        s_feature_compat;        /* compatible feature set */
/*60*/        __le32        s_feature_incompat;        /* incompatible feature set */
        __le32        s_feature_ro_compat;        /* readonly-compatible feature set */
/*68*/        __u8        s_uuid[16];                /* 128-bit uuid for volume */
/*78*/        char        s_volume_name[16];        /* volume name */
/*88*/        char        s_last_mounted[64] __nonstring;        /* directory where last mounted */
/*C8*/        __le32        s_algorithm_usage_bitmap; /* For compression */
        /*
         * Performance hints.  Directory preallocation should only
         * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on.
         */
        __u8        s_prealloc_blocks;        /* Nr of blocks to try to preallocate*/
        __u8        s_prealloc_dir_blocks;        /* Nr to preallocate for dirs */
        __le16        s_reserved_gdt_blocks;        /* Per group desc for online growth */
        /*
         * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set.
         */
/*D0*/        __u8        s_journal_uuid[16];        /* uuid of journal superblock */
/*E0*/        __le32        s_journal_inum;                /* inode number of journal file */
        __le32        s_journal_dev;                /* device number of journal file */
        __le32        s_last_orphan;                /* start of list of inodes to delete */
        __le32        s_hash_seed[4];                /* HTREE hash seed */
        __u8        s_def_hash_version;        /* Default hash version to use */
        __u8        s_jnl_backup_type;
        __le16  s_desc_size;                /* size of group descriptor */
/*100*/        __le32        s_default_mount_opts;
        __le32        s_first_meta_bg;        /* First metablock block group */
        __le32        s_mkfs_time;                /* When the filesystem was created */
        __le32        s_jnl_blocks[17];        /* Backup of the journal inode */
        /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */
/*150*/        __le32        s_blocks_count_hi;        /* Blocks count */
        __le32        s_r_blocks_count_hi;        /* Reserved blocks count */
        __le32        s_free_blocks_count_hi;        /* Free blocks count */
        __le16        s_min_extra_isize;        /* All inodes have at least # bytes */
        __le16        s_want_extra_isize;         /* New inodes should reserve # bytes */
        __le32        s_flags;                /* Miscellaneous flags */
        __le16  s_raid_stride;                /* RAID stride */
        __le16  s_mmp_update_interval;  /* # seconds to wait in MMP checking */
        __le64  s_mmp_block;            /* Block for multi-mount protection */
        __le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
        __u8        s_log_groups_per_flex;  /* FLEX_BG group size */
        __u8        s_checksum_type;        /* metadata checksum algorithm used */
        __u8        s_encryption_level;        /* versioning level for encryption */
        __u8        s_reserved_pad;                /* Padding to next 32bits */
        __le64        s_kbytes_written;        /* nr of lifetime kilobytes written */
        __le32        s_snapshot_inum;        /* Inode number of active snapshot */
        __le32        s_snapshot_id;                /* sequential ID of active snapshot */
        __le64        s_snapshot_r_blocks_count; /* reserved blocks for active
                                              snapshot's future use */
        __le32        s_snapshot_list;        /* inode number of the head of the
                                           on-disk snapshot list */
#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count)
        __le32        s_error_count;                /* number of fs errors */
        __le32        s_first_error_time;        /* first time an error happened */
        __le32        s_first_error_ino;        /* inode involved in first error */
        __le64        s_first_error_block;        /* block involved of first error */
        __u8        s_first_error_func[32] __nonstring;        /* function where the error happened */
        __le32        s_first_error_line;        /* line number where error happened */
        __le32        s_last_error_time;        /* most recent time of an error */
        __le32        s_last_error_ino;        /* inode involved in last error */
        __le32        s_last_error_line;        /* line number where error happened */
        __le64        s_last_error_block;        /* block involved of last error */
        __u8        s_last_error_func[32] __nonstring;        /* function where the error happened */
#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
        __u8        s_mount_opts[64];
        __le32        s_usr_quota_inum;        /* inode for tracking user quota */
        __le32        s_grp_quota_inum;        /* inode for tracking group quota */
        __le32        s_overhead_clusters;        /* overhead blocks/clusters in fs */
        __le32        s_backup_bgs[2];        /* groups with sparse_super2 SBs */
        __u8        s_encrypt_algos[4];        /* Encryption algorithms in use  */
        __u8        s_encrypt_pw_salt[16];        /* Salt used for string2key algorithm */
        __le32        s_lpf_ino;                /* Location of the lost+found inode */
        __le32        s_prj_quota_inum;        /* inode for tracking project quota */
        __le32        s_checksum_seed;        /* crc32c(uuid) if csum_seed set */
        __u8        s_wtime_hi;
        __u8        s_mtime_hi;
        __u8        s_mkfs_time_hi;
        __u8        s_lastcheck_hi;
        __u8        s_first_error_time_hi;
        __u8        s_last_error_time_hi;
        __u8        s_first_error_errcode;
        __u8    s_last_error_errcode;
        __le16  s_encoding;                /* Filename charset encoding */
        __le16  s_encoding_flags;        /* Filename charset encoding flags */
        __le32        s_reserved[95];                /* Padding to the end of the block */
        __le32        s_checksum;                /* crc32c(superblock) */
};

#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)

#ifdef __KERNEL__

/* Number of quota types we support */
#define EXT4_MAXQUOTAS 3

#define EXT4_ENC_UTF8_12_1        1

/*
 * fourth extended-fs super-block data in memory
 */
struct ext4_sb_info {
        unsigned long s_desc_size;        /* Size of a group descriptor in bytes */
        unsigned long s_inodes_per_block;/* Number of inodes per block */
        unsigned long s_blocks_per_group;/* Number of blocks in a group */
        unsigned long s_clusters_per_group; /* Number of clusters in a group */
        unsigned long s_inodes_per_group;/* Number of inodes in a group */
        unsigned long s_itb_per_group;        /* Number of inode table blocks per group */
        unsigned long s_gdb_count;        /* Number of group descriptor blocks */
        unsigned long s_desc_per_block;        /* Number of group descriptors per block */
        ext4_group_t s_groups_count;        /* Number of groups in the fs */
        ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
        unsigned long s_overhead;  /* # of fs overhead clusters */
        unsigned int s_cluster_ratio;        /* Number of blocks per cluster */
        unsigned int s_cluster_bits;        /* log2 of s_cluster_ratio */
        loff_t s_bitmap_maxbytes;        /* max bytes for bitmap files */
        struct buffer_head * s_sbh;        /* Buffer containing the super block */
        struct ext4_super_block *s_es;        /* Pointer to the super block in the buffer */
        struct buffer_head * __rcu *s_group_desc;
        unsigned int s_mount_opt;
        unsigned int s_mount_opt2;
        unsigned long s_mount_flags;
        unsigned int s_def_mount_opt;
        ext4_fsblk_t s_sb_block;
        atomic64_t s_resv_clusters;
        kuid_t s_resuid;
        kgid_t s_resgid;
        unsigned short s_mount_state;
        unsigned short s_pad;
        int s_addr_per_block_bits;
        int s_desc_per_block_bits;
        int s_inode_size;
        int s_first_ino;
        unsigned int s_inode_readahead_blks;
        unsigned int s_inode_goal;
        u32 s_hash_seed[4];
        int s_def_hash_version;
        int s_hash_unsigned;        /* 3 if hash should be signed, 0 if not */
        struct percpu_counter s_freeclusters_counter;
        struct percpu_counter s_freeinodes_counter;
        struct percpu_counter s_dirs_counter;
        struct percpu_counter s_dirtyclusters_counter;
        struct percpu_counter s_sra_exceeded_retry_limit;
        struct blockgroup_lock *s_blockgroup_lock;
        struct proc_dir_entry *s_proc;
        struct kobject s_kobj;
        struct completion s_kobj_unregister;
        struct super_block *s_sb;
        struct buffer_head *s_mmp_bh;

        /* Journaling */
        struct journal_s *s_journal;
        struct list_head s_orphan;
        struct mutex s_orphan_lock;
        unsigned long s_ext4_flags;                /* Ext4 superblock flags */
        unsigned long s_commit_interval;
        u32 s_max_batch_time;
        u32 s_min_batch_time;
        struct block_device *s_journal_bdev;
#ifdef CONFIG_QUOTA
        /* Names of quota files with journalled quota */
        char __rcu *s_qf_names[EXT4_MAXQUOTAS];
        int s_jquota_fmt;                        /* Format of quota to use */
#endif
        unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
        struct ext4_system_blocks __rcu *s_system_blks;

#ifdef EXTENTS_STATS
        /* ext4 extents stats */
        unsigned long s_ext_min;
        unsigned long s_ext_max;
        unsigned long s_depth_max;
        spinlock_t s_ext_stats_lock;
        unsigned long s_ext_blocks;
        unsigned long s_ext_extents;
#endif

        /* for buddy allocator */
        struct ext4_group_info ** __rcu *s_group_info;
        struct inode *s_buddy_cache;
        spinlock_t s_md_lock;
        unsigned short *s_mb_offsets;
        unsigned int *s_mb_maxs;
        unsigned int s_group_info_size;
        unsigned int s_mb_free_pending;
        struct list_head s_freed_data_list;        /* List of blocks to be freed
                                                   after commit completed */

        /* tunables */
        unsigned long s_stripe;
        unsigned int s_mb_stream_request;
        unsigned int s_mb_max_to_scan;
        unsigned int s_mb_min_to_scan;
        unsigned int s_mb_stats;
        unsigned int s_mb_order2_reqs;
        unsigned int s_mb_group_prealloc;
        unsigned int s_mb_max_inode_prealloc;
        unsigned int s_max_dir_size_kb;
        /* where last allocation was done - for stream allocation */
        unsigned long s_mb_last_group;
        unsigned long s_mb_last_start;
        unsigned int s_mb_prefetch;
        unsigned int s_mb_prefetch_limit;

        /* stats for buddy allocator */
        atomic_t s_bal_reqs;        /* number of reqs with len > 1 */
        atomic_t s_bal_success;        /* we found long enough chunks */
        atomic_t s_bal_allocated;        /* in blocks */
        atomic_t s_bal_ex_scanned;        /* total extents scanned */
        atomic_t s_bal_groups_scanned;        /* number of groups scanned */
        atomic_t s_bal_goals;        /* goal hits */
        atomic_t s_bal_breaks;        /* too long searches */
        atomic_t s_bal_2orders;        /* 2^order hits */
        atomic64_t s_bal_cX_groups_considered[4];
        atomic64_t s_bal_cX_hits[4];
        atomic64_t s_bal_cX_failed[4];                /* cX loop didn't find blocks */
        atomic_t s_mb_buddies_generated;        /* number of buddies generated */
        atomic64_t s_mb_generation_time;
        atomic_t s_mb_lost_chunks;
        atomic_t s_mb_preallocated;
        atomic_t s_mb_discarded;
        atomic_t s_lock_busy;

        /* locality groups */
        struct ext4_locality_group __percpu *s_locality_groups;

        /* for write statistics */
        unsigned long s_sectors_written_start;
        u64 s_kbytes_written;

        /* the size of zero-out chunk */
        unsigned int s_extent_max_zeroout_kb;

        unsigned int s_log_groups_per_flex;
        struct flex_groups * __rcu *s_flex_groups;
        ext4_group_t s_flex_groups_allocated;

        /* workqueue for reserved extent conversions (buffered io) */
        struct workqueue_struct *rsv_conversion_wq;

        /* timer for periodic error stats printing */
        struct timer_list s_err_report;

        /* Lazy inode table initialization info */
        struct ext4_li_request *s_li_request;
        /* Wait multiplier for lazy initialization thread */
        unsigned int s_li_wait_mult;

        /* Kernel thread for multiple mount protection */
        struct task_struct *s_mmp_tsk;

        /* record the last minlen when FITRIM is called. */
        unsigned long s_last_trim_minblks;

        /* Reference to checksum algorithm driver via cryptoapi */
        struct crypto_shash *s_chksum_driver;

        /* Precomputed FS UUID checksum for seeding other checksums */
        __u32 s_csum_seed;

        /* Reclaim extents from extent status tree */
        struct shrinker s_es_shrinker;
        struct list_head s_es_list;        /* List of inodes with reclaimable extents */
        long s_es_nr_inode;
        struct ext4_es_stats s_es_stats;
        struct mb_cache *s_ea_block_cache;
        struct mb_cache *s_ea_inode_cache;
        spinlock_t s_es_lock ____cacheline_aligned_in_smp;

        /* Ratelimit ext4 messages. */
        struct ratelimit_state s_err_ratelimit_state;
        struct ratelimit_state s_warning_ratelimit_state;
        struct ratelimit_state s_msg_ratelimit_state;
        atomic_t s_warning_count;
        atomic_t s_msg_count;

        /* Encryption policy for '-o test_dummy_encryption' */
        struct fscrypt_dummy_policy s_dummy_enc_policy;

        /*
         * Barrier between writepages ops and changing any inode's JOURNAL_DATA
         * or EXTENTS flag.
         */
        struct percpu_rw_semaphore s_writepages_rwsem;
        struct dax_device *s_daxdev;
#ifdef CONFIG_EXT4_DEBUG
        unsigned long s_simulate_fail;
#endif
        /* Record the errseq of the backing block device */
        errseq_t s_bdev_wb_err;
        spinlock_t s_bdev_wb_lock;

        /* Ext4 fast commit stuff */
        atomic_t s_fc_subtid;
        atomic_t s_fc_ineligible_updates;
        /*
         * After commit starts, the main queue gets locked, and the further
         * updates get added in the staging queue.
         */
#define FC_Q_MAIN        0
#define FC_Q_STAGING        1
        struct list_head s_fc_q[2];        /* Inodes staged for fast commit
                                         * that have data changes in them.
                                         */
        struct list_head s_fc_dentry_q[2];        /* directory entry updates */
        unsigned int s_fc_bytes;
        /*
         * Main fast commit lock. This lock protects accesses to the
         * following fields:
         * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh.
         */
        spinlock_t s_fc_lock;
        struct buffer_head *s_fc_bh;
        struct ext4_fc_stats s_fc_stats;
        u64 s_fc_avg_commit_time;
#ifdef CONFIG_EXT4_DEBUG
        int s_fc_debug_max_replay;
#endif
        struct ext4_fc_replay_state s_fc_replay_state;
};

static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
{
        return sb->s_fs_info;
}
static inline struct ext4_inode_info *EXT4_I(struct inode *inode)
{
        return container_of(inode, struct ext4_inode_info, vfs_inode);
}

static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
{
        return ino == EXT4_ROOT_INO ||
                (ino >= EXT4_FIRST_INO(sb) &&
                 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
}

/*
 * Returns: sbi->field[index]
 * Used to access an array element from the following sbi fields which require
 * rcu protection to avoid dereferencing an invalid pointer due to reassignment
 * - s_group_desc
 * - s_group_info
 * - s_flex_group
 */
#define sbi_array_rcu_deref(sbi, field, index)                                   \
({                                                                           \
        typeof(*((sbi)->field)) _v;                                           \
        rcu_read_lock();                                                   \
        _v = ((typeof(_v)*)rcu_dereference((sbi)->field))[index];           \
        rcu_read_unlock();                                                   \
        _v;                                                                   \
})

/*
 * run-time mount flags
 */
enum {
        EXT4_MF_MNTDIR_SAMPLED,
        EXT4_MF_FS_ABORTED,        /* Fatal error detected */
        EXT4_MF_FC_INELIGIBLE,        /* Fast commit ineligible */
        EXT4_MF_FC_COMMITTING        /* File system underoing a fast
                                 * commit.
                                 */
};

static inline void ext4_set_mount_flag(struct super_block *sb, int bit)
{
        set_bit(bit, &EXT4_SB(sb)->s_mount_flags);
}

static inline void ext4_clear_mount_flag(struct super_block *sb, int bit)
{
        clear_bit(bit, &EXT4_SB(sb)->s_mount_flags);
}

static inline int ext4_test_mount_flag(struct super_block *sb, int bit)
{
        return test_bit(bit, &EXT4_SB(sb)->s_mount_flags);
}


/*
 * Simulate_fail codes
 */
#define EXT4_SIM_BBITMAP_EIO        1
#define EXT4_SIM_BBITMAP_CRC        2
#define EXT4_SIM_IBITMAP_EIO        3
#define EXT4_SIM_IBITMAP_CRC        4
#define EXT4_SIM_INODE_EIO        5
#define EXT4_SIM_INODE_CRC        6
#define EXT4_SIM_DIRBLOCK_EIO        7
#define EXT4_SIM_DIRBLOCK_CRC        8

static inline bool ext4_simulate_fail(struct super_block *sb,
                                     unsigned long code)
{
#ifdef CONFIG_EXT4_DEBUG
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (unlikely(sbi->s_simulate_fail == code)) {
                sbi->s_simulate_fail = 0;
                return true;
        }
#endif
        return false;
}

static inline void ext4_simulate_fail_bh(struct super_block *sb,
                                         struct buffer_head *bh,
                                         unsigned long code)
{
        if (!IS_ERR(bh) && ext4_simulate_fail(sb, code))
                clear_buffer_uptodate(bh);
}

/*
 * Error number codes for s_{first,last}_error_errno
 *
 * Linux errno numbers are architecture specific, so we need to translate
 * them into something which is architecture independent.   We don't define
 * codes for all errno's; just the ones which are most likely to be the cause
 * of an ext4_error() call.
 */
#define EXT4_ERR_UNKNOWN         1
#define EXT4_ERR_EIO                 2
#define EXT4_ERR_ENOMEM                 3
#define EXT4_ERR_EFSBADCRC         4
#define EXT4_ERR_EFSCORRUPTED         5
#define EXT4_ERR_ENOSPC                 6
#define EXT4_ERR_ENOKEY                 7
#define EXT4_ERR_EROFS                 8
#define EXT4_ERR_EFBIG                 9
#define EXT4_ERR_EEXIST                10
#define EXT4_ERR_ERANGE                11
#define EXT4_ERR_EOVERFLOW        12
#define EXT4_ERR_EBUSY                13
#define EXT4_ERR_ENOTDIR        14
#define EXT4_ERR_ENOTEMPTY        15
#define EXT4_ERR_ESHUTDOWN        16
#define EXT4_ERR_EFAULT                17

/*
 * Inode dynamic state flags
 */
enum {
        EXT4_STATE_JDATA,                /* journaled data exists */
        EXT4_STATE_NEW,                        /* inode is newly created */
        EXT4_STATE_XATTR,                /* has in-inode xattrs */
        EXT4_STATE_NO_EXPAND,                /* No space for expansion */
        EXT4_STATE_DA_ALLOC_CLOSE,        /* Alloc DA blks on close */
        EXT4_STATE_EXT_MIGRATE,                /* Inode is migrating */
        EXT4_STATE_NEWENTRY,                /* File just added to dir */
        EXT4_STATE_MAY_INLINE_DATA,        /* may have in-inode data */
        EXT4_STATE_EXT_PRECACHED,        /* extents have been precached */
        EXT4_STATE_LUSTRE_EA_INODE,        /* Lustre-style ea_inode */
        EXT4_STATE_VERITY_IN_PROGRESS,        /* building fs-verity Merkle tree */
        EXT4_STATE_FC_COMMITTING,        /* Fast commit ongoing */
};

#define EXT4_INODE_BIT_FNS(name, field, offset)                                \
static inline int ext4_test_inode_##name(struct inode *inode, int bit)        \
{                                                                        \
        return test_bit(bit + (offset), &EXT4_I(inode)->i_##field);        \
}                                                                        \
static inline void ext4_set_inode_##name(struct inode *inode, int bit)        \
{                                                                        \
        set_bit(bit + (offset), &EXT4_I(inode)->i_##field);                \
}                                                                        \
static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
{                                                                        \
        clear_bit(bit + (offset), &EXT4_I(inode)->i_##field);                \
}

/* Add these declarations here only so that these functions can be
 * found by name.  Otherwise, they are very hard to locate. */
static inline int ext4_test_inode_flag(struct inode *inode, int bit);
static inline void ext4_set_inode_flag(struct inode *inode, int bit);
static inline void ext4_clear_inode_flag(struct inode *inode, int bit);
EXT4_INODE_BIT_FNS(flag, flags, 0)

/* Add these declarations here only so that these functions can be
 * found by name.  Otherwise, they are very hard to locate. */
static inline int ext4_test_inode_state(struct inode *inode, int bit);
static inline void ext4_set_inode_state(struct inode *inode, int bit);
static inline void ext4_clear_inode_state(struct inode *inode, int bit);
#if (BITS_PER_LONG < 64)
EXT4_INODE_BIT_FNS(state, state_flags, 0)

static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
{
        (ei)->i_state_flags = 0;
}
#else
EXT4_INODE_BIT_FNS(state, flags, 32)

static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
{
        /* We depend on the fact that callers will set i_flags */
}
#endif
#else
/* Assume that user mode programs are passing in an ext4fs superblock, not
 * a kernel struct super_block.  This will allow us to call the feature-test
 * macros from user land. */
#define EXT4_SB(sb)        (sb)
#endif

static inline bool ext4_verity_in_progress(struct inode *inode)
{
        return IS_ENABLED(CONFIG_FS_VERITY) &&
               ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
}

#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime

/*
 * Codes for operating systems
 */
#define EXT4_OS_LINUX                0
#define EXT4_OS_HURD                1
#define EXT4_OS_MASIX                2
#define EXT4_OS_FREEBSD                3
#define EXT4_OS_LITES                4

/*
 * Revision levels
 */
#define EXT4_GOOD_OLD_REV        0        /* The good old (original) format */
#define EXT4_DYNAMIC_REV        1        /* V2 format w/ dynamic inode sizes */

#define EXT4_CURRENT_REV        EXT4_GOOD_OLD_REV
#define EXT4_MAX_SUPP_REV        EXT4_DYNAMIC_REV

#define EXT4_GOOD_OLD_INODE_SIZE 128

#define EXT4_EXTRA_TIMESTAMP_MAX        (((s64)1 << 34) - 1  + S32_MIN)
#define EXT4_NON_EXTRA_TIMESTAMP_MAX        S32_MAX
#define EXT4_TIMESTAMP_MIN                S32_MIN

/*
 * Feature set definitions
 */

#define EXT4_FEATURE_COMPAT_DIR_PREALLOC        0x0001
#define EXT4_FEATURE_COMPAT_IMAGIC_INODES        0x0002
#define EXT4_FEATURE_COMPAT_HAS_JOURNAL                0x0004
#define EXT4_FEATURE_COMPAT_EXT_ATTR                0x0008
#define EXT4_FEATURE_COMPAT_RESIZE_INODE        0x0010
#define EXT4_FEATURE_COMPAT_DIR_INDEX                0x0020
#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2        0x0200
/*
 * The reason why "FAST_COMMIT" is a compat feature is that, FS becomes
 * incompatible only if fast commit blocks are present in the FS. Since we
 * clear the journal (and thus the fast commit blocks), we don't mark FS as
 * incompatible. We also have a JBD2 incompat feature, which gets set when
 * there are fast commit blocks present in the journal.
 */
#define EXT4_FEATURE_COMPAT_FAST_COMMIT                0x0400
#define EXT4_FEATURE_COMPAT_STABLE_INODES        0x0800

#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER        0x0001
#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE        0x0002
#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR        0x0004
#define EXT4_FEATURE_RO_COMPAT_HUGE_FILE        0x0008
#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM                0x0010
#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK        0x0020
#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE        0x0040
#define EXT4_FEATURE_RO_COMPAT_QUOTA                0x0100
#define EXT4_FEATURE_RO_COMPAT_BIGALLOC                0x0200
/*
 * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM).  When
 * METADATA_CSUM is set, group descriptor checksums use the same algorithm as
 * all other data structures' checksums.  However, the METADATA_CSUM and
 * GDT_CSUM bits are mutually exclusive.
 */
#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM        0x0400
#define EXT4_FEATURE_RO_COMPAT_READONLY                0x1000
#define EXT4_FEATURE_RO_COMPAT_PROJECT                0x2000
#define EXT4_FEATURE_RO_COMPAT_VERITY                0x8000

#define EXT4_FEATURE_INCOMPAT_COMPRESSION        0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE                0x0002
#define EXT4_FEATURE_INCOMPAT_RECOVER                0x0004 /* Needs recovery */
#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV        0x0008 /* Journal device */
#define EXT4_FEATURE_INCOMPAT_META_BG                0x0010
#define EXT4_FEATURE_INCOMPAT_EXTENTS                0x0040 /* extents support */
#define EXT4_FEATURE_INCOMPAT_64BIT                0x0080
#define EXT4_FEATURE_INCOMPAT_MMP               0x0100
#define EXT4_FEATURE_INCOMPAT_FLEX_BG                0x0200
#define EXT4_FEATURE_INCOMPAT_EA_INODE                0x0400 /* EA in inode */
#define EXT4_FEATURE_INCOMPAT_DIRDATA                0x1000 /* data in dirent */
#define EXT4_FEATURE_INCOMPAT_CSUM_SEED                0x2000
#define EXT4_FEATURE_INCOMPAT_LARGEDIR                0x4000 /* >2GB or 3-lvl htree */
#define EXT4_FEATURE_INCOMPAT_INLINE_DATA        0x8000 /* data in inode */
#define EXT4_FEATURE_INCOMPAT_ENCRYPT                0x10000
#define EXT4_FEATURE_INCOMPAT_CASEFOLD                0x20000

extern void ext4_update_dynamic_rev(struct super_block *sb);

#define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \
static inline bool ext4_has_feature_##name(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_compat & \
                cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname)) != 0); \
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
        ext4_update_dynamic_rev(sb); \
        EXT4_SB(sb)->s_es->s_feature_compat |= \
                cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \
} \
static inline void ext4_clear_feature_##name(struct super_block *sb) \
{ \
        EXT4_SB(sb)->s_es->s_feature_compat &= \
                ~cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \
}

#define EXT4_FEATURE_RO_COMPAT_FUNCS(name, flagname) \
static inline bool ext4_has_feature_##name(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \
                cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname)) != 0); \
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
        ext4_update_dynamic_rev(sb); \
        EXT4_SB(sb)->s_es->s_feature_ro_compat |= \
                cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \
} \
static inline void ext4_clear_feature_##name(struct super_block *sb) \
{ \
        EXT4_SB(sb)->s_es->s_feature_ro_compat &= \
                ~cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \
}

#define EXT4_FEATURE_INCOMPAT_FUNCS(name, flagname) \
static inline bool ext4_has_feature_##name(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_incompat & \
                cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname)) != 0); \
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
        ext4_update_dynamic_rev(sb); \
        EXT4_SB(sb)->s_es->s_feature_incompat |= \
                cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \
} \
static inline void ext4_clear_feature_##name(struct super_block *sb) \
{ \
        EXT4_SB(sb)->s_es->s_feature_incompat &= \
                ~cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \
}

EXT4_FEATURE_COMPAT_FUNCS(dir_prealloc,                DIR_PREALLOC)
EXT4_FEATURE_COMPAT_FUNCS(imagic_inodes,        IMAGIC_INODES)
EXT4_FEATURE_COMPAT_FUNCS(journal,                HAS_JOURNAL)
EXT4_FEATURE_COMPAT_FUNCS(xattr,                EXT_ATTR)
EXT4_FEATURE_COMPAT_FUNCS(resize_inode,                RESIZE_INODE)
EXT4_FEATURE_COMPAT_FUNCS(dir_index,                DIR_INDEX)
EXT4_FEATURE_COMPAT_FUNCS(sparse_super2,        SPARSE_SUPER2)
EXT4_FEATURE_COMPAT_FUNCS(fast_commit,                FAST_COMMIT)
EXT4_FEATURE_COMPAT_FUNCS(stable_inodes,        STABLE_INODES)

EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super,        SPARSE_SUPER)
EXT4_FEATURE_RO_COMPAT_FUNCS(large_file,        LARGE_FILE)
EXT4_FEATURE_RO_COMPAT_FUNCS(btree_dir,                BTREE_DIR)
EXT4_FEATURE_RO_COMPAT_FUNCS(huge_file,                HUGE_FILE)
EXT4_FEATURE_RO_COMPAT_FUNCS(gdt_csum,                GDT_CSUM)
EXT4_FEATURE_RO_COMPAT_FUNCS(dir_nlink,                DIR_NLINK)
EXT4_FEATURE_RO_COMPAT_FUNCS(extra_isize,        EXTRA_ISIZE)
EXT4_FEATURE_RO_COMPAT_FUNCS(quota,                QUOTA)
EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc,                BIGALLOC)
EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum,        METADATA_CSUM)
EXT4_FEATURE_RO_COMPAT_FUNCS(readonly,                READONLY)
EXT4_FEATURE_RO_COMPAT_FUNCS(project,                PROJECT)
EXT4_FEATURE_RO_COMPAT_FUNCS(verity,                VERITY)

EXT4_FEATURE_INCOMPAT_FUNCS(compression,        COMPRESSION)
EXT4_FEATURE_INCOMPAT_FUNCS(filetype,                FILETYPE)
EXT4_FEATURE_INCOMPAT_FUNCS(journal_needs_recovery,        RECOVER)
EXT4_FEATURE_INCOMPAT_FUNCS(journal_dev,        JOURNAL_DEV)
EXT4_FEATURE_INCOMPAT_FUNCS(meta_bg,                META_BG)
EXT4_FEATURE_INCOMPAT_FUNCS(extents,                EXTENTS)
EXT4_FEATURE_INCOMPAT_FUNCS(64bit,                64BIT)
EXT4_FEATURE_INCOMPAT_FUNCS(mmp,                MMP)
EXT4_FEATURE_INCOMPAT_FUNCS(flex_bg,                FLEX_BG)
EXT4_FEATURE_INCOMPAT_FUNCS(ea_inode,                EA_INODE)
EXT4_FEATURE_INCOMPAT_FUNCS(dirdata,                DIRDATA)
EXT4_FEATURE_INCOMPAT_FUNCS(csum_seed,                CSUM_SEED)
EXT4_FEATURE_INCOMPAT_FUNCS(largedir,                LARGEDIR)
EXT4_FEATURE_INCOMPAT_FUNCS(inline_data,        INLINE_DATA)
EXT4_FEATURE_INCOMPAT_FUNCS(encrypt,                ENCRYPT)
EXT4_FEATURE_INCOMPAT_FUNCS(casefold,                CASEFOLD)

#define EXT2_FEATURE_COMPAT_SUPP        EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT2_FEATURE_INCOMPAT_SUPP        (EXT4_FEATURE_INCOMPAT_FILETYPE| \
                                         EXT4_FEATURE_INCOMPAT_META_BG)
#define EXT2_FEATURE_RO_COMPAT_SUPP        (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
                                         EXT4_FEATURE_RO_COMPAT_BTREE_DIR)

#define EXT3_FEATURE_COMPAT_SUPP        EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT3_FEATURE_INCOMPAT_SUPP        (EXT4_FEATURE_INCOMPAT_FILETYPE| \
                                         EXT4_FEATURE_INCOMPAT_RECOVER| \
                                         EXT4_FEATURE_INCOMPAT_META_BG)
#define EXT3_FEATURE_RO_COMPAT_SUPP        (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
                                         EXT4_FEATURE_RO_COMPAT_BTREE_DIR)

#define EXT4_FEATURE_COMPAT_SUPP        EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT4_FEATURE_INCOMPAT_SUPP        (EXT4_FEATURE_INCOMPAT_FILETYPE| \
                                         EXT4_FEATURE_INCOMPAT_RECOVER| \
                                         EXT4_FEATURE_INCOMPAT_META_BG| \
                                         EXT4_FEATURE_INCOMPAT_EXTENTS| \
                                         EXT4_FEATURE_INCOMPAT_64BIT| \
                                         EXT4_FEATURE_INCOMPAT_FLEX_BG| \
                                         EXT4_FEATURE_INCOMPAT_EA_INODE| \
                                         EXT4_FEATURE_INCOMPAT_MMP | \
                                         EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
                                         EXT4_FEATURE_INCOMPAT_ENCRYPT | \
                                         EXT4_FEATURE_INCOMPAT_CASEFOLD | \
                                         EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
                                         EXT4_FEATURE_INCOMPAT_LARGEDIR)
#define EXT4_FEATURE_RO_COMPAT_SUPP        (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
                                         EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
                                         EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
                                         EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
                                         EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
                                         EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
                                         EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
                                         EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
                                         EXT4_FEATURE_RO_COMPAT_QUOTA |\
                                         EXT4_FEATURE_RO_COMPAT_PROJECT |\
                                         EXT4_FEATURE_RO_COMPAT_VERITY)

#define EXTN_FEATURE_FUNCS(ver) \
static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_compat & \
                cpu_to_le32(~EXT##ver##_FEATURE_COMPAT_SUPP)) != 0); \
} \
static inline bool ext4_has_unknown_ext##ver##_ro_compat_features(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \
                cpu_to_le32(~EXT##ver##_FEATURE_RO_COMPAT_SUPP)) != 0); \
} \
static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_incompat & \
                cpu_to_le32(~EXT##ver##_FEATURE_INCOMPAT_SUPP)) != 0); \
}

EXTN_FEATURE_FUNCS(2)
EXTN_FEATURE_FUNCS(3)
EXTN_FEATURE_FUNCS(4)

static inline bool ext4_has_compat_features(struct super_block *sb)
{
        return (EXT4_SB(sb)->s_es->s_feature_compat != 0);
}
static inline bool ext4_has_ro_compat_features(struct super_block *sb)
{
        return (EXT4_SB(sb)->s_es->s_feature_ro_compat != 0);
}
static inline bool ext4_has_incompat_features(struct super_block *sb)
{
        return (EXT4_SB(sb)->s_es->s_feature_incompat != 0);
}

/*
 * Superblock flags
 */
#define EXT4_FLAGS_RESIZING        0
#define EXT4_FLAGS_SHUTDOWN        1
#define EXT4_FLAGS_BDEV_IS_DAX        2

static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi)
{
        return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
}


/*
 * Default values for user and/or group using reserved blocks
 */
#define        EXT4_DEF_RESUID                0
#define        EXT4_DEF_RESGID                0

/*
 * Default project ID
 */
#define        EXT4_DEF_PROJID                0

#define EXT4_DEF_INODE_READAHEAD_BLKS        32

/*
 * Default mount options
 */
#define EXT4_DEFM_DEBUG                0x0001
#define EXT4_DEFM_BSDGROUPS        0x0002
#define EXT4_DEFM_XATTR_USER        0x0004
#define EXT4_DEFM_ACL                0x0008
#define EXT4_DEFM_UID16                0x0010
#define EXT4_DEFM_JMODE                0x0060
#define EXT4_DEFM_JMODE_DATA        0x0020
#define EXT4_DEFM_JMODE_ORDERED        0x0040
#define EXT4_DEFM_JMODE_WBACK        0x0060
#define EXT4_DEFM_NOBARRIER        0x0100
#define EXT4_DEFM_BLOCK_VALIDITY 0x0200
#define EXT4_DEFM_DISCARD        0x0400
#define EXT4_DEFM_NODELALLOC        0x0800

/*
 * Default journal batch times
 */
#define EXT4_DEF_MIN_BATCH_TIME        0
#define EXT4_DEF_MAX_BATCH_TIME        15000 /* 15ms */

/*
 * Minimum number of groups in a flexgroup before we separate out
 * directories into the first block group of a flexgroup
 */
#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME        4

/*
 * Structure of a directory entry
 */
#define EXT4_NAME_LEN 255
/*
 * Base length of the ext4 directory entry excluding the name length
 */
#define EXT4_BASE_DIR_LEN (sizeof(struct ext4_dir_entry_2) - EXT4_NAME_LEN)

struct ext4_dir_entry {
        __le32        inode;                        /* Inode number */
        __le16        rec_len;                /* Directory entry length */
        __le16        name_len;                /* Name length */
        char        name[EXT4_NAME_LEN];        /* File name */
};

/*
 * The new version of the directory entry.  Since EXT4 structures are
 * stored in intel byte order, and the name_len field could never be
 * bigger than 255 chars, it's safe to reclaim the extra byte for the
 * file_type field.
 */
struct ext4_dir_entry_2 {
        __le32        inode;                        /* Inode number */
        __le16        rec_len;                /* Directory entry length */
        __u8        name_len;                /* Name length */
        __u8        file_type;                /* See file type macros EXT4_FT_* below */
        char        name[EXT4_NAME_LEN];        /* File name */
};

/*
 * This is a bogus directory entry at the end of each leaf block that
 * records checksums.
 */
struct ext4_dir_entry_tail {
        __le32        det_reserved_zero1;        /* Pretend to be unused */
        __le16        det_rec_len;                /* 12 */
        __u8        det_reserved_zero2;        /* Zero name length */
        __u8        det_reserved_ft;        /* 0xDE, fake file type */
        __le32        det_checksum;                /* crc32c(uuid+inum+dirblock) */
};

#define EXT4_DIRENT_TAIL(block, blocksize) \
        ((struct ext4_dir_entry_tail *)(((void *)(block)) + \
                                        ((blocksize) - \
                                         sizeof(struct ext4_dir_entry_tail))))

/*
 * Ext4 directory file types.  Only the low 3 bits are used.  The
 * other bits are reserved for now.
 */
#define EXT4_FT_UNKNOWN                0
#define EXT4_FT_REG_FILE        1
#define EXT4_FT_DIR                2
#define EXT4_FT_CHRDEV                3
#define EXT4_FT_BLKDEV                4
#define EXT4_FT_FIFO                5
#define EXT4_FT_SOCK                6
#define EXT4_FT_SYMLINK                7

#define EXT4_FT_MAX                8

#define EXT4_FT_DIR_CSUM        0xDE

/*
 * EXT4_DIR_PAD defines the directory entries boundaries
 *
 * NOTE: It must be a multiple of 4
 */
#define EXT4_DIR_PAD                        4
#define EXT4_DIR_ROUND                        (EXT4_DIR_PAD - 1)
#define EXT4_DIR_REC_LEN(name_len)        (((name_len) + 8 + EXT4_DIR_ROUND) & \
                                         ~EXT4_DIR_ROUND)
#define EXT4_MAX_REC_LEN                ((1<<16)-1)

/*
 * If we ever get support for fs block sizes > page_size, we'll need
 * to remove the #if statements in the next two functions...
 */
static inline unsigned int
ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
{
        unsigned len = le16_to_cpu(dlen);

#if (PAGE_SIZE >= 65536)
        if (len == EXT4_MAX_REC_LEN || len == 0)
                return blocksize;
        return (len & 65532) | ((len & 3) << 16);
#else
        return len;
#endif
}

static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
{
        if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
                BUG();
#if (PAGE_SIZE >= 65536)
        if (len < 65536)
                return cpu_to_le16(len);
        if (len == blocksize) {
                if (blocksize == 65536)
                        return cpu_to_le16(EXT4_MAX_REC_LEN);
                else
                        return cpu_to_le16(0);
        }
        return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
#else
        return cpu_to_le16(len);
#endif
}

/*
 * Hash Tree Directory indexing
 * (c) Daniel Phillips, 2001
 */

#define is_dx(dir) (ext4_has_feature_dir_index((dir)->i_sb) && \
                    ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
#define EXT4_DIR_LINK_MAX(dir) unlikely((dir)->i_nlink >= EXT4_LINK_MAX && \
                    !(ext4_has_feature_dir_nlink((dir)->i_sb) && is_dx(dir)))
#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)

/* Legal values for the dx_root hash_version field: */

#define DX_HASH_LEGACY                        0
#define DX_HASH_HALF_MD4                1
#define DX_HASH_TEA                        2
#define DX_HASH_LEGACY_UNSIGNED                3
#define DX_HASH_HALF_MD4_UNSIGNED        4
#define DX_HASH_TEA_UNSIGNED                5

static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
                              const void *address, unsigned int length)
{
        struct {
                struct shash_desc shash;
                char ctx[4];
        } desc;

        BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx));

        desc.shash.tfm = sbi->s_chksum_driver;
        *(u32 *)desc.ctx = crc;

        BUG_ON(crypto_shash_update(&desc.shash, address, length));

        return *(u32 *)desc.ctx;
}

#ifdef __KERNEL__

/* hash info structure used by the directory hash */
struct dx_hash_info
{
        u32                hash;
        u32                minor_hash;
        int                hash_version;
        u32                *seed;
};


/* 32 and 64 bit signed EOF for dx directories */
#define EXT4_HTREE_EOF_32BIT   ((1UL  << (32 - 1)) - 1)
#define EXT4_HTREE_EOF_64BIT   ((1ULL << (64 - 1)) - 1)


/*
 * Control parameters used by ext4_htree_next_block
 */
#define HASH_NB_ALWAYS                1

struct ext4_filename {
        const struct qstr *usr_fname;
        struct fscrypt_str disk_name;
        struct dx_hash_info hinfo;
#ifdef CONFIG_FS_ENCRYPTION
        struct fscrypt_str crypto_buf;
#endif
#ifdef CONFIG_UNICODE
        struct fscrypt_str cf_name;
#endif
};

#define fname_name(p) ((p)->disk_name.name)
#define fname_len(p)  ((p)->disk_name.len)

/*
 * Describe an inode's exact location on disk and in memory
 */
struct ext4_iloc
{
        struct buffer_head *bh;
        unsigned long offset;
        ext4_group_t block_group;
};

static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
{
        return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
}

static inline bool ext4_is_quota_file(struct inode *inode)
{
        return IS_NOQUOTA(inode) &&
               !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL);
}

/*
 * This structure is stuffed into the struct file's private_data field
 * for directories.  It is where we put information so that we can do
 * readdir operations in hash tree order.
 */
struct dir_private_info {
        struct rb_root        root;
        struct rb_node        *curr_node;
        struct fname        *extra_fname;
        loff_t                last_pos;
        __u32                curr_hash;
        __u32                curr_minor_hash;
        __u32                next_hash;
};

/* calculate the first block number of the group */
static inline ext4_fsblk_t
ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
{
        return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) +
                le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
}

/*
 * Special error return code only used by dx_probe() and its callers.
 */
#define ERR_BAD_DX_DIR        (-(MAX_ERRNO - 1))

/* htree levels for ext4 */
#define        EXT4_HTREE_LEVEL_COMPAT        2
#define        EXT4_HTREE_LEVEL        3

static inline int ext4_dir_htree_level(struct super_block *sb)
{
        return ext4_has_feature_largedir(sb) ?
                EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
}

/*
 * Timeout and state flag for lazy initialization inode thread.
 */
#define EXT4_DEF_LI_WAIT_MULT                        10
#define EXT4_DEF_LI_MAX_START_DELAY                5
#define EXT4_LAZYINIT_QUIT                        0x0001
#define EXT4_LAZYINIT_RUNNING                        0x0002

/*
 * Lazy inode table initialization info
 */
struct ext4_lazy_init {
        unsigned long                li_state;
        struct list_head        li_request_list;
        struct mutex                li_list_mtx;
};

enum ext4_li_mode {
        EXT4_LI_MODE_PREFETCH_BBITMAP,
        EXT4_LI_MODE_ITABLE,
};

struct ext4_li_request {
        struct super_block        *lr_super;
        enum ext4_li_mode        lr_mode;
        ext4_group_t                lr_first_not_zeroed;
        ext4_group_t                lr_next_group;
        struct list_head        lr_request;
        unsigned long                lr_next_sched;
        unsigned long                lr_timeout;
};

struct ext4_features {
        struct kobject f_kobj;
        struct completion f_kobj_unregister;
};

/*
 * This structure will be used for multiple mount protection. It will be
 * written into the block number saved in the s_mmp_block field in the
 * superblock. Programs that check MMP should assume that if
 * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
 * to use the filesystem, regardless of how old the timestamp is.
 */
#define EXT4_MMP_MAGIC     0x004D4D50U /* ASCII for MMP */
#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
#define EXT4_MMP_SEQ_FSCK  0xE24D4D50U /* mmp_seq value when being fscked */
#define EXT4_MMP_SEQ_MAX   0xE24D4D4FU /* maximum valid mmp_seq value */

struct mmp_struct {
        __le32        mmp_magic;                /* Magic number for MMP */
        __le32        mmp_seq;                /* Sequence no. updated periodically */

        /*
         * mmp_time, mmp_nodename & mmp_bdevname are only used for information
         * purposes and do not affect the correctness of the algorithm
         */
        __le64        mmp_time;                /* Time last updated */
        char        mmp_nodename[64];        /* Node which last updated MMP block */
        char        mmp_bdevname[32];        /* Bdev which last updated MMP block */

        /*
         * mmp_check_interval is used to verify if the MMP block has been
         * updated on the block device. The value is updated based on the
         * maximum time to write the MMP block during an update cycle.
         */
        __le16        mmp_check_interval;

        __le16        mmp_pad1;
        __le32        mmp_pad2[226];
        __le32        mmp_checksum;                /* crc32c(uuid+mmp_block) */
};

/* arguments passed to the mmp thread */
struct mmpd_data {
        struct buffer_head *bh; /* bh from initial read_mmp_block() */
        struct super_block *sb;  /* super block of the fs */
};

/*
 * Check interval multiplier
 * The MMP block is written every update interval and initially checked every
 * update interval x the multiplier (the value is then adapted based on the
 * write latency). The reason is that writes can be delayed under load and we
 * don't want readers to incorrectly assume that the filesystem is no longer
 * in use.
 */
#define EXT4_MMP_CHECK_MULT                2UL

/*
 * Minimum interval for MMP checking in seconds.
 */
#define EXT4_MMP_MIN_CHECK_INTERVAL        5UL

/*
 * Maximum interval for MMP checking in seconds.
 */
#define EXT4_MMP_MAX_CHECK_INTERVAL        300UL

/*
 * Function prototypes
 */

/*
 * Ok, these declarations are also in <linux/kernel.h> but none of the
 * ext4 source programs needs to include it so they are duplicated here.
 */
# define NORET_TYPE        /**/
# define ATTRIB_NORET        __attribute__((noreturn))
# define NORET_AND        noreturn,

/* bitmap.c */
extern unsigned int ext4_count_free(char *bitmap, unsigned numchars);
void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
                                struct ext4_group_desc *gdp,
                                struct buffer_head *bh, int sz);
int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
                                  struct ext4_group_desc *gdp,
                                  struct buffer_head *bh, int sz);
void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
                                struct ext4_group_desc *gdp,
                                struct buffer_head *bh);
int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
                                  struct ext4_group_desc *gdp,
                                  struct buffer_head *bh);

/* balloc.c */
extern void ext4_get_group_no_and_offset(struct super_block *sb,
                                         ext4_fsblk_t blocknr,
                                         ext4_group_t *blockgrpp,
                                         ext4_grpblk_t *offsetp);
extern ext4_group_t ext4_get_group_number(struct super_block *sb,
                                          ext4_fsblk_t block);

extern unsigned int ext4_block_group(struct super_block *sb,
                        ext4_fsblk_t blocknr);
extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
                        ext4_fsblk_t blocknr);
extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
                        ext4_group_t group);
extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                                         ext4_fsblk_t goal,
                                         unsigned int flags,
                                         unsigned long *count,
                                         int *errp);
extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
                                    s64 nclusters, unsigned int flags);
extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *);
extern void ext4_check_blocks_bitmap(struct super_block *);
extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
                                                    ext4_group_t block_group,
                                                    struct buffer_head ** bh);
extern struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
                                                   ext4_group_t group);
extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);

extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
                                                ext4_group_t block_group,
                                                bool ignore_locked);
extern int ext4_wait_block_bitmap(struct super_block *sb,
                                  ext4_group_t block_group,
                                  struct buffer_head *bh);
extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
                                                  ext4_group_t block_group);
extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
                                              ext4_group_t block_group,
                                              struct ext4_group_desc *gdp);
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);

#ifdef CONFIG_UNICODE
extern void ext4_fname_setup_ci_filename(struct inode *dir,
                                         const struct qstr *iname,
                                         struct fscrypt_str *fname);
#endif

#ifdef CONFIG_FS_ENCRYPTION
static inline void ext4_fname_from_fscrypt_name(struct ext4_filename *dst,
                                                const struct fscrypt_name *src)
{
        memset(dst, 0, sizeof(*dst));

        dst->usr_fname = src->usr_fname;
        dst->disk_name = src->disk_name;
        dst->hinfo.hash = src->hash;
        dst->hinfo.minor_hash = src->minor_hash;
        dst->crypto_buf = src->crypto_buf;
}

static inline int ext4_fname_setup_filename(struct inode *dir,
                                            const struct qstr *iname,
                                            int lookup,
                                            struct ext4_filename *fname)
{
        struct fscrypt_name name;
        int err;

        err = fscrypt_setup_filename(dir, iname, lookup, &name);
        if (err)
                return err;

        ext4_fname_from_fscrypt_name(fname, &name);

#ifdef CONFIG_UNICODE
        ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name);
#endif
        return 0;
}

static inline int ext4_fname_prepare_lookup(struct inode *dir,
                                            struct dentry *dentry,
                                            struct ext4_filename *fname)
{
        struct fscrypt_name name;
        int err;

        err = fscrypt_prepare_lookup(dir, dentry, &name);
        if (err)
                return err;

        ext4_fname_from_fscrypt_name(fname, &name);

#ifdef CONFIG_UNICODE
        ext4_fname_setup_ci_filename(dir, &dentry->d_name, &fname->cf_name);
#endif
        return 0;
}

static inline void ext4_fname_free_filename(struct ext4_filename *fname)
{
        struct fscrypt_name name;

        name.crypto_buf = fname->crypto_buf;
        fscrypt_free_filename(&name);

        fname->crypto_buf.name = NULL;
        fname->usr_fname = NULL;
        fname->disk_name.name = NULL;

#ifdef CONFIG_UNICODE
        kfree(fname->cf_name.name);
        fname->cf_name.name = NULL;
#endif
}
#else /* !CONFIG_FS_ENCRYPTION */
static inline int ext4_fname_setup_filename(struct inode *dir,
                                            const struct qstr *iname,
                                            int lookup,
                                            struct ext4_filename *fname)
{
        fname->usr_fname = iname;
        fname->disk_name.name = (unsigned char *) iname->name;
        fname->disk_name.len = iname->len;

#ifdef CONFIG_UNICODE
        ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name);
#endif

        return 0;
}

static inline int ext4_fname_prepare_lookup(struct inode *dir,
                                            struct dentry *dentry,
                                            struct ext4_filename *fname)
{
        return ext4_fname_setup_filename(dir, &dentry->d_name, 1, fname);
}

static inline void ext4_fname_free_filename(struct ext4_filename *fname)
{
#ifdef CONFIG_UNICODE
        kfree(fname->cf_name.name);
        fname->cf_name.name = NULL;
#endif
}
#endif /* !CONFIG_FS_ENCRYPTION */

/* dir.c */
extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
                                  struct file *,
                                  struct ext4_dir_entry_2 *,
                                  struct buffer_head *, char *, int,
                                  unsigned int);
#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset)        \
        unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
                                        (de), (bh), (buf), (size), (offset)))
extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
                                __u32 minor_hash,
                                struct ext4_dir_entry_2 *dirent,
                                struct fscrypt_str *ent_name);
extern void ext4_htree_free_dir_info(struct dir_private_info *p);
extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
                             struct buffer_head *bh,
                             void *buf, int buf_size,
                             struct ext4_filename *fname,
                             struct ext4_dir_entry_2 **dest_de);
void ext4_insert_dentry(struct inode *inode,
                        struct ext4_dir_entry_2 *de,
                        int buf_size,
                        struct ext4_filename *fname);
static inline void ext4_update_dx_flag(struct inode *inode)
{
        if (!ext4_has_feature_dir_index(inode->i_sb) &&
            ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
                /* ext4_iget() should have caught this... */
                WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb));
                ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
        }
}
static const unsigned char ext4_filetype_table[] = {
        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
};

static inline  unsigned char get_dtype(struct super_block *sb, int filetype)
{
        if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX)
                return DT_UNKNOWN;

        return ext4_filetype_table[filetype];
}
extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh,
                             void *buf, int buf_size);

/* fsync.c */
extern int ext4_sync_file(struct file *, loff_t, loff_t, int);

/* hash.c */
extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
                          struct dx_hash_info *hinfo);

/* ialloc.c */
extern int ext4_mark_inode_used(struct super_block *sb, int ino);
extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t,
                                      const struct qstr *qstr, __u32 goal,
                                      uid_t *owner, __u32 i_flags,
                                      int handle_type, unsigned int line_no,
                                      int nblocks);

#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \
        __ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \
                         i_flags, 0, 0, 0)
#define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \
                                    type, nblocks)                    \
        __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \
                         0, (type), __LINE__, (nblocks))


extern void ext4_free_inode(handle_t *, struct inode *);
extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
extern unsigned long ext4_count_free_inodes(struct super_block *);
extern unsigned long ext4_count_dirs(struct super_block *);
extern void ext4_check_inodes_bitmap(struct super_block *);
extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
extern int ext4_init_inode_table(struct super_block *sb,
                                 ext4_group_t group, int barrier);
extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);

/* fast_commit.c */
int ext4_fc_info_show(struct seq_file *seq, void *v);
void ext4_fc_init(struct super_block *sb, journal_t *journal);
void ext4_fc_init_inode(struct inode *inode);
void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
                         ext4_lblk_t end);
void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode,
        struct dentry *dentry);
void __ext4_fc_track_link(handle_t *handle, struct inode *inode,
        struct dentry *dentry);
void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry);
void ext4_fc_track_link(handle_t *handle, struct dentry *dentry);
void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
                            struct dentry *dentry);
void ext4_fc_track_create(handle_t *handle, struct dentry *dentry);
void ext4_fc_track_inode(handle_t *handle, struct inode *inode);
void ext4_fc_mark_ineligible(struct super_block *sb, int reason);
void ext4_fc_start_ineligible(struct super_block *sb, int reason);
void ext4_fc_stop_ineligible(struct super_block *sb);
void ext4_fc_start_update(struct inode *inode);
void ext4_fc_stop_update(struct inode *inode);
void ext4_fc_del(struct inode *inode);
bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block);
void ext4_fc_replay_cleanup(struct super_block *sb);
int ext4_fc_commit(journal_t *journal, tid_t commit_tid);
int __init ext4_fc_init_dentry_cache(void);
void ext4_fc_destroy_dentry_cache(void);
int ext4_fc_record_regions(struct super_block *sb, int ino,
                           ext4_lblk_t lblk, ext4_fsblk_t pblk,
                           int len, int replay);

/* mballoc.c */
extern const struct seq_operations ext4_mb_seq_groups_ops;
extern long ext4_mb_stats;
extern long ext4_mb_max_to_scan;
extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset);
extern int ext4_mb_init(struct super_block *);
extern int ext4_mb_release(struct super_block *);
extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
                                struct ext4_allocation_request *, int *);
extern int ext4_mb_reserve_blocks(struct super_block *, int);
extern void ext4_discard_preallocations(struct inode *, unsigned int);
extern int __init ext4_init_mballoc(void);
extern void ext4_exit_mballoc(void);
extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
                                     ext4_group_t group,
                                     unsigned int nr, int *cnt);
extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
                                  unsigned int nr);

extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
                             struct buffer_head *bh, ext4_fsblk_t block,
                             unsigned long count, int flags);
extern int ext4_mb_alloc_groupinfo(struct super_block *sb,
                                   ext4_group_t ngroups);
extern int ext4_mb_add_groupinfo(struct super_block *sb,
                ext4_group_t i, struct ext4_group_desc *desc);
extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
                                ext4_fsblk_t block, unsigned long count);
extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid);
extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
                       int len, int state);

/* inode.c */
void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
                         struct ext4_inode_info *ei);
int ext4_inode_is_fast_symlink(struct inode *inode);
struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
                     bool wait, struct buffer_head **bhs);
int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
                             struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh_result, int create);
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                           struct buffer_head *bh, int create);
int ext4_walk_page_buffers(handle_t *handle,
                           struct buffer_head *head,
                           unsigned from,
                           unsigned to,
                           int *partial,
                           int (*fn)(handle_t *handle,
                                     struct buffer_head *bh));
int do_journal_get_write_access(handle_t *handle,
                                struct buffer_head *bh);
#define FALL_BACK_TO_NONDELALLOC 1
#define CONVERT_INLINE_DATA         2

typedef enum {
        EXT4_IGET_NORMAL =        0,
        EXT4_IGET_SPECIAL =        0x0001, /* OK to iget a system inode */
        EXT4_IGET_HANDLE =         0x0002,        /* Inode # is from a handle */
        EXT4_IGET_BAD =                0x0004, /* Allow to iget a bad inode */
        EXT4_IGET_EA_INODE =        0x0008        /* Inode should contain an EA value */
} ext4_iget_flags;

extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
                                 ext4_iget_flags flags, const char *function,
                                 unsigned int line);

#define ext4_iget(sb, ino, flags) \
        __ext4_iget((sb), (ino), (flags), __func__, __LINE__)

extern int  ext4_write_inode(struct inode *, struct writeback_control *);
extern int  ext4_setattr(struct dentry *, struct iattr *);
extern int  ext4_getattr(const struct path *, struct kstat *, u32, unsigned int);
extern void ext4_evict_inode(struct inode *);
extern void ext4_clear_inode(struct inode *);
extern int  ext4_file_getattr(const struct path *, struct kstat *, u32, unsigned int);
extern int  ext4_sync_inode(handle_t *, struct inode *);
extern void ext4_dirty_inode(struct inode *, int);
extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
extern int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
                          struct ext4_iloc *iloc);
extern int ext4_inode_attach_jinode(struct inode *inode);
extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
extern int ext4_break_layouts(struct inode *);
extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
                             loff_t lstart, loff_t lend);
extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
extern void ext4_da_release_space(struct inode *inode, int to_free);
extern void ext4_da_update_reserve_space(struct inode *inode,
                                        int used, int quota_claim);
extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
                              ext4_fsblk_t pblk, ext4_lblk_t len);

/* indirect.c */
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
                                struct ext4_map_blocks *map, int flags);
extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
extern void ext4_ind_truncate(handle_t *, struct inode *inode);
extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
                                 ext4_lblk_t start, ext4_lblk_t end);

/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
extern void ext4_reset_inode_seed(struct inode *inode);

/* migrate.c */
extern int ext4_ext_migrate(struct inode *);
extern int ext4_ind_migrate(struct inode *inode);

/* namei.c */
extern int ext4_init_new_dir(handle_t *handle, struct inode *dir,
                             struct inode *inode);
extern int ext4_dirblock_csum_verify(struct inode *inode,
                                     struct buffer_head *bh);
extern int ext4_orphan_add(handle_t *, struct inode *);
extern int ext4_orphan_del(handle_t *, struct inode *);
extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
                                __u32 start_minor_hash, __u32 *next_hash);
extern int ext4_search_dir(struct buffer_head *bh,
                           char *search_buf,
                           int buf_size,
                           struct inode *dir,
                           struct ext4_filename *fname,
                           unsigned int offset,
                           struct ext4_dir_entry_2 **res_dir);
extern int ext4_generic_delete_entry(struct inode *dir,
                                     struct ext4_dir_entry_2 *de_del,
                                     struct buffer_head *bh,
                                     void *entry_buf,
                                     int buf_size,
                                     int csum_size);
extern bool ext4_empty_dir(struct inode *inode);

/* resize.c */
extern void ext4_kvfree_array_rcu(void *to_free);
extern int ext4_group_add(struct super_block *sb,
                                struct ext4_new_group_data *input);
extern int ext4_group_extend(struct super_block *sb,
                                struct ext4_super_block *es,
                                ext4_fsblk_t n_blocks_count);
extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);

/* super.c */
extern struct buffer_head *ext4_sb_bread(struct super_block *sb,
                                         sector_t block, int op_flags);
extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
                                                   sector_t block);
extern void ext4_read_bh_nowait(struct buffer_head *bh, int op_flags,
                                bh_end_io_t *end_io);
extern int ext4_read_bh(struct buffer_head *bh, int op_flags,
                        bh_end_io_t *end_io);
extern int ext4_read_bh_lock(struct buffer_head *bh, int op_flags, bool wait);
extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block);
extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
extern int ext4_calculate_overhead(struct super_block *sb);
extern void ext4_superblock_csum_set(struct super_block *sb);
extern int ext4_alloc_flex_bg_array(struct super_block *sb,
                                    ext4_group_t ngroup);
extern const char *ext4_decode_error(struct super_block *sb, int errno,
                                     char nbuf[16]);
extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
                                             ext4_group_t block_group,
                                             unsigned int flags);
extern unsigned int ext4_num_base_meta_blocks(struct super_block *sb,
                                              ext4_group_t block_group);

extern __printf(6, 7)
void __ext4_error(struct super_block *, const char *, unsigned int, int, __u64,
                  const char *, ...);
extern __printf(6, 7)
void __ext4_error_inode(struct inode *, const char *, unsigned int,
                        ext4_fsblk_t, int, const char *, ...);
extern __printf(5, 6)
void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
                     const char *, ...);
extern void __ext4_std_error(struct super_block *, const char *,
                             unsigned int, int);
extern __printf(5, 6)
void __ext4_abort(struct super_block *, const char *, unsigned int, int,
                  const char *, ...);
extern __printf(4, 5)
void __ext4_warning(struct super_block *, const char *, unsigned int,
                    const char *, ...);
extern __printf(4, 5)
void __ext4_warning_inode(const struct inode *inode, const char *function,
                          unsigned int line, const char *fmt, ...);
extern __printf(3, 4)
void __ext4_msg(struct super_block *, const char *, const char *, ...);
extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
                           const char *, unsigned int, const char *);
extern __printf(7, 8)
void __ext4_grp_locked_error(const char *, unsigned int,
                             struct super_block *, ext4_group_t,
                             unsigned long, ext4_fsblk_t,
                             const char *, ...);

#define EXT4_ERROR_INODE(inode, fmt, a...) \
        ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)

#define EXT4_ERROR_INODE_ERR(inode, err, fmt, a...)                        \
        __ext4_error_inode((inode), __func__, __LINE__, 0, (err), (fmt), ## a)

#define ext4_error_inode_block(inode, block, err, fmt, a...)                \
        __ext4_error_inode((inode), __func__, __LINE__, (block), (err),        \
                           (fmt), ## a)

#define EXT4_ERROR_FILE(file, block, fmt, a...)                                \
        ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)

#ifdef CONFIG_PRINTK

#define ext4_error_inode(inode, func, line, block, fmt, ...)                \
        __ext4_error_inode(inode, func, line, block, 0, fmt, ##__VA_ARGS__)
#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...)        \
        __ext4_error_inode((inode), (func), (line), (block),                 \
                           (err), (fmt), ##__VA_ARGS__)
#define ext4_error_file(file, func, line, block, fmt, ...)                \
        __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
#define ext4_error(sb, fmt, ...)                                        \
        __ext4_error((sb), __func__, __LINE__, 0, 0, (fmt), ##__VA_ARGS__)
#define ext4_error_err(sb, err, fmt, ...)                                \
        __ext4_error((sb), __func__, __LINE__, (err), 0, (fmt), ##__VA_ARGS__)
#define ext4_abort(sb, err, fmt, ...)                                        \
        __ext4_abort((sb), __func__, __LINE__, (err), (fmt), ##__VA_ARGS__)
#define ext4_warning(sb, fmt, ...)                                        \
        __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
#define ext4_warning_inode(inode, fmt, ...)                                \
        __ext4_warning_inode(inode, __func__, __LINE__, fmt, ##__VA_ARGS__)
#define ext4_msg(sb, level, fmt, ...)                                \
        __ext4_msg(sb, level, fmt, ##__VA_ARGS__)
#define dump_mmp_msg(sb, mmp, msg)                                        \
        __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg)
#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)                \
        __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \
                                fmt, ##__VA_ARGS__)

#else

#define ext4_error_inode(inode, func, line, block, fmt, ...)                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error_inode(inode, "", 0, block, 0, " ");                \
} while (0)
#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...)        \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error_inode(inode, "", 0, block, err, " ");                \
} while (0)
#define ext4_error_file(file, func, line, block, fmt, ...)                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error_file(file, "", 0, block, " ");                        \
} while (0)
#define ext4_error(sb, fmt, ...)                                        \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error(sb, "", 0, 0, 0, " ");                                \
} while (0)
#define ext4_error_err(sb, err, fmt, ...)                                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error(sb, "", 0, err, 0, " ");                                \
} while (0)
#define ext4_abort(sb, err, fmt, ...)                                        \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_abort(sb, "", 0, err, " ");                                \
} while (0)
#define ext4_warning(sb, fmt, ...)                                        \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_warning(sb, "", 0, " ");                                        \
} while (0)
#define ext4_warning_inode(inode, fmt, ...)                                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_warning_inode(inode, "", 0, " ");                        \
} while (0)
#define ext4_msg(sb, level, fmt, ...)                                        \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_msg(sb, "", " ");                                        \
} while (0)
#define dump_mmp_msg(sb, mmp, msg)                                        \
        __dump_mmp_msg(sb, mmp, "", 0, "")
#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                \
        __ext4_grp_locked_error("", 0, sb, grp, ino, block, " ");        \
} while (0)

#endif

extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
                                      struct ext4_group_desc *bg);
extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
                                      struct ext4_group_desc *bg);
extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
                                     struct ext4_group_desc *bg);
extern __u32 ext4_free_group_clusters(struct super_block *sb,
                                      struct ext4_group_desc *bg);
extern __u32 ext4_free_inodes_count(struct super_block *sb,
                                 struct ext4_group_desc *bg);
extern __u32 ext4_used_dirs_count(struct super_block *sb,
                                struct ext4_group_desc *bg);
extern __u32 ext4_itable_unused_count(struct super_block *sb,
                                   struct ext4_group_desc *bg);
extern void ext4_block_bitmap_set(struct super_block *sb,
                                  struct ext4_group_desc *bg, ext4_fsblk_t blk);
extern void ext4_inode_bitmap_set(struct super_block *sb,
                                  struct ext4_group_desc *bg, ext4_fsblk_t blk);
extern void ext4_inode_table_set(struct super_block *sb,
                                 struct ext4_group_desc *bg, ext4_fsblk_t blk);
extern void ext4_free_group_clusters_set(struct super_block *sb,
                                         struct ext4_group_desc *bg,
                                         __u32 count);
extern void ext4_free_inodes_set(struct super_block *sb,
                                struct ext4_group_desc *bg, __u32 count);
extern void ext4_used_dirs_set(struct super_block *sb,
                                struct ext4_group_desc *bg, __u32 count);
extern void ext4_itable_unused_set(struct super_block *sb,
                                   struct ext4_group_desc *bg, __u32 count);
extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group,
                                       struct ext4_group_desc *gdp);
extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
                                     struct ext4_group_desc *gdp);
extern int ext4_register_li_request(struct super_block *sb,
                                    ext4_group_t first_not_zeroed);

static inline int ext4_has_metadata_csum(struct super_block *sb)
{
        WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) &&
                     !EXT4_SB(sb)->s_chksum_driver);

        return ext4_has_feature_metadata_csum(sb) &&
               (EXT4_SB(sb)->s_chksum_driver != NULL);
}

static inline int ext4_has_group_desc_csum(struct super_block *sb)
{
        return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb);
}

#define ext4_read_incompat_64bit_val(es, name) \
        (((es)->s_feature_incompat & cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT) \
                ? (ext4_fsblk_t)le32_to_cpu(es->name##_hi) << 32 : 0) | \
                le32_to_cpu(es->name##_lo))

static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
{
        return ext4_read_incompat_64bit_val(es, s_blocks_count);
}

static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es)
{
        return ext4_read_incompat_64bit_val(es, s_r_blocks_count);
}

static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es)
{
        return ext4_read_incompat_64bit_val(es, s_free_blocks_count);
}

static inline void ext4_blocks_count_set(struct ext4_super_block *es,
                                         ext4_fsblk_t blk)
{
        es->s_blocks_count_lo = cpu_to_le32((u32)blk);
        es->s_blocks_count_hi = cpu_to_le32(blk >> 32);
}

static inline void ext4_free_blocks_count_set(struct ext4_super_block *es,
                                              ext4_fsblk_t blk)
{
        es->s_free_blocks_count_lo = cpu_to_le32((u32)blk);
        es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32);
}

static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
                                           ext4_fsblk_t blk)
{
        es->s_r_blocks_count_lo = cpu_to_le32((u32)blk);
        es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
}

static inline loff_t ext4_isize(struct super_block *sb,
                                struct ext4_inode *raw_inode)
{
        if (ext4_has_feature_largedir(sb) ||
            S_ISREG(le16_to_cpu(raw_inode->i_mode)))
                return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
                        le32_to_cpu(raw_inode->i_size_lo);

        return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
}

static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
{
        raw_inode->i_size_lo = cpu_to_le32(i_size);
        raw_inode->i_size_high = cpu_to_le32(i_size >> 32);
}

/*
 * Reading s_groups_count requires using smp_rmb() afterwards.  See
 * the locking protocol documented in the comments of ext4_group_add()
 * in resize.c
 */
static inline ext4_group_t ext4_get_groups_count(struct super_block *sb)
{
        ext4_group_t        ngroups = EXT4_SB(sb)->s_groups_count;

        smp_rmb();
        return ngroups;
}

static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
                                             ext4_group_t block_group)
{
        return block_group >> sbi->s_log_groups_per_flex;
}

static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
{
        return 1 << sbi->s_log_groups_per_flex;
}

#define ext4_std_error(sb, errno)                                \
do {                                                                \
        if ((errno))                                                \
                __ext4_std_error((sb), __func__, __LINE__, (errno));        \
} while (0)

#ifdef CONFIG_SMP
/* Each CPU can accumulate percpu_counter_batch clusters in their local
 * counters. So we need to make sure we have free clusters more
 * than percpu_counter_batch  * nr_cpu_ids. Also add a window of 4 times.
 */
#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
#else
#define EXT4_FREECLUSTERS_WATERMARK 0
#endif

/* Update i_disksize. Requires i_mutex to avoid races with truncate */
static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
{
        WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
                     !inode_is_locked(inode));
        down_write(&EXT4_I(inode)->i_data_sem);
        if (newsize > EXT4_I(inode)->i_disksize)
                WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize);
        up_write(&EXT4_I(inode)->i_data_sem);
}

/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */
static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
{
        int changed = 0;

        if (newsize > inode->i_size) {
                i_size_write(inode, newsize);
                changed = 1;
        }
        if (newsize > EXT4_I(inode)->i_disksize) {
                ext4_update_i_disksize(inode, newsize);
                changed |= 2;
        }
        return changed;
}

int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
                                      loff_t len);

struct ext4_group_info {
        unsigned long   bb_state;
#ifdef AGGRESSIVE_CHECK
        unsigned long        bb_check_counter;
#endif
        struct rb_root  bb_free_root;
        ext4_grpblk_t        bb_first_free;        /* first free block */
        ext4_grpblk_t        bb_free;        /* total free blocks */
        ext4_grpblk_t        bb_fragments;        /* nr of freespace fragments */
        ext4_grpblk_t        bb_largest_free_order;/* order of largest frag in BG */
        struct          list_head bb_prealloc_list;
#ifdef DOUBLE_CHECK
        void            *bb_bitmap;
#endif
        struct rw_semaphore alloc_sem;
        ext4_grpblk_t        bb_counters[];        /* Nr of free power-of-two-block
                                         * regions, index is order.
                                         * bb_counters[3] = 5 means
                                         * 5 free 8-block regions. */
};

#define EXT4_GROUP_INFO_NEED_INIT_BIT                0
#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT                1
#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT        2
#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT        3
#define EXT4_GROUP_INFO_BBITMAP_CORRUPT                \
        (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT)
#define EXT4_GROUP_INFO_IBITMAP_CORRUPT                \
        (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT)
#define EXT4_GROUP_INFO_BBITMAP_READ_BIT        4

#define EXT4_MB_GRP_NEED_INIT(grp)        \
        (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp)        \
        (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp)        \
        (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state)))

#define EXT4_MB_GRP_WAS_TRIMMED(grp)        \
        (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_SET_TRIMMED(grp)        \
        (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_CLEAR_TRIMMED(grp)        \
        (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_TEST_AND_SET_READ(grp)        \
        (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))

#define EXT4_MAX_CONTENTION                8
#define EXT4_CONTENTION_THRESHOLD        2

static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
                                              ext4_group_t group)
{
        return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
}

/*
 * Returns true if the filesystem is busy enough that attempts to
 * access the block group locks has run into contention.
 */
static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
{
        return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
}

static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
{
        spinlock_t *lock = ext4_group_lock_ptr(sb, group);
        if (spin_trylock(lock))
                /*
                 * We're able to grab the lock right away, so drop the
                 * lock contention counter.
                 */
                atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
        else {
                /*
                 * The lock is busy, so bump the contention counter,
                 * and then wait on the spin lock.
                 */
                atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
                                  EXT4_MAX_CONTENTION);
                spin_lock(lock);
        }
}

static inline void ext4_unlock_group(struct super_block *sb,
                                        ext4_group_t group)
{
        spin_unlock(ext4_group_lock_ptr(sb, group));
}

/*
 * Block validity checking
 */
#define ext4_check_indirect_blockref(inode, bh)                                \
        ext4_check_blockref(__func__, __LINE__, inode,                        \
                            (__le32 *)(bh)->b_data,                        \
                            EXT4_ADDR_PER_BLOCK((inode)->i_sb))

#define ext4_ind_check_inode(inode)                                        \
        ext4_check_blockref(__func__, __LINE__, inode,                        \
                            EXT4_I(inode)->i_data,                        \
                            EXT4_NDIR_BLOCKS)

/*
 * Inodes and files operations
 */

/* dir.c */
extern const struct file_operations ext4_dir_operations;

#ifdef CONFIG_UNICODE
extern const struct dentry_operations ext4_dentry_ops;
#endif

/* file.c */
extern const struct inode_operations ext4_file_inode_operations;
extern const struct file_operations ext4_file_operations;
extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);

/* inline.c */
extern int ext4_get_max_inline_size(struct inode *inode);
extern int ext4_find_inline_data_nolock(struct inode *inode);
extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
                                 unsigned int len);
extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);

extern int ext4_readpage_inline(struct inode *inode, struct page *page);
extern int ext4_try_to_write_inline_data(struct address_space *mapping,
                                         struct inode *inode,
                                         loff_t pos, unsigned len,
                                         unsigned flags,
                                         struct page **pagep);
extern int ext4_write_inline_data_end(struct inode *inode,
                                      loff_t pos, unsigned len,
                                      unsigned copied,
                                      struct page *page);
extern struct buffer_head *
ext4_journalled_write_inline_data(struct inode *inode,
                                  unsigned len,
                                  struct page *page);
extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
                                           struct inode *inode,
                                           loff_t pos, unsigned len,
                                           unsigned flags,
                                           struct page **pagep,
                                           void **fsdata);
extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
                                         unsigned len, unsigned copied,
                                         struct page *page);
extern int ext4_try_add_inline_entry(handle_t *handle,
                                     struct ext4_filename *fname,
                                     struct inode *dir, struct inode *inode);
extern int ext4_try_create_inline_dir(handle_t *handle,
                                      struct inode *parent,
                                      struct inode *inode);
extern int ext4_read_inline_dir(struct file *filp,
                                struct dir_context *ctx,
                                int *has_inline_data);
extern int ext4_inlinedir_to_tree(struct file *dir_file,
                                  struct inode *dir, ext4_lblk_t block,
                                  struct dx_hash_info *hinfo,
                                  __u32 start_hash, __u32 start_minor_hash,
                                  int *has_inline_data);
extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
                                        struct ext4_filename *fname,
                                        struct ext4_dir_entry_2 **res_dir,
                                        int *has_inline_data);
extern int ext4_delete_inline_entry(handle_t *handle,
                                    struct inode *dir,
                                    struct ext4_dir_entry_2 *de_del,
                                    struct buffer_head *bh,
                                    int *has_inline_data);
extern bool empty_inline_dir(struct inode *dir, int *has_inline_data);
extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
                                        struct ext4_dir_entry_2 **parent_de,
                                        int *retval);
extern int ext4_inline_data_fiemap(struct inode *inode,
                                   struct fiemap_extent_info *fieinfo,
                                   int *has_inline, __u64 start, __u64 len);

struct iomap;
extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap);

extern int ext4_inline_data_truncate(struct inode *inode, int *has_inline);

extern int ext4_convert_inline_data(struct inode *inode);

static inline int ext4_has_inline_data(struct inode *inode)
{
        return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) &&
               EXT4_I(inode)->i_inline_off;
}

/* namei.c */
extern const struct inode_operations ext4_dir_inode_operations;
extern const struct inode_operations ext4_special_inode_operations;
extern struct dentry *ext4_get_parent(struct dentry *child);
extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
                                 struct ext4_dir_entry_2 *de,
                                 int blocksize, int csum_size,
                                 unsigned int parent_ino, int dotdot_real_len);
extern void ext4_initialize_dirent_tail(struct buffer_head *bh,
                                        unsigned int blocksize);
extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
                                      struct buffer_head *bh);
extern int ext4_ci_compare(const struct inode *parent,
                           const struct qstr *fname,
                           const struct qstr *entry, bool quick);
extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
                         struct inode *inode, struct dentry *dentry);
extern int __ext4_link(struct inode *dir, struct inode *inode,
                       struct dentry *dentry);

#define S_SHIFT 12
static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = {
        [S_IFREG >> S_SHIFT]        = EXT4_FT_REG_FILE,
        [S_IFDIR >> S_SHIFT]        = EXT4_FT_DIR,
        [S_IFCHR >> S_SHIFT]        = EXT4_FT_CHRDEV,
        [S_IFBLK >> S_SHIFT]        = EXT4_FT_BLKDEV,
        [S_IFIFO >> S_SHIFT]        = EXT4_FT_FIFO,
        [S_IFSOCK >> S_SHIFT]        = EXT4_FT_SOCK,
        [S_IFLNK >> S_SHIFT]        = EXT4_FT_SYMLINK,
};

static inline void ext4_set_de_type(struct super_block *sb,
                                struct ext4_dir_entry_2 *de,
                                umode_t mode) {
        if (ext4_has_feature_filetype(sb))
                de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
}

/* readpages.c */
extern int ext4_mpage_readpages(struct inode *inode,
                struct readahead_control *rac, struct page *page);
extern int __init ext4_init_post_read_processing(void);
extern void ext4_exit_post_read_processing(void);

/* symlink.c */
extern const struct inode_operations ext4_encrypted_symlink_inode_operations;
extern const struct inode_operations ext4_symlink_inode_operations;
extern const struct inode_operations ext4_fast_symlink_inode_operations;

/* sysfs.c */
extern int ext4_register_sysfs(struct super_block *sb);
extern void ext4_unregister_sysfs(struct super_block *sb);
extern int __init ext4_init_sysfs(void);
extern void ext4_exit_sysfs(void);

/* block_validity */
extern void ext4_release_system_zone(struct super_block *sb);
extern int ext4_setup_system_zone(struct super_block *sb);
extern int __init ext4_init_system_zone(void);
extern void ext4_exit_system_zone(void);
extern int ext4_inode_block_valid(struct inode *inode,
                                  ext4_fsblk_t start_blk,
                                  unsigned int count);
extern int ext4_check_blockref(const char *, unsigned int,
                               struct inode *, __le32 *, unsigned int);
extern int ext4_sb_block_valid(struct super_block *sb, struct inode *inode,
                                ext4_fsblk_t start_blk, unsigned int count);


/* extents.c */
struct ext4_ext_path;
struct ext4_extent;

/*
 * Maximum number of logical blocks in a file; ext4_extent's ee_block is
 * __le32.
 */
#define EXT_MAX_BLOCKS        0xffffffff

extern void ext4_ext_tree_init(handle_t *handle, struct inode *inode);
extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                               struct ext4_map_blocks *map, int flags);
extern int ext4_ext_truncate(handle_t *, struct inode *);
extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
                                 ext4_lblk_t end);
extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
                          loff_t len);
extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
                                          loff_t offset, ssize_t len);
extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
                                             ext4_io_end_t *io_end);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
                           struct ext4_map_blocks *map, int flags);
extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
                                                   int num,
                                                   struct ext4_ext_path *path);
extern int ext4_ext_insert_extent(handle_t *, struct inode *,
                                  struct ext4_ext_path **,
                                  struct ext4_extent *, int);
extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t,
                                              struct ext4_ext_path **,
                                              int flags);
extern void ext4_ext_drop_refs(struct ext4_ext_path *);
extern int ext4_ext_check_inode(struct inode *inode);
extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        __u64 start, __u64 len);
extern int ext4_get_es_cache(struct inode *inode,
                             struct fiemap_extent_info *fieinfo,
                             __u64 start, __u64 len);
extern int ext4_ext_precache(struct inode *inode);
extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
                                struct inode *inode2, ext4_lblk_t lblk1,
                             ext4_lblk_t lblk2,  ext4_lblk_t count,
                             int mark_unwritten,int *err);
extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
                                       int check_cred, int restart_cred,
                                       int revoke_cred);
extern void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end);
extern int ext4_ext_replay_set_iblocks(struct inode *inode);
extern int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
                int len, int unwritten, ext4_fsblk_t pblk);
extern int ext4_ext_clear_bb(struct inode *inode);


/* move_extent.c */
extern void ext4_double_down_write_data_sem(struct inode *first,
                                            struct inode *second);
extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
                                          struct inode *donor_inode);
extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
                             __u64 start_orig, __u64 start_donor,
                             __u64 len, __u64 *moved_len);

/* page-io.c */
extern int __init ext4_init_pageio(void);
extern void ext4_exit_pageio(void);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
extern int ext4_put_io_end(ext4_io_end_t *io_end);
extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
extern void ext4_io_submit_init(struct ext4_io_submit *io,
                                struct writeback_control *wbc);
extern void ext4_end_io_rsv_work(struct work_struct *work);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
                               struct page *page,
                               int len,
                               struct writeback_control *wbc,
                               bool keep_towrite);
extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);

/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);

/* mmp.c */
extern void ext4_stop_mmpd(struct ext4_sb_info *sbi);

/* verity.c */
extern const struct fsverity_operations ext4_verityops;

/*
 * Add new method to test whether block and inode bitmaps are properly
 * initialized. With uninit_bg reading the block from disk is not enough
 * to mark the bitmap uptodate. We need to also zero-out the bitmap
 */
#define BH_BITMAP_UPTODATE BH_JBDPrivateStart

static inline int bitmap_uptodate(struct buffer_head *bh)
{
        return (buffer_uptodate(bh) &&
                        test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
}
static inline void set_bitmap_uptodate(struct buffer_head *bh)
{
        set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
}

#define in_range(b, first, len)        ((b) >= (first) && (b) <= (first) + (len) - 1)

/* For ioend & aio unwritten conversion wait queues */
#define EXT4_WQ_HASH_SZ                37
#define ext4_ioend_wq(v)   (&ext4__ioend_wq[((unsigned long)(v)) %\
                                            EXT4_WQ_HASH_SZ])
extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];

extern int ext4_resize_begin(struct super_block *sb);
extern void ext4_resize_end(struct super_block *sb);

static inline void ext4_set_io_unwritten_flag(struct inode *inode,
                                              struct ext4_io_end *io_end)
{
        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
                io_end->flag |= EXT4_IO_END_UNWRITTEN;
                atomic_inc(&EXT4_I(inode)->i_unwritten);
        }
}

static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
{
        struct inode *inode = io_end->inode;

        if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
                io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
                /* Wake up anyone waiting on unwritten extent conversion */
                if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
                        wake_up_all(ext4_ioend_wq(inode));
        }
}

extern const struct iomap_ops ext4_iomap_ops;
extern const struct iomap_ops ext4_iomap_overwrite_ops;
extern const struct iomap_ops ext4_iomap_report_ops;

static inline int ext4_buffer_uptodate(struct buffer_head *bh)
{
        /*
         * If the buffer has the write error flag, we have failed
         * to write out data in the block.  In this  case, we don't
         * have to read the block because we may read the old data
         * successfully.
         */
        if (!buffer_uptodate(bh) && buffer_write_io_error(bh))
                set_buffer_uptodate(bh);
        return buffer_uptodate(bh);
}

#endif        /* __KERNEL__ */

#define EFSBADCRC        EBADMSG                /* Bad CRC detected */
#define EFSCORRUPTED        EUCLEAN                /* Filesystem is corrupted */

#endif        /* _EXT4_H */




























































































































    1 




    1 













































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
// SPDX-License-Identifier: GPL-2.0
#include <linux/err.h>
#include <linux/bug.h>
#include <linux/atomic.h>
#include <linux/errseq.h>

/*
 * An errseq_t is a way of recording errors in one place, and allowing any
 * number of "subscribers" to tell whether it has changed since a previous
 * point where it was sampled.
 *
 * It's implemented as an unsigned 32-bit value. The low order bits are
 * designated to hold an error code (between 0 and -MAX_ERRNO). The upper bits
 * are used as a counter. This is done with atomics instead of locking so that
 * these functions can be called from any context.
 *
 * The general idea is for consumers to sample an errseq_t value. That value
 * can later be used to tell whether any new errors have occurred since that
 * sampling was done.
 *
 * Note that there is a risk of collisions if new errors are being recorded
 * frequently, since we have so few bits to use as a counter.
 *
 * To mitigate this, one bit is used as a flag to tell whether the value has
 * been sampled since a new value was recorded. That allows us to avoid bumping
 * the counter if no one has sampled it since the last time an error was
 * recorded.
 *
 * A new errseq_t should always be zeroed out.  A errseq_t value of all zeroes
 * is the special (but common) case where there has never been an error. An all
 * zero value thus serves as the "epoch" if one wishes to know whether there
 * has ever been an error set since it was first initialized.
 */

/* The low bits are designated for error code (max of MAX_ERRNO) */
#define ERRSEQ_SHIFT                ilog2(MAX_ERRNO + 1)

/* This bit is used as a flag to indicate whether the value has been seen */
#define ERRSEQ_SEEN                (1 << ERRSEQ_SHIFT)

/* The lowest bit of the counter */
#define ERRSEQ_CTR_INC                (1 << (ERRSEQ_SHIFT + 1))

/**
 * errseq_set - set a errseq_t for later reporting
 * @eseq: errseq_t field that should be set
 * @err: error to set (must be between -1 and -MAX_ERRNO)
 *
 * This function sets the error in @eseq, and increments the sequence counter
 * if the last sequence was sampled at some point in the past.
 *
 * Any error set will always overwrite an existing error.
 *
 * Return: The previous value, primarily for debugging purposes. The
 * return value should not be used as a previously sampled value in later
 * calls as it will not have the SEEN flag set.
 */
errseq_t errseq_set(errseq_t *eseq, int err)
{
        errseq_t cur, old;

        /* MAX_ERRNO must be able to serve as a mask */
        BUILD_BUG_ON_NOT_POWER_OF_2(MAX_ERRNO + 1);

        /*
         * Ensure the error code actually fits where we want it to go. If it
         * doesn't then just throw a warning and don't record anything. We
         * also don't accept zero here as that would effectively clear a
         * previous error.
         */
        old = READ_ONCE(*eseq);

        if (WARN(unlikely(err == 0 || (unsigned int)-err > MAX_ERRNO),
                                "err = %d\n", err))
                return old;

        for (;;) {
                errseq_t new;

                /* Clear out error bits and set new error */
                new = (old & ~(MAX_ERRNO|ERRSEQ_SEEN)) | -err;

                /* Only increment if someone has looked at it */
                if (old & ERRSEQ_SEEN)
                        new += ERRSEQ_CTR_INC;

                /* If there would be no change, then call it done */
                if (new == old) {
                        cur = new;
                        break;
                }

                /* Try to swap the new value into place */
                cur = cmpxchg(eseq, old, new);

                /*
                 * Call it success if we did the swap or someone else beat us
                 * to it for the same value.
                 */
                if (likely(cur == old || cur == new))
                        break;

                /* Raced with an update, try again */
                old = cur;
        }
        return cur;
}
EXPORT_SYMBOL(errseq_set);

/**
 * errseq_sample() - Grab current errseq_t value.
 * @eseq: Pointer to errseq_t to be sampled.
 *
 * This function allows callers to initialise their errseq_t variable.
 * If the error has been "seen", new callers will not see an old error.
 * If there is an unseen error in @eseq, the caller of this function will
 * see it the next time it checks for an error.
 *
 * Context: Any context.
 * Return: The current errseq value.
 */
errseq_t errseq_sample(errseq_t *eseq)
{
        errseq_t old = READ_ONCE(*eseq);

        /* If nobody has seen this error yet, then we can be the first. */
        if (!(old & ERRSEQ_SEEN))
                old = 0;
        return old;
}
EXPORT_SYMBOL(errseq_sample);

/**
 * errseq_check() - Has an error occurred since a particular sample point?
 * @eseq: Pointer to errseq_t value to be checked.
 * @since: Previously-sampled errseq_t from which to check.
 *
 * Grab the value that eseq points to, and see if it has changed @since
 * the given value was sampled. The @since value is not advanced, so there
 * is no need to mark the value as seen.
 *
 * Return: The latest error set in the errseq_t or 0 if it hasn't changed.
 */
int errseq_check(errseq_t *eseq, errseq_t since)
{
        errseq_t cur = READ_ONCE(*eseq);

        if (likely(cur == since))
                return 0;
        return -(cur & MAX_ERRNO);
}
EXPORT_SYMBOL(errseq_check);

/**
 * errseq_check_and_advance() - Check an errseq_t and advance to current value.
 * @eseq: Pointer to value being checked and reported.
 * @since: Pointer to previously-sampled errseq_t to check against and advance.
 *
 * Grab the eseq value, and see whether it matches the value that @since
 * points to. If it does, then just return 0.
 *
 * If it doesn't, then the value has changed. Set the "seen" flag, and try to
 * swap it into place as the new eseq value. Then, set that value as the new
 * "since" value, and return whatever the error portion is set to.
 *
 * Note that no locking is provided here for concurrent updates to the "since"
 * value. The caller must provide that if necessary. Because of this, callers
 * may want to do a lockless errseq_check before taking the lock and calling
 * this.
 *
 * Return: Negative errno if one has been stored, or 0 if no new error has
 * occurred.
 */
int errseq_check_and_advance(errseq_t *eseq, errseq_t *since)
{
        int err = 0;
        errseq_t old, new;

        /*
         * Most callers will want to use the inline wrapper to check this,
         * so that the common case of no error is handled without needing
         * to take the lock that protects the "since" value.
         */
        old = READ_ONCE(*eseq);
        if (old != *since) {
                /*
                 * Set the flag and try to swap it into place if it has
                 * changed.
                 *
                 * We don't care about the outcome of the swap here. If the
                 * swap doesn't occur, then it has either been updated by a
                 * writer who is altering the value in some way (updating
                 * counter or resetting the error), or another reader who is
                 * just setting the "seen" flag. Either outcome is OK, and we
                 * can advance "since" and return an error based on what we
                 * have.
                 */
                new = old | ERRSEQ_SEEN;
                if (new != old)
                        cmpxchg(eseq, old, new);
                *since = new;
                err = -(new & MAX_ERRNO);
        }
        return err;
}
EXPORT_SYMBOL(errseq_check_and_advance);







































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _LINUX_KPROBES_H
#define _LINUX_KPROBES_H
/*
 *  Kernel Probes (KProbes)
 *  include/linux/kprobes.h
 *
 * Copyright (C) IBM Corporation, 2002, 2004
 *
 * 2002-Oct        Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
 *                Probes initial implementation ( includes suggestions from
 *                Rusty Russell).
 * 2004-July        Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
 *                interface to access function arguments.
 * 2005-May        Hien Nguyen <hien@us.ibm.com> and Jim Keniston
 *                <jkenisto@us.ibm.com>  and Prasanna S Panchamukhi
 *                <prasanna@in.ibm.com> added function-return probes.
 */
#include <linux/compiler.h>
#include <linux/linkage.h>
#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/smp.h>
#include <linux/bug.h>
#include <linux/percpu.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/mutex.h>
#include <linux/ftrace.h>
#include <asm/kprobes.h>

#ifdef CONFIG_KPROBES

/* kprobe_status settings */
#define KPROBE_HIT_ACTIVE        0x00000001
#define KPROBE_HIT_SS                0x00000002
#define KPROBE_REENTER                0x00000004
#define KPROBE_HIT_SSDONE        0x00000008

#else /* CONFIG_KPROBES */
#include <asm-generic/kprobes.h>
typedef int kprobe_opcode_t;
struct arch_specific_insn {
        int dummy;
};
#endif /* CONFIG_KPROBES */

struct kprobe;
struct pt_regs;
struct kretprobe;
struct kretprobe_instance;
typedef int (*kprobe_pre_handler_t) (struct kprobe *, struct pt_regs *);
typedef void (*kprobe_post_handler_t) (struct kprobe *, struct pt_regs *,
                                       unsigned long flags);
typedef int (*kprobe_fault_handler_t) (struct kprobe *, struct pt_regs *,
                                       int trapnr);
typedef int (*kretprobe_handler_t) (struct kretprobe_instance *,
                                    struct pt_regs *);

struct kprobe {
        struct hlist_node hlist;

        /* list of kprobes for multi-handler support */
        struct list_head list;

        /*count the number of times this probe was temporarily disarmed */
        unsigned long nmissed;

        /* location of the probe point */
        kprobe_opcode_t *addr;

        /* Allow user to indicate symbol name of the probe point */
        const char *symbol_name;

        /* Offset into the symbol */
        unsigned int offset;

        /* Called before addr is executed. */
        kprobe_pre_handler_t pre_handler;

        /* Called after addr is executed, unless... */
        kprobe_post_handler_t post_handler;

        /*
         * ... called if executing addr causes a fault (eg. page fault).
         * Return 1 if it handled fault, otherwise kernel will see it.
         */
        kprobe_fault_handler_t fault_handler;

        /* Saved opcode (which has been replaced with breakpoint) */
        kprobe_opcode_t opcode;

        /* copy of the original instruction */
        struct arch_specific_insn ainsn;

        /*
         * Indicates various status flags.
         * Protected by kprobe_mutex after this kprobe is registered.
         */
        u32 flags;
};

/* Kprobe status flags */
#define KPROBE_FLAG_GONE        1 /* breakpoint has already gone */
#define KPROBE_FLAG_DISABLED        2 /* probe is temporarily disabled */
#define KPROBE_FLAG_OPTIMIZED        4 /*
                                   * probe is really optimized.
                                   * NOTE:
                                   * this flag is only for optimized_kprobe.
                                   */
#define KPROBE_FLAG_FTRACE        8 /* probe is using ftrace */

/* Has this kprobe gone ? */
static inline int kprobe_gone(struct kprobe *p)
{
        return p->flags & KPROBE_FLAG_GONE;
}

/* Is this kprobe disabled ? */
static inline int kprobe_disabled(struct kprobe *p)
{
        return p->flags & (KPROBE_FLAG_DISABLED | KPROBE_FLAG_GONE);
}

/* Is this kprobe really running optimized path ? */
static inline int kprobe_optimized(struct kprobe *p)
{
        return p->flags & KPROBE_FLAG_OPTIMIZED;
}

/* Is this kprobe uses ftrace ? */
static inline int kprobe_ftrace(struct kprobe *p)
{
        return p->flags & KPROBE_FLAG_FTRACE;
}

/*
 * Function-return probe -
 * Note:
 * User needs to provide a handler function, and initialize maxactive.
 * maxactive - The maximum number of instances of the probed function that
 * can be active concurrently.
 * nmissed - tracks the number of times the probed function's return was
 * ignored, due to maxactive being too low.
 *
 */
struct kretprobe {
        struct kprobe kp;
        kretprobe_handler_t handler;
        kretprobe_handler_t entry_handler;
        int maxactive;
        int nmissed;
        size_t data_size;
        struct hlist_head free_instances;
        raw_spinlock_t lock;
};

#define KRETPROBE_MAX_DATA_SIZE        4096

struct kretprobe_instance {
        union {
                struct hlist_node hlist;
                struct rcu_head rcu;
        };
        struct kretprobe *rp;
        kprobe_opcode_t *ret_addr;
        struct task_struct *task;
        void *fp;
        char data[];
};

struct kretprobe_blackpoint {
        const char *name;
        void *addr;
};

struct kprobe_blacklist_entry {
        struct list_head list;
        unsigned long start_addr;
        unsigned long end_addr;
};

#ifdef CONFIG_KPROBES
DECLARE_PER_CPU(struct kprobe *, current_kprobe);
DECLARE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);

/*
 * For #ifdef avoidance:
 */
static inline int kprobes_built_in(void)
{
        return 1;
}

extern void kprobe_busy_begin(void);
extern void kprobe_busy_end(void);

#ifdef CONFIG_KRETPROBES
extern void arch_prepare_kretprobe(struct kretprobe_instance *ri,
                                   struct pt_regs *regs);
extern int arch_trampoline_kprobe(struct kprobe *p);

/* If the trampoline handler called from a kprobe, use this version */
unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs,
                                void *trampoline_address,
                                void *frame_pointer);

static nokprobe_inline
unsigned long kretprobe_trampoline_handler(struct pt_regs *regs,
                                void *trampoline_address,
                                void *frame_pointer)
{
        unsigned long ret;
        /*
         * Set a dummy kprobe for avoiding kretprobe recursion.
         * Since kretprobe never runs in kprobe handler, no kprobe must
         * be running at this point.
         */
        kprobe_busy_begin();
        ret = __kretprobe_trampoline_handler(regs, trampoline_address, frame_pointer);
        kprobe_busy_end();

        return ret;
}

#else /* CONFIG_KRETPROBES */
static inline void arch_prepare_kretprobe(struct kretprobe *rp,
                                        struct pt_regs *regs)
{
}
static inline int arch_trampoline_kprobe(struct kprobe *p)
{
        return 0;
}
#endif /* CONFIG_KRETPROBES */

extern struct kretprobe_blackpoint kretprobe_blacklist[];

#ifdef CONFIG_KPROBES_SANITY_TEST
extern int init_test_probes(void);
#else
static inline int init_test_probes(void)
{
        return 0;
}
#endif /* CONFIG_KPROBES_SANITY_TEST */

extern int arch_prepare_kprobe(struct kprobe *p);
extern void arch_arm_kprobe(struct kprobe *p);
extern void arch_disarm_kprobe(struct kprobe *p);
extern int arch_init_kprobes(void);
extern void kprobes_inc_nmissed_count(struct kprobe *p);
extern bool arch_within_kprobe_blacklist(unsigned long addr);
extern int arch_populate_kprobe_blacklist(void);
extern bool arch_kprobe_on_func_entry(unsigned long offset);
extern int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset);

extern bool within_kprobe_blacklist(unsigned long addr);
extern int kprobe_add_ksym_blacklist(unsigned long entry);
extern int kprobe_add_area_blacklist(unsigned long start, unsigned long end);

struct kprobe_insn_cache {
        struct mutex mutex;
        void *(*alloc)(void);        /* allocate insn page */
        void (*free)(void *);        /* free insn page */
        const char *sym;        /* symbol for insn pages */
        struct list_head pages; /* list of kprobe_insn_page */
        size_t insn_size;        /* size of instruction slot */
        int nr_garbage;
};

#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
extern kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c);
extern void __free_insn_slot(struct kprobe_insn_cache *c,
                             kprobe_opcode_t *slot, int dirty);
/* sleep-less address checking routine  */
extern bool __is_insn_slot_addr(struct kprobe_insn_cache *c,
                                unsigned long addr);

#define DEFINE_INSN_CACHE_OPS(__name)                                        \
extern struct kprobe_insn_cache kprobe_##__name##_slots;                \
                                                                        \
static inline kprobe_opcode_t *get_##__name##_slot(void)                \
{                                                                        \
        return __get_insn_slot(&kprobe_##__name##_slots);                \
}                                                                        \
                                                                        \
static inline void free_##__name##_slot(kprobe_opcode_t *slot, int dirty)\
{                                                                        \
        __free_insn_slot(&kprobe_##__name##_slots, slot, dirty);        \
}                                                                        \
                                                                        \
static inline bool is_kprobe_##__name##_slot(unsigned long addr)        \
{                                                                        \
        return __is_insn_slot_addr(&kprobe_##__name##_slots, addr);        \
}
#define KPROBE_INSN_PAGE_SYM                "kprobe_insn_page"
#define KPROBE_OPTINSN_PAGE_SYM                "kprobe_optinsn_page"
int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum,
                             unsigned long *value, char *type, char *sym);
#else /* __ARCH_WANT_KPROBES_INSN_SLOT */
#define DEFINE_INSN_CACHE_OPS(__name)                                        \
static inline bool is_kprobe_##__name##_slot(unsigned long addr)        \
{                                                                        \
        return 0;                                                        \
}
#endif

DEFINE_INSN_CACHE_OPS(insn);

#ifdef CONFIG_OPTPROBES
/*
 * Internal structure for direct jump optimized probe
 */
struct optimized_kprobe {
        struct kprobe kp;
        struct list_head list;        /* list for optimizing queue */
        struct arch_optimized_insn optinsn;
};

/* Architecture dependent functions for direct jump optimization */
extern int arch_prepared_optinsn(struct arch_optimized_insn *optinsn);
extern int arch_check_optimized_kprobe(struct optimized_kprobe *op);
extern int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
                                         struct kprobe *orig);
extern void arch_remove_optimized_kprobe(struct optimized_kprobe *op);
extern void arch_optimize_kprobes(struct list_head *oplist);
extern void arch_unoptimize_kprobes(struct list_head *oplist,
                                    struct list_head *done_list);
extern void arch_unoptimize_kprobe(struct optimized_kprobe *op);
extern int arch_within_optimized_kprobe(struct optimized_kprobe *op,
                                        unsigned long addr);

extern void opt_pre_handler(struct kprobe *p, struct pt_regs *regs);

DEFINE_INSN_CACHE_OPS(optinsn);

#ifdef CONFIG_SYSCTL
extern int sysctl_kprobes_optimization;
extern int proc_kprobes_optimization_handler(struct ctl_table *table,
                                             int write, void *buffer,
                                             size_t *length, loff_t *ppos);
#endif
extern void wait_for_kprobe_optimizer(void);
bool optprobe_queued_unopt(struct optimized_kprobe *op);
bool kprobe_disarmed(struct kprobe *p);
#else
static inline void wait_for_kprobe_optimizer(void) { }
#endif /* CONFIG_OPTPROBES */
#ifdef CONFIG_KPROBES_ON_FTRACE
extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
                                  struct ftrace_ops *ops, struct pt_regs *regs);
extern int arch_prepare_kprobe_ftrace(struct kprobe *p);
#endif

int arch_check_ftrace_location(struct kprobe *p);

/* Get the kprobe at this addr (if any) - called with preemption disabled */
struct kprobe *get_kprobe(void *addr);

/* kprobe_running() will just return the current_kprobe on this CPU */
static inline struct kprobe *kprobe_running(void)
{
        return (__this_cpu_read(current_kprobe));
}

static inline void reset_current_kprobe(void)
{
        __this_cpu_write(current_kprobe, NULL);
}

static inline struct kprobe_ctlblk *get_kprobe_ctlblk(void)
{
        return this_cpu_ptr(&kprobe_ctlblk);
}

kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset);
int register_kprobe(struct kprobe *p);
void unregister_kprobe(struct kprobe *p);
int register_kprobes(struct kprobe **kps, int num);
void unregister_kprobes(struct kprobe **kps, int num);
unsigned long arch_deref_entry_point(void *);

int register_kretprobe(struct kretprobe *rp);
void unregister_kretprobe(struct kretprobe *rp);
int register_kretprobes(struct kretprobe **rps, int num);
void unregister_kretprobes(struct kretprobe **rps, int num);

void kprobe_flush_task(struct task_struct *tk);

void kprobe_free_init_mem(void);

int disable_kprobe(struct kprobe *kp);
int enable_kprobe(struct kprobe *kp);

void dump_kprobe(struct kprobe *kp);

void *alloc_insn_page(void);
void free_insn_page(void *page);

int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
                       char *sym);

int arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
                            char *type, char *sym);
#else /* !CONFIG_KPROBES: */

static inline int kprobes_built_in(void)
{
        return 0;
}
static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
{
        return 0;
}
static inline struct kprobe *get_kprobe(void *addr)
{
        return NULL;
}
static inline struct kprobe *kprobe_running(void)
{
        return NULL;
}
static inline int register_kprobe(struct kprobe *p)
{
        return -ENOSYS;
}
static inline int register_kprobes(struct kprobe **kps, int num)
{
        return -ENOSYS;
}
static inline void unregister_kprobe(struct kprobe *p)
{
}
static inline void unregister_kprobes(struct kprobe **kps, int num)
{
}
static inline int register_kretprobe(struct kretprobe *rp)
{
        return -ENOSYS;
}
static inline int register_kretprobes(struct kretprobe **rps, int num)
{
        return -ENOSYS;
}
static inline void unregister_kretprobe(struct kretprobe *rp)
{
}
static inline void unregister_kretprobes(struct kretprobe **rps, int num)
{
}
static inline void kprobe_flush_task(struct task_struct *tk)
{
}
static inline void kprobe_free_init_mem(void)
{
}
static inline int disable_kprobe(struct kprobe *kp)
{
        return -ENOSYS;
}
static inline int enable_kprobe(struct kprobe *kp)
{
        return -ENOSYS;
}

static inline bool within_kprobe_blacklist(unsigned long addr)
{
        return true;
}
static inline int kprobe_get_kallsym(unsigned int symnum, unsigned long *value,
                                     char *type, char *sym)
{
        return -ERANGE;
}
#endif /* CONFIG_KPROBES */
static inline int disable_kretprobe(struct kretprobe *rp)
{
        return disable_kprobe(&rp->kp);
}
static inline int enable_kretprobe(struct kretprobe *rp)
{
        return enable_kprobe(&rp->kp);
}

#ifndef CONFIG_KPROBES
static inline bool is_kprobe_insn_slot(unsigned long addr)
{
        return false;
}
#endif
#ifndef CONFIG_OPTPROBES
static inline bool is_kprobe_optinsn_slot(unsigned long addr)
{
        return false;
}
#endif

/* Returns true if kprobes handled the fault */
static nokprobe_inline bool kprobe_page_fault(struct pt_regs *regs,
                                              unsigned int trap)
{
        if (!kprobes_built_in())
                return false;
        if (user_mode(regs))
                return false;
        /*
         * To be potentially processing a kprobe fault and to be allowed
         * to call kprobe_running(), we have to be non-preemptible.
         */
        if (preemptible())
                return false;
        if (!kprobe_running())
                return false;
        return kprobe_fault_handler(regs, trap);
}

#endif /* _LINUX_KPROBES_H */




























































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
















































    1 


















































































































































































    1 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 

















































































    1 





















































































    1 





    1 












    1 























    1 











    1 
    1 



































    1 



























































































































































































































































































































































































    1 








































































    1 


    1 








    1 











    1 































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MM_H
#define _LINUX_MM_H

#include <linux/errno.h>

#ifdef __KERNEL__

#include <linux/mmdebug.h>
#include <linux/gfp.h>
#include <linux/bug.h>
#include <linux/list.h>
#include <linux/mmzone.h>
#include <linux/rbtree.h>
#include <linux/atomic.h>
#include <linux/debug_locks.h>
#include <linux/mm_types.h>
#include <linux/mmap_lock.h>
#include <linux/range.h>
#include <linux/pfn.h>
#include <linux/percpu-refcount.h>
#include <linux/bit_spinlock.h>
#include <linux/shrinker.h>
#include <linux/resource.h>
#include <linux/page_ext.h>
#include <linux/err.h>
#include <linux/page-flags.h>
#include <linux/page_ref.h>
#include <linux/memremap.h>
#include <linux/overflow.h>
#include <linux/sizes.h>
#include <linux/sched.h>
#include <linux/pgtable.h>

struct mempolicy;
struct anon_vma;
struct anon_vma_chain;
struct file_ra_state;
struct user_struct;
struct writeback_control;
struct bdi_writeback;
struct pt_regs;

extern int sysctl_page_lock_unfairness;

void init_mm_internals(void);

#ifndef CONFIG_NEED_MULTIPLE_NODES        /* Don't use mapnrs, do it properly */
extern unsigned long max_mapnr;

static inline void set_max_mapnr(unsigned long limit)
{
        max_mapnr = limit;
}
#else
static inline void set_max_mapnr(unsigned long limit) { }
#endif

extern atomic_long_t _totalram_pages;
static inline unsigned long totalram_pages(void)
{
        return (unsigned long)atomic_long_read(&_totalram_pages);
}

static inline void totalram_pages_inc(void)
{
        atomic_long_inc(&_totalram_pages);
}

static inline void totalram_pages_dec(void)
{
        atomic_long_dec(&_totalram_pages);
}

static inline void totalram_pages_add(long count)
{
        atomic_long_add(count, &_totalram_pages);
}

extern void * high_memory;
extern int page_cluster;

#ifdef CONFIG_SYSCTL
extern int sysctl_legacy_va_layout;
#else
#define sysctl_legacy_va_layout 0
#endif

#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
extern const int mmap_rnd_bits_min;
extern const int mmap_rnd_bits_max;
extern int mmap_rnd_bits __read_mostly;
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
extern const int mmap_rnd_compat_bits_min;
extern const int mmap_rnd_compat_bits_max;
extern int mmap_rnd_compat_bits __read_mostly;
#endif

#include <asm/page.h>
#include <asm/processor.h>

/*
 * Architectures that support memory tagging (assigning tags to memory regions,
 * embedding these tags into addresses that point to these memory regions, and
 * checking that the memory and the pointer tags match on memory accesses)
 * redefine this macro to strip tags from pointers.
 * It's defined as noop for arcitectures that don't support memory tagging.
 */
#ifndef untagged_addr
#define untagged_addr(addr) (addr)
#endif

#ifndef __pa_symbol
#define __pa_symbol(x)  __pa(RELOC_HIDE((unsigned long)(x), 0))
#endif

#ifndef page_to_virt
#define page_to_virt(x)        __va(PFN_PHYS(page_to_pfn(x)))
#endif

#ifndef lm_alias
#define lm_alias(x)        __va(__pa_symbol(x))
#endif

/*
 * To prevent common memory management code establishing
 * a zero page mapping on a read fault.
 * This macro should be defined within <asm/pgtable.h>.
 * s390 does this to prevent multiplexing of hardware bits
 * related to the physical page in case of virtualization.
 */
#ifndef mm_forbids_zeropage
#define mm_forbids_zeropage(X)        (0)
#endif

/*
 * On some architectures it is expensive to call memset() for small sizes.
 * If an architecture decides to implement their own version of
 * mm_zero_struct_page they should wrap the defines below in a #ifndef and
 * define their own version of this macro in <asm/pgtable.h>
 */
#if BITS_PER_LONG == 64
/* This function must be updated when the size of struct page grows above 80
 * or reduces below 56. The idea that compiler optimizes out switch()
 * statement, and only leaves move/store instructions. Also the compiler can
 * combine write statments if they are both assignments and can be reordered,
 * this can result in several of the writes here being dropped.
 */
#define        mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
static inline void __mm_zero_struct_page(struct page *page)
{
        unsigned long *_pp = (void *)page;

         /* Check that struct page is either 56, 64, 72, or 80 bytes */
        BUILD_BUG_ON(sizeof(struct page) & 7);
        BUILD_BUG_ON(sizeof(struct page) < 56);
        BUILD_BUG_ON(sizeof(struct page) > 80);

        switch (sizeof(struct page)) {
        case 80:
                _pp[9] = 0;
                fallthrough;
        case 72:
                _pp[8] = 0;
                fallthrough;
        case 64:
                _pp[7] = 0;
                fallthrough;
        case 56:
                _pp[6] = 0;
                _pp[5] = 0;
                _pp[4] = 0;
                _pp[3] = 0;
                _pp[2] = 0;
                _pp[1] = 0;
                _pp[0] = 0;
        }
}
#else
#define mm_zero_struct_page(pp)  ((void)memset((pp), 0, sizeof(struct page)))
#endif

/*
 * Default maximum number of active map areas, this limits the number of vmas
 * per mm struct. Users can overwrite this number by sysctl but there is a
 * problem.
 *
 * When a program's coredump is generated as ELF format, a section is created
 * per a vma. In ELF, the number of sections is represented in unsigned short.
 * This means the number of sections should be smaller than 65535 at coredump.
 * Because the kernel adds some informative sections to a image of program at
 * generating coredump, we need some margin. The number of extra sections is
 * 1-3 now and depends on arch. We use "5" as safe margin, here.
 *
 * ELF extended numbering allows more than 65535 sections, so 16-bit bound is
 * not a hard limit any more. Although some userspace tools can be surprised by
 * that.
 */
#define MAPCOUNT_ELF_CORE_MARGIN        (5)
#define DEFAULT_MAX_MAP_COUNT        (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)

extern int sysctl_max_map_count;

extern unsigned long sysctl_user_reserve_kbytes;
extern unsigned long sysctl_admin_reserve_kbytes;

extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
extern unsigned long sysctl_overcommit_kbytes;

int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);
int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);
int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);

#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))

/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE)

/* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */
#define PAGE_ALIGNED(addr)        IS_ALIGNED((unsigned long)(addr), PAGE_SIZE)

#define lru_to_page(head) (list_entry((head)->prev, struct page, lru))

/*
 * Linux kernel virtual memory manager primitives.
 * The idea being to have a "virtual" mm in the same way
 * we have a virtual fs - giving a cleaner interface to the
 * mm details, and allowing different kinds of memory mappings
 * (from shared memory to executable loading to arbitrary
 * mmap() functions).
 */

struct vm_area_struct *vm_area_alloc(struct mm_struct *);
struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
void vm_area_free(struct vm_area_struct *);

#ifndef CONFIG_MMU
extern struct rb_root nommu_region_tree;
extern struct rw_semaphore nommu_region_sem;

extern unsigned int kobjsize(const void *objp);
#endif

/*
 * vm_flags in vm_area_struct, see mm_types.h.
 * When changing, update also include/trace/events/mmflags.h
 */
#define VM_NONE                0x00000000

#define VM_READ                0x00000001        /* currently active flags */
#define VM_WRITE        0x00000002
#define VM_EXEC                0x00000004
#define VM_SHARED        0x00000008

/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
#define VM_MAYREAD        0x00000010        /* limits for mprotect() etc */
#define VM_MAYWRITE        0x00000020
#define VM_MAYEXEC        0x00000040
#define VM_MAYSHARE        0x00000080

#define VM_GROWSDOWN        0x00000100        /* general info on the segment */
#define VM_UFFD_MISSING        0x00000200        /* missing pages tracking */
#define VM_PFNMAP        0x00000400        /* Page-ranges managed without "struct page", just pure PFN */
#define VM_DENYWRITE        0x00000800        /* ETXTBSY on write attempts.. */
#define VM_UFFD_WP        0x00001000        /* wrprotect pages tracking */

#define VM_LOCKED        0x00002000
#define VM_IO           0x00004000        /* Memory mapped I/O or similar */

                                        /* Used by sys_madvise() */
#define VM_SEQ_READ        0x00008000        /* App will access data sequentially */
#define VM_RAND_READ        0x00010000        /* App will not benefit from clustered reads */

#define VM_DONTCOPY        0x00020000      /* Do not copy this vma on fork */
#define VM_DONTEXPAND        0x00040000        /* Cannot expand with mremap() */
#define VM_LOCKONFAULT        0x00080000        /* Lock the pages covered when they are faulted in */
#define VM_ACCOUNT        0x00100000        /* Is a VM accounted object */
#define VM_NORESERVE        0x00200000        /* should the VM suppress accounting */
#define VM_HUGETLB        0x00400000        /* Huge TLB Page VM */
#define VM_SYNC                0x00800000        /* Synchronous page faults */
#define VM_ARCH_1        0x01000000        /* Architecture-specific flag */
#define VM_WIPEONFORK        0x02000000        /* Wipe VMA contents in child. */
#define VM_DONTDUMP        0x04000000        /* Do not include in the core dump */

#ifdef CONFIG_MEM_SOFT_DIRTY
# define VM_SOFTDIRTY        0x08000000        /* Not soft dirty clean area */
#else
# define VM_SOFTDIRTY        0
#endif

#define VM_MIXEDMAP        0x10000000        /* Can contain "struct page" and pure PFN pages */
#define VM_HUGEPAGE        0x20000000        /* MADV_HUGEPAGE marked this vma */
#define VM_NOHUGEPAGE        0x40000000        /* MADV_NOHUGEPAGE marked this vma */
#define VM_MERGEABLE        0x80000000        /* KSM may merge identical pages */

#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
#define VM_HIGH_ARCH_BIT_0        32        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_1        33        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_2        34        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_3        35        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_4        36        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_0        BIT(VM_HIGH_ARCH_BIT_0)
#define VM_HIGH_ARCH_1        BIT(VM_HIGH_ARCH_BIT_1)
#define VM_HIGH_ARCH_2        BIT(VM_HIGH_ARCH_BIT_2)
#define VM_HIGH_ARCH_3        BIT(VM_HIGH_ARCH_BIT_3)
#define VM_HIGH_ARCH_4        BIT(VM_HIGH_ARCH_BIT_4)
#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */

#ifdef CONFIG_ARCH_HAS_PKEYS
# define VM_PKEY_SHIFT        VM_HIGH_ARCH_BIT_0
# define VM_PKEY_BIT0        VM_HIGH_ARCH_0        /* A protection key is a 4-bit value */
# define VM_PKEY_BIT1        VM_HIGH_ARCH_1        /* on x86 and 5-bit value on ppc64   */
# define VM_PKEY_BIT2        VM_HIGH_ARCH_2
# define VM_PKEY_BIT3        VM_HIGH_ARCH_3
#ifdef CONFIG_PPC
# define VM_PKEY_BIT4  VM_HIGH_ARCH_4
#else
# define VM_PKEY_BIT4  0
#endif
#endif /* CONFIG_ARCH_HAS_PKEYS */

#if defined(CONFIG_X86)
# define VM_PAT                VM_ARCH_1        /* PAT reserves whole VMA at once (x86) */
#elif defined(CONFIG_PPC)
# define VM_SAO                VM_ARCH_1        /* Strong Access Ordering (powerpc) */
#elif defined(CONFIG_PARISC)
# define VM_GROWSUP        VM_ARCH_1
#elif defined(CONFIG_IA64)
# define VM_GROWSUP        VM_ARCH_1
#elif defined(CONFIG_SPARC64)
# define VM_SPARC_ADI        VM_ARCH_1        /* Uses ADI tag for access control */
# define VM_ARCH_CLEAR        VM_SPARC_ADI
#elif defined(CONFIG_ARM64)
# define VM_ARM64_BTI        VM_ARCH_1        /* BTI guarded page, a.k.a. GP bit */
# define VM_ARCH_CLEAR        VM_ARM64_BTI
#elif !defined(CONFIG_MMU)
# define VM_MAPPED_COPY        VM_ARCH_1        /* T if mapped copy of data (nommu mmap) */
#endif

#if defined(CONFIG_ARM64_MTE)
# define VM_MTE                VM_HIGH_ARCH_0        /* Use Tagged memory for access control */
# define VM_MTE_ALLOWED        VM_HIGH_ARCH_1        /* Tagged memory permitted */
#else
# define VM_MTE                VM_NONE
# define VM_MTE_ALLOWED        VM_NONE
#endif

#ifndef VM_GROWSUP
# define VM_GROWSUP        VM_NONE
#endif

/* Bits set in the VMA until the stack is in its final location */
#define VM_STACK_INCOMPLETE_SETUP        (VM_RAND_READ | VM_SEQ_READ)

#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0)

/* Common data flag combinations */
#define VM_DATA_FLAGS_TSK_EXEC        (VM_READ | VM_WRITE | TASK_EXEC | \
                                 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
#define VM_DATA_FLAGS_NON_EXEC        (VM_READ | VM_WRITE | VM_MAYREAD | \
                                 VM_MAYWRITE | VM_MAYEXEC)
#define VM_DATA_FLAGS_EXEC        (VM_READ | VM_WRITE | VM_EXEC | \
                                 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)

#ifndef VM_DATA_DEFAULT_FLAGS                /* arch can override this */
#define VM_DATA_DEFAULT_FLAGS  VM_DATA_FLAGS_EXEC
#endif

#ifndef VM_STACK_DEFAULT_FLAGS                /* arch can override this */
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
#endif

#ifdef CONFIG_STACK_GROWSUP
#define VM_STACK        VM_GROWSUP
#else
#define VM_STACK        VM_GROWSDOWN
#endif

#define VM_STACK_FLAGS        (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)

/* VMA basic access permission flags */
#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)


/*
 * Special vmas that are non-mergable, non-mlock()able.
 */
#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)

/* This mask prevents VMA from being scanned with khugepaged */
#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)

/* This mask defines which mm->def_flags a process can inherit its parent */
#define VM_INIT_DEF_MASK        VM_NOHUGEPAGE

/* This mask is used to clear all the VMA flags used by mlock */
#define VM_LOCKED_CLEAR_MASK        (~(VM_LOCKED | VM_LOCKONFAULT))

/* Arch-specific flags to clear when updating VM flags on protection change */
#ifndef VM_ARCH_CLEAR
# define VM_ARCH_CLEAR        VM_NONE
#endif
#define VM_FLAGS_CLEAR        (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR)

/*
 * mapping from the currently active vm_flags protection bits (the
 * low four bits) to a page protection mask..
 */
extern pgprot_t protection_map[16];

/**
 * Fault flag definitions.
 *
 * @FAULT_FLAG_WRITE: Fault was a write fault.
 * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE.
 * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked.
 * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying.
 * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region.
 * @FAULT_FLAG_TRIED: The fault has been tried once.
 * @FAULT_FLAG_USER: The fault originated in userspace.
 * @FAULT_FLAG_REMOTE: The fault is not for current task/mm.
 * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch.
 * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals.
 *
 * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
 * whether we would allow page faults to retry by specifying these two
 * fault flags correctly.  Currently there can be three legal combinations:
 *
 * (a) ALLOW_RETRY and !TRIED:  this means the page fault allows retry, and
 *                              this is the first try
 *
 * (b) ALLOW_RETRY and TRIED:   this means the page fault allows retry, and
 *                              we've already tried at least once
 *
 * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry
 *
 * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never
 * be used.  Note that page faults can be allowed to retry for multiple times,
 * in which case we'll have an initial fault with flags (a) then later on
 * continuous faults with flags (b).  We should always try to detect pending
 * signals before a retry to make sure the continuous page faults can still be
 * interrupted if necessary.
 */
#define FAULT_FLAG_WRITE                        0x01
#define FAULT_FLAG_MKWRITE                        0x02
#define FAULT_FLAG_ALLOW_RETRY                        0x04
#define FAULT_FLAG_RETRY_NOWAIT                        0x08
#define FAULT_FLAG_KILLABLE                        0x10
#define FAULT_FLAG_TRIED                        0x20
#define FAULT_FLAG_USER                                0x40
#define FAULT_FLAG_REMOTE                        0x80
#define FAULT_FLAG_INSTRUCTION                  0x100
#define FAULT_FLAG_INTERRUPTIBLE                0x200

/*
 * The default fault flags that should be used by most of the
 * arch-specific page fault handlers.
 */
#define FAULT_FLAG_DEFAULT  (FAULT_FLAG_ALLOW_RETRY | \
                             FAULT_FLAG_KILLABLE | \
                             FAULT_FLAG_INTERRUPTIBLE)

/**
 * fault_flag_allow_retry_first - check ALLOW_RETRY the first time
 *
 * This is mostly used for places where we want to try to avoid taking
 * the mmap_lock for too long a time when waiting for another condition
 * to change, in which case we can try to be polite to release the
 * mmap_lock in the first round to avoid potential starvation of other
 * processes that would also want the mmap_lock.
 *
 * Return: true if the page fault allows retry and this is the first
 * attempt of the fault handling; false otherwise.
 */
static inline bool fault_flag_allow_retry_first(unsigned int flags)
{
        return (flags & FAULT_FLAG_ALLOW_RETRY) &&
            (!(flags & FAULT_FLAG_TRIED));
}

#define FAULT_FLAG_TRACE \
        { FAULT_FLAG_WRITE,                "WRITE" }, \
        { FAULT_FLAG_MKWRITE,                "MKWRITE" }, \
        { FAULT_FLAG_ALLOW_RETRY,        "ALLOW_RETRY" }, \
        { FAULT_FLAG_RETRY_NOWAIT,        "RETRY_NOWAIT" }, \
        { FAULT_FLAG_KILLABLE,                "KILLABLE" }, \
        { FAULT_FLAG_TRIED,                "TRIED" }, \
        { FAULT_FLAG_USER,                "USER" }, \
        { FAULT_FLAG_REMOTE,                "REMOTE" }, \
        { FAULT_FLAG_INSTRUCTION,        "INSTRUCTION" }, \
        { FAULT_FLAG_INTERRUPTIBLE,        "INTERRUPTIBLE" }

/*
 * vm_fault is filled by the pagefault handler and passed to the vma's
 * ->fault function. The vma's ->fault is responsible for returning a bitmask
 * of VM_FAULT_xxx flags that give details about how the fault was handled.
 *
 * MM layer fills up gfp_mask for page allocations but fault handler might
 * alter it if its implementation requires a different allocation context.
 *
 * pgoff should be used in favour of virtual_address, if possible.
 */
struct vm_fault {
        struct vm_area_struct *vma;        /* Target VMA */
        unsigned int flags;                /* FAULT_FLAG_xxx flags */
        gfp_t gfp_mask;                        /* gfp mask to be used for allocations */
        pgoff_t pgoff;                        /* Logical page offset based on vma */
        unsigned long address;                /* Faulting virtual address */
        pmd_t *pmd;                        /* Pointer to pmd entry matching
                                         * the 'address' */
        pud_t *pud;                        /* Pointer to pud entry matching
                                         * the 'address'
                                         */
        pte_t orig_pte;                        /* Value of PTE at the time of fault */

        struct page *cow_page;                /* Page handler may use for COW fault */
        struct page *page;                /* ->fault handlers should return a
                                         * page here, unless VM_FAULT_NOPAGE
                                         * is set (which is also implied by
                                         * VM_FAULT_ERROR).
                                         */
        /* These three entries are valid only while holding ptl lock */
        pte_t *pte;                        /* Pointer to pte entry matching
                                         * the 'address'. NULL if the page
                                         * table hasn't been allocated.
                                         */
        spinlock_t *ptl;                /* Page table lock.
                                         * Protects pte page table if 'pte'
                                         * is not NULL, otherwise pmd.
                                         */
        pgtable_t prealloc_pte;                /* Pre-allocated pte page table.
                                         * vm_ops->map_pages() calls
                                         * alloc_set_pte() from atomic context.
                                         * do_fault_around() pre-allocates
                                         * page table to avoid allocation from
                                         * atomic context.
                                         */
};

/* page entry size for vm->huge_fault() */
enum page_entry_size {
        PE_SIZE_PTE = 0,
        PE_SIZE_PMD,
        PE_SIZE_PUD,
};

/*
 * These are the virtual MM functions - opening of an area, closing and
 * unmapping it (needed to keep files on disk up-to-date etc), pointer
 * to the functions called when a no-page or a wp-page exception occurs.
 */
struct vm_operations_struct {
        void (*open)(struct vm_area_struct * area);
        void (*close)(struct vm_area_struct * area);
        int (*split)(struct vm_area_struct * area, unsigned long addr);
        int (*mremap)(struct vm_area_struct * area);
        vm_fault_t (*fault)(struct vm_fault *vmf);
        vm_fault_t (*huge_fault)(struct vm_fault *vmf,
                        enum page_entry_size pe_size);
        void (*map_pages)(struct vm_fault *vmf,
                        pgoff_t start_pgoff, pgoff_t end_pgoff);
        unsigned long (*pagesize)(struct vm_area_struct * area);

        /* notification that a previously read-only page is about to become
         * writable, if an error is returned it will cause a SIGBUS */
        vm_fault_t (*page_mkwrite)(struct vm_fault *vmf);

        /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
        vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf);

        /* called by access_process_vm when get_user_pages() fails, typically
         * for use by special VMAs that can switch between memory and hardware
         */
        int (*access)(struct vm_area_struct *vma, unsigned long addr,
                      void *buf, int len, int write);

        /* Called by the /proc/PID/maps code to ask the vma whether it
         * has a special name.  Returning non-NULL will also cause this
         * vma to be dumped unconditionally. */
        const char *(*name)(struct vm_area_struct *vma);

#ifdef CONFIG_NUMA
        /*
         * set_policy() op must add a reference to any non-NULL @new mempolicy
         * to hold the policy upon return.  Caller should pass NULL @new to
         * remove a policy and fall back to surrounding context--i.e. do not
         * install a MPOL_DEFAULT policy, nor the task or system default
         * mempolicy.
         */
        int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);

        /*
         * get_policy() op must add reference [mpol_get()] to any policy at
         * (vma,addr) marked as MPOL_SHARED.  The shared policy infrastructure
         * in mm/mempolicy.c will do this automatically.
         * get_policy() must NOT add a ref if the policy at (vma,addr) is not
         * marked as MPOL_SHARED. vma policies are protected by the mmap_lock.
         * If no [shared/vma] mempolicy exists at the addr, get_policy() op
         * must return NULL--i.e., do not "fallback" to task or system default
         * policy.
         */
        struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
                                        unsigned long addr);
#endif
        /*
         * Called by vm_normal_page() for special PTEs to find the
         * page for @addr.  This is useful if the default behavior
         * (using pte_page()) would not find the correct page.
         */
        struct page *(*find_special_page)(struct vm_area_struct *vma,
                                          unsigned long addr);
};

static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
{
        static const struct vm_operations_struct dummy_vm_ops = {};

        memset(vma, 0, sizeof(*vma));
        vma->vm_mm = mm;
        vma->vm_ops = &dummy_vm_ops;
        INIT_LIST_HEAD(&vma->anon_vma_chain);
}

static inline void vma_set_anonymous(struct vm_area_struct *vma)
{
        vma->vm_ops = NULL;
}

static inline bool vma_is_anonymous(struct vm_area_struct *vma)
{
        return !vma->vm_ops;
}

static inline bool vma_is_temporary_stack(struct vm_area_struct *vma)
{
        int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);

        if (!maybe_stack)
                return false;

        if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
                                                VM_STACK_INCOMPLETE_SETUP)
                return true;

        return false;
}

static inline bool vma_is_foreign(struct vm_area_struct *vma)
{
        if (!current->mm)
                return true;

        if (current->mm != vma->vm_mm)
                return true;

        return false;
}

static inline bool vma_is_accessible(struct vm_area_struct *vma)
{
        return vma->vm_flags & VM_ACCESS_FLAGS;
}

#ifdef CONFIG_SHMEM
/*
 * The vma_is_shmem is not inline because it is used only by slow
 * paths in userfault.
 */
bool vma_is_shmem(struct vm_area_struct *vma);
#else
static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; }
#endif

int vma_is_stack_for_current(struct vm_area_struct *vma);

/* flush_tlb_range() takes a vma, not a mm, and can care about flags */
#define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) }

struct mmu_gather;
struct inode;

#include <linux/huge_mm.h>

/*
 * Methods to modify the page usage count.
 *
 * What counts for a page usage:
 * - cache mapping   (page->mapping)
 * - private data    (page->private)
 * - page mapped in a task's page tables, each mapping
 *   is counted separately
 *
 * Also, many kernel routines increase the page count before a critical
 * routine so they can be sure the page doesn't go away from under them.
 */

/*
 * Drop a ref, return true if the refcount fell to zero (the page has no users)
 */
static inline int put_page_testzero(struct page *page)
{
        VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
        return page_ref_dec_and_test(page);
}

/*
 * Try to grab a ref unless the page has a refcount of zero, return false if
 * that is the case.
 * This can be called when MMU is off so it must not access
 * any of the virtual mappings.
 */
static inline int get_page_unless_zero(struct page *page)
{
        return page_ref_add_unless(page, 1, 0);
}

extern int page_is_ram(unsigned long pfn);

enum {
        REGION_INTERSECTS,
        REGION_DISJOINT,
        REGION_MIXED,
};

int region_intersects(resource_size_t offset, size_t size, unsigned long flags,
                      unsigned long desc);

/* Support for virtually mapped pages */
struct page *vmalloc_to_page(const void *addr);
unsigned long vmalloc_to_pfn(const void *addr);

/*
 * Determine if an address is within the vmalloc range
 *
 * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
 * is no special casing required.
 */

#ifndef is_ioremap_addr
#define is_ioremap_addr(x) is_vmalloc_addr(x)
#endif

#ifdef CONFIG_MMU
extern bool is_vmalloc_addr(const void *x);
extern int is_vmalloc_or_module_addr(const void *x);
#else
static inline bool is_vmalloc_addr(const void *x)
{
        return false;
}
static inline int is_vmalloc_or_module_addr(const void *x)
{
        return 0;
}
#endif

extern void *kvmalloc_node(size_t size, gfp_t flags, int node);
static inline void *kvmalloc(size_t size, gfp_t flags)
{
        return kvmalloc_node(size, flags, NUMA_NO_NODE);
}
static inline void *kvzalloc_node(size_t size, gfp_t flags, int node)
{
        return kvmalloc_node(size, flags | __GFP_ZERO, node);
}
static inline void *kvzalloc(size_t size, gfp_t flags)
{
        return kvmalloc(size, flags | __GFP_ZERO);
}

static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;

        return kvmalloc(bytes, flags);
}

static inline void *kvcalloc(size_t n, size_t size, gfp_t flags)
{
        return kvmalloc_array(n, size, flags | __GFP_ZERO);
}

extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize,
                gfp_t flags);
extern void kvfree(const void *addr);
extern void kvfree_sensitive(const void *addr, size_t len);

static inline int head_compound_mapcount(struct page *head)
{
        return atomic_read(compound_mapcount_ptr(head)) + 1;
}

/*
 * Mapcount of compound page as a whole, does not include mapped sub-pages.
 *
 * Must be called only for compound pages or any their tail sub-pages.
 */
static inline int compound_mapcount(struct page *page)
{
        VM_BUG_ON_PAGE(!PageCompound(page), page);
        page = compound_head(page);
        return head_compound_mapcount(page);
}

/*
 * The atomic page->_mapcount, starts from -1: so that transitions
 * both from it and to it can be tracked, using atomic_inc_and_test
 * and atomic_add_negative(-1).
 */
static inline void page_mapcount_reset(struct page *page)
{
        atomic_set(&(page)->_mapcount, -1);
}

int __page_mapcount(struct page *page);

/*
 * Mapcount of 0-order page; when compound sub-page, includes
 * compound_mapcount().
 *
 * Result is undefined for pages which cannot be mapped into userspace.
 * For example SLAB or special types of pages. See function page_has_type().
 * They use this place in struct page differently.
 */
static inline int page_mapcount(struct page *page)
{
        if (unlikely(PageCompound(page)))
                return __page_mapcount(page);
        return atomic_read(&page->_mapcount) + 1;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int total_mapcount(struct page *page);
int page_trans_huge_mapcount(struct page *page, int *total_mapcount);
#else
static inline int total_mapcount(struct page *page)
{
        return page_mapcount(page);
}
static inline int page_trans_huge_mapcount(struct page *page,
                                           int *total_mapcount)
{
        int mapcount = page_mapcount(page);
        if (total_mapcount)
                *total_mapcount = mapcount;
        return mapcount;
}
#endif

static inline struct page *virt_to_head_page(const void *x)
{
        struct page *page = virt_to_page(x);

        return compound_head(page);
}

void __put_page(struct page *page);

void put_pages_list(struct list_head *pages);

void split_page(struct page *page, unsigned int order);

/*
 * Compound pages have a destructor function.  Provide a
 * prototype for that function and accessor functions.
 * These are _only_ valid on the head of a compound page.
 */
typedef void compound_page_dtor(struct page *);

/* Keep the enum in sync with compound_page_dtors array in mm/page_alloc.c */
enum compound_dtor_id {
        NULL_COMPOUND_DTOR,
        COMPOUND_PAGE_DTOR,
#ifdef CONFIG_HUGETLB_PAGE
        HUGETLB_PAGE_DTOR,
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        TRANSHUGE_PAGE_DTOR,
#endif
        NR_COMPOUND_DTORS,
};
extern compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS];

static inline void set_compound_page_dtor(struct page *page,
                enum compound_dtor_id compound_dtor)
{
        VM_BUG_ON_PAGE(compound_dtor >= NR_COMPOUND_DTORS, page);
        page[1].compound_dtor = compound_dtor;
}

static inline void destroy_compound_page(struct page *page)
{
        VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page);
        compound_page_dtors[page[1].compound_dtor](page);
}

static inline unsigned int compound_order(struct page *page)
{
        if (!PageHead(page))
                return 0;
        return page[1].compound_order;
}

static inline bool hpage_pincount_available(struct page *page)
{
        /*
         * Can the page->hpage_pinned_refcount field be used? That field is in
         * the 3rd page of the compound page, so the smallest (2-page) compound
         * pages cannot support it.
         */
        page = compound_head(page);
        return PageCompound(page) && compound_order(page) > 1;
}

static inline int head_compound_pincount(struct page *head)
{
        return atomic_read(compound_pincount_ptr(head));
}

static inline int compound_pincount(struct page *page)
{
        VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
        page = compound_head(page);
        return head_compound_pincount(page);
}

static inline void set_compound_order(struct page *page, unsigned int order)
{
        page[1].compound_order = order;
        page[1].compound_nr = 1U << order;
}

/* Returns the number of pages in this potentially compound page. */
static inline unsigned long compound_nr(struct page *page)
{
        if (!PageHead(page))
                return 1;
        return page[1].compound_nr;
}

/* Returns the number of bytes in this potentially compound page. */
static inline unsigned long page_size(struct page *page)
{
        return PAGE_SIZE << compound_order(page);
}

/* Returns the number of bits needed for the number of bytes in a page */
static inline unsigned int page_shift(struct page *page)
{
        return PAGE_SHIFT + compound_order(page);
}

void free_compound_page(struct page *page);

#ifdef CONFIG_MMU
/*
 * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
 * servicing faults for write access.  In the normal case, do always want
 * pte_mkwrite.  But get_user_pages can cause write faults for mappings
 * that do not have writing enabled, when used by access_process_vm.
 */
static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
        if (likely(vma->vm_flags & VM_WRITE))
                pte = pte_mkwrite(pte);
        return pte;
}

vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page);
vm_fault_t finish_fault(struct vm_fault *vmf);
vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
#endif

/*
 * Multiple processes may "see" the same page. E.g. for untouched
 * mappings of /dev/null, all processes see the same page full of
 * zeroes, and text pages of executables and shared libraries have
 * only one copy in memory, at most, normally.
 *
 * For the non-reserved pages, page_count(page) denotes a reference count.
 *   page_count() == 0 means the page is free. page->lru is then used for
 *   freelist management in the buddy allocator.
 *   page_count() > 0  means the page has been allocated.
 *
 * Pages are allocated by the slab allocator in order to provide memory
 * to kmalloc and kmem_cache_alloc. In this case, the management of the
 * page, and the fields in 'struct page' are the responsibility of mm/slab.c
 * unless a particular usage is carefully commented. (the responsibility of
 * freeing the kmalloc memory is the caller's, of course).
 *
 * A page may be used by anyone else who does a __get_free_page().
 * In this case, page_count still tracks the references, and should only
 * be used through the normal accessor functions. The top bits of page->flags
 * and page->virtual store page management information, but all other fields
 * are unused and could be used privately, carefully. The management of this
 * page is the responsibility of the one who allocated it, and those who have
 * subsequently been given references to it.
 *
 * The other pages (we may call them "pagecache pages") are completely
 * managed by the Linux memory manager: I/O, buffers, swapping etc.
 * The following discussion applies only to them.
 *
 * A pagecache page contains an opaque `private' member, which belongs to the
 * page's address_space. Usually, this is the address of a circular list of
 * the page's disk buffers. PG_private must be set to tell the VM to call
 * into the filesystem to release these pages.
 *
 * A page may belong to an inode's memory mapping. In this case, page->mapping
 * is the pointer to the inode, and page->index is the file offset of the page,
 * in units of PAGE_SIZE.
 *
 * If pagecache pages are not associated with an inode, they are said to be
 * anonymous pages. These may become associated with the swapcache, and in that
 * case PG_swapcache is set, and page->private is an offset into the swapcache.
 *
 * In either case (swapcache or inode backed), the pagecache itself holds one
 * reference to the page. Setting PG_private should also increment the
 * refcount. The each user mapping also has a reference to the page.
 *
 * The pagecache pages are stored in a per-mapping radix tree, which is
 * rooted at mapping->i_pages, and indexed by offset.
 * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space
 * lists, we instead now tag pages as dirty/writeback in the radix tree.
 *
 * All pagecache pages may be subject to I/O:
 * - inode pages may need to be read from disk,
 * - inode pages which have been modified and are MAP_SHARED may need
 *   to be written back to the inode on disk,
 * - anonymous pages (including MAP_PRIVATE file mappings) which have been
 *   modified may need to be swapped out to swap space and (later) to be read
 *   back into memory.
 */

/*
 * The zone field is never updated after free_area_init_core()
 * sets it, so none of the operations on it need to be atomic.
 */

/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */
#define SECTIONS_PGOFF                ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
#define NODES_PGOFF                (SECTIONS_PGOFF - NODES_WIDTH)
#define ZONES_PGOFF                (NODES_PGOFF - ZONES_WIDTH)
#define LAST_CPUPID_PGOFF        (ZONES_PGOFF - LAST_CPUPID_WIDTH)
#define KASAN_TAG_PGOFF                (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)

/*
 * Define the bit shifts to access each section.  For non-existent
 * sections we define the shift as 0; that plus a 0 mask ensures
 * the compiler will optimise away reference to them.
 */
#define SECTIONS_PGSHIFT        (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
#define NODES_PGSHIFT                (NODES_PGOFF * (NODES_WIDTH != 0))
#define ZONES_PGSHIFT                (ZONES_PGOFF * (ZONES_WIDTH != 0))
#define LAST_CPUPID_PGSHIFT        (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0))
#define KASAN_TAG_PGSHIFT        (KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0))

/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
#ifdef NODE_NOT_IN_PAGE_FLAGS
#define ZONEID_SHIFT                (SECTIONS_SHIFT + ZONES_SHIFT)
#define ZONEID_PGOFF                ((SECTIONS_PGOFF < ZONES_PGOFF)? \
                                                SECTIONS_PGOFF : ZONES_PGOFF)
#else
#define ZONEID_SHIFT                (NODES_SHIFT + ZONES_SHIFT)
#define ZONEID_PGOFF                ((NODES_PGOFF < ZONES_PGOFF)? \
                                                NODES_PGOFF : ZONES_PGOFF)
#endif

#define ZONEID_PGSHIFT                (ZONEID_PGOFF * (ZONEID_SHIFT != 0))

#define ZONES_MASK                ((1UL << ZONES_WIDTH) - 1)
#define NODES_MASK                ((1UL << NODES_WIDTH) - 1)
#define SECTIONS_MASK                ((1UL << SECTIONS_WIDTH) - 1)
#define LAST_CPUPID_MASK        ((1UL << LAST_CPUPID_SHIFT) - 1)
#define KASAN_TAG_MASK                ((1UL << KASAN_TAG_WIDTH) - 1)
#define ZONEID_MASK                ((1UL << ZONEID_SHIFT) - 1)

static inline enum zone_type page_zonenum(const struct page *page)
{
        ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT);
        return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
}

#ifdef CONFIG_ZONE_DEVICE
static inline bool is_zone_device_page(const struct page *page)
{
        return page_zonenum(page) == ZONE_DEVICE;
}
extern void memmap_init_zone_device(struct zone *, unsigned long,
                                    unsigned long, struct dev_pagemap *);
#else
static inline bool is_zone_device_page(const struct page *page)
{
        return false;
}
#endif

#ifdef CONFIG_DEV_PAGEMAP_OPS
void free_devmap_managed_page(struct page *page);
DECLARE_STATIC_KEY_FALSE(devmap_managed_key);

static inline bool page_is_devmap_managed(struct page *page)
{
        if (!static_branch_unlikely(&devmap_managed_key))
                return false;
        if (!is_zone_device_page(page))
                return false;
        switch (page->pgmap->type) {
        case MEMORY_DEVICE_PRIVATE:
        case MEMORY_DEVICE_FS_DAX:
                return true;
        default:
                break;
        }
        return false;
}

void put_devmap_managed_page(struct page *page);

#else /* CONFIG_DEV_PAGEMAP_OPS */
static inline bool page_is_devmap_managed(struct page *page)
{
        return false;
}

static inline void put_devmap_managed_page(struct page *page)
{
}
#endif /* CONFIG_DEV_PAGEMAP_OPS */

static inline bool is_device_private_page(const struct page *page)
{
        return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
                IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
                is_zone_device_page(page) &&
                page->pgmap->type == MEMORY_DEVICE_PRIVATE;
}

static inline bool is_pci_p2pdma_page(const struct page *page)
{
        return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
                IS_ENABLED(CONFIG_PCI_P2PDMA) &&
                is_zone_device_page(page) &&
                page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
}

/* 127: arbitrary random number, small enough to assemble well */
#define page_ref_zero_or_close_to_overflow(page) \
        ((unsigned int) page_ref_count(page) + 127u <= 127u)

static inline void get_page(struct page *page)
{
        page = compound_head(page);
        /*
         * Getting a normal page or the head of a compound page
         * requires to already have an elevated page->_refcount.
         */
        VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page);
        page_ref_inc(page);
}

bool __must_check try_grab_page(struct page *page, unsigned int flags);

static inline __must_check bool try_get_page(struct page *page)
{
        page = compound_head(page);
        if (WARN_ON_ONCE(page_ref_count(page) <= 0))
                return false;
        page_ref_inc(page);
        return true;
}

static inline void put_page(struct page *page)
{
        page = compound_head(page);

        /*
         * For devmap managed pages we need to catch refcount transition from
         * 2 to 1, when refcount reach one it means the page is free and we
         * need to inform the device driver through callback. See
         * include/linux/memremap.h and HMM for details.
         */
        if (page_is_devmap_managed(page)) {
                put_devmap_managed_page(page);
                return;
        }

        if (put_page_testzero(page))
                __put_page(page);
}

/*
 * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload
 * the page's refcount so that two separate items are tracked: the original page
 * reference count, and also a new count of how many pin_user_pages() calls were
 * made against the page. ("gup-pinned" is another term for the latter).
 *
 * With this scheme, pin_user_pages() becomes special: such pages are marked as
 * distinct from normal pages. As such, the unpin_user_page() call (and its
 * variants) must be used in order to release gup-pinned pages.
 *
 * Choice of value:
 *
 * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference
 * counts with respect to pin_user_pages() and unpin_user_page() becomes
 * simpler, due to the fact that adding an even power of two to the page
 * refcount has the effect of using only the upper N bits, for the code that
 * counts up using the bias value. This means that the lower bits are left for
 * the exclusive use of the original code that increments and decrements by one
 * (or at least, by much smaller values than the bias value).
 *
 * Of course, once the lower bits overflow into the upper bits (and this is
 * OK, because subtraction recovers the original values), then visual inspection
 * no longer suffices to directly view the separate counts. However, for normal
 * applications that don't have huge page reference counts, this won't be an
 * issue.
 *
 * Locking: the lockless algorithm described in page_cache_get_speculative()
 * and page_cache_gup_pin_speculative() provides safe operation for
 * get_user_pages and page_mkclean and other calls that race to set up page
 * table entries.
 */
#define GUP_PIN_COUNTING_BIAS (1U << 10)

void unpin_user_page(struct page *page);
void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
                                 bool make_dirty);
void unpin_user_pages(struct page **pages, unsigned long npages);

/**
 * page_maybe_dma_pinned() - report if a page is pinned for DMA.
 *
 * This function checks if a page has been pinned via a call to
 * pin_user_pages*().
 *
 * For non-huge pages, the return value is partially fuzzy: false is not fuzzy,
 * because it means "definitely not pinned for DMA", but true means "probably
 * pinned for DMA, but possibly a false positive due to having at least
 * GUP_PIN_COUNTING_BIAS worth of normal page references".
 *
 * False positives are OK, because: a) it's unlikely for a page to get that many
 * refcounts, and b) all the callers of this routine are expected to be able to
 * deal gracefully with a false positive.
 *
 * For huge pages, the result will be exactly correct. That's because we have
 * more tracking data available: the 3rd struct page in the compound page is
 * used to track the pincount (instead using of the GUP_PIN_COUNTING_BIAS
 * scheme).
 *
 * For more information, please see Documentation/core-api/pin_user_pages.rst.
 *
 * @page:        pointer to page to be queried.
 * @Return:        True, if it is likely that the page has been "dma-pinned".
 *                False, if the page is definitely not dma-pinned.
 */
static inline bool page_maybe_dma_pinned(struct page *page)
{
        if (hpage_pincount_available(page))
                return compound_pincount(page) > 0;

        /*
         * page_ref_count() is signed. If that refcount overflows, then
         * page_ref_count() returns a negative value, and callers will avoid
         * further incrementing the refcount.
         *
         * Here, for that overflow case, use the signed bit to count a little
         * bit higher via unsigned math, and thus still get an accurate result.
         */
        return ((unsigned int)page_ref_count(compound_head(page))) >=
                GUP_PIN_COUNTING_BIAS;
}

#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define SECTION_IN_PAGE_FLAGS
#endif

/*
 * The identification function is mainly used by the buddy allocator for
 * determining if two pages could be buddies. We are not really identifying
 * the zone since we could be using the section number id if we do not have
 * node id available in page flags.
 * We only guarantee that it will return the same value for two combinable
 * pages in a zone.
 */
static inline int page_zone_id(struct page *page)
{
        return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK;
}

#ifdef NODE_NOT_IN_PAGE_FLAGS
extern int page_to_nid(const struct page *page);
#else
static inline int page_to_nid(const struct page *page)
{
        struct page *p = (struct page *)page;

        return (PF_POISONED_CHECK(p)->flags >> NODES_PGSHIFT) & NODES_MASK;
}
#endif

#ifdef CONFIG_NUMA_BALANCING
static inline int cpu_pid_to_cpupid(int cpu, int pid)
{
        return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK);
}

static inline int cpupid_to_pid(int cpupid)
{
        return cpupid & LAST__PID_MASK;
}

static inline int cpupid_to_cpu(int cpupid)
{
        return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK;
}

static inline int cpupid_to_nid(int cpupid)
{
        return cpu_to_node(cpupid_to_cpu(cpupid));
}

static inline bool cpupid_pid_unset(int cpupid)
{
        return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK);
}

static inline bool cpupid_cpu_unset(int cpupid)
{
        return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK);
}

static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
{
        return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid);
}

#define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid)
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
{
        return xchg(&page->_last_cpupid, cpupid & LAST_CPUPID_MASK);
}

static inline int page_cpupid_last(struct page *page)
{
        return page->_last_cpupid;
}
static inline void page_cpupid_reset_last(struct page *page)
{
        page->_last_cpupid = -1 & LAST_CPUPID_MASK;
}
#else
static inline int page_cpupid_last(struct page *page)
{
        return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
}

extern int page_cpupid_xchg_last(struct page *page, int cpupid);

static inline void page_cpupid_reset_last(struct page *page)
{
        page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
}
#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
#else /* !CONFIG_NUMA_BALANCING */
static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
{
        return page_to_nid(page); /* XXX */
}

static inline int page_cpupid_last(struct page *page)
{
        return page_to_nid(page); /* XXX */
}

static inline int cpupid_to_nid(int cpupid)
{
        return -1;
}

static inline int cpupid_to_pid(int cpupid)
{
        return -1;
}

static inline int cpupid_to_cpu(int cpupid)
{
        return -1;
}

static inline int cpu_pid_to_cpupid(int nid, int pid)
{
        return -1;
}

static inline bool cpupid_pid_unset(int cpupid)
{
        return true;
}

static inline void page_cpupid_reset_last(struct page *page)
{
}

static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
{
        return false;
}
#endif /* CONFIG_NUMA_BALANCING */

#ifdef CONFIG_KASAN_SW_TAGS

/*
 * KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid
 * setting tags for all pages to native kernel tag value 0xff, as the default
 * value 0x00 maps to 0xff.
 */

static inline u8 page_kasan_tag(const struct page *page)
{
        u8 tag;

        tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
        tag ^= 0xff;

        return tag;
}

static inline void page_kasan_tag_set(struct page *page, u8 tag)
{
        tag ^= 0xff;
        page->flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
        page->flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
}

static inline void page_kasan_tag_reset(struct page *page)
{
        page_kasan_tag_set(page, 0xff);
}
#else
static inline u8 page_kasan_tag(const struct page *page)
{
        return 0xff;
}

static inline void page_kasan_tag_set(struct page *page, u8 tag) { }
static inline void page_kasan_tag_reset(struct page *page) { }
#endif

static inline struct zone *page_zone(const struct page *page)
{
        return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
}

static inline pg_data_t *page_pgdat(const struct page *page)
{
        return NODE_DATA(page_to_nid(page));
}

#ifdef SECTION_IN_PAGE_FLAGS
static inline void set_page_section(struct page *page, unsigned long section)
{
        page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
        page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
}

static inline unsigned long page_to_section(const struct page *page)
{
        return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
}
#endif

static inline void set_page_zone(struct page *page, enum zone_type zone)
{
        page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
        page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
}

static inline void set_page_node(struct page *page, unsigned long node)
{
        page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
        page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
}

static inline void set_page_links(struct page *page, enum zone_type zone,
        unsigned long node, unsigned long pfn)
{
        set_page_zone(page, zone);
        set_page_node(page, node);
#ifdef SECTION_IN_PAGE_FLAGS
        set_page_section(page, pfn_to_section_nr(pfn));
#endif
}

#ifdef CONFIG_MEMCG
static inline struct mem_cgroup *page_memcg(struct page *page)
{
        return page->mem_cgroup;
}
static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
{
        WARN_ON_ONCE(!rcu_read_lock_held());
        return READ_ONCE(page->mem_cgroup);
}
#else
static inline struct mem_cgroup *page_memcg(struct page *page)
{
        return NULL;
}
static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
{
        WARN_ON_ONCE(!rcu_read_lock_held());
        return NULL;
}
#endif

/*
 * Some inline functions in vmstat.h depend on page_zone()
 */
#include <linux/vmstat.h>

static __always_inline void *lowmem_page_address(const struct page *page)
{
        return page_to_virt(page);
}

#if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL)
#define HASHED_PAGE_VIRTUAL
#endif

#if defined(WANT_PAGE_VIRTUAL)
static inline void *page_address(const struct page *page)
{
        return page->virtual;
}
static inline void set_page_address(struct page *page, void *address)
{
        page->virtual = address;
}
#define page_address_init()  do { } while(0)
#endif

#if defined(HASHED_PAGE_VIRTUAL)
void *page_address(const struct page *page);
void set_page_address(struct page *page, void *virtual);
void page_address_init(void);
#endif

#if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
#define page_address(page) lowmem_page_address(page)
#define set_page_address(page, address)  do { } while(0)
#define page_address_init()  do { } while(0)
#endif

extern void *page_rmapping(struct page *page);
extern struct anon_vma *page_anon_vma(struct page *page);
extern struct address_space *page_mapping(struct page *page);

extern struct address_space *__page_file_mapping(struct page *);

static inline
struct address_space *page_file_mapping(struct page *page)
{
        if (unlikely(PageSwapCache(page)))
                return __page_file_mapping(page);

        return page->mapping;
}

extern pgoff_t __page_file_index(struct page *page);

/*
 * Return the pagecache index of the passed page.  Regular pagecache pages
 * use ->index whereas swapcache pages use swp_offset(->private)
 */
static inline pgoff_t page_index(struct page *page)
{
        if (unlikely(PageSwapCache(page)))
                return __page_file_index(page);
        return page->index;
}

bool page_mapped(struct page *page);
struct address_space *page_mapping(struct page *page);
struct address_space *page_mapping_file(struct page *page);

/*
 * Return true only if the page has been allocated with
 * ALLOC_NO_WATERMARKS and the low watermark was not
 * met implying that the system is under some pressure.
 */
static inline bool page_is_pfmemalloc(struct page *page)
{
        /*
         * Page index cannot be this large so this must be
         * a pfmemalloc page.
         */
        return page->index == -1UL;
}

/*
 * Only to be called by the page allocator on a freshly allocated
 * page.
 */
static inline void set_page_pfmemalloc(struct page *page)
{
        page->index = -1UL;
}

static inline void clear_page_pfmemalloc(struct page *page)
{
        page->index = 0;
}

/*
 * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
 */
extern void pagefault_out_of_memory(void);

#define offset_in_page(p)        ((unsigned long)(p) & ~PAGE_MASK)
#define offset_in_thp(page, p)        ((unsigned long)(p) & (thp_size(page) - 1))

/*
 * Flags passed to show_mem() and show_free_areas() to suppress output in
 * various contexts.
 */
#define SHOW_MEM_FILTER_NODES                (0x0001u)        /* disallowed nodes */

extern void show_free_areas(unsigned int flags, nodemask_t *nodemask);

#ifdef CONFIG_MMU
extern bool can_do_mlock(void);
#else
static inline bool can_do_mlock(void) { return false; }
#endif
extern int user_shm_lock(size_t, struct user_struct *);
extern void user_shm_unlock(size_t, struct user_struct *);

/*
 * Parameter block passed down to zap_pte_range in exceptional cases.
 */
struct zap_details {
        struct address_space *check_mapping;        /* Check page->mapping if set */
        pgoff_t        first_index;                        /* Lowest page->index to unmap */
        pgoff_t last_index;                        /* Highest page->index to unmap */
        struct page *single_page;                /* Locked page to be unmapped */
};

struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                             pte_t pte);
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
                                pmd_t pmd);

void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
                  unsigned long size);
void zap_page_range(struct vm_area_struct *vma, unsigned long address,
                    unsigned long size);
void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
                unsigned long start, unsigned long end);

struct mmu_notifier_range;

void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
                unsigned long end, unsigned long floor, unsigned long ceiling);
int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
                          struct mmu_notifier_range *range, pte_t **ptepp,
                          pmd_t **pmdpp, spinlock_t **ptlp);
int follow_pte(struct mm_struct *mm, unsigned long address,
               pte_t **ptepp, spinlock_t **ptlp);
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
        unsigned long *pfn);
int follow_phys(struct vm_area_struct *vma, unsigned long address,
                unsigned int flags, unsigned long *prot, resource_size_t *phys);
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
                        void *buf, int len, int write);

extern void truncate_pagecache(struct inode *inode, loff_t new);
extern void truncate_setsize(struct inode *inode, loff_t newsize);
void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
int truncate_inode_page(struct address_space *mapping, struct page *page);
int generic_error_remove_page(struct address_space *mapping, struct page *page);
int invalidate_inode_page(struct page *page);

#ifdef CONFIG_MMU
extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
                                  unsigned long address, unsigned int flags,
                                  struct pt_regs *regs);
extern int fixup_user_fault(struct mm_struct *mm,
                            unsigned long address, unsigned int fault_flags,
                            bool *unlocked);
void unmap_mapping_page(struct page *page);
void unmap_mapping_pages(struct address_space *mapping,
                pgoff_t start, pgoff_t nr, bool even_cows);
void unmap_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen, int even_cows);
#else
static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
                                         unsigned long address, unsigned int flags,
                                         struct pt_regs *regs)
{
        /* should never happen if there's no MMU */
        BUG();
        return VM_FAULT_SIGBUS;
}
static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address,
                unsigned int fault_flags, bool *unlocked)
{
        /* should never happen if there's no MMU */
        BUG();
        return -EFAULT;
}
static inline void unmap_mapping_page(struct page *page) { }
static inline void unmap_mapping_pages(struct address_space *mapping,
                pgoff_t start, pgoff_t nr, bool even_cows) { }
static inline void unmap_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen, int even_cows) { }
#endif

static inline void unmap_shared_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen)
{
        unmap_mapping_range(mapping, holebegin, holelen, 0);
}

extern int access_process_vm(struct task_struct *tsk, unsigned long addr,
                void *buf, int len, unsigned int gup_flags);
extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
                void *buf, int len, unsigned int gup_flags);
extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
                unsigned long addr, void *buf, int len, unsigned int gup_flags);

long get_user_pages_remote(struct mm_struct *mm,
                            unsigned long start, unsigned long nr_pages,
                            unsigned int gup_flags, struct page **pages,
                            struct vm_area_struct **vmas, int *locked);
long pin_user_pages_remote(struct mm_struct *mm,
                           unsigned long start, unsigned long nr_pages,
                           unsigned int gup_flags, struct page **pages,
                           struct vm_area_struct **vmas, int *locked);
long get_user_pages(unsigned long start, unsigned long nr_pages,
                            unsigned int gup_flags, struct page **pages,
                            struct vm_area_struct **vmas);
long pin_user_pages(unsigned long start, unsigned long nr_pages,
                    unsigned int gup_flags, struct page **pages,
                    struct vm_area_struct **vmas);
long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
                    unsigned int gup_flags, struct page **pages, int *locked);
long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
                    unsigned int gup_flags, struct page **pages, int *locked);
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                    struct page **pages, unsigned int gup_flags);
long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                    struct page **pages, unsigned int gup_flags);

int get_user_pages_fast(unsigned long start, int nr_pages,
                        unsigned int gup_flags, struct page **pages);
int pin_user_pages_fast(unsigned long start, int nr_pages,
                        unsigned int gup_flags, struct page **pages);

int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc);
int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
                        struct task_struct *task, bool bypass_rlim);

/* Container for pinned pfns / pages */
struct frame_vector {
        unsigned int nr_allocated;        /* Number of frames we have space for */
        unsigned int nr_frames;        /* Number of frames stored in ptrs array */
        bool got_ref;                /* Did we pin pages by getting page ref? */
        bool is_pfns;                /* Does array contain pages or pfns? */
        void *ptrs[];                /* Array of pinned pfns / pages. Use
                                 * pfns_vector_pages() or pfns_vector_pfns()
                                 * for access */
};

struct frame_vector *frame_vector_create(unsigned int nr_frames);
void frame_vector_destroy(struct frame_vector *vec);
int get_vaddr_frames(unsigned long start, unsigned int nr_pfns,
                     unsigned int gup_flags, struct frame_vector *vec);
void put_vaddr_frames(struct frame_vector *vec);
int frame_vector_to_pages(struct frame_vector *vec);
void frame_vector_to_pfns(struct frame_vector *vec);

static inline unsigned int frame_vector_count(struct frame_vector *vec)
{
        return vec->nr_frames;
}

static inline struct page **frame_vector_pages(struct frame_vector *vec)
{
        if (vec->is_pfns) {
                int err = frame_vector_to_pages(vec);

                if (err)
                        return ERR_PTR(err);
        }
        return (struct page **)(vec->ptrs);
}

static inline unsigned long *frame_vector_pfns(struct frame_vector *vec)
{
        if (!vec->is_pfns)
                frame_vector_to_pfns(vec);
        return (unsigned long *)(vec->ptrs);
}

struct kvec;
int get_kernel_pages(const struct kvec *iov, int nr_pages, int write,
                        struct page **pages);
int get_kernel_page(unsigned long start, int write, struct page **pages);
struct page *get_dump_page(unsigned long addr);

extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
extern void do_invalidatepage(struct page *page, unsigned int offset,
                              unsigned int length);

void __set_page_dirty(struct page *, struct address_space *, int warn);
int __set_page_dirty_nobuffers(struct page *page);
int __set_page_dirty_no_writeback(struct page *page);
int redirty_page_for_writepage(struct writeback_control *wbc,
                                struct page *page);
void account_page_dirtied(struct page *page, struct address_space *mapping);
void account_page_cleaned(struct page *page, struct address_space *mapping,
                          struct bdi_writeback *wb);
int set_page_dirty(struct page *page);
int set_page_dirty_lock(struct page *page);
void __cancel_dirty_page(struct page *page);
static inline void cancel_dirty_page(struct page *page)
{
        /* Avoid atomic ops, locking, etc. when not actually needed. */
        if (PageDirty(page))
                __cancel_dirty_page(page);
}
int clear_page_dirty_for_io(struct page *page);

int get_cmdline(struct task_struct *task, char *buffer, int buflen);

extern unsigned long move_page_tables(struct vm_area_struct *vma,
                unsigned long old_addr, struct vm_area_struct *new_vma,
                unsigned long new_addr, unsigned long len,
                bool need_rmap_locks);

/*
 * Flags used by change_protection().  For now we make it a bitmap so
 * that we can pass in multiple flags just like parameters.  However
 * for now all the callers are only use one of the flags at the same
 * time.
 */
/* Whether we should allow dirty bit accounting */
#define  MM_CP_DIRTY_ACCT                  (1UL << 0)
/* Whether this protection change is for NUMA hints */
#define  MM_CP_PROT_NUMA                   (1UL << 1)
/* Whether this change is for write protecting */
#define  MM_CP_UFFD_WP                     (1UL << 2) /* do wp */
#define  MM_CP_UFFD_WP_RESOLVE             (1UL << 3) /* Resolve wp */
#define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \
                                            MM_CP_UFFD_WP_RESOLVE)

extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
                              unsigned long end, pgprot_t newprot,
                              unsigned long cp_flags);
extern int mprotect_fixup(struct vm_area_struct *vma,
                          struct vm_area_struct **pprev, unsigned long start,
                          unsigned long end, unsigned long newflags);

/*
 * doesn't attempt to fault and will return short.
 */
int get_user_pages_fast_only(unsigned long start, int nr_pages,
                             unsigned int gup_flags, struct page **pages);
int pin_user_pages_fast_only(unsigned long start, int nr_pages,
                             unsigned int gup_flags, struct page **pages);

static inline bool get_user_page_fast_only(unsigned long addr,
                        unsigned int gup_flags, struct page **pagep)
{
        return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1;
}
/*
 * per-process(per-mm_struct) statistics.
 */
static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
{
        long val = atomic_long_read(&mm->rss_stat.count[member]);

#ifdef SPLIT_RSS_COUNTING
        /*
         * counter is updated in asynchronous manner and may go to minus.
         * But it's never be expected number for users.
         */
        if (val < 0)
                val = 0;
#endif
        return (unsigned long)val;
}

void mm_trace_rss_stat(struct mm_struct *mm, int member, long count);

static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
{
        long count = atomic_long_add_return(value, &mm->rss_stat.count[member]);

        mm_trace_rss_stat(mm, member, count);
}

static inline void inc_mm_counter(struct mm_struct *mm, int member)
{
        long count = atomic_long_inc_return(&mm->rss_stat.count[member]);

        mm_trace_rss_stat(mm, member, count);
}

static inline void dec_mm_counter(struct mm_struct *mm, int member)
{
        long count = atomic_long_dec_return(&mm->rss_stat.count[member]);

        mm_trace_rss_stat(mm, member, count);
}

/* Optimized variant when page is already known not to be PageAnon */
static inline int mm_counter_file(struct page *page)
{
        if (PageSwapBacked(page))
                return MM_SHMEMPAGES;
        return MM_FILEPAGES;
}

static inline int mm_counter(struct page *page)
{
        if (PageAnon(page))
                return MM_ANONPAGES;
        return mm_counter_file(page);
}

static inline unsigned long get_mm_rss(struct mm_struct *mm)
{
        return get_mm_counter(mm, MM_FILEPAGES) +
                get_mm_counter(mm, MM_ANONPAGES) +
                get_mm_counter(mm, MM_SHMEMPAGES);
}

static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
{
        return max(mm->hiwater_rss, get_mm_rss(mm));
}

static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm)
{
        return max(mm->hiwater_vm, mm->total_vm);
}

static inline void update_hiwater_rss(struct mm_struct *mm)
{
        unsigned long _rss = get_mm_rss(mm);

        if ((mm)->hiwater_rss < _rss)
                (mm)->hiwater_rss = _rss;
}

static inline void update_hiwater_vm(struct mm_struct *mm)
{
        if (mm->hiwater_vm < mm->total_vm)
                mm->hiwater_vm = mm->total_vm;
}

static inline void reset_mm_hiwater_rss(struct mm_struct *mm)
{
        mm->hiwater_rss = get_mm_rss(mm);
}

static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
                                         struct mm_struct *mm)
{
        unsigned long hiwater_rss = get_mm_hiwater_rss(mm);

        if (*maxrss < hiwater_rss)
                *maxrss = hiwater_rss;
}

#if defined(SPLIT_RSS_COUNTING)
void sync_mm_rss(struct mm_struct *mm);
#else
static inline void sync_mm_rss(struct mm_struct *mm)
{
}
#endif

#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL
static inline int pte_special(pte_t pte)
{
        return 0;
}

static inline pte_t pte_mkspecial(pte_t pte)
{
        return pte;
}
#endif

#ifndef CONFIG_ARCH_HAS_PTE_DEVMAP
static inline int pte_devmap(pte_t pte)
{
        return 0;
}
#endif

int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);

extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
                               spinlock_t **ptl);
static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
                                    spinlock_t **ptl)
{
        pte_t *ptep;
        __cond_lock(*ptl, ptep = __get_locked_pte(mm, addr, ptl));
        return ptep;
}

#ifdef __PAGETABLE_P4D_FOLDED
static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
                                                unsigned long address)
{
        return 0;
}
#else
int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
#endif

#if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU)
static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d,
                                                unsigned long address)
{
        return 0;
}
static inline void mm_inc_nr_puds(struct mm_struct *mm) {}
static inline void mm_dec_nr_puds(struct mm_struct *mm) {}

#else
int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address);

static inline void mm_inc_nr_puds(struct mm_struct *mm)
{
        if (mm_pud_folded(mm))
                return;
        atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
}

static inline void mm_dec_nr_puds(struct mm_struct *mm)
{
        if (mm_pud_folded(mm))
                return;
        atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
}
#endif

#if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU)
static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
                                                unsigned long address)
{
        return 0;
}

static inline void mm_inc_nr_pmds(struct mm_struct *mm) {}
static inline void mm_dec_nr_pmds(struct mm_struct *mm) {}

#else
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);

static inline void mm_inc_nr_pmds(struct mm_struct *mm)
{
        if (mm_pmd_folded(mm))
                return;
        atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
}

static inline void mm_dec_nr_pmds(struct mm_struct *mm)
{
        if (mm_pmd_folded(mm))
                return;
        atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
}
#endif

#ifdef CONFIG_MMU
static inline void mm_pgtables_bytes_init(struct mm_struct *mm)
{
        atomic_long_set(&mm->pgtables_bytes, 0);
}

static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
{
        return atomic_long_read(&mm->pgtables_bytes);
}

static inline void mm_inc_nr_ptes(struct mm_struct *mm)
{
        atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
}

static inline void mm_dec_nr_ptes(struct mm_struct *mm)
{
        atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
}
#else

static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {}
static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
{
        return 0;
}

static inline void mm_inc_nr_ptes(struct mm_struct *mm) {}
static inline void mm_dec_nr_ptes(struct mm_struct *mm) {}
#endif

int __pte_alloc(struct mm_struct *mm, pmd_t *pmd);
int __pte_alloc_kernel(pmd_t *pmd);

#if defined(CONFIG_MMU)

static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
                unsigned long address)
{
        return (unlikely(pgd_none(*pgd)) && __p4d_alloc(mm, pgd, address)) ?
                NULL : p4d_offset(pgd, address);
}

static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d,
                unsigned long address)
{
        return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ?
                NULL : pud_offset(p4d, address);
}

static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
        return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
                NULL: pmd_offset(pud, address);
}
#endif /* CONFIG_MMU */

#if USE_SPLIT_PTE_PTLOCKS
#if ALLOC_SPLIT_PTLOCKS
void __init ptlock_cache_init(void);
extern bool ptlock_alloc(struct page *page);
extern void ptlock_free(struct page *page);

static inline spinlock_t *ptlock_ptr(struct page *page)
{
        return page->ptl;
}
#else /* ALLOC_SPLIT_PTLOCKS */
static inline void ptlock_cache_init(void)
{
}

static inline bool ptlock_alloc(struct page *page)
{
        return true;
}

static inline void ptlock_free(struct page *page)
{
}

static inline spinlock_t *ptlock_ptr(struct page *page)
{
        return &page->ptl;
}
#endif /* ALLOC_SPLIT_PTLOCKS */

static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return ptlock_ptr(pmd_page(*pmd));
}

static inline bool ptlock_init(struct page *page)
{
        /*
         * prep_new_page() initialize page->private (and therefore page->ptl)
         * with 0. Make sure nobody took it in use in between.
         *
         * It can happen if arch try to use slab for page table allocation:
         * slab code uses page->slab_cache, which share storage with page->ptl.
         */
        VM_BUG_ON_PAGE(*(unsigned long *)&page->ptl, page);
        if (!ptlock_alloc(page))
                return false;
        spin_lock_init(ptlock_ptr(page));
        return true;
}

#else        /* !USE_SPLIT_PTE_PTLOCKS */
/*
 * We use mm->page_table_lock to guard all pagetable pages of the mm.
 */
static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return &mm->page_table_lock;
}
static inline void ptlock_cache_init(void) {}
static inline bool ptlock_init(struct page *page) { return true; }
static inline void ptlock_free(struct page *page) {}
#endif /* USE_SPLIT_PTE_PTLOCKS */

static inline void pgtable_init(void)
{
        ptlock_cache_init();
        pgtable_cache_init();
}

static inline bool pgtable_pte_page_ctor(struct page *page)
{
        if (!ptlock_init(page))
                return false;
        __SetPageTable(page);
        inc_zone_page_state(page, NR_PAGETABLE);
        return true;
}

static inline void pgtable_pte_page_dtor(struct page *page)
{
        ptlock_free(page);
        __ClearPageTable(page);
        dec_zone_page_state(page, NR_PAGETABLE);
}

#define pte_offset_map_lock(mm, pmd, address, ptlp)        \
({                                                        \
        spinlock_t *__ptl = pte_lockptr(mm, pmd);        \
        pte_t *__pte = pte_offset_map(pmd, address);        \
        *(ptlp) = __ptl;                                \
        spin_lock(__ptl);                                \
        __pte;                                                \
})

#define pte_unmap_unlock(pte, ptl)        do {                \
        spin_unlock(ptl);                                \
        pte_unmap(pte);                                        \
} while (0)

#define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd))

#define pte_alloc_map(mm, pmd, address)                        \
        (pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address))

#define pte_alloc_map_lock(mm, pmd, address, ptlp)        \
        (pte_alloc(mm, pmd) ?                        \
                 NULL : pte_offset_map_lock(mm, pmd, address, ptlp))

#define pte_alloc_kernel(pmd, address)                        \
        ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \
                NULL: pte_offset_kernel(pmd, address))

#if USE_SPLIT_PMD_PTLOCKS

static struct page *pmd_to_page(pmd_t *pmd)
{
        unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
        return virt_to_page((void *)((unsigned long) pmd & mask));
}

static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return ptlock_ptr(pmd_to_page(pmd));
}

static inline bool pmd_ptlock_init(struct page *page)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        page->pmd_huge_pte = NULL;
#endif
        return ptlock_init(page);
}

static inline void pmd_ptlock_free(struct page *page)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        VM_BUG_ON_PAGE(page->pmd_huge_pte, page);
#endif
        ptlock_free(page);
}

#define pmd_huge_pte(mm, pmd) (pmd_to_page(pmd)->pmd_huge_pte)

#else

static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return &mm->page_table_lock;
}

static inline bool pmd_ptlock_init(struct page *page) { return true; }
static inline void pmd_ptlock_free(struct page *page) {}

#define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)

#endif

static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
{
        spinlock_t *ptl = pmd_lockptr(mm, pmd);
        spin_lock(ptl);
        return ptl;
}

static inline bool pgtable_pmd_page_ctor(struct page *page)
{
        if (!pmd_ptlock_init(page))
                return false;
        __SetPageTable(page);
        inc_zone_page_state(page, NR_PAGETABLE);
        return true;
}

static inline void pgtable_pmd_page_dtor(struct page *page)
{
        pmd_ptlock_free(page);
        __ClearPageTable(page);
        dec_zone_page_state(page, NR_PAGETABLE);
}

/*
 * No scalability reason to split PUD locks yet, but follow the same pattern
 * as the PMD locks to make it easier if we decide to.  The VM should not be
 * considered ready to switch to split PUD locks yet; there may be places
 * which need to be converted from page_table_lock.
 */
static inline spinlock_t *pud_lockptr(struct mm_struct *mm, pud_t *pud)
{
        return &mm->page_table_lock;
}

static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
{
        spinlock_t *ptl = pud_lockptr(mm, pud);

        spin_lock(ptl);
        return ptl;
}

extern void __init pagecache_init(void);
extern void __init free_area_init_memoryless_node(int nid);
extern void free_initmem(void);

/*
 * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
 * into the buddy system. The freed pages will be poisoned with pattern
 * "poison" if it's within range [0, UCHAR_MAX].
 * Return pages freed into the buddy system.
 */
extern unsigned long free_reserved_area(void *start, void *end,
                                        int poison, const char *s);

#ifdef        CONFIG_HIGHMEM
/*
 * Free a highmem page into the buddy system, adjusting totalhigh_pages
 * and totalram_pages.
 */
extern void free_highmem_page(struct page *page);
#endif

extern void adjust_managed_page_count(struct page *page, long count);
extern void mem_init_print_info(const char *str);

extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end);

/* Free the reserved page into the buddy system, so it gets managed. */
static inline void __free_reserved_page(struct page *page)
{
        ClearPageReserved(page);
        init_page_count(page);
        __free_page(page);
}

static inline void free_reserved_page(struct page *page)
{
        __free_reserved_page(page);
        adjust_managed_page_count(page, 1);
}

static inline void mark_page_reserved(struct page *page)
{
        SetPageReserved(page);
        adjust_managed_page_count(page, -1);
}

/*
 * Default method to free all the __init memory into the buddy system.
 * The freed pages will be poisoned with pattern "poison" if it's within
 * range [0, UCHAR_MAX].
 * Return pages freed into the buddy system.
 */
static inline unsigned long free_initmem_default(int poison)
{
        extern char __init_begin[], __init_end[];

        return free_reserved_area(&__init_begin, &__init_end,
                                  poison, "unused kernel");
}

static inline unsigned long get_num_physpages(void)
{
        int nid;
        unsigned long phys_pages = 0;

        for_each_online_node(nid)
                phys_pages += node_present_pages(nid);

        return phys_pages;
}

/*
 * Using memblock node mappings, an architecture may initialise its
 * zones, allocate the backing mem_map and account for memory holes in an
 * architecture independent manner.
 *
 * An architecture is expected to register range of page frames backed by
 * physical memory with memblock_add[_node]() before calling
 * free_area_init() passing in the PFN each zone ends at. At a basic
 * usage, an architecture is expected to do something like
 *
 * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn,
 *                                                          max_highmem_pfn};
 * for_each_valid_physical_page_range()
 *         memblock_add_node(base, size, nid)
 * free_area_init(max_zone_pfns);
 */
void free_area_init(unsigned long *max_zone_pfn);
unsigned long node_map_pfn_alignment(void);
unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn,
                                                unsigned long end_pfn);
extern unsigned long absent_pages_in_range(unsigned long start_pfn,
                                                unsigned long end_pfn);
extern void get_pfn_range_for_nid(unsigned int nid,
                        unsigned long *start_pfn, unsigned long *end_pfn);
extern unsigned long find_min_pfn_with_active_regions(void);

#ifndef CONFIG_NEED_MULTIPLE_NODES
static inline int early_pfn_to_nid(unsigned long pfn)
{
        return 0;
}
#else
/* please see mm/page_alloc.c */
extern int __meminit early_pfn_to_nid(unsigned long pfn);
/* there is a per-arch backend function. */
extern int __meminit __early_pfn_to_nid(unsigned long pfn,
                                        struct mminit_pfnnid_cache *state);
#endif

extern void set_dma_reserve(unsigned long new_dma_reserve);
extern void memmap_init_zone(unsigned long, int, unsigned long,
                unsigned long, unsigned long, enum meminit_context,
                struct vmem_altmap *, int migratetype);
extern void setup_per_zone_wmarks(void);
extern int __meminit init_per_zone_wmark_min(void);
extern void mem_init(void);
extern void __init mmap_init(void);
extern void show_mem(unsigned int flags, nodemask_t *nodemask);
extern long si_mem_available(void);
extern void si_meminfo(struct sysinfo * val);
extern void si_meminfo_node(struct sysinfo *val, int nid);
#ifdef __HAVE_ARCH_RESERVED_KERNEL_PAGES
extern unsigned long arch_reserved_kernel_pages(void);
#endif

extern __printf(3, 4)
void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...);

extern void setup_per_cpu_pageset(void);

/* page_alloc.c */
extern int min_free_kbytes;
extern int watermark_boost_factor;
extern int watermark_scale_factor;
extern bool arch_has_descending_max_zone_pfns(void);

/* nommu.c */
extern atomic_long_t mmap_pages_allocated;
extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);

/* interval_tree.c */
void vma_interval_tree_insert(struct vm_area_struct *node,
                              struct rb_root_cached *root);
void vma_interval_tree_insert_after(struct vm_area_struct *node,
                                    struct vm_area_struct *prev,
                                    struct rb_root_cached *root);
void vma_interval_tree_remove(struct vm_area_struct *node,
                              struct rb_root_cached *root);
struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root,
                                unsigned long start, unsigned long last);
struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
                                unsigned long start, unsigned long last);

#define vma_interval_tree_foreach(vma, root, start, last)                \
        for (vma = vma_interval_tree_iter_first(root, start, last);        \
             vma; vma = vma_interval_tree_iter_next(vma, start, last))

void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
                                   struct rb_root_cached *root);
void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
                                   struct rb_root_cached *root);
struct anon_vma_chain *
anon_vma_interval_tree_iter_first(struct rb_root_cached *root,
                                  unsigned long start, unsigned long last);
struct anon_vma_chain *anon_vma_interval_tree_iter_next(
        struct anon_vma_chain *node, unsigned long start, unsigned long last);
#ifdef CONFIG_DEBUG_VM_RB
void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
#endif

#define anon_vma_interval_tree_foreach(avc, root, start, last)                 \
        for (avc = anon_vma_interval_tree_iter_first(root, start, last); \
             avc; avc = anon_vma_interval_tree_iter_next(avc, start, last))

/* mmap.c */
extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
        unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
        struct vm_area_struct *expand);
static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start,
        unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
{
        return __vma_adjust(vma, start, end, pgoff, insert, NULL);
}
extern struct vm_area_struct *vma_merge(struct mm_struct *,
        struct vm_area_struct *prev, unsigned long addr, unsigned long end,
        unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
        struct mempolicy *, struct vm_userfaultfd_ctx);
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
        unsigned long addr, int new_below);
extern int split_vma(struct mm_struct *, struct vm_area_struct *,
        unsigned long addr, int new_below);
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
        struct rb_node **, struct rb_node *);
extern void unlink_file_vma(struct vm_area_struct *);
extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
        unsigned long addr, unsigned long len, pgoff_t pgoff,
        bool *need_rmap_locks);
extern void exit_mmap(struct mm_struct *);

static inline int check_data_rlimit(unsigned long rlim,
                                    unsigned long new,
                                    unsigned long start,
                                    unsigned long end_data,
                                    unsigned long start_data)
{
        if (rlim < RLIM_INFINITY) {
                if (((new - start) + (end_data - start_data)) > rlim)
                        return -ENOSPC;
        }

        return 0;
}

extern int mm_take_all_locks(struct mm_struct *mm);
extern void mm_drop_all_locks(struct mm_struct *mm);

extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
extern struct file *get_mm_exe_file(struct mm_struct *mm);
extern struct file *get_task_exe_file(struct task_struct *task);

extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages);
extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages);

extern bool vma_is_special_mapping(const struct vm_area_struct *vma,
                                   const struct vm_special_mapping *sm);
extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm,
                                   unsigned long addr, unsigned long len,
                                   unsigned long flags,
                                   const struct vm_special_mapping *spec);
/* This is an obsolete alternative to _install_special_mapping. */
extern int install_special_mapping(struct mm_struct *mm,
                                   unsigned long addr, unsigned long len,
                                   unsigned long flags, struct page **pages);

unsigned long randomize_stack_top(unsigned long stack_top);
unsigned long randomize_page(unsigned long start, unsigned long range);

extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);

extern unsigned long mmap_region(struct file *file, unsigned long addr,
        unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
        struct list_head *uf);
extern unsigned long do_mmap(struct file *file, unsigned long addr,
        unsigned long len, unsigned long prot, unsigned long flags,
        unsigned long pgoff, unsigned long *populate, struct list_head *uf);
extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
                       struct list_head *uf, bool downgrade);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
                     struct list_head *uf);
extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);

#ifdef CONFIG_MMU
extern int __mm_populate(unsigned long addr, unsigned long len,
                         int ignore_errors);
static inline void mm_populate(unsigned long addr, unsigned long len)
{
        /* Ignore errors */
        (void) __mm_populate(addr, len, 1);
}
#else
static inline void mm_populate(unsigned long addr, unsigned long len) {}
#endif

/* These take the mm semaphore themselves */
extern int __must_check vm_brk(unsigned long, unsigned long);
extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long);
extern int vm_munmap(unsigned long, size_t);
extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
        unsigned long, unsigned long,
        unsigned long, unsigned long);

struct vm_unmapped_area_info {
#define VM_UNMAPPED_AREA_TOPDOWN 1
        unsigned long flags;
        unsigned long length;
        unsigned long low_limit;
        unsigned long high_limit;
        unsigned long align_mask;
        unsigned long align_offset;
};

extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info);

/* truncate.c */
extern void truncate_inode_pages(struct address_space *, loff_t);
extern void truncate_inode_pages_range(struct address_space *,
                                       loff_t lstart, loff_t lend);
extern void truncate_inode_pages_final(struct address_space *);

/* generic vm_area_ops exported for stackable file systems */
extern vm_fault_t filemap_fault(struct vm_fault *vmf);
extern void filemap_map_pages(struct vm_fault *vmf,
                pgoff_t start_pgoff, pgoff_t end_pgoff);
extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);

/* mm/page-writeback.c */
int __must_check write_one_page(struct page *page);
void task_dirty_inc(struct task_struct *tsk);

extern unsigned long stack_guard_gap;
/* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
extern int expand_stack(struct vm_area_struct *vma, unsigned long address);

/* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */
extern int expand_downwards(struct vm_area_struct *vma,
                unsigned long address);
#if VM_GROWSUP
extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
#else
  #define expand_upwards(vma, address) (0)
#endif

/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
                                             struct vm_area_struct **pprev);

/* Look up the first VMA which intersects the interval start_addr..end_addr-1,
   NULL if none.  Assume start_addr < end_addr. */
static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
{
        struct vm_area_struct * vma = find_vma(mm,start_addr);

        if (vma && end_addr <= vma->vm_start)
                vma = NULL;
        return vma;
}

static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
{
        unsigned long vm_start = vma->vm_start;

        if (vma->vm_flags & VM_GROWSDOWN) {
                vm_start -= stack_guard_gap;
                if (vm_start > vma->vm_start)
                        vm_start = 0;
        }
        return vm_start;
}

static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
{
        unsigned long vm_end = vma->vm_end;

        if (vma->vm_flags & VM_GROWSUP) {
                vm_end += stack_guard_gap;
                if (vm_end < vma->vm_end)
                        vm_end = -PAGE_SIZE;
        }
        return vm_end;
}

static inline unsigned long vma_pages(struct vm_area_struct *vma)
{
        return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
}

/* Look up the first VMA which exactly match the interval vm_start ... vm_end */
static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
                                unsigned long vm_start, unsigned long vm_end)
{
        struct vm_area_struct *vma = find_vma(mm, vm_start);

        if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
                vma = NULL;

        return vma;
}

static inline bool range_in_vma(struct vm_area_struct *vma,
                                unsigned long start, unsigned long end)
{
        return (vma && vma->vm_start <= start && end <= vma->vm_end);
}

#ifdef CONFIG_MMU
pgprot_t vm_get_page_prot(unsigned long vm_flags);
void vma_set_page_prot(struct vm_area_struct *vma);
#else
static inline pgprot_t vm_get_page_prot(unsigned long vm_flags)
{
        return __pgprot(0);
}
static inline void vma_set_page_prot(struct vm_area_struct *vma)
{
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
}
#endif

#ifdef CONFIG_NUMA_BALANCING
unsigned long change_prot_numa(struct vm_area_struct *vma,
                        unsigned long start, unsigned long end);
#endif

struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
                        unsigned long pfn, unsigned long size, pgprot_t);
int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
                        struct page **pages, unsigned long *num);
int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num);
int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num);
vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn);
vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn, pgprot_t pgprot);
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                        pfn_t pfn);
vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
                        pfn_t pfn, pgprot_t pgprot);
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
                unsigned long addr, pfn_t pfn);
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);

static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma,
                                unsigned long addr, struct page *page)
{
        int err = vm_insert_page(vma, addr, page);

        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        if (err < 0 && err != -EBUSY)
                return VM_FAULT_SIGBUS;

        return VM_FAULT_NOPAGE;
}

#ifndef io_remap_pfn_range
static inline int io_remap_pfn_range(struct vm_area_struct *vma,
                                     unsigned long addr, unsigned long pfn,
                                     unsigned long size, pgprot_t prot)
{
        return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot));
}
#endif

static inline vm_fault_t vmf_error(int err)
{
        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        return VM_FAULT_SIGBUS;
}

struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                         unsigned int foll_flags);

#define FOLL_WRITE        0x01        /* check pte is writable */
#define FOLL_TOUCH        0x02        /* mark page accessed */
#define FOLL_GET        0x04        /* do get_page on page */
#define FOLL_DUMP        0x08        /* give error on hole if it would be zero */
#define FOLL_FORCE        0x10        /* get_user_pages read/write w/o permission */
#define FOLL_NOWAIT        0x20        /* if a disk transfer is needed, start the IO
                                 * and return without waiting upon it */
#define FOLL_POPULATE        0x40        /* fault in page */
#define FOLL_SPLIT        0x80        /* don't return transhuge pages, split them */
#define FOLL_HWPOISON        0x100        /* check page is hwpoisoned */
#define FOLL_NUMA        0x200        /* force NUMA hinting page fault */
#define FOLL_MIGRATION        0x400        /* wait for page to replace migration entry */
#define FOLL_TRIED        0x800        /* a retry, previous pass started an IO */
#define FOLL_MLOCK        0x1000        /* lock present pages */
#define FOLL_REMOTE        0x2000        /* we are working on non-current tsk/mm */
#define FOLL_COW        0x4000        /* internal GUP flag */
#define FOLL_ANON        0x8000        /* don't do file mappings */
#define FOLL_LONGTERM        0x10000        /* mapping lifetime is indefinite: see below */
#define FOLL_SPLIT_PMD        0x20000        /* split huge pmd before returning */
#define FOLL_PIN        0x40000        /* pages must be released via unpin_user_page */
#define FOLL_FAST_ONLY        0x80000        /* gup_fast: prevent fall-back to slow gup */

/*
 * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each
 * other. Here is what they mean, and how to use them:
 *
 * FOLL_LONGTERM indicates that the page will be held for an indefinite time
 * period _often_ under userspace control.  This is in contrast to
 * iov_iter_get_pages(), whose usages are transient.
 *
 * FIXME: For pages which are part of a filesystem, mappings are subject to the
 * lifetime enforced by the filesystem and we need guarantees that longterm
 * users like RDMA and V4L2 only establish mappings which coordinate usage with
 * the filesystem.  Ideas for this coordination include revoking the longterm
 * pin, delaying writeback, bounce buffer page writeback, etc.  As FS DAX was
 * added after the problem with filesystems was found FS DAX VMAs are
 * specifically failed.  Filesystem pages are still subject to bugs and use of
 * FOLL_LONGTERM should be avoided on those pages.
 *
 * FIXME: Also NOTE that FOLL_LONGTERM is not supported in every GUP call.
 * Currently only get_user_pages() and get_user_pages_fast() support this flag
 * and calls to get_user_pages_[un]locked are specifically not allowed.  This
 * is due to an incompatibility with the FS DAX check and
 * FAULT_FLAG_ALLOW_RETRY.
 *
 * In the CMA case: long term pins in a CMA region would unnecessarily fragment
 * that region.  And so, CMA attempts to migrate the page before pinning, when
 * FOLL_LONGTERM is specified.
 *
 * FOLL_PIN indicates that a special kind of tracking (not just page->_refcount,
 * but an additional pin counting system) will be invoked. This is intended for
 * anything that gets a page reference and then touches page data (for example,
 * Direct IO). This lets the filesystem know that some non-file-system entity is
 * potentially changing the pages' data. In contrast to FOLL_GET (whose pages
 * are released via put_page()), FOLL_PIN pages must be released, ultimately, by
 * a call to unpin_user_page().
 *
 * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different
 * and separate refcounting mechanisms, however, and that means that each has
 * its own acquire and release mechanisms:
 *
 *     FOLL_GET: get_user_pages*() to acquire, and put_page() to release.
 *
 *     FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release.
 *
 * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call.
 * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based
 * calls applied to them, and that's perfectly OK. This is a constraint on the
 * callers, not on the pages.)
 *
 * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never
 * directly by the caller. That's in order to help avoid mismatches when
 * releasing pages: get_user_pages*() pages must be released via put_page(),
 * while pin_user_pages*() pages must be released via unpin_user_page().
 *
 * Please see Documentation/core-api/pin_user_pages.rst for more information.
 */

static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
{
        if (vm_fault & VM_FAULT_OOM)
                return -ENOMEM;
        if (vm_fault & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
                return (foll_flags & FOLL_HWPOISON) ? -EHWPOISON : -EFAULT;
        if (vm_fault & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
                return -EFAULT;
        return 0;
}

typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
                               unsigned long size, pte_fn_t fn, void *data);
extern int apply_to_existing_page_range(struct mm_struct *mm,
                                   unsigned long address, unsigned long size,
                                   pte_fn_t fn, void *data);

#ifdef CONFIG_PAGE_POISONING
extern bool page_poisoning_enabled(void);
extern void kernel_poison_pages(struct page *page, int numpages, int enable);
#else
static inline bool page_poisoning_enabled(void) { return false; }
static inline void kernel_poison_pages(struct page *page, int numpages,
                                        int enable) { }
#endif

#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON
DECLARE_STATIC_KEY_TRUE(init_on_alloc);
#else
DECLARE_STATIC_KEY_FALSE(init_on_alloc);
#endif
static inline bool want_init_on_alloc(gfp_t flags)
{
        if (static_branch_unlikely(&init_on_alloc) &&
            !page_poisoning_enabled())
                return true;
        return flags & __GFP_ZERO;
}

#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON
DECLARE_STATIC_KEY_TRUE(init_on_free);
#else
DECLARE_STATIC_KEY_FALSE(init_on_free);
#endif
static inline bool want_init_on_free(void)
{
        return static_branch_unlikely(&init_on_free) &&
               !page_poisoning_enabled();
}

#ifdef CONFIG_DEBUG_PAGEALLOC
extern void init_debug_pagealloc(void);
#else
static inline void init_debug_pagealloc(void) {}
#endif
extern bool _debug_pagealloc_enabled_early;
DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);

static inline bool debug_pagealloc_enabled(void)
{
        return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
                _debug_pagealloc_enabled_early;
}

/*
 * For use in fast paths after init_debug_pagealloc() has run, or when a
 * false negative result is not harmful when called too early.
 */
static inline bool debug_pagealloc_enabled_static(void)
{
        if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC))
                return false;

        return static_branch_unlikely(&_debug_pagealloc_enabled);
}

#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP)
extern void __kernel_map_pages(struct page *page, int numpages, int enable);

/*
 * When called in DEBUG_PAGEALLOC context, the call should most likely be
 * guarded by debug_pagealloc_enabled() or debug_pagealloc_enabled_static()
 */
static inline void
kernel_map_pages(struct page *page, int numpages, int enable)
{
        __kernel_map_pages(page, numpages, enable);
}
#ifdef CONFIG_HIBERNATION
extern bool kernel_page_present(struct page *page);
#endif        /* CONFIG_HIBERNATION */
#else        /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */
static inline void
kernel_map_pages(struct page *page, int numpages, int enable) {}
#ifdef CONFIG_HIBERNATION
static inline bool kernel_page_present(struct page *page) { return true; }
#endif        /* CONFIG_HIBERNATION */
#endif        /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */

#ifdef __HAVE_ARCH_GATE_AREA
extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm);
extern int in_gate_area_no_mm(unsigned long addr);
extern int in_gate_area(struct mm_struct *mm, unsigned long addr);
#else
static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
        return NULL;
}
static inline int in_gate_area_no_mm(unsigned long addr) { return 0; }
static inline int in_gate_area(struct mm_struct *mm, unsigned long addr)
{
        return 0;
}
#endif        /* __HAVE_ARCH_GATE_AREA */

extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm);

#ifdef CONFIG_SYSCTL
extern int sysctl_drop_caches;
int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);
#endif

void drop_slab(void);
void drop_slab_node(int nid);

#ifndef CONFIG_MMU
#define randomize_va_space 0
#else
extern int randomize_va_space;
#endif

const char * arch_vma_name(struct vm_area_struct *vma);
#ifdef CONFIG_MMU
void print_vma_addr(char *prefix, unsigned long rip);
#else
static inline void print_vma_addr(char *prefix, unsigned long rip)
{
}
#endif

void *sparse_buffer_alloc(unsigned long size);
struct page * __populate_section_memmap(unsigned long pfn,
                unsigned long nr_pages, int nid, struct vmem_altmap *altmap);
pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
                            struct vmem_altmap *altmap);
void *vmemmap_alloc_block(unsigned long size, int node);
struct vmem_altmap;
void *vmemmap_alloc_block_buf(unsigned long size, int node,
                              struct vmem_altmap *altmap);
void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
int vmemmap_populate_basepages(unsigned long start, unsigned long end,
                               int node, struct vmem_altmap *altmap);
int vmemmap_populate(unsigned long start, unsigned long end, int node,
                struct vmem_altmap *altmap);
void vmemmap_populate_print_last(void);
#ifdef CONFIG_MEMORY_HOTPLUG
void vmemmap_free(unsigned long start, unsigned long end,
                struct vmem_altmap *altmap);
#endif
void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
                                  unsigned long nr_pages);

enum mf_flags {
        MF_COUNT_INCREASED = 1 << 0,
        MF_ACTION_REQUIRED = 1 << 1,
        MF_MUST_KILL = 1 << 2,
        MF_SOFT_OFFLINE = 1 << 3,
};
extern int memory_failure(unsigned long pfn, int flags);
extern void memory_failure_queue(unsigned long pfn, int flags);
extern void memory_failure_queue_kick(int cpu);
extern int unpoison_memory(unsigned long pfn);
extern int sysctl_memory_failure_early_kill;
extern int sysctl_memory_failure_recovery;
extern void shake_page(struct page *p, int access);
extern atomic_long_t num_poisoned_pages __read_mostly;
extern int soft_offline_page(unsigned long pfn, int flags);


/*
 * Error handlers for various types of pages.
 */
enum mf_result {
        MF_IGNORED,        /* Error: cannot be handled */
        MF_FAILED,        /* Error: handling failed */
        MF_DELAYED,        /* Will be handled later */
        MF_RECOVERED,        /* Successfully recovered */
};

enum mf_action_page_type {
        MF_MSG_KERNEL,
        MF_MSG_KERNEL_HIGH_ORDER,
        MF_MSG_SLAB,
        MF_MSG_DIFFERENT_COMPOUND,
        MF_MSG_POISONED_HUGE,
        MF_MSG_HUGE,
        MF_MSG_FREE_HUGE,
        MF_MSG_NON_PMD_HUGE,
        MF_MSG_UNMAP_FAILED,
        MF_MSG_DIRTY_SWAPCACHE,
        MF_MSG_CLEAN_SWAPCACHE,
        MF_MSG_DIRTY_MLOCKED_LRU,
        MF_MSG_CLEAN_MLOCKED_LRU,
        MF_MSG_DIRTY_UNEVICTABLE_LRU,
        MF_MSG_CLEAN_UNEVICTABLE_LRU,
        MF_MSG_DIRTY_LRU,
        MF_MSG_CLEAN_LRU,
        MF_MSG_TRUNCATED_LRU,
        MF_MSG_BUDDY,
        MF_MSG_BUDDY_2ND,
        MF_MSG_DAX,
        MF_MSG_UNSPLIT_THP,
        MF_MSG_UNKNOWN,
};

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
extern void clear_huge_page(struct page *page,
                            unsigned long addr_hint,
                            unsigned int pages_per_huge_page);
extern void copy_user_huge_page(struct page *dst, struct page *src,
                                unsigned long addr_hint,
                                struct vm_area_struct *vma,
                                unsigned int pages_per_huge_page);
extern long copy_huge_page_from_user(struct page *dst_page,
                                const void __user *usr_src,
                                unsigned int pages_per_huge_page,
                                bool allow_pagefault);

/**
 * vma_is_special_huge - Are transhuge page-table entries considered special?
 * @vma: Pointer to the struct vm_area_struct to consider
 *
 * Whether transhuge page-table entries are considered "special" following
 * the definition in vm_normal_page().
 *
 * Return: true if transhuge page-table entries should be considered special,
 * false otherwise.
 */
static inline bool vma_is_special_huge(const struct vm_area_struct *vma)
{
        return vma_is_dax(vma) || (vma->vm_file &&
                                   (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
}

#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */

#ifdef CONFIG_DEBUG_PAGEALLOC
extern unsigned int _debug_guardpage_minorder;
DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled);

static inline unsigned int debug_guardpage_minorder(void)
{
        return _debug_guardpage_minorder;
}

static inline bool debug_guardpage_enabled(void)
{
        return static_branch_unlikely(&_debug_guardpage_enabled);
}

static inline bool page_is_guard(struct page *page)
{
        if (!debug_guardpage_enabled())
                return false;

        return PageGuard(page);
}
#else
static inline unsigned int debug_guardpage_minorder(void) { return 0; }
static inline bool debug_guardpage_enabled(void) { return false; }
static inline bool page_is_guard(struct page *page) { return false; }
#endif /* CONFIG_DEBUG_PAGEALLOC */

#if MAX_NUMNODES > 1
void __init setup_nr_node_ids(void);
#else
static inline void setup_nr_node_ids(void) {}
#endif

extern int memcmp_pages(struct page *page1, struct page *page2);

static inline int pages_identical(struct page *page1, struct page *page2)
{
        return !memcmp_pages(page1, page2);
}

#ifdef CONFIG_MAPPING_DIRTY_HELPERS
unsigned long clean_record_shared_mapping_range(struct address_space *mapping,
                                                pgoff_t first_index, pgoff_t nr,
                                                pgoff_t bitmap_pgoff,
                                                unsigned long *bitmap,
                                                pgoff_t *start,
                                                pgoff_t *end);

unsigned long wp_shared_mapping_range(struct address_space *mapping,
                                      pgoff_t first_index, pgoff_t nr);
#endif

extern int sysctl_nr_trim_pages;

/**
 * seal_check_future_write - Check for F_SEAL_FUTURE_WRITE flag and handle it
 * @seals: the seals to check
 * @vma: the vma to operate on
 *
 * Check whether F_SEAL_FUTURE_WRITE is set; if so, do proper check/handling on
 * the vma flags.  Return 0 if check pass, or <0 for errors.
 */
static inline int seal_check_future_write(int seals, struct vm_area_struct *vma)
{
        if (seals & F_SEAL_FUTURE_WRITE) {
                /*
                 * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
                 * "future write" seal active.
                 */
                if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
                        return -EPERM;

                /*
                 * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as
                 * MAP_SHARED and read-only, take care to not allow mprotect to
                 * revert protections on such mappings. Do this only for shared
                 * mappings. For private mappings, don't need to mask
                 * VM_MAYWRITE as we still want them to be COW-writable.
                 */
                if (vma->vm_flags & VM_SHARED)
                        vma->vm_flags &= ~(VM_MAYWRITE);
        }

        return 0;
}

#endif /* __KERNEL__ */
#endif /* _LINUX_MM_H */








































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PTRACE_H
#define _ASM_X86_PTRACE_H

#include <asm/segment.h>
#include <asm/page_types.h>
#include <uapi/asm/ptrace.h>

#ifndef __ASSEMBLY__
#ifdef __i386__

struct pt_regs {
        /*
         * NB: 32-bit x86 CPUs are inconsistent as what happens in the
         * following cases (where %seg represents a segment register):
         *
         * - pushl %seg: some do a 16-bit write and leave the high
         *   bits alone
         * - movl %seg, [mem]: some do a 16-bit write despite the movl
         * - IDT entry: some (e.g. 486) will leave the high bits of CS
         *   and (if applicable) SS undefined.
         *
         * Fortunately, x86-32 doesn't read the high bits on POP or IRET,
         * so we can just treat all of the segment registers as 16-bit
         * values.
         */
        unsigned long bx;
        unsigned long cx;
        unsigned long dx;
        unsigned long si;
        unsigned long di;
        unsigned long bp;
        unsigned long ax;
        unsigned short ds;
        unsigned short __dsh;
        unsigned short es;
        unsigned short __esh;
        unsigned short fs;
        unsigned short __fsh;
        /*
         * On interrupt, gs and __gsh store the vector number.  They never
         * store gs any more.
         */
        unsigned short gs;
        unsigned short __gsh;
        /* On interrupt, this is the error code. */
        unsigned long orig_ax;
        unsigned long ip;
        unsigned short cs;
        unsigned short __csh;
        unsigned long flags;
        unsigned long sp;
        unsigned short ss;
        unsigned short __ssh;
};

#else /* __i386__ */

struct pt_regs {
/*
 * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
 * unless syscall needs a complete, fully filled "struct pt_regs".
 */
        unsigned long r15;
        unsigned long r14;
        unsigned long r13;
        unsigned long r12;
        unsigned long bp;
        unsigned long bx;
/* These regs are callee-clobbered. Always saved on kernel entry. */
        unsigned long r11;
        unsigned long r10;
        unsigned long r9;
        unsigned long r8;
        unsigned long ax;
        unsigned long cx;
        unsigned long dx;
        unsigned long si;
        unsigned long di;
/*
 * On syscall entry, this is syscall#. On CPU exception, this is error code.
 * On hw interrupt, it's IRQ number:
 */
        unsigned long orig_ax;
/* Return frame for iretq */
        unsigned long ip;
        unsigned long cs;
        unsigned long flags;
        unsigned long sp;
        unsigned long ss;
/* top of stack page */
};

#endif /* !__i386__ */

#ifdef CONFIG_PARAVIRT
#include <asm/paravirt_types.h>
#endif

#include <asm/proto.h>

struct cpuinfo_x86;
struct task_struct;

extern unsigned long profile_pc(struct pt_regs *regs);

extern unsigned long
convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
extern void send_sigtrap(struct pt_regs *regs, int error_code, int si_code);


static inline unsigned long regs_return_value(struct pt_regs *regs)
{
        return regs->ax;
}

static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
{
        regs->ax = rc;
}

/*
 * user_mode(regs) determines whether a register set came from user
 * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
 * register set was from protected mode with RPL-3 CS value.  This
 * tricky test checks that with one comparison.
 *
 * On x86_64, vm86 mode is mercifully nonexistent, and we don't need
 * the extra check.
 */
static __always_inline int user_mode(struct pt_regs *regs)
{
#ifdef CONFIG_X86_32
        return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= USER_RPL;
#else
        return !!(regs->cs & 3);
#endif
}

static inline int v8086_mode(struct pt_regs *regs)
{
#ifdef CONFIG_X86_32
        return (regs->flags & X86_VM_MASK);
#else
        return 0;        /* No V86 mode support in long mode */
#endif
}

static inline bool user_64bit_mode(struct pt_regs *regs)
{
#ifdef CONFIG_X86_64
#ifndef CONFIG_PARAVIRT_XXL
        /*
         * On non-paravirt systems, this is the only long mode CPL 3
         * selector.  We do not allow long mode selectors in the LDT.
         */
        return regs->cs == __USER_CS;
#else
        /* Headers are too twisted for this to go in paravirt.h. */
        return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs;
#endif
#else /* !CONFIG_X86_64 */
        return false;
#endif
}

/*
 * Determine whether the register set came from any context that is running in
 * 64-bit mode.
 */
static inline bool any_64bit_mode(struct pt_regs *regs)
{
#ifdef CONFIG_X86_64
        return !user_mode(regs) || user_64bit_mode(regs);
#else
        return false;
#endif
}

#ifdef CONFIG_X86_64
#define current_user_stack_pointer()        current_pt_regs()->sp
#define compat_user_stack_pointer()        current_pt_regs()->sp

static inline bool ip_within_syscall_gap(struct pt_regs *regs)
{
        bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 &&
                    regs->ip <  (unsigned long)entry_SYSCALL_64_safe_stack);

#ifdef CONFIG_IA32_EMULATION
        ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_compat &&
                      regs->ip <  (unsigned long)entry_SYSCALL_compat_safe_stack);
#endif

        return ret;
}
#endif

static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
{
        return regs->sp;
}

static inline unsigned long instruction_pointer(struct pt_regs *regs)
{
        return regs->ip;
}

static inline void instruction_pointer_set(struct pt_regs *regs,
                unsigned long val)
{
        regs->ip = val;
}

static inline unsigned long frame_pointer(struct pt_regs *regs)
{
        return regs->bp;
}

static inline unsigned long user_stack_pointer(struct pt_regs *regs)
{
        return regs->sp;
}

static inline void user_stack_pointer_set(struct pt_regs *regs,
                unsigned long val)
{
        regs->sp = val;
}

static __always_inline bool regs_irqs_disabled(struct pt_regs *regs)
{
        return !(regs->flags & X86_EFLAGS_IF);
}

/* Query offset/name of register from its name/offset */
extern int regs_query_register_offset(const char *name);
extern const char *regs_query_register_name(unsigned int offset);
#define MAX_REG_OFFSET (offsetof(struct pt_regs, ss))

/**
 * regs_get_register() - get register value from its offset
 * @regs:        pt_regs from which register value is gotten.
 * @offset:        offset number of the register.
 *
 * regs_get_register returns the value of a register. The @offset is the
 * offset of the register in struct pt_regs address which specified by @regs.
 * If @offset is bigger than MAX_REG_OFFSET, this returns 0.
 */
static inline unsigned long regs_get_register(struct pt_regs *regs,
                                              unsigned int offset)
{
        if (unlikely(offset > MAX_REG_OFFSET))
                return 0;
#ifdef CONFIG_X86_32
        /* The selector fields are 16-bit. */
        if (offset == offsetof(struct pt_regs, cs) ||
            offset == offsetof(struct pt_regs, ss) ||
            offset == offsetof(struct pt_regs, ds) ||
            offset == offsetof(struct pt_regs, es) ||
            offset == offsetof(struct pt_regs, fs) ||
            offset == offsetof(struct pt_regs, gs)) {
                return *(u16 *)((unsigned long)regs + offset);

        }
#endif
        return *(unsigned long *)((unsigned long)regs + offset);
}

/**
 * regs_within_kernel_stack() - check the address in the stack
 * @regs:        pt_regs which contains kernel stack pointer.
 * @addr:        address which is checked.
 *
 * regs_within_kernel_stack() checks @addr is within the kernel stack page(s).
 * If @addr is within the kernel stack, it returns true. If not, returns false.
 */
static inline int regs_within_kernel_stack(struct pt_regs *regs,
                                           unsigned long addr)
{
        return ((addr & ~(THREAD_SIZE - 1)) == (regs->sp & ~(THREAD_SIZE - 1)));
}

/**
 * regs_get_kernel_stack_nth_addr() - get the address of the Nth entry on stack
 * @regs:        pt_regs which contains kernel stack pointer.
 * @n:                stack entry number.
 *
 * regs_get_kernel_stack_nth() returns the address of the @n th entry of the
 * kernel stack which is specified by @regs. If the @n th entry is NOT in
 * the kernel stack, this returns NULL.
 */
static inline unsigned long *regs_get_kernel_stack_nth_addr(struct pt_regs *regs, unsigned int n)
{
        unsigned long *addr = (unsigned long *)regs->sp;

        addr += n;
        if (regs_within_kernel_stack(regs, (unsigned long)addr))
                return addr;
        else
                return NULL;
}

/* To avoid include hell, we can't include uaccess.h */
extern long copy_from_kernel_nofault(void *dst, const void *src, size_t size);

/**
 * regs_get_kernel_stack_nth() - get Nth entry of the stack
 * @regs:        pt_regs which contains kernel stack pointer.
 * @n:                stack entry number.
 *
 * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
 * is specified by @regs. If the @n th entry is NOT in the kernel stack
 * this returns 0.
 */
static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
                                                      unsigned int n)
{
        unsigned long *addr;
        unsigned long val;
        long ret;

        addr = regs_get_kernel_stack_nth_addr(regs, n);
        if (addr) {
                ret = copy_from_kernel_nofault(&val, addr, sizeof(val));
                if (!ret)
                        return val;
        }
        return 0;
}

/**
 * regs_get_kernel_argument() - get Nth function argument in kernel
 * @regs:        pt_regs of that context
 * @n:                function argument number (start from 0)
 *
 * regs_get_argument() returns @n th argument of the function call.
 * Note that this chooses most probably assignment, in some case
 * it can be incorrect.
 * This is expected to be called from kprobes or ftrace with regs
 * where the top of stack is the return address.
 */
static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs,
                                                     unsigned int n)
{
        static const unsigned int argument_offs[] = {
#ifdef __i386__
                offsetof(struct pt_regs, ax),
                offsetof(struct pt_regs, dx),
                offsetof(struct pt_regs, cx),
#define NR_REG_ARGUMENTS 3
#else
                offsetof(struct pt_regs, di),
                offsetof(struct pt_regs, si),
                offsetof(struct pt_regs, dx),
                offsetof(struct pt_regs, cx),
                offsetof(struct pt_regs, r8),
                offsetof(struct pt_regs, r9),
#define NR_REG_ARGUMENTS 6
#endif
        };

        if (n >= NR_REG_ARGUMENTS) {
                n -= NR_REG_ARGUMENTS - 1;
                return regs_get_kernel_stack_nth(regs, n);
        } else
                return regs_get_register(regs, argument_offs[n]);
}

#define arch_has_single_step()        (1)
#ifdef CONFIG_X86_DEBUGCTLMSR
#define arch_has_block_step()        (1)
#else
#define arch_has_block_step()        (boot_cpu_data.x86 >= 6)
#endif

#define ARCH_HAS_USER_SINGLE_STEP_REPORT

struct user_desc;
extern int do_get_thread_area(struct task_struct *p, int idx,
                              struct user_desc __user *info);
extern int do_set_thread_area(struct task_struct *p, int idx,
                              struct user_desc __user *info, int can_allocate);

#ifdef CONFIG_X86_64
# define do_set_thread_area_64(p, s, t)        do_arch_prctl_64(p, s, t)
#else
# define do_set_thread_area_64(p, s, t)        (0)
#endif

#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PTRACE_H */












































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
/*
 *  include/linux/ktime.h
 *
 *  ktime_t - nanosecond-resolution time format.
 *
 *   Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
 *   Copyright(C) 2005, Red Hat, Inc., Ingo Molnar
 *
 *  data type definitions, declarations, prototypes and macros.
 *
 *  Started by: Thomas Gleixner and Ingo Molnar
 *
 *  Credits:
 *
 *          Roman Zippel provided the ideas and primary code snippets of
 *          the ktime_t union and further simplifications of the original
 *          code.
 *
 *  For licencing details see kernel-base/COPYING
 */
#ifndef _LINUX_KTIME_H
#define _LINUX_KTIME_H

#include <linux/time.h>
#include <linux/jiffies.h>
#include <asm/bug.h>

/* Nanosecond scalar representation for kernel time values */
typedef s64        ktime_t;

/**
 * ktime_set - Set a ktime_t variable from a seconds/nanoseconds value
 * @secs:        seconds to set
 * @nsecs:        nanoseconds to set
 *
 * Return: The ktime_t representation of the value.
 */
static inline ktime_t ktime_set(const s64 secs, const unsigned long nsecs)
{
        if (unlikely(secs >= KTIME_SEC_MAX))
                return KTIME_MAX;

        return secs * NSEC_PER_SEC + (s64)nsecs;
}

/* Subtract two ktime_t variables. rem = lhs -rhs: */
#define ktime_sub(lhs, rhs)        ((lhs) - (rhs))

/* Add two ktime_t variables. res = lhs + rhs: */
#define ktime_add(lhs, rhs)        ((lhs) + (rhs))

/*
 * Same as ktime_add(), but avoids undefined behaviour on overflow; however,
 * this means that you must check the result for overflow yourself.
 */
#define ktime_add_unsafe(lhs, rhs)        ((u64) (lhs) + (rhs))

/*
 * Add a ktime_t variable and a scalar nanosecond value.
 * res = kt + nsval:
 */
#define ktime_add_ns(kt, nsval)                ((kt) + (nsval))

/*
 * Subtract a scalar nanosecod from a ktime_t variable
 * res = kt - nsval:
 */
#define ktime_sub_ns(kt, nsval)                ((kt) - (nsval))

/* convert a timespec64 to ktime_t format: */
static inline ktime_t timespec64_to_ktime(struct timespec64 ts)
{
        return ktime_set(ts.tv_sec, ts.tv_nsec);
}

/* Map the ktime_t to timespec conversion to ns_to_timespec function */
#define ktime_to_timespec64(kt)                ns_to_timespec64((kt))

/* Convert ktime_t to nanoseconds */
static inline s64 ktime_to_ns(const ktime_t kt)
{
        return kt;
}

/**
 * ktime_compare - Compares two ktime_t variables for less, greater or equal
 * @cmp1:        comparable1
 * @cmp2:        comparable2
 *
 * Return: ...
 *   cmp1  < cmp2: return <0
 *   cmp1 == cmp2: return 0
 *   cmp1  > cmp2: return >0
 */
static inline int ktime_compare(const ktime_t cmp1, const ktime_t cmp2)
{
        if (cmp1 < cmp2)
                return -1;
        if (cmp1 > cmp2)
                return 1;
        return 0;
}

/**
 * ktime_after - Compare if a ktime_t value is bigger than another one.
 * @cmp1:        comparable1
 * @cmp2:        comparable2
 *
 * Return: true if cmp1 happened after cmp2.
 */
static inline bool ktime_after(const ktime_t cmp1, const ktime_t cmp2)
{
        return ktime_compare(cmp1, cmp2) > 0;
}

/**
 * ktime_before - Compare if a ktime_t value is smaller than another one.
 * @cmp1:        comparable1
 * @cmp2:        comparable2
 *
 * Return: true if cmp1 happened before cmp2.
 */
static inline bool ktime_before(const ktime_t cmp1, const ktime_t cmp2)
{
        return ktime_compare(cmp1, cmp2) < 0;
}

#if BITS_PER_LONG < 64
extern s64 __ktime_divns(const ktime_t kt, s64 div);
static inline s64 ktime_divns(const ktime_t kt, s64 div)
{
        /*
         * Negative divisors could cause an inf loop,
         * so bug out here.
         */
        BUG_ON(div < 0);
        if (__builtin_constant_p(div) && !(div >> 32)) {
                s64 ns = kt;
                u64 tmp = ns < 0 ? -ns : ns;

                do_div(tmp, div);
                return ns < 0 ? -tmp : tmp;
        } else {
                return __ktime_divns(kt, div);
        }
}
#else /* BITS_PER_LONG < 64 */
static inline s64 ktime_divns(const ktime_t kt, s64 div)
{
        /*
         * 32-bit implementation cannot handle negative divisors,
         * so catch them on 64bit as well.
         */
        WARN_ON(div < 0);
        return kt / div;
}
#endif

static inline s64 ktime_to_us(const ktime_t kt)
{
        return ktime_divns(kt, NSEC_PER_USEC);
}

static inline s64 ktime_to_ms(const ktime_t kt)
{
        return ktime_divns(kt, NSEC_PER_MSEC);
}

static inline s64 ktime_us_delta(const ktime_t later, const ktime_t earlier)
{
       return ktime_to_us(ktime_sub(later, earlier));
}

static inline s64 ktime_ms_delta(const ktime_t later, const ktime_t earlier)
{
        return ktime_to_ms(ktime_sub(later, earlier));
}

static inline ktime_t ktime_add_us(const ktime_t kt, const u64 usec)
{
        return ktime_add_ns(kt, usec * NSEC_PER_USEC);
}

static inline ktime_t ktime_add_ms(const ktime_t kt, const u64 msec)
{
        return ktime_add_ns(kt, msec * NSEC_PER_MSEC);
}

static inline ktime_t ktime_sub_us(const ktime_t kt, const u64 usec)
{
        return ktime_sub_ns(kt, usec * NSEC_PER_USEC);
}

static inline ktime_t ktime_sub_ms(const ktime_t kt, const u64 msec)
{
        return ktime_sub_ns(kt, msec * NSEC_PER_MSEC);
}

extern ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs);

/**
 * ktime_to_timespec64_cond - convert a ktime_t variable to timespec64
 *                            format only if the variable contains data
 * @kt:                the ktime_t variable to convert
 * @ts:                the timespec variable to store the result in
 *
 * Return: %true if there was a successful conversion, %false if kt was 0.
 */
static inline __must_check bool ktime_to_timespec64_cond(const ktime_t kt,
                                                       struct timespec64 *ts)
{
        if (kt) {
                *ts = ktime_to_timespec64(kt);
                return true;
        } else {
                return false;
        }
}

#include <vdso/ktime.h>

static inline ktime_t ns_to_ktime(u64 ns)
{
        return ns;
}

static inline ktime_t ms_to_ktime(u64 ms)
{
        return ms * NSEC_PER_MSEC;
}

# include <linux/timekeeping.h>
# include <linux/timekeeping32.h>

#endif

















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#ifndef _LINUX_JHASH_H
#define _LINUX_JHASH_H

/* jhash.h: Jenkins hash support.
 *
 * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net)
 *
 * https://burtleburtle.net/bob/hash/
 *
 * These are the credits from Bob's sources:
 *
 * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
 *
 * These are functions for producing 32-bit hashes for hash table lookup.
 * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
 * are externally useful functions.  Routines to test the hash are included
 * if SELF_TEST is defined.  You can use this free for any purpose.  It's in
 * the public domain.  It has no warranty.
 *
 * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@netfilter.org)
 *
 * I've modified Bob's hash to be useful in the Linux kernel, and
 * any bugs present are my fault.
 * Jozsef
 */
#include <linux/bitops.h>
#include <linux/unaligned/packed_struct.h>

/* Best hash sizes are of power of two */
#define jhash_size(n)   ((u32)1<<(n))
/* Mask the hash value, i.e (value & jhash_mask(n)) instead of (value % n) */
#define jhash_mask(n)   (jhash_size(n)-1)

/* __jhash_mix -- mix 3 32-bit values reversibly. */
#define __jhash_mix(a, b, c)                        \
{                                                \
        a -= c;  a ^= rol32(c, 4);  c += b;        \
        b -= a;  b ^= rol32(a, 6);  a += c;        \
        c -= b;  c ^= rol32(b, 8);  b += a;        \
        a -= c;  a ^= rol32(c, 16); c += b;        \
        b -= a;  b ^= rol32(a, 19); a += c;        \
        c -= b;  c ^= rol32(b, 4);  b += a;        \
}

/* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */
#define __jhash_final(a, b, c)                        \
{                                                \
        c ^= b; c -= rol32(b, 14);                \
        a ^= c; a -= rol32(c, 11);                \
        b ^= a; b -= rol32(a, 25);                \
        c ^= b; c -= rol32(b, 16);                \
        a ^= c; a -= rol32(c, 4);                \
        b ^= a; b -= rol32(a, 14);                \
        c ^= b; c -= rol32(b, 24);                \
}

/* An arbitrary initial parameter */
#define JHASH_INITVAL                0xdeadbeef

/* jhash - hash an arbitrary key
 * @k: sequence of bytes as key
 * @length: the length of the key
 * @initval: the previous hash, or an arbitray value
 *
 * The generic version, hashes an arbitrary sequence of bytes.
 * No alignment or length assumptions are made about the input key.
 *
 * Returns the hash value of the key. The result depends on endianness.
 */
static inline u32 jhash(const void *key, u32 length, u32 initval)
{
        u32 a, b, c;
        const u8 *k = key;

        /* Set up the internal state */
        a = b = c = JHASH_INITVAL + length + initval;

        /* All but the last block: affect some 32 bits of (a,b,c) */
        while (length > 12) {
                a += __get_unaligned_cpu32(k);
                b += __get_unaligned_cpu32(k + 4);
                c += __get_unaligned_cpu32(k + 8);
                __jhash_mix(a, b, c);
                length -= 12;
                k += 12;
        }
        /* Last block: affect all 32 bits of (c) */
        switch (length) {
        case 12: c += (u32)k[11]<<24;        fallthrough;
        case 11: c += (u32)k[10]<<16;        fallthrough;
        case 10: c += (u32)k[9]<<8;        fallthrough;
        case 9:  c += k[8];                fallthrough;
        case 8:  b += (u32)k[7]<<24;        fallthrough;
        case 7:  b += (u32)k[6]<<16;        fallthrough;
        case 6:  b += (u32)k[5]<<8;        fallthrough;
        case 5:  b += k[4];                fallthrough;
        case 4:  a += (u32)k[3]<<24;        fallthrough;
        case 3:  a += (u32)k[2]<<16;        fallthrough;
        case 2:  a += (u32)k[1]<<8;        fallthrough;
        case 1:  a += k[0];
                 __jhash_final(a, b, c);
                 break;
        case 0: /* Nothing left to add */
                break;
        }

        return c;
}

/* jhash2 - hash an array of u32's
 * @k: the key which must be an array of u32's
 * @length: the number of u32's in the key
 * @initval: the previous hash, or an arbitray value
 *
 * Returns the hash value of the key.
 */
static inline u32 jhash2(const u32 *k, u32 length, u32 initval)
{
        u32 a, b, c;

        /* Set up the internal state */
        a = b = c = JHASH_INITVAL + (length<<2) + initval;

        /* Handle most of the key */
        while (length > 3) {
                a += k[0];
                b += k[1];
                c += k[2];
                __jhash_mix(a, b, c);
                length -= 3;
                k += 3;
        }

        /* Handle the last 3 u32's */
        switch (length) {
        case 3: c += k[2];        fallthrough;
        case 2: b += k[1];        fallthrough;
        case 1: a += k[0];
                __jhash_final(a, b, c);
                break;
        case 0:        /* Nothing left to add */
                break;
        }

        return c;
}


/* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */
static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
{
        a += initval;
        b += initval;
        c += initval;

        __jhash_final(a, b, c);

        return c;
}

static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval)
{
        return __jhash_nwords(a, b, c, initval + JHASH_INITVAL + (3 << 2));
}

static inline u32 jhash_2words(u32 a, u32 b, u32 initval)
{
        return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2));
}

static inline u32 jhash_1word(u32 a, u32 initval)
{
        return __jhash_nwords(a, 0, 0, initval + JHASH_INITVAL + (1 << 2));
}

#endif /* _LINUX_JHASH_H */






























































































































































    1 































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _MM_PERCPU_INTERNAL_H
#define _MM_PERCPU_INTERNAL_H

#include <linux/types.h>
#include <linux/percpu.h>

/*
 * There are two chunk types: root and memcg-aware.
 * Chunks of each type have separate slots list.
 *
 * Memcg-aware chunks have an attached vector of obj_cgroup pointers, which is
 * used to store memcg membership data of a percpu object.  Obj_cgroups are
 * ref-counted pointers to a memory cgroup with an ability to switch dynamically
 * to the parent memory cgroup.  This allows to reclaim a deleted memory cgroup
 * without reclaiming of all outstanding objects, which hold a reference at it.
 */
enum pcpu_chunk_type {
        PCPU_CHUNK_ROOT,
#ifdef CONFIG_MEMCG_KMEM
        PCPU_CHUNK_MEMCG,
#endif
        PCPU_NR_CHUNK_TYPES,
        PCPU_FAIL_ALLOC = PCPU_NR_CHUNK_TYPES
};

/*
 * pcpu_block_md is the metadata block struct.
 * Each chunk's bitmap is split into a number of full blocks.
 * All units are in terms of bits.
 *
 * The scan hint is the largest known contiguous area before the contig hint.
 * It is not necessarily the actual largest contig hint though.  There is an
 * invariant that the scan_hint_start > contig_hint_start iff
 * scan_hint == contig_hint.  This is necessary because when scanning forward,
 * we don't know if a new contig hint would be better than the current one.
 */
struct pcpu_block_md {
        int                        scan_hint;        /* scan hint for block */
        int                        scan_hint_start; /* block relative starting
                                                    position of the scan hint */
        int                     contig_hint;    /* contig hint for block */
        int                     contig_hint_start; /* block relative starting
                                                      position of the contig hint */
        int                     left_free;      /* size of free space along
                                                   the left side of the block */
        int                     right_free;     /* size of free space along
                                                   the right side of the block */
        int                     first_free;     /* block position of first free */
        int                        nr_bits;        /* total bits responsible for */
};

struct pcpu_chunk {
#ifdef CONFIG_PERCPU_STATS
        int                        nr_alloc;        /* # of allocations */
        size_t                        max_alloc_size; /* largest allocation size */
#endif

        struct list_head        list;                /* linked to pcpu_slot lists */
        int                        free_bytes;        /* free bytes in the chunk */
        struct pcpu_block_md        chunk_md;
        void                        *base_addr;        /* base address of this chunk */

        unsigned long                *alloc_map;        /* allocation map */
        unsigned long                *bound_map;        /* boundary map */
        struct pcpu_block_md        *md_blocks;        /* metadata blocks */

        void                        *data;                /* chunk data */
        bool                        immutable;        /* no [de]population allowed */
        int                        start_offset;        /* the overlap with the previous
                                                   region to have a page aligned
                                                   base_addr */
        int                        end_offset;        /* additional area required to
                                                   have the region end page
                                                   aligned */
#ifdef CONFIG_MEMCG_KMEM
        struct obj_cgroup        **obj_cgroups;        /* vector of object cgroups */
#endif

        int                        nr_pages;        /* # of pages served by this chunk */
        int                        nr_populated;        /* # of populated pages */
        int                     nr_empty_pop_pages; /* # of empty populated pages */
        unsigned long                populated[];        /* populated bitmap */
};

extern spinlock_t pcpu_lock;

extern struct list_head *pcpu_chunk_lists;
extern int pcpu_nr_slots;
extern int pcpu_nr_empty_pop_pages[];

extern struct pcpu_chunk *pcpu_first_chunk;
extern struct pcpu_chunk *pcpu_reserved_chunk;

/**
 * pcpu_chunk_nr_blocks - converts nr_pages to # of md_blocks
 * @chunk: chunk of interest
 *
 * This conversion is from the number of physical pages that the chunk
 * serves to the number of bitmap blocks used.
 */
static inline int pcpu_chunk_nr_blocks(struct pcpu_chunk *chunk)
{
        return chunk->nr_pages * PAGE_SIZE / PCPU_BITMAP_BLOCK_SIZE;
}

/**
 * pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap
 * @pages: number of physical pages
 *
 * This conversion is from physical pages to the number of bits
 * required in the bitmap.
 */
static inline int pcpu_nr_pages_to_map_bits(int pages)
{
        return pages * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
}

/**
 * pcpu_chunk_map_bits - helper to convert nr_pages to size of bitmap
 * @chunk: chunk of interest
 *
 * This conversion is from the number of physical pages that the chunk
 * serves to the number of bits in the bitmap.
 */
static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
{
        return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
}

#ifdef CONFIG_MEMCG_KMEM
static inline enum pcpu_chunk_type pcpu_chunk_type(struct pcpu_chunk *chunk)
{
        if (chunk->obj_cgroups)
                return PCPU_CHUNK_MEMCG;
        return PCPU_CHUNK_ROOT;
}

static inline bool pcpu_is_memcg_chunk(enum pcpu_chunk_type chunk_type)
{
        return chunk_type == PCPU_CHUNK_MEMCG;
}

#else
static inline enum pcpu_chunk_type pcpu_chunk_type(struct pcpu_chunk *chunk)
{
        return PCPU_CHUNK_ROOT;
}

static inline bool pcpu_is_memcg_chunk(enum pcpu_chunk_type chunk_type)
{
        return false;
}
#endif

static inline struct list_head *pcpu_chunk_list(enum pcpu_chunk_type chunk_type)
{
        return &pcpu_chunk_lists[pcpu_nr_slots *
                                 pcpu_is_memcg_chunk(chunk_type)];
}

#ifdef CONFIG_PERCPU_STATS

#include <linux/spinlock.h>

struct percpu_stats {
        u64 nr_alloc;                /* lifetime # of allocations */
        u64 nr_dealloc;                /* lifetime # of deallocations */
        u64 nr_cur_alloc;        /* current # of allocations */
        u64 nr_max_alloc;        /* max # of live allocations */
        u32 nr_chunks;                /* current # of live chunks */
        u32 nr_max_chunks;        /* max # of live chunks */
        size_t min_alloc_size;        /* min allocaiton size */
        size_t max_alloc_size;        /* max allocation size */
};

extern struct percpu_stats pcpu_stats;
extern struct pcpu_alloc_info pcpu_stats_ai;

/*
 * For debug purposes. We don't care about the flexible array.
 */
static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
{
        memcpy(&pcpu_stats_ai, ai, sizeof(struct pcpu_alloc_info));

        /* initialize min_alloc_size to unit_size */
        pcpu_stats.min_alloc_size = pcpu_stats_ai.unit_size;
}

/*
 * pcpu_stats_area_alloc - increment area allocation stats
 * @chunk: the location of the area being allocated
 * @size: size of area to allocate in bytes
 *
 * CONTEXT:
 * pcpu_lock.
 */
static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
{
        lockdep_assert_held(&pcpu_lock);

        pcpu_stats.nr_alloc++;
        pcpu_stats.nr_cur_alloc++;
        pcpu_stats.nr_max_alloc =
                max(pcpu_stats.nr_max_alloc, pcpu_stats.nr_cur_alloc);
        pcpu_stats.min_alloc_size =
                min(pcpu_stats.min_alloc_size, size);
        pcpu_stats.max_alloc_size =
                max(pcpu_stats.max_alloc_size, size);

        chunk->nr_alloc++;
        chunk->max_alloc_size = max(chunk->max_alloc_size, size);
}

/*
 * pcpu_stats_area_dealloc - decrement allocation stats
 * @chunk: the location of the area being deallocated
 *
 * CONTEXT:
 * pcpu_lock.
 */
static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
{
        lockdep_assert_held(&pcpu_lock);

        pcpu_stats.nr_dealloc++;
        pcpu_stats.nr_cur_alloc--;

        chunk->nr_alloc--;
}

/*
 * pcpu_stats_chunk_alloc - increment chunk stats
 */
static inline void pcpu_stats_chunk_alloc(void)
{
        unsigned long flags;
        spin_lock_irqsave(&pcpu_lock, flags);

        pcpu_stats.nr_chunks++;
        pcpu_stats.nr_max_chunks =
                max(pcpu_stats.nr_max_chunks, pcpu_stats.nr_chunks);

        spin_unlock_irqrestore(&pcpu_lock, flags);
}

/*
 * pcpu_stats_chunk_dealloc - decrement chunk stats
 */
static inline void pcpu_stats_chunk_dealloc(void)
{
        unsigned long flags;
        spin_lock_irqsave(&pcpu_lock, flags);

        pcpu_stats.nr_chunks--;

        spin_unlock_irqrestore(&pcpu_lock, flags);
}

#else

static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
{
}

static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
{
}

static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
{
}

static inline void pcpu_stats_chunk_alloc(void)
{
}

static inline void pcpu_stats_chunk_dealloc(void)
{
}

#endif /* !CONFIG_PERCPU_STATS */

#endif































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * net/dst.h        Protocol independent destination cache definitions.
 *
 * Authors:        Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 */

#ifndef _NET_DST_H
#define _NET_DST_H

#include <net/dst_ops.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/rcupdate.h>
#include <linux/bug.h>
#include <linux/jiffies.h>
#include <linux/refcount.h>
#include <net/neighbour.h>
#include <asm/processor.h>

struct sk_buff;

struct dst_entry {
        struct net_device       *dev;
        struct  dst_ops                *ops;
        unsigned long                _metrics;
        unsigned long           expires;
#ifdef CONFIG_XFRM
        struct xfrm_state        *xfrm;
#else
        void                        *__pad1;
#endif
        int                        (*input)(struct sk_buff *);
        int                        (*output)(struct net *net, struct sock *sk, struct sk_buff *skb);

        unsigned short                flags;
#define DST_NOXFRM                0x0002
#define DST_NOPOLICY                0x0004
#define DST_NOCOUNT                0x0008
#define DST_FAKE_RTABLE                0x0010
#define DST_XFRM_TUNNEL                0x0020
#define DST_XFRM_QUEUE                0x0040
#define DST_METADATA                0x0080

        /* A non-zero value of dst->obsolete forces by-hand validation
         * of the route entry.  Positive values are set by the generic
         * dst layer to indicate that the entry has been forcefully
         * destroyed.
         *
         * Negative values are used by the implementation layer code to
         * force invocation of the dst_ops->check() method.
         */
        short                        obsolete;
#define DST_OBSOLETE_NONE        0
#define DST_OBSOLETE_DEAD        2
#define DST_OBSOLETE_FORCE_CHK        -1
#define DST_OBSOLETE_KILL        -2
        unsigned short                header_len;        /* more space at head required */
        unsigned short                trailer_len;        /* space to reserve at tail */

        /*
         * __refcnt wants to be on a different cache line from
         * input/output/ops or performance tanks badly
         */
#ifdef CONFIG_64BIT
        atomic_t                __refcnt;        /* 64-bit offset 64 */
#endif
        int                        __use;
        unsigned long                lastuse;
        struct lwtunnel_state   *lwtstate;
        struct rcu_head                rcu_head;
        short                        error;
        short                        __pad;
        __u32                        tclassid;
#ifndef CONFIG_64BIT
        atomic_t                __refcnt;        /* 32-bit offset 64 */
#endif
};

struct dst_metrics {
        u32                metrics[RTAX_MAX];
        refcount_t        refcnt;
} __aligned(4);                /* Low pointer bits contain DST_METRICS_FLAGS */
extern const struct dst_metrics dst_default_metrics;

u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old);

#define DST_METRICS_READ_ONLY                0x1UL
#define DST_METRICS_REFCOUNTED                0x2UL
#define DST_METRICS_FLAGS                0x3UL
#define __DST_METRICS_PTR(Y)        \
        ((u32 *)((Y) & ~DST_METRICS_FLAGS))
#define DST_METRICS_PTR(X)        __DST_METRICS_PTR((X)->_metrics)

static inline bool dst_metrics_read_only(const struct dst_entry *dst)
{
        return dst->_metrics & DST_METRICS_READ_ONLY;
}

void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old);

static inline void dst_destroy_metrics_generic(struct dst_entry *dst)
{
        unsigned long val = dst->_metrics;
        if (!(val & DST_METRICS_READ_ONLY))
                __dst_destroy_metrics_generic(dst, val);
}

static inline u32 *dst_metrics_write_ptr(struct dst_entry *dst)
{
        unsigned long p = dst->_metrics;

        BUG_ON(!p);

        if (p & DST_METRICS_READ_ONLY)
                return dst->ops->cow_metrics(dst, p);
        return __DST_METRICS_PTR(p);
}

/* This may only be invoked before the entry has reached global
 * visibility.
 */
static inline void dst_init_metrics(struct dst_entry *dst,
                                    const u32 *src_metrics,
                                    bool read_only)
{
        dst->_metrics = ((unsigned long) src_metrics) |
                (read_only ? DST_METRICS_READ_ONLY : 0);
}

static inline void dst_copy_metrics(struct dst_entry *dest, const struct dst_entry *src)
{
        u32 *dst_metrics = dst_metrics_write_ptr(dest);

        if (dst_metrics) {
                u32 *src_metrics = DST_METRICS_PTR(src);

                memcpy(dst_metrics, src_metrics, RTAX_MAX * sizeof(u32));
        }
}

static inline u32 *dst_metrics_ptr(struct dst_entry *dst)
{
        return DST_METRICS_PTR(dst);
}

static inline u32
dst_metric_raw(const struct dst_entry *dst, const int metric)
{
        u32 *p = DST_METRICS_PTR(dst);

        return p[metric-1];
}

static inline u32
dst_metric(const struct dst_entry *dst, const int metric)
{
        WARN_ON_ONCE(metric == RTAX_HOPLIMIT ||
                     metric == RTAX_ADVMSS ||
                     metric == RTAX_MTU);
        return dst_metric_raw(dst, metric);
}

static inline u32
dst_metric_advmss(const struct dst_entry *dst)
{
        u32 advmss = dst_metric_raw(dst, RTAX_ADVMSS);

        if (!advmss)
                advmss = dst->ops->default_advmss(dst);

        return advmss;
}

static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val)
{
        u32 *p = dst_metrics_write_ptr(dst);

        if (p)
                p[metric-1] = val;
}

/* Kernel-internal feature bits that are unallocated in user space. */
#define DST_FEATURE_ECN_CA        (1U << 31)

#define DST_FEATURE_MASK        (DST_FEATURE_ECN_CA)
#define DST_FEATURE_ECN_MASK        (DST_FEATURE_ECN_CA | RTAX_FEATURE_ECN)

static inline u32
dst_feature(const struct dst_entry *dst, u32 feature)
{
        return dst_metric(dst, RTAX_FEATURES) & feature;
}

static inline u32 dst_mtu(const struct dst_entry *dst)
{
        return dst->ops->mtu(dst);
}

/* RTT metrics are stored in milliseconds for user ABI, but used as jiffies */
static inline unsigned long dst_metric_rtt(const struct dst_entry *dst, int metric)
{
        return msecs_to_jiffies(dst_metric(dst, metric));
}

static inline u32
dst_allfrag(const struct dst_entry *dst)
{
        int ret = dst_feature(dst,  RTAX_FEATURE_ALLFRAG);
        return ret;
}

static inline int
dst_metric_locked(const struct dst_entry *dst, int metric)
{
        return dst_metric(dst, RTAX_LOCK) & (1 << metric);
}

static inline void dst_hold(struct dst_entry *dst)
{
        /*
         * If your kernel compilation stops here, please check
         * the placement of __refcnt in struct dst_entry
         */
        BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63);
        WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0);
}

static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
{
        if (unlikely(time != dst->lastuse)) {
                dst->__use++;
                dst->lastuse = time;
        }
}

static inline struct dst_entry *dst_clone(struct dst_entry *dst)
{
        if (dst)
                dst_hold(dst);
        return dst;
}

void dst_release(struct dst_entry *dst);

void dst_release_immediate(struct dst_entry *dst);

static inline void refdst_drop(unsigned long refdst)
{
        if (!(refdst & SKB_DST_NOREF))
                dst_release((struct dst_entry *)(refdst & SKB_DST_PTRMASK));
}

/**
 * skb_dst_drop - drops skb dst
 * @skb: buffer
 *
 * Drops dst reference count if a reference was taken.
 */
static inline void skb_dst_drop(struct sk_buff *skb)
{
        if (skb->_skb_refdst) {
                refdst_drop(skb->_skb_refdst);
                skb->_skb_refdst = 0UL;
        }
}

static inline void __skb_dst_copy(struct sk_buff *nskb, unsigned long refdst)
{
        nskb->_skb_refdst = refdst;
        if (!(nskb->_skb_refdst & SKB_DST_NOREF))
                dst_clone(skb_dst(nskb));
}

static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb)
{
        __skb_dst_copy(nskb, oskb->_skb_refdst);
}

/**
 * dst_hold_safe - Take a reference on a dst if possible
 * @dst: pointer to dst entry
 *
 * This helper returns false if it could not safely
 * take a reference on a dst.
 */
static inline bool dst_hold_safe(struct dst_entry *dst)
{
        return atomic_inc_not_zero(&dst->__refcnt);
}

/**
 * skb_dst_force - makes sure skb dst is refcounted
 * @skb: buffer
 *
 * If dst is not yet refcounted and not destroyed, grab a ref on it.
 * Returns true if dst is refcounted.
 */
static inline bool skb_dst_force(struct sk_buff *skb)
{
        if (skb_dst_is_noref(skb)) {
                struct dst_entry *dst = skb_dst(skb);

                WARN_ON(!rcu_read_lock_held());
                if (!dst_hold_safe(dst))
                        dst = NULL;

                skb->_skb_refdst = (unsigned long)dst;
        }

        return skb->_skb_refdst != 0UL;
}


/**
 *        __skb_tunnel_rx - prepare skb for rx reinsert
 *        @skb: buffer
 *        @dev: tunnel device
 *        @net: netns for packet i/o
 *
 *        After decapsulation, packet is going to re-enter (netif_rx()) our stack,
 *        so make some cleanups. (no accounting done)
 */
static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev,
                                   struct net *net)
{
        skb->dev = dev;

        /*
         * Clear hash so that we can recalulate the hash for the
         * encapsulated packet, unless we have already determine the hash
         * over the L4 4-tuple.
         */
        skb_clear_hash_if_not_l4(skb);
        skb_set_queue_mapping(skb, 0);
        skb_scrub_packet(skb, !net_eq(net, dev_net(dev)));
}

/**
 *        skb_tunnel_rx - prepare skb for rx reinsert
 *        @skb: buffer
 *        @dev: tunnel device
 *        @net: netns for packet i/o
 *
 *        After decapsulation, packet is going to re-enter (netif_rx()) our stack,
 *        so make some cleanups, and perform accounting.
 *        Note: this accounting is not SMP safe.
 */
static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev,
                                 struct net *net)
{
        DEV_STATS_INC(dev, rx_packets);
        DEV_STATS_ADD(dev, rx_bytes, skb->len);
        __skb_tunnel_rx(skb, dev, net);
}

static inline u32 dst_tclassid(const struct sk_buff *skb)
{
#ifdef CONFIG_IP_ROUTE_CLASSID
        const struct dst_entry *dst;

        dst = skb_dst(skb);
        if (dst)
                return dst->tclassid;
#endif
        return 0;
}

int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
static inline int dst_discard(struct sk_buff *skb)
{
        return dst_discard_out(&init_net, skb->sk, skb);
}
void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_ref,
                int initial_obsolete, unsigned short flags);
void dst_init(struct dst_entry *dst, struct dst_ops *ops,
              struct net_device *dev, int initial_ref, int initial_obsolete,
              unsigned short flags);
struct dst_entry *dst_destroy(struct dst_entry *dst);
void dst_dev_put(struct dst_entry *dst);

static inline void dst_confirm(struct dst_entry *dst)
{
}

static inline struct neighbour *dst_neigh_lookup(const struct dst_entry *dst, const void *daddr)
{
        struct neighbour *n = dst->ops->neigh_lookup(dst, NULL, daddr);
        return IS_ERR(n) ? NULL : n;
}

static inline struct neighbour *dst_neigh_lookup_skb(const struct dst_entry *dst,
                                                     struct sk_buff *skb)
{
        struct neighbour *n = NULL;

        /* The packets from tunnel devices (eg bareudp) may have only
         * metadata in the dst pointer of skb. Hence a pointer check of
         * neigh_lookup is needed.
         */
        if (dst->ops->neigh_lookup)
                n = dst->ops->neigh_lookup(dst, skb, NULL);

        return IS_ERR(n) ? NULL : n;
}

static inline void dst_confirm_neigh(const struct dst_entry *dst,
                                     const void *daddr)
{
        if (dst->ops->confirm_neigh)
                dst->ops->confirm_neigh(dst, daddr);
}

static inline void dst_link_failure(struct sk_buff *skb)
{
        struct dst_entry *dst = skb_dst(skb);
        if (dst && dst->ops && dst->ops->link_failure)
                dst->ops->link_failure(skb);
}

static inline void dst_set_expires(struct dst_entry *dst, int timeout)
{
        unsigned long expires = jiffies + timeout;

        if (expires == 0)
                expires = 1;

        if (dst->expires == 0 || time_before(expires, dst->expires))
                dst->expires = expires;
}

/* Output packet to network from transport.  */
static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        return skb_dst(skb)->output(net, sk, skb);
}

/* Input packet from network to transport.  */
static inline int dst_input(struct sk_buff *skb)
{
        return skb_dst(skb)->input(skb);
}

static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie)
{
        if (dst->obsolete)
                dst = dst->ops->check(dst, cookie);
        return dst;
}

/* Flags for xfrm_lookup flags argument. */
enum {
        XFRM_LOOKUP_ICMP = 1 << 0,
        XFRM_LOOKUP_QUEUE = 1 << 1,
        XFRM_LOOKUP_KEEP_DST_REF = 1 << 2,
};

struct flowi;
#ifndef CONFIG_XFRM
static inline struct dst_entry *xfrm_lookup(struct net *net,
                                            struct dst_entry *dst_orig,
                                            const struct flowi *fl,
                                            const struct sock *sk,
                                            int flags)
{
        return dst_orig;
}

static inline struct dst_entry *
xfrm_lookup_with_ifid(struct net *net, struct dst_entry *dst_orig,
                      const struct flowi *fl, const struct sock *sk,
                      int flags, u32 if_id)
{
        return dst_orig;
}

static inline struct dst_entry *xfrm_lookup_route(struct net *net,
                                                  struct dst_entry *dst_orig,
                                                  const struct flowi *fl,
                                                  const struct sock *sk,
                                                  int flags)
{
        return dst_orig;
}

static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst)
{
        return NULL;
}

#else
struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
                              const struct flowi *fl, const struct sock *sk,
                              int flags);

struct dst_entry *xfrm_lookup_with_ifid(struct net *net,
                                        struct dst_entry *dst_orig,
                                        const struct flowi *fl,
                                        const struct sock *sk, int flags,
                                        u32 if_id);

struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig,
                                    const struct flowi *fl, const struct sock *sk,
                                    int flags);

/* skb attached with this dst needs transformation if dst->xfrm is valid */
static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst)
{
        return dst->xfrm;
}
#endif

static inline void skb_dst_update_pmtu(struct sk_buff *skb, u32 mtu)
{
        struct dst_entry *dst = skb_dst(skb);

        if (dst && dst->ops->update_pmtu)
                dst->ops->update_pmtu(dst, NULL, skb, mtu, true);
}

/* update dst pmtu but not do neighbor confirm */
static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu)
{
        struct dst_entry *dst = skb_dst(skb);

        if (dst && dst->ops->update_pmtu)
                dst->ops->update_pmtu(dst, NULL, skb, mtu, false);
}

struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie);
void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
                               struct sk_buff *skb, u32 mtu, bool confirm_neigh);
void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
                            struct sk_buff *skb);
u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old);
struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst,
                                             struct sk_buff *skb,
                                             const void *daddr);
unsigned int dst_blackhole_mtu(const struct dst_entry *dst);

#endif /* _NET_DST_H */


















































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kmem

#if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_KMEM_H

#include <linux/types.h>
#include <linux/tracepoint.h>
#include <trace/events/mmflags.h>

DECLARE_EVENT_CLASS(kmem_alloc,

        TP_PROTO(unsigned long call_site,
                 const void *ptr,
                 size_t bytes_req,
                 size_t bytes_alloc,
                 gfp_t gfp_flags),

        TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),

        TP_STRUCT__entry(
                __field(        unsigned long,        call_site        )
                __field(        const void *,        ptr                )
                __field(        size_t,                bytes_req        )
                __field(        size_t,                bytes_alloc        )
                __field(        gfp_t,                gfp_flags        )
        ),

        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->ptr                = ptr;
                __entry->bytes_req        = bytes_req;
                __entry->bytes_alloc        = bytes_alloc;
                __entry->gfp_flags        = gfp_flags;
        ),

        TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s",
                (void *)__entry->call_site,
                __entry->ptr,
                __entry->bytes_req,
                __entry->bytes_alloc,
                show_gfp_flags(__entry->gfp_flags))
);

DEFINE_EVENT(kmem_alloc, kmalloc,

        TP_PROTO(unsigned long call_site, const void *ptr,
                 size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags),

        TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags)
);

DEFINE_EVENT(kmem_alloc, kmem_cache_alloc,

        TP_PROTO(unsigned long call_site, const void *ptr,
                 size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags),

        TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags)
);

DECLARE_EVENT_CLASS(kmem_alloc_node,

        TP_PROTO(unsigned long call_site,
                 const void *ptr,
                 size_t bytes_req,
                 size_t bytes_alloc,
                 gfp_t gfp_flags,
                 int node),

        TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),

        TP_STRUCT__entry(
                __field(        unsigned long,        call_site        )
                __field(        const void *,        ptr                )
                __field(        size_t,                bytes_req        )
                __field(        size_t,                bytes_alloc        )
                __field(        gfp_t,                gfp_flags        )
                __field(        int,                node                )
        ),

        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->ptr                = ptr;
                __entry->bytes_req        = bytes_req;
                __entry->bytes_alloc        = bytes_alloc;
                __entry->gfp_flags        = gfp_flags;
                __entry->node                = node;
        ),

        TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d",
                (void *)__entry->call_site,
                __entry->ptr,
                __entry->bytes_req,
                __entry->bytes_alloc,
                show_gfp_flags(__entry->gfp_flags),
                __entry->node)
);

DEFINE_EVENT(kmem_alloc_node, kmalloc_node,

        TP_PROTO(unsigned long call_site, const void *ptr,
                 size_t bytes_req, size_t bytes_alloc,
                 gfp_t gfp_flags, int node),

        TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node)
);

DEFINE_EVENT(kmem_alloc_node, kmem_cache_alloc_node,

        TP_PROTO(unsigned long call_site, const void *ptr,
                 size_t bytes_req, size_t bytes_alloc,
                 gfp_t gfp_flags, int node),

        TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node)
);

DECLARE_EVENT_CLASS(kmem_free,

        TP_PROTO(unsigned long call_site, const void *ptr),

        TP_ARGS(call_site, ptr),

        TP_STRUCT__entry(
                __field(        unsigned long,        call_site        )
                __field(        const void *,        ptr                )
        ),

        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->ptr                = ptr;
        ),

        TP_printk("call_site=%pS ptr=%p",
                  (void *)__entry->call_site, __entry->ptr)
);

DEFINE_EVENT(kmem_free, kfree,

        TP_PROTO(unsigned long call_site, const void *ptr),

        TP_ARGS(call_site, ptr)
);

DEFINE_EVENT(kmem_free, kmem_cache_free,

        TP_PROTO(unsigned long call_site, const void *ptr),

        TP_ARGS(call_site, ptr)
);

TRACE_EVENT(mm_page_free,

        TP_PROTO(struct page *page, unsigned int order),

        TP_ARGS(page, order),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
        ),

        TP_fast_assign(
                __entry->pfn                = page_to_pfn(page);
                __entry->order                = order;
        ),

        TP_printk("page=%p pfn=%lu order=%d",
                        pfn_to_page(__entry->pfn),
                        __entry->pfn,
                        __entry->order)
);

TRACE_EVENT(mm_page_free_batched,

        TP_PROTO(struct page *page),

        TP_ARGS(page),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
        ),

        TP_fast_assign(
                __entry->pfn                = page_to_pfn(page);
        ),

        TP_printk("page=%p pfn=%lu order=0",
                        pfn_to_page(__entry->pfn),
                        __entry->pfn)
);

TRACE_EVENT(mm_page_alloc,

        TP_PROTO(struct page *page, unsigned int order,
                        gfp_t gfp_flags, int migratetype),

        TP_ARGS(page, order, gfp_flags, migratetype),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
                __field(        gfp_t,                gfp_flags        )
                __field(        int,                migratetype        )
        ),

        TP_fast_assign(
                __entry->pfn                = page ? page_to_pfn(page) : -1UL;
                __entry->order                = order;
                __entry->gfp_flags        = gfp_flags;
                __entry->migratetype        = migratetype;
        ),

        TP_printk("page=%p pfn=%lu order=%d migratetype=%d gfp_flags=%s",
                __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL,
                __entry->pfn != -1UL ? __entry->pfn : 0,
                __entry->order,
                __entry->migratetype,
                show_gfp_flags(__entry->gfp_flags))
);

DECLARE_EVENT_CLASS(mm_page,

        TP_PROTO(struct page *page, unsigned int order, int migratetype),

        TP_ARGS(page, order, migratetype),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
                __field(        int,                migratetype        )
        ),

        TP_fast_assign(
                __entry->pfn                = page ? page_to_pfn(page) : -1UL;
                __entry->order                = order;
                __entry->migratetype        = migratetype;
        ),

        TP_printk("page=%p pfn=%lu order=%u migratetype=%d percpu_refill=%d",
                __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL,
                __entry->pfn != -1UL ? __entry->pfn : 0,
                __entry->order,
                __entry->migratetype,
                __entry->order == 0)
);

DEFINE_EVENT(mm_page, mm_page_alloc_zone_locked,

        TP_PROTO(struct page *page, unsigned int order, int migratetype),

        TP_ARGS(page, order, migratetype)
);

TRACE_EVENT(mm_page_pcpu_drain,

        TP_PROTO(struct page *page, unsigned int order, int migratetype),

        TP_ARGS(page, order, migratetype),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
                __field(        int,                migratetype        )
        ),

        TP_fast_assign(
                __entry->pfn                = page ? page_to_pfn(page) : -1UL;
                __entry->order                = order;
                __entry->migratetype        = migratetype;
        ),

        TP_printk("page=%p pfn=%lu order=%d migratetype=%d",
                pfn_to_page(__entry->pfn), __entry->pfn,
                __entry->order, __entry->migratetype)
);

TRACE_EVENT(mm_page_alloc_extfrag,

        TP_PROTO(struct page *page,
                int alloc_order, int fallback_order,
                int alloc_migratetype, int fallback_migratetype),

        TP_ARGS(page,
                alloc_order, fallback_order,
                alloc_migratetype, fallback_migratetype),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                        )
                __field(        int,                alloc_order                )
                __field(        int,                fallback_order                )
                __field(        int,                alloc_migratetype        )
                __field(        int,                fallback_migratetype        )
                __field(        int,                change_ownership        )
        ),

        TP_fast_assign(
                __entry->pfn                        = page_to_pfn(page);
                __entry->alloc_order                = alloc_order;
                __entry->fallback_order                = fallback_order;
                __entry->alloc_migratetype        = alloc_migratetype;
                __entry->fallback_migratetype        = fallback_migratetype;
                __entry->change_ownership        = (alloc_migratetype ==
                                        get_pageblock_migratetype(page));
        ),

        TP_printk("page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d",
                pfn_to_page(__entry->pfn),
                __entry->pfn,
                __entry->alloc_order,
                __entry->fallback_order,
                pageblock_order,
                __entry->alloc_migratetype,
                __entry->fallback_migratetype,
                __entry->fallback_order < pageblock_order,
                __entry->change_ownership)
);

/*
 * Required for uniquely and securely identifying mm in rss_stat tracepoint.
 */
#ifndef __PTR_TO_HASHVAL
static unsigned int __maybe_unused mm_ptr_to_hash(const void *ptr)
{
        int ret;
        unsigned long hashval;

        ret = ptr_to_hashval(ptr, &hashval);
        if (ret)
                return 0;

        /* The hashed value is only 32-bit */
        return (unsigned int)hashval;
}
#define __PTR_TO_HASHVAL
#endif

TRACE_EVENT(rss_stat,

        TP_PROTO(struct mm_struct *mm,
                int member,
                long count),

        TP_ARGS(mm, member, count),

        TP_STRUCT__entry(
                __field(unsigned int, mm_id)
                __field(unsigned int, curr)
                __field(int, member)
                __field(long, size)
        ),

        TP_fast_assign(
                __entry->mm_id = mm_ptr_to_hash(mm);
                __entry->curr = !!(current->mm == mm);
                __entry->member = member;
                __entry->size = (count << PAGE_SHIFT);
        ),

        TP_printk("mm_id=%u curr=%d member=%d size=%ldB",
                __entry->mm_id,
                __entry->curr,
                __entry->member,
                __entry->size)
        );
#endif /* _TRACE_KMEM_H */

/* This part must be outside protection */
#include <trace/define_trace.h>
































































































































































































































































































































    1 

























































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* audit.h -- Auditing support
 *
 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
 * All Rights Reserved.
 *
 * Written by Rickard E. (Rik) Faith <faith@redhat.com>
 */
#ifndef _LINUX_AUDIT_H_
#define _LINUX_AUDIT_H_

#include <linux/sched.h>
#include <linux/ptrace.h>
#include <uapi/linux/audit.h>
#include <uapi/linux/netfilter/nf_tables.h>

#define AUDIT_INO_UNSET ((unsigned long)-1)
#define AUDIT_DEV_UNSET ((dev_t)-1)

struct audit_sig_info {
        uid_t                uid;
        pid_t                pid;
        char                ctx[];
};

struct audit_buffer;
struct audit_context;
struct inode;
struct netlink_skb_parms;
struct path;
struct linux_binprm;
struct mq_attr;
struct mqstat;
struct audit_watch;
struct audit_tree;
struct sk_buff;

struct audit_krule {
        u32                        pflags;
        u32                        flags;
        u32                        listnr;
        u32                        action;
        u32                        mask[AUDIT_BITMASK_SIZE];
        u32                        buflen; /* for data alloc on list rules */
        u32                        field_count;
        char                        *filterkey; /* ties events to rules */
        struct audit_field        *fields;
        struct audit_field        *arch_f; /* quick access to arch field */
        struct audit_field        *inode_f; /* quick access to an inode field */
        struct audit_watch        *watch;        /* associated watch */
        struct audit_tree        *tree;        /* associated watched tree */
        struct audit_fsnotify_mark        *exe;
        struct list_head        rlist;        /* entry in audit_{watch,tree}.rules list */
        struct list_head        list;        /* for AUDIT_LIST* purposes only */
        u64                        prio;
};

/* Flag to indicate legacy AUDIT_LOGINUID unset usage */
#define AUDIT_LOGINUID_LEGACY                0x1

struct audit_field {
        u32                                type;
        union {
                u32                        val;
                kuid_t                        uid;
                kgid_t                        gid;
                struct {
                        char                *lsm_str;
                        void                *lsm_rule;
                };
        };
        u32                                op;
};

enum audit_ntp_type {
        AUDIT_NTP_OFFSET,
        AUDIT_NTP_FREQ,
        AUDIT_NTP_STATUS,
        AUDIT_NTP_TAI,
        AUDIT_NTP_TICK,
        AUDIT_NTP_ADJUST,

        AUDIT_NTP_NVALS /* count */
};

#ifdef CONFIG_AUDITSYSCALL
struct audit_ntp_val {
        long long oldval, newval;
};

struct audit_ntp_data {
        struct audit_ntp_val vals[AUDIT_NTP_NVALS];
};
#else
struct audit_ntp_data {};
#endif

enum audit_nfcfgop {
        AUDIT_XT_OP_REGISTER,
        AUDIT_XT_OP_REPLACE,
        AUDIT_XT_OP_UNREGISTER,
        AUDIT_NFT_OP_TABLE_REGISTER,
        AUDIT_NFT_OP_TABLE_UNREGISTER,
        AUDIT_NFT_OP_CHAIN_REGISTER,
        AUDIT_NFT_OP_CHAIN_UNREGISTER,
        AUDIT_NFT_OP_RULE_REGISTER,
        AUDIT_NFT_OP_RULE_UNREGISTER,
        AUDIT_NFT_OP_SET_REGISTER,
        AUDIT_NFT_OP_SET_UNREGISTER,
        AUDIT_NFT_OP_SETELEM_REGISTER,
        AUDIT_NFT_OP_SETELEM_UNREGISTER,
        AUDIT_NFT_OP_GEN_REGISTER,
        AUDIT_NFT_OP_OBJ_REGISTER,
        AUDIT_NFT_OP_OBJ_UNREGISTER,
        AUDIT_NFT_OP_OBJ_RESET,
        AUDIT_NFT_OP_FLOWTABLE_REGISTER,
        AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,
        AUDIT_NFT_OP_INVALID,
};

extern int is_audit_feature_set(int which);

extern int __init audit_register_class(int class, unsigned *list);
extern int audit_classify_syscall(int abi, unsigned syscall);
extern int audit_classify_arch(int arch);
/* only for compat system calls */
extern unsigned compat_write_class[];
extern unsigned compat_read_class[];
extern unsigned compat_dir_class[];
extern unsigned compat_chattr_class[];
extern unsigned compat_signal_class[];

extern int audit_classify_compat_syscall(int abi, unsigned syscall);

/* audit_names->type values */
#define        AUDIT_TYPE_UNKNOWN        0        /* we don't know yet */
#define        AUDIT_TYPE_NORMAL        1        /* a "normal" audit record */
#define        AUDIT_TYPE_PARENT        2        /* a parent audit record */
#define        AUDIT_TYPE_CHILD_DELETE 3        /* a child being deleted */
#define        AUDIT_TYPE_CHILD_CREATE 4        /* a child being created */

/* maximized args number that audit_socketcall can process */
#define AUDITSC_ARGS                6

/* bit values for ->signal->audit_tty */
#define AUDIT_TTY_ENABLE        BIT(0)
#define AUDIT_TTY_LOG_PASSWD        BIT(1)

struct filename;

#define AUDIT_OFF        0
#define AUDIT_ON        1
#define AUDIT_LOCKED        2
#ifdef CONFIG_AUDIT
/* These are defined in audit.c */
                                /* Public API */
extern __printf(4, 5)
void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
               const char *fmt, ...);

extern struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type);
extern __printf(2, 3)
void audit_log_format(struct audit_buffer *ab, const char *fmt, ...);
extern void                    audit_log_end(struct audit_buffer *ab);
extern bool                    audit_string_contains_control(const char *string,
                                                          size_t len);
extern void                    audit_log_n_hex(struct audit_buffer *ab,
                                          const unsigned char *buf,
                                          size_t len);
extern void                    audit_log_n_string(struct audit_buffer *ab,
                                               const char *buf,
                                               size_t n);
extern void                    audit_log_n_untrustedstring(struct audit_buffer *ab,
                                                        const char *string,
                                                        size_t n);
extern void                    audit_log_untrustedstring(struct audit_buffer *ab,
                                                      const char *string);
extern void                    audit_log_d_path(struct audit_buffer *ab,
                                             const char *prefix,
                                             const struct path *path);
extern void                    audit_log_key(struct audit_buffer *ab,
                                          char *key);
extern void                    audit_log_path_denied(int type,
                                                  const char *operation);
extern void                    audit_log_lost(const char *message);

extern int audit_log_task_context(struct audit_buffer *ab);
extern void audit_log_task_info(struct audit_buffer *ab);

extern int                    audit_update_lsm_rules(void);

                                /* Private API (for audit.c only) */
extern int audit_rule_change(int type, int seq, void *data, size_t datasz);
extern int audit_list_rules_send(struct sk_buff *request_skb, int seq);

extern int audit_set_loginuid(kuid_t loginuid);

static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
{
        return tsk->loginuid;
}

static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
{
        return tsk->sessionid;
}

extern u32 audit_enabled;

extern int audit_signal_info(int sig, struct task_struct *t);

#else /* CONFIG_AUDIT */
static inline __printf(4, 5)
void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
               const char *fmt, ...)
{ }
static inline struct audit_buffer *audit_log_start(struct audit_context *ctx,
                                                   gfp_t gfp_mask, int type)
{
        return NULL;
}
static inline __printf(2, 3)
void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
{ }
static inline void audit_log_end(struct audit_buffer *ab)
{ }
static inline void audit_log_n_hex(struct audit_buffer *ab,
                                   const unsigned char *buf, size_t len)
{ }
static inline void audit_log_n_string(struct audit_buffer *ab,
                                      const char *buf, size_t n)
{ }
static inline void  audit_log_n_untrustedstring(struct audit_buffer *ab,
                                                const char *string, size_t n)
{ }
static inline void audit_log_untrustedstring(struct audit_buffer *ab,
                                             const char *string)
{ }
static inline void audit_log_d_path(struct audit_buffer *ab,
                                    const char *prefix,
                                    const struct path *path)
{ }
static inline void audit_log_key(struct audit_buffer *ab, char *key)
{ }
static inline void audit_log_path_denied(int type, const char *operation)
{ }
static inline int audit_log_task_context(struct audit_buffer *ab)
{
        return 0;
}
static inline void audit_log_task_info(struct audit_buffer *ab)
{ }

static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
{
        return INVALID_UID;
}

static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
{
        return AUDIT_SID_UNSET;
}

#define audit_enabled AUDIT_OFF

static inline int audit_signal_info(int sig, struct task_struct *t)
{
        return 0;
}

#endif /* CONFIG_AUDIT */

#ifdef CONFIG_AUDIT_COMPAT_GENERIC
#define audit_is_compat(arch)  (!((arch) & __AUDIT_ARCH_64BIT))
#else
#define audit_is_compat(arch)  false
#endif

#define AUDIT_INODE_PARENT        1        /* dentry represents the parent */
#define AUDIT_INODE_HIDDEN        2        /* audit record should be hidden */
#define AUDIT_INODE_NOEVAL        4        /* audit record incomplete */

#ifdef CONFIG_AUDITSYSCALL
#include <asm/syscall.h> /* for syscall_get_arch() */

/* These are defined in auditsc.c */
                                /* Public API */
extern int  audit_alloc(struct task_struct *task);
extern void __audit_free(struct task_struct *task);
extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1,
                                  unsigned long a2, unsigned long a3);
extern void __audit_syscall_exit(int ret_success, long ret_value);
extern struct filename *__audit_reusename(const __user char *uptr);
extern void __audit_getname(struct filename *name);
extern void __audit_getcwd(void);
extern void __audit_inode(struct filename *name, const struct dentry *dentry,
                                unsigned int flags);
extern void __audit_file(const struct file *);
extern void __audit_inode_child(struct inode *parent,
                                const struct dentry *dentry,
                                const unsigned char type);
extern void audit_seccomp(unsigned long syscall, long signr, int code);
extern void audit_seccomp_actions_logged(const char *names,
                                         const char *old_names, int res);
extern void __audit_ptrace(struct task_struct *t);

static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx)
{
        task->audit_context = ctx;
}

static inline struct audit_context *audit_context(void)
{
        return current->audit_context;
}

static inline bool audit_dummy_context(void)
{
        void *p = audit_context();
        return !p || *(int *)p;
}
static inline void audit_free(struct task_struct *task)
{
        if (unlikely(task->audit_context))
                __audit_free(task);
}
static inline void audit_syscall_entry(int major, unsigned long a0,
                                       unsigned long a1, unsigned long a2,
                                       unsigned long a3)
{
        if (unlikely(audit_context()))
                __audit_syscall_entry(major, a0, a1, a2, a3);
}
static inline void audit_syscall_exit(void *pt_regs)
{
        if (unlikely(audit_context())) {
                int success = is_syscall_success(pt_regs);
                long return_code = regs_return_value(pt_regs);

                __audit_syscall_exit(success, return_code);
        }
}
static inline struct filename *audit_reusename(const __user char *name)
{
        if (unlikely(!audit_dummy_context()))
                return __audit_reusename(name);
        return NULL;
}
static inline void audit_getname(struct filename *name)
{
        if (unlikely(!audit_dummy_context()))
                __audit_getname(name);
}
static inline void audit_getcwd(void)
{
        if (unlikely(audit_context()))
                __audit_getcwd();
}
static inline void audit_inode(struct filename *name,
                                const struct dentry *dentry,
                                unsigned int aflags) {
        if (unlikely(!audit_dummy_context()))
                __audit_inode(name, dentry, aflags);
}
static inline void audit_file(struct file *file)
{
        if (unlikely(!audit_dummy_context()))
                __audit_file(file);
}
static inline void audit_inode_parent_hidden(struct filename *name,
                                                const struct dentry *dentry)
{
        if (unlikely(!audit_dummy_context()))
                __audit_inode(name, dentry,
                                AUDIT_INODE_PARENT | AUDIT_INODE_HIDDEN);
}
static inline void audit_inode_child(struct inode *parent,
                                     const struct dentry *dentry,
                                     const unsigned char type) {
        if (unlikely(!audit_dummy_context()))
                __audit_inode_child(parent, dentry, type);
}
void audit_core_dumps(long signr);

static inline void audit_ptrace(struct task_struct *t)
{
        if (unlikely(!audit_dummy_context()))
                __audit_ptrace(t);
}

                                /* Private API (for audit.c only) */
extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
extern void __audit_bprm(struct linux_binprm *bprm);
extern int __audit_socketcall(int nargs, unsigned long *args);
extern int __audit_sockaddr(int len, void *addr);
extern void __audit_fd_pair(int fd1, int fd2);
extern void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr);
extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout);
extern void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification);
extern void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat);
extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
                                  const struct cred *new,
                                  const struct cred *old);
extern void __audit_log_capset(const struct cred *new, const struct cred *old);
extern void __audit_mmap_fd(int fd, int flags);
extern void __audit_log_kern_module(char *name);
extern void __audit_fanotify(unsigned int response);
extern void __audit_tk_injoffset(struct timespec64 offset);
extern void __audit_ntp_log(const struct audit_ntp_data *ad);
extern void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries,
                              enum audit_nfcfgop op, gfp_t gfp);

static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
{
        if (unlikely(!audit_dummy_context()))
                __audit_ipc_obj(ipcp);
}
static inline void audit_fd_pair(int fd1, int fd2)
{
        if (unlikely(!audit_dummy_context()))
                __audit_fd_pair(fd1, fd2);
}
static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)
{
        if (unlikely(!audit_dummy_context()))
                __audit_ipc_set_perm(qbytes, uid, gid, mode);
}
static inline void audit_bprm(struct linux_binprm *bprm)
{
        if (unlikely(!audit_dummy_context()))
                __audit_bprm(bprm);
}
static inline int audit_socketcall(int nargs, unsigned long *args)
{
        if (unlikely(!audit_dummy_context()))
                return __audit_socketcall(nargs, args);
        return 0;
}

static inline int audit_socketcall_compat(int nargs, u32 *args)
{
        unsigned long a[AUDITSC_ARGS];
        int i;

        if (audit_dummy_context())
                return 0;

        for (i = 0; i < nargs; i++)
                a[i] = (unsigned long)args[i];
        return __audit_socketcall(nargs, a);
}

static inline int audit_sockaddr(int len, void *addr)
{
        if (unlikely(!audit_dummy_context()))
                return __audit_sockaddr(len, addr);
        return 0;
}
static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_open(oflag, mode, attr);
}
static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_sendrecv(mqdes, msg_len, msg_prio, abs_timeout);
}
static inline void audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_notify(mqdes, notification);
}
static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_getsetattr(mqdes, mqstat);
}

static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm,
                                       const struct cred *new,
                                       const struct cred *old)
{
        if (unlikely(!audit_dummy_context()))
                return __audit_log_bprm_fcaps(bprm, new, old);
        return 0;
}

static inline void audit_log_capset(const struct cred *new,
                                   const struct cred *old)
{
        if (unlikely(!audit_dummy_context()))
                __audit_log_capset(new, old);
}

static inline void audit_mmap_fd(int fd, int flags)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mmap_fd(fd, flags);
}

static inline void audit_log_kern_module(char *name)
{
        if (!audit_dummy_context())
                __audit_log_kern_module(name);
}

static inline void audit_fanotify(unsigned int response)
{
        if (!audit_dummy_context())
                __audit_fanotify(response);
}

static inline void audit_tk_injoffset(struct timespec64 offset)
{
        /* ignore no-op events */
        if (offset.tv_sec == 0 && offset.tv_nsec == 0)
                return;

        if (!audit_dummy_context())
                __audit_tk_injoffset(offset);
}

static inline void audit_ntp_init(struct audit_ntp_data *ad)
{
        memset(ad, 0, sizeof(*ad));
}

static inline void audit_ntp_set_old(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{
        ad->vals[type].oldval = val;
}

static inline void audit_ntp_set_new(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{
        ad->vals[type].newval = val;
}

static inline void audit_ntp_log(const struct audit_ntp_data *ad)
{
        if (!audit_dummy_context())
                __audit_ntp_log(ad);
}

static inline void audit_log_nfcfg(const char *name, u8 af,
                                   unsigned int nentries,
                                   enum audit_nfcfgop op, gfp_t gfp)
{
        if (audit_enabled)
                __audit_log_nfcfg(name, af, nentries, op, gfp);
}

extern int audit_n_rules;
extern int audit_signals;
#else /* CONFIG_AUDITSYSCALL */
static inline int audit_alloc(struct task_struct *task)
{
        return 0;
}
static inline void audit_free(struct task_struct *task)
{ }
static inline void audit_syscall_entry(int major, unsigned long a0,
                                       unsigned long a1, unsigned long a2,
                                       unsigned long a3)
{ }
static inline void audit_syscall_exit(void *pt_regs)
{ }
static inline bool audit_dummy_context(void)
{
        return true;
}
static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx)
{ }
static inline struct audit_context *audit_context(void)
{
        return NULL;
}
static inline struct filename *audit_reusename(const __user char *name)
{
        return NULL;
}
static inline void audit_getname(struct filename *name)
{ }
static inline void audit_getcwd(void)
{ }
static inline void audit_inode(struct filename *name,
                                const struct dentry *dentry,
                                unsigned int aflags)
{ }
static inline void audit_file(struct file *file)
{
}
static inline void audit_inode_parent_hidden(struct filename *name,
                                const struct dentry *dentry)
{ }
static inline void audit_inode_child(struct inode *parent,
                                     const struct dentry *dentry,
                                     const unsigned char type)
{ }
static inline void audit_core_dumps(long signr)
{ }
static inline void audit_seccomp(unsigned long syscall, long signr, int code)
{ }
static inline void audit_seccomp_actions_logged(const char *names,
                                                const char *old_names, int res)
{ }
static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
{ }
static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid,
                                        gid_t gid, umode_t mode)
{ }
static inline void audit_bprm(struct linux_binprm *bprm)
{ }
static inline int audit_socketcall(int nargs, unsigned long *args)
{
        return 0;
}

static inline int audit_socketcall_compat(int nargs, u32 *args)
{
        return 0;
}

static inline void audit_fd_pair(int fd1, int fd2)
{ }
static inline int audit_sockaddr(int len, void *addr)
{
        return 0;
}
static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
{ }
static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len,
                                     unsigned int msg_prio,
                                     const struct timespec64 *abs_timeout)
{ }
static inline void audit_mq_notify(mqd_t mqdes,
                                   const struct sigevent *notification)
{ }
static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
{ }
static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm,
                                       const struct cred *new,
                                       const struct cred *old)
{
        return 0;
}
static inline void audit_log_capset(const struct cred *new,
                                    const struct cred *old)
{ }
static inline void audit_mmap_fd(int fd, int flags)
{ }

static inline void audit_log_kern_module(char *name)
{
}

static inline void audit_fanotify(unsigned int response)
{ }

static inline void audit_tk_injoffset(struct timespec64 offset)
{ }

static inline void audit_ntp_init(struct audit_ntp_data *ad)
{ }

static inline void audit_ntp_set_old(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{ }

static inline void audit_ntp_set_new(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{ }

static inline void audit_ntp_log(const struct audit_ntp_data *ad)
{ }

static inline void audit_ptrace(struct task_struct *t)
{ }

static inline void audit_log_nfcfg(const char *name, u8 af,
                                   unsigned int nentries,
                                   enum audit_nfcfgop op, gfp_t gfp)
{ }

#define audit_n_rules 0
#define audit_signals 0
#endif /* CONFIG_AUDITSYSCALL */

static inline bool audit_loginuid_set(struct task_struct *tsk)
{
        return uid_valid(audit_get_loginuid(tsk));
}

#endif












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



























































































    1 

    1 



    1 


    1 












































    1 




    1 
























































    1 




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMZONE_H
#define _LINUX_MMZONE_H

#ifndef __ASSEMBLY__
#ifndef __GENERATING_BOUNDS_H

#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/wait.h>
#include <linux/bitops.h>
#include <linux/cache.h>
#include <linux/threads.h>
#include <linux/numa.h>
#include <linux/init.h>
#include <linux/seqlock.h>
#include <linux/nodemask.h>
#include <linux/pageblock-flags.h>
#include <linux/page-flags-layout.h>
#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/page-flags.h>
#include <asm/page.h>

/* Free memory management - zoned buddy allocator.  */
#ifndef CONFIG_FORCE_MAX_ZONEORDER
#define MAX_ORDER 11
#else
#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
#endif
#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))

/*
 * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
 * costly to service.  That is between allocation orders which should
 * coalesce naturally under reasonable reclaim pressure and those which
 * will not.
 */
#define PAGE_ALLOC_COSTLY_ORDER 3

enum migratetype {
        MIGRATE_UNMOVABLE,
        MIGRATE_MOVABLE,
        MIGRATE_RECLAIMABLE,
        MIGRATE_PCPTYPES,        /* the number of types on the pcp lists */
        MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES,
#ifdef CONFIG_CMA
        /*
         * MIGRATE_CMA migration type is designed to mimic the way
         * ZONE_MOVABLE works.  Only movable pages can be allocated
         * from MIGRATE_CMA pageblocks and page allocator never
         * implicitly change migration type of MIGRATE_CMA pageblock.
         *
         * The way to use it is to change migratetype of a range of
         * pageblocks to MIGRATE_CMA which can be done by
         * __free_pageblock_cma() function.  What is important though
         * is that a range of pageblocks must be aligned to
         * MAX_ORDER_NR_PAGES should biggest page be bigger then
         * a single pageblock.
         */
        MIGRATE_CMA,
#endif
#ifdef CONFIG_MEMORY_ISOLATION
        MIGRATE_ISOLATE,        /* can't allocate from here */
#endif
        MIGRATE_TYPES
};

/* In mm/page_alloc.c; keep in sync also with show_migration_types() there */
extern const char * const migratetype_names[MIGRATE_TYPES];

#ifdef CONFIG_CMA
#  define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
#  define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA)
#else
#  define is_migrate_cma(migratetype) false
#  define is_migrate_cma_page(_page) false
#endif

static inline bool is_migrate_movable(int mt)
{
        return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE;
}

#define for_each_migratetype_order(order, type) \
        for (order = 0; order < MAX_ORDER; order++) \
                for (type = 0; type < MIGRATE_TYPES; type++)

extern int page_group_by_mobility_disabled;

#define MIGRATETYPE_MASK ((1UL << PB_migratetype_bits) - 1)

#define get_pageblock_migratetype(page)                                        \
        get_pfnblock_flags_mask(page, page_to_pfn(page), MIGRATETYPE_MASK)

struct free_area {
        struct list_head        free_list[MIGRATE_TYPES];
        unsigned long                nr_free;
};

static inline struct page *get_page_from_free_area(struct free_area *area,
                                            int migratetype)
{
        return list_first_entry_or_null(&area->free_list[migratetype],
                                        struct page, lru);
}

static inline bool free_area_empty(struct free_area *area, int migratetype)
{
        return list_empty(&area->free_list[migratetype]);
}

struct pglist_data;

/*
 * zone->lock and the zone lru_lock are two of the hottest locks in the kernel.
 * So add a wild amount of padding here to ensure that they fall into separate
 * cachelines.  There are very few zone structures in the machine, so space
 * consumption is not a concern here.
 */
#if defined(CONFIG_SMP)
struct zone_padding {
        char x[0];
} ____cacheline_internodealigned_in_smp;
#define ZONE_PADDING(name)        struct zone_padding name;
#else
#define ZONE_PADDING(name)
#endif

#ifdef CONFIG_NUMA
enum numa_stat_item {
        NUMA_HIT,                /* allocated in intended node */
        NUMA_MISS,                /* allocated in non intended node */
        NUMA_FOREIGN,                /* was intended here, hit elsewhere */
        NUMA_INTERLEAVE_HIT,        /* interleaver preferred this zone */
        NUMA_LOCAL,                /* allocation from local node */
        NUMA_OTHER,                /* allocation from other node */
        NR_VM_NUMA_STAT_ITEMS
};
#else
#define NR_VM_NUMA_STAT_ITEMS 0
#endif

enum zone_stat_item {
        /* First 128 byte cacheline (assuming 64 bit words) */
        NR_FREE_PAGES,
        NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */
        NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE,
        NR_ZONE_ACTIVE_ANON,
        NR_ZONE_INACTIVE_FILE,
        NR_ZONE_ACTIVE_FILE,
        NR_ZONE_UNEVICTABLE,
        NR_ZONE_WRITE_PENDING,        /* Count of dirty, writeback and unstable pages */
        NR_MLOCK,                /* mlock()ed pages found and moved off LRU */
        NR_PAGETABLE,                /* used for pagetables */
        /* Second 128 byte cacheline */
        NR_BOUNCE,
#if IS_ENABLED(CONFIG_ZSMALLOC)
        NR_ZSPAGES,                /* allocated in zsmalloc */
#endif
        NR_FREE_CMA_PAGES,
        NR_VM_ZONE_STAT_ITEMS };

enum node_stat_item {
        NR_LRU_BASE,
        NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
        NR_ACTIVE_ANON,                /*  "     "     "   "       "         */
        NR_INACTIVE_FILE,        /*  "     "     "   "       "         */
        NR_ACTIVE_FILE,                /*  "     "     "   "       "         */
        NR_UNEVICTABLE,                /*  "     "     "   "       "         */
        NR_SLAB_RECLAIMABLE_B,
        NR_SLAB_UNRECLAIMABLE_B,
        NR_ISOLATED_ANON,        /* Temporary isolated pages from anon lru */
        NR_ISOLATED_FILE,        /* Temporary isolated pages from file lru */
        WORKINGSET_NODES,
        WORKINGSET_REFAULT_BASE,
        WORKINGSET_REFAULT_ANON = WORKINGSET_REFAULT_BASE,
        WORKINGSET_REFAULT_FILE,
        WORKINGSET_ACTIVATE_BASE,
        WORKINGSET_ACTIVATE_ANON = WORKINGSET_ACTIVATE_BASE,
        WORKINGSET_ACTIVATE_FILE,
        WORKINGSET_RESTORE_BASE,
        WORKINGSET_RESTORE_ANON = WORKINGSET_RESTORE_BASE,
        WORKINGSET_RESTORE_FILE,
        WORKINGSET_NODERECLAIM,
        NR_ANON_MAPPED,        /* Mapped anonymous pages */
        NR_FILE_MAPPED,        /* pagecache pages mapped into pagetables.
                           only modified from process context */
        NR_FILE_PAGES,
        NR_FILE_DIRTY,
        NR_WRITEBACK,
        NR_WRITEBACK_TEMP,        /* Writeback using temporary buffers */
        NR_SHMEM,                /* shmem pages (included tmpfs/GEM pages) */
        NR_SHMEM_THPS,
        NR_SHMEM_PMDMAPPED,
        NR_FILE_THPS,
        NR_FILE_PMDMAPPED,
        NR_ANON_THPS,
        NR_VMSCAN_WRITE,
        NR_VMSCAN_IMMEDIATE,        /* Prioritise for reclaim when writeback ends */
        NR_DIRTIED,                /* page dirtyings since bootup */
        NR_WRITTEN,                /* page writings since bootup */
        NR_KERNEL_MISC_RECLAIMABLE,        /* reclaimable non-slab kernel pages */
        NR_FOLL_PIN_ACQUIRED,        /* via: pin_user_page(), gup flag: FOLL_PIN */
        NR_FOLL_PIN_RELEASED,        /* pages returned via unpin_user_page() */
        NR_KERNEL_STACK_KB,        /* measured in KiB */
#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
        NR_KERNEL_SCS_KB,        /* measured in KiB */
#endif
        NR_VM_NODE_STAT_ITEMS
};

/*
 * Returns true if the value is measured in bytes (most vmstat values are
 * measured in pages). This defines the API part, the internal representation
 * might be different.
 */
static __always_inline bool vmstat_item_in_bytes(int idx)
{
        /*
         * Global and per-node slab counters track slab pages.
         * It's expected that changes are multiples of PAGE_SIZE.
         * Internally values are stored in pages.
         *
         * Per-memcg and per-lruvec counters track memory, consumed
         * by individual slab objects. These counters are actually
         * byte-precise.
         */
        return (idx == NR_SLAB_RECLAIMABLE_B ||
                idx == NR_SLAB_UNRECLAIMABLE_B);
}

/*
 * We do arithmetic on the LRU lists in various places in the code,
 * so it is important to keep the active lists LRU_ACTIVE higher in
 * the array than the corresponding inactive lists, and to keep
 * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists.
 *
 * This has to be kept in sync with the statistics in zone_stat_item
 * above and the descriptions in vmstat_text in mm/vmstat.c
 */
#define LRU_BASE 0
#define LRU_ACTIVE 1
#define LRU_FILE 2

enum lru_list {
        LRU_INACTIVE_ANON = LRU_BASE,
        LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
        LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
        LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
        LRU_UNEVICTABLE,
        NR_LRU_LISTS
};

#define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++)

#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)

static inline bool is_file_lru(enum lru_list lru)
{
        return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE);
}

static inline bool is_active_lru(enum lru_list lru)
{
        return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
}

#define ANON_AND_FILE 2

enum lruvec_flags {
        LRUVEC_CONGESTED,                /* lruvec has many dirty pages
                                         * backed by a congested BDI
                                         */
};

struct lruvec {
        struct list_head                lists[NR_LRU_LISTS];
        /*
         * These track the cost of reclaiming one LRU - file or anon -
         * over the other. As the observed cost of reclaiming one LRU
         * increases, the reclaim scan balance tips toward the other.
         */
        unsigned long                        anon_cost;
        unsigned long                        file_cost;
        /* Non-resident age, driven by LRU movement */
        atomic_long_t                        nonresident_age;
        /* Refaults at the time of last reclaim cycle */
        unsigned long                        refaults[ANON_AND_FILE];
        /* Various lruvec state flags (enum lruvec_flags) */
        unsigned long                        flags;
#ifdef CONFIG_MEMCG
        struct pglist_data *pgdat;
#endif
};

/* Isolate unmapped pages */
#define ISOLATE_UNMAPPED        ((__force isolate_mode_t)0x2)
/* Isolate for asynchronous migration */
#define ISOLATE_ASYNC_MIGRATE        ((__force isolate_mode_t)0x4)
/* Isolate unevictable pages */
#define ISOLATE_UNEVICTABLE        ((__force isolate_mode_t)0x8)

/* LRU Isolation modes. */
typedef unsigned __bitwise isolate_mode_t;

enum zone_watermarks {
        WMARK_MIN,
        WMARK_LOW,
        WMARK_HIGH,
        NR_WMARK
};

#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
#define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)

struct per_cpu_pages {
        int count;                /* number of pages in the list */
        int high;                /* high watermark, emptying needed */
        int batch;                /* chunk size for buddy add/remove */

        /* Lists of pages, one per migrate type stored on the pcp-lists */
        struct list_head lists[MIGRATE_PCPTYPES];
};

struct per_cpu_pageset {
        struct per_cpu_pages pcp;
#ifdef CONFIG_NUMA
        s8 expire;
        u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
#endif
#ifdef CONFIG_SMP
        s8 stat_threshold;
        s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
#endif
};

struct per_cpu_nodestat {
        s8 stat_threshold;
        s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS];
};

#endif /* !__GENERATING_BOUNDS.H */

enum zone_type {
        /*
         * ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able
         * to DMA to all of the addressable memory (ZONE_NORMAL).
         * On architectures where this area covers the whole 32 bit address
         * space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller
         * DMA addressing constraints. This distinction is important as a 32bit
         * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit
         * platforms may need both zones as they support peripherals with
         * different DMA addressing limitations.
         */
#ifdef CONFIG_ZONE_DMA
        ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
        ZONE_DMA32,
#endif
        /*
         * Normal addressable memory is in ZONE_NORMAL. DMA operations can be
         * performed on pages in ZONE_NORMAL if the DMA devices support
         * transfers to all addressable memory.
         */
        ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
        /*
         * A memory area that is only addressable by the kernel through
         * mapping portions into its own address space. This is for example
         * used by i386 to allow the kernel to address the memory beyond
         * 900MB. The kernel will set up special mappings (page
         * table entries on i386) for each page that the kernel needs to
         * access.
         */
        ZONE_HIGHMEM,
#endif
        /*
         * ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains
         * movable pages with few exceptional cases described below. Main use
         * cases for ZONE_MOVABLE are to make memory offlining/unplug more
         * likely to succeed, and to locally limit unmovable allocations - e.g.,
         * to increase the number of THP/huge pages. Notable special cases are:
         *
         * 1. Pinned pages: (long-term) pinning of movable pages might
         *    essentially turn such pages unmovable. Memory offlining might
         *    retry a long time.
         * 2. memblock allocations: kernelcore/movablecore setups might create
         *    situations where ZONE_MOVABLE contains unmovable allocations
         *    after boot. Memory offlining and allocations fail early.
         * 3. Memory holes: kernelcore/movablecore setups might create very rare
         *    situations where ZONE_MOVABLE contains memory holes after boot,
         *    for example, if we have sections that are only partially
         *    populated. Memory offlining and allocations fail early.
         * 4. PG_hwpoison pages: while poisoned pages can be skipped during
         *    memory offlining, such pages cannot be allocated.
         * 5. Unmovable PG_offline pages: in paravirtualized environments,
         *    hotplugged memory blocks might only partially be managed by the
         *    buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The
         *    parts not manged by the buddy are unmovable PG_offline pages. In
         *    some cases (virtio-mem), such pages can be skipped during
         *    memory offlining, however, cannot be moved/allocated. These
         *    techniques might use alloc_contig_range() to hide previously
         *    exposed pages from the buddy again (e.g., to implement some sort
         *    of memory unplug in virtio-mem).
         *
         * In general, no unmovable allocations that degrade memory offlining
         * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range())
         * have to expect that migrating pages in ZONE_MOVABLE can fail (even
         * if has_unmovable_pages() states that there are no unmovable pages,
         * there can be false negatives).
         */
        ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
        ZONE_DEVICE,
#endif
        __MAX_NR_ZONES

};

#ifndef __GENERATING_BOUNDS_H

#define ASYNC_AND_SYNC 2

struct zone {
        /* Read-mostly fields */

        /* zone watermarks, access with *_wmark_pages(zone) macros */
        unsigned long _watermark[NR_WMARK];
        unsigned long watermark_boost;

        unsigned long nr_reserved_highatomic;

        /*
         * We don't know if the memory that we're going to allocate will be
         * freeable or/and it will be released eventually, so to avoid totally
         * wasting several GB of ram we must reserve some of the lower zone
         * memory (otherwise we risk to run OOM on the lower zones despite
         * there being tons of freeable ram on the higher zones).  This array is
         * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
         * changes.
         */
        long lowmem_reserve[MAX_NR_ZONES];

#ifdef CONFIG_NEED_MULTIPLE_NODES
        int node;
#endif
        struct pglist_data        *zone_pgdat;
        struct per_cpu_pageset __percpu *pageset;

#ifndef CONFIG_SPARSEMEM
        /*
         * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
         * In SPARSEMEM, this map is stored in struct mem_section
         */
        unsigned long                *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */

        /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
        unsigned long                zone_start_pfn;

        /*
         * spanned_pages is the total pages spanned by the zone, including
         * holes, which is calculated as:
         *         spanned_pages = zone_end_pfn - zone_start_pfn;
         *
         * present_pages is physical pages existing within the zone, which
         * is calculated as:
         *        present_pages = spanned_pages - absent_pages(pages in holes);
         *
         * managed_pages is present pages managed by the buddy system, which
         * is calculated as (reserved_pages includes pages allocated by the
         * bootmem allocator):
         *        managed_pages = present_pages - reserved_pages;
         *
         * So present_pages may be used by memory hotplug or memory power
         * management logic to figure out unmanaged pages by checking
         * (present_pages - managed_pages). And managed_pages should be used
         * by page allocator and vm scanner to calculate all kinds of watermarks
         * and thresholds.
         *
         * Locking rules:
         *
         * zone_start_pfn and spanned_pages are protected by span_seqlock.
         * It is a seqlock because it has to be read outside of zone->lock,
         * and it is done in the main allocator path.  But, it is written
         * quite infrequently.
         *
         * The span_seq lock is declared along with zone->lock because it is
         * frequently read in proximity to zone->lock.  It's good to
         * give them a chance of being in the same cacheline.
         *
         * Write access to present_pages at runtime should be protected by
         * mem_hotplug_begin/end(). Any reader who can't tolerant drift of
         * present_pages should get_online_mems() to get a stable value.
         */
        atomic_long_t                managed_pages;
        unsigned long                spanned_pages;
        unsigned long                present_pages;

        const char                *name;

#ifdef CONFIG_MEMORY_ISOLATION
        /*
         * Number of isolated pageblock. It is used to solve incorrect
         * freepage counting problem due to racy retrieving migratetype
         * of pageblock. Protected by zone->lock.
         */
        unsigned long                nr_isolate_pageblock;
#endif

#ifdef CONFIG_MEMORY_HOTPLUG
        /* see spanned/present_pages for more description */
        seqlock_t                span_seqlock;
#endif

        int initialized;

        /* Write-intensive fields used from the page allocator */
        ZONE_PADDING(_pad1_)

        /* free areas of different sizes */
        struct free_area        free_area[MAX_ORDER];

        /* zone flags, see below */
        unsigned long                flags;

        /* Primarily protects free_area */
        spinlock_t                lock;

        /* Write-intensive fields used by compaction and vmstats. */
        ZONE_PADDING(_pad2_)

        /*
         * When free pages are below this point, additional steps are taken
         * when reading the number of free pages to avoid per-cpu counter
         * drift allowing watermarks to be breached
         */
        unsigned long percpu_drift_mark;

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
        /* pfn where compaction free scanner should start */
        unsigned long                compact_cached_free_pfn;
        /* pfn where compaction migration scanner should start */
        unsigned long                compact_cached_migrate_pfn[ASYNC_AND_SYNC];
        unsigned long                compact_init_migrate_pfn;
        unsigned long                compact_init_free_pfn;
#endif

#ifdef CONFIG_COMPACTION
        /*
         * On compaction failure, 1<<compact_defer_shift compactions
         * are skipped before trying again. The number attempted since
         * last failure is tracked with compact_considered.
         * compact_order_failed is the minimum compaction failed order.
         */
        unsigned int                compact_considered;
        unsigned int                compact_defer_shift;
        int                        compact_order_failed;
#endif

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
        /* Set to true when the PG_migrate_skip bits should be cleared */
        bool                        compact_blockskip_flush;
#endif

        bool                        contiguous;

        ZONE_PADDING(_pad3_)
        /* Zone statistics */
        atomic_long_t                vm_stat[NR_VM_ZONE_STAT_ITEMS];
        atomic_long_t                vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
} ____cacheline_internodealigned_in_smp;

enum pgdat_flags {
        PGDAT_DIRTY,                        /* reclaim scanning has recently found
                                         * many dirty file pages at the tail
                                         * of the LRU.
                                         */
        PGDAT_WRITEBACK,                /* reclaim scanning has recently found
                                         * many pages under writeback
                                         */
        PGDAT_RECLAIM_LOCKED,                /* prevents concurrent reclaim */
};

enum zone_flags {
        ZONE_BOOSTED_WATERMARK,                /* zone recently boosted watermarks.
                                         * Cleared when kswapd is woken.
                                         */
};

static inline unsigned long zone_managed_pages(struct zone *zone)
{
        return (unsigned long)atomic_long_read(&zone->managed_pages);
}

static inline unsigned long zone_end_pfn(const struct zone *zone)
{
        return zone->zone_start_pfn + zone->spanned_pages;
}

static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
{
        return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
}

static inline bool zone_is_initialized(struct zone *zone)
{
        return zone->initialized;
}

static inline bool zone_is_empty(struct zone *zone)
{
        return zone->spanned_pages == 0;
}

/*
 * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty
 * intersection with the given zone
 */
static inline bool zone_intersects(struct zone *zone,
                unsigned long start_pfn, unsigned long nr_pages)
{
        if (zone_is_empty(zone))
                return false;
        if (start_pfn >= zone_end_pfn(zone) ||
            start_pfn + nr_pages <= zone->zone_start_pfn)
                return false;

        return true;
}

/*
 * The "priority" of VM scanning is how much of the queues we will scan in one
 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
 * queues ("queue_length >> 12") during an aging round.
 */
#define DEF_PRIORITY 12

/* Maximum number of zones on a zonelist */
#define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)

enum {
        ZONELIST_FALLBACK,        /* zonelist with fallback */
#ifdef CONFIG_NUMA
        /*
         * The NUMA zonelists are doubled because we need zonelists that
         * restrict the allocations to a single node for __GFP_THISNODE.
         */
        ZONELIST_NOFALLBACK,        /* zonelist without fallback (__GFP_THISNODE) */
#endif
        MAX_ZONELISTS
};

/*
 * This struct contains information about a zone in a zonelist. It is stored
 * here to avoid dereferences into large structures and lookups of tables
 */
struct zoneref {
        struct zone *zone;        /* Pointer to actual zone */
        int zone_idx;                /* zone_idx(zoneref->zone) */
};

/*
 * One allocation request operates on a zonelist. A zonelist
 * is a list of zones, the first one is the 'goal' of the
 * allocation, the other zones are fallback zones, in decreasing
 * priority.
 *
 * To speed the reading of the zonelist, the zonerefs contain the zone index
 * of the entry being read. Helper functions to access information given
 * a struct zoneref are
 *
 * zonelist_zone()        - Return the struct zone * for an entry in _zonerefs
 * zonelist_zone_idx()        - Return the index of the zone for an entry
 * zonelist_node_idx()        - Return the index of the node for an entry
 */
struct zonelist {
        struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
};

#ifndef CONFIG_DISCONTIGMEM
/* The array of struct pages - for discontigmem use pgdat->lmem_map */
extern struct page *mem_map;
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct deferred_split {
        spinlock_t split_queue_lock;
        struct list_head split_queue;
        unsigned long split_queue_len;
};
#endif

/*
 * On NUMA machines, each NUMA node would have a pg_data_t to describe
 * it's memory layout. On UMA machines there is a single pglist_data which
 * describes the whole memory.
 *
 * Memory statistics and page replacement data structures are maintained on a
 * per-zone basis.
 */
typedef struct pglist_data {
        /*
         * node_zones contains just the zones for THIS node. Not all of the
         * zones may be populated, but it is the full list. It is referenced by
         * this node's node_zonelists as well as other node's node_zonelists.
         */
        struct zone node_zones[MAX_NR_ZONES];

        /*
         * node_zonelists contains references to all zones in all nodes.
         * Generally the first zones will be references to this node's
         * node_zones.
         */
        struct zonelist node_zonelists[MAX_ZONELISTS];

        int nr_zones; /* number of populated zones in this node */
#ifdef CONFIG_FLAT_NODE_MEM_MAP        /* means !SPARSEMEM */
        struct page *node_mem_map;
#ifdef CONFIG_PAGE_EXTENSION
        struct page_ext *node_page_ext;
#endif
#endif
#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
        /*
         * Must be held any time you expect node_start_pfn,
         * node_present_pages, node_spanned_pages or nr_zones to stay constant.
         * Also synchronizes pgdat->first_deferred_pfn during deferred page
         * init.
         *
         * pgdat_resize_lock() and pgdat_resize_unlock() are provided to
         * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG
         * or CONFIG_DEFERRED_STRUCT_PAGE_INIT.
         *
         * Nests above zone->lock and zone->span_seqlock
         */
        spinlock_t node_size_lock;
#endif
        unsigned long node_start_pfn;
        unsigned long node_present_pages; /* total number of physical pages */
        unsigned long node_spanned_pages; /* total size of physical page
                                             range, including holes */
        int node_id;
        wait_queue_head_t kswapd_wait;
        wait_queue_head_t pfmemalloc_wait;
        struct task_struct *kswapd;        /* Protected by
                                           mem_hotplug_begin/end() */
        int kswapd_order;
        enum zone_type kswapd_highest_zoneidx;

        int kswapd_failures;                /* Number of 'reclaimed == 0' runs */

#ifdef CONFIG_COMPACTION
        int kcompactd_max_order;
        enum zone_type kcompactd_highest_zoneidx;
        wait_queue_head_t kcompactd_wait;
        struct task_struct *kcompactd;
#endif
        /*
         * This is a per-node reserve of pages that are not available
         * to userspace allocations.
         */
        unsigned long                totalreserve_pages;

#ifdef CONFIG_NUMA
        /*
         * node reclaim becomes active if more unmapped pages exist.
         */
        unsigned long                min_unmapped_pages;
        unsigned long                min_slab_pages;
#endif /* CONFIG_NUMA */

        /* Write-intensive fields used by page reclaim */
        ZONE_PADDING(_pad1_)
        spinlock_t                lru_lock;

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
        /*
         * If memory initialisation on large machines is deferred then this
         * is the first PFN that needs to be initialised.
         */
        unsigned long first_deferred_pfn;
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        struct deferred_split deferred_split_queue;
#endif

        /* Fields commonly accessed by the page reclaim scanner */

        /*
         * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.
         *
         * Use mem_cgroup_lruvec() to look up lruvecs.
         */
        struct lruvec                __lruvec;

        unsigned long                flags;

        ZONE_PADDING(_pad2_)

        /* Per-node vmstats */
        struct per_cpu_nodestat __percpu *per_cpu_nodestats;
        atomic_long_t                vm_stat[NR_VM_NODE_STAT_ITEMS];
} pg_data_t;

#define node_present_pages(nid)        (NODE_DATA(nid)->node_present_pages)
#define node_spanned_pages(nid)        (NODE_DATA(nid)->node_spanned_pages)
#ifdef CONFIG_FLAT_NODE_MEM_MAP
#define pgdat_page_nr(pgdat, pagenr)        ((pgdat)->node_mem_map + (pagenr))
#else
#define pgdat_page_nr(pgdat, pagenr)        pfn_to_page((pgdat)->node_start_pfn + (pagenr))
#endif
#define nid_page_nr(nid, pagenr)         pgdat_page_nr(NODE_DATA(nid),(pagenr))

#define node_start_pfn(nid)        (NODE_DATA(nid)->node_start_pfn)
#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))

static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
{
        return pgdat->node_start_pfn + pgdat->node_spanned_pages;
}

static inline bool pgdat_is_empty(pg_data_t *pgdat)
{
        return !pgdat->node_start_pfn && !pgdat->node_spanned_pages;
}

#include <linux/memory_hotplug.h>

void build_all_zonelists(pg_data_t *pgdat);
void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
                   enum zone_type highest_zoneidx);
bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
                         int highest_zoneidx, unsigned int alloc_flags,
                         long free_pages);
bool zone_watermark_ok(struct zone *z, unsigned int order,
                unsigned long mark, int highest_zoneidx,
                unsigned int alloc_flags);
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
                unsigned long mark, int highest_zoneidx);
/*
 * Memory initialization context, use to differentiate memory added by
 * the platform statically or via memory hotplug interface.
 */
enum meminit_context {
        MEMINIT_EARLY,
        MEMINIT_HOTPLUG,
};

extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
                                     unsigned long size);

extern void lruvec_init(struct lruvec *lruvec);

static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
{
#ifdef CONFIG_MEMCG
        return lruvec->pgdat;
#else
        return container_of(lruvec, struct pglist_data, __lruvec);
#endif
}

extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx);

#ifdef CONFIG_HAVE_MEMORYLESS_NODES
int local_memory_node(int node_id);
#else
static inline int local_memory_node(int node_id) { return node_id; };
#endif

/*
 * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
 */
#define zone_idx(zone)                ((zone) - (zone)->zone_pgdat->node_zones)

/*
 * Returns true if a zone has pages managed by the buddy allocator.
 * All the reclaim decisions have to use this function rather than
 * populated_zone(). If the whole zone is reserved then we can easily
 * end up with populated_zone() && !managed_zone().
 */
static inline bool managed_zone(struct zone *zone)
{
        return zone_managed_pages(zone);
}

/* Returns true if a zone has memory */
static inline bool populated_zone(struct zone *zone)
{
        return zone->present_pages;
}

#ifdef CONFIG_NEED_MULTIPLE_NODES
static inline int zone_to_nid(struct zone *zone)
{
        return zone->node;
}

static inline void zone_set_nid(struct zone *zone, int nid)
{
        zone->node = nid;
}
#else
static inline int zone_to_nid(struct zone *zone)
{
        return 0;
}

static inline void zone_set_nid(struct zone *zone, int nid) {}
#endif

extern int movable_zone;

#ifdef CONFIG_HIGHMEM
static inline int zone_movable_is_highmem(void)
{
#ifdef CONFIG_NEED_MULTIPLE_NODES
        return movable_zone == ZONE_HIGHMEM;
#else
        return (ZONE_MOVABLE - 1) == ZONE_HIGHMEM;
#endif
}
#endif

static inline int is_highmem_idx(enum zone_type idx)
{
#ifdef CONFIG_HIGHMEM
        return (idx == ZONE_HIGHMEM ||
                (idx == ZONE_MOVABLE && zone_movable_is_highmem()));
#else
        return 0;
#endif
}

#ifdef CONFIG_ZONE_DMA
bool has_managed_dma(void);
#else
static inline bool has_managed_dma(void)
{
        return false;
}
#endif

/**
 * is_highmem - helper function to quickly check if a struct zone is a
 *              highmem zone or not.  This is an attempt to keep references
 *              to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
 * @zone - pointer to struct zone variable
 */
static inline int is_highmem(struct zone *zone)
{
#ifdef CONFIG_HIGHMEM
        return is_highmem_idx(zone_idx(zone));
#else
        return 0;
#endif
}

/* These two functions are used to setup the per zone pages min values */
struct ctl_table;

int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);
int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *,
                size_t *, loff_t *);
extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES];
int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *,
                size_t *, loff_t *);
int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int,
                void *, size_t *, loff_t *);
int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
                void *, size_t *, loff_t *);
int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
                void *, size_t *, loff_t *);
int numa_zonelist_order_handler(struct ctl_table *, int,
                void *, size_t *, loff_t *);
extern int percpu_pagelist_fraction;
extern char numa_zonelist_order[];
#define NUMA_ZONELIST_ORDER_LEN        16

#ifndef CONFIG_NEED_MULTIPLE_NODES

extern struct pglist_data contig_page_data;
#define NODE_DATA(nid)                (&contig_page_data)
#define NODE_MEM_MAP(nid)        mem_map

#else /* CONFIG_NEED_MULTIPLE_NODES */

#include <asm/mmzone.h>

#endif /* !CONFIG_NEED_MULTIPLE_NODES */

extern struct pglist_data *first_online_pgdat(void);
extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat);
extern struct zone *next_zone(struct zone *zone);

/**
 * for_each_online_pgdat - helper macro to iterate over all online nodes
 * @pgdat - pointer to a pg_data_t variable
 */
#define for_each_online_pgdat(pgdat)                        \
        for (pgdat = first_online_pgdat();                \
             pgdat;                                        \
             pgdat = next_online_pgdat(pgdat))
/**
 * for_each_zone - helper macro to iterate over all memory zones
 * @zone - pointer to struct zone variable
 *
 * The user only needs to declare the zone variable, for_each_zone
 * fills it in.
 */
#define for_each_zone(zone)                                \
        for (zone = (first_online_pgdat())->node_zones; \
             zone;                                        \
             zone = next_zone(zone))

#define for_each_populated_zone(zone)                        \
        for (zone = (first_online_pgdat())->node_zones; \
             zone;                                        \
             zone = next_zone(zone))                        \
                if (!populated_zone(zone))                \
                        ; /* do nothing */                \
                else

static inline struct zone *zonelist_zone(struct zoneref *zoneref)
{
        return zoneref->zone;
}

static inline int zonelist_zone_idx(struct zoneref *zoneref)
{
        return zoneref->zone_idx;
}

static inline int zonelist_node_idx(struct zoneref *zoneref)
{
        return zone_to_nid(zoneref->zone);
}

struct zoneref *__next_zones_zonelist(struct zoneref *z,
                                        enum zone_type highest_zoneidx,
                                        nodemask_t *nodes);

/**
 * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
 * @z - The cursor used as a starting point for the search
 * @highest_zoneidx - The zone index of the highest zone to return
 * @nodes - An optional nodemask to filter the zonelist with
 *
 * This function returns the next zone at or below a given zone index that is
 * within the allowed nodemask using a cursor as the starting point for the
 * search. The zoneref returned is a cursor that represents the current zone
 * being examined. It should be advanced by one before calling
 * next_zones_zonelist again.
 */
static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
                                        enum zone_type highest_zoneidx,
                                        nodemask_t *nodes)
{
        if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx))
                return z;
        return __next_zones_zonelist(z, highest_zoneidx, nodes);
}

/**
 * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
 * @zonelist - The zonelist to search for a suitable zone
 * @highest_zoneidx - The zone index of the highest zone to return
 * @nodes - An optional nodemask to filter the zonelist with
 * @return - Zoneref pointer for the first suitable zone found (see below)
 *
 * This function returns the first zone at or below a given zone index that is
 * within the allowed nodemask. The zoneref returned is a cursor that can be
 * used to iterate the zonelist with next_zones_zonelist by advancing it by
 * one before calling.
 *
 * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is
 * never NULL). This may happen either genuinely, or due to concurrent nodemask
 * update due to cpuset modification.
 */
static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
                                        enum zone_type highest_zoneidx,
                                        nodemask_t *nodes)
{
        return next_zones_zonelist(zonelist->_zonerefs,
                                                        highest_zoneidx, nodes);
}

/**
 * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
 * @zone - The current zone in the iterator
 * @z - The current pointer within zonelist->_zonerefs being iterated
 * @zlist - The zonelist being iterated
 * @highidx - The zone index of the highest zone to return
 * @nodemask - Nodemask allowed by the allocator
 *
 * This iterator iterates though all zones at or below a given zone index and
 * within a given nodemask
 */
#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
        for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z);        \
                zone;                                                        \
                z = next_zones_zonelist(++z, highidx, nodemask),        \
                        zone = zonelist_zone(z))

#define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \
        for (zone = z->zone;        \
                zone;                                                        \
                z = next_zones_zonelist(++z, highidx, nodemask),        \
                        zone = zonelist_zone(z))


/**
 * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
 * @zone - The current zone in the iterator
 * @z - The current pointer within zonelist->zones being iterated
 * @zlist - The zonelist being iterated
 * @highidx - The zone index of the highest zone to return
 *
 * This iterator iterates though all zones at or below a given zone index.
 */
#define for_each_zone_zonelist(zone, z, zlist, highidx) \
        for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)

#ifdef CONFIG_SPARSEMEM
#include <asm/sparsemem.h>
#endif

#ifdef CONFIG_FLATMEM
#define pfn_to_nid(pfn)                (0)
#endif

#ifdef CONFIG_SPARSEMEM

/*
 * SECTION_SHIFT                    #bits space required to store a section #
 *
 * PA_SECTION_SHIFT                physical address to/from section number
 * PFN_SECTION_SHIFT                pfn to/from section number
 */
#define PA_SECTION_SHIFT        (SECTION_SIZE_BITS)
#define PFN_SECTION_SHIFT        (SECTION_SIZE_BITS - PAGE_SHIFT)

#define NR_MEM_SECTIONS                (1UL << SECTIONS_SHIFT)

#define PAGES_PER_SECTION       (1UL << PFN_SECTION_SHIFT)
#define PAGE_SECTION_MASK        (~(PAGES_PER_SECTION-1))

#define SECTION_BLOCKFLAGS_BITS \
        ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS)

#if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS
#error Allocator MAX_ORDER exceeds SECTION_SIZE
#endif

static inline unsigned long pfn_to_section_nr(unsigned long pfn)
{
        return pfn >> PFN_SECTION_SHIFT;
}
static inline unsigned long section_nr_to_pfn(unsigned long sec)
{
        return sec << PFN_SECTION_SHIFT;
}

#define SECTION_ALIGN_UP(pfn)        (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK)
#define SECTION_ALIGN_DOWN(pfn)        ((pfn) & PAGE_SECTION_MASK)

#define SUBSECTION_SHIFT 21
#define SUBSECTION_SIZE (1UL << SUBSECTION_SHIFT)

#define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT)
#define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT)
#define PAGE_SUBSECTION_MASK (~(PAGES_PER_SUBSECTION-1))

#if SUBSECTION_SHIFT > SECTION_SIZE_BITS
#error Subsection size exceeds section size
#else
#define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT))
#endif

#define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION)
#define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK)

struct mem_section_usage {
        struct rcu_head rcu;
#ifdef CONFIG_SPARSEMEM_VMEMMAP
        DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
#endif
        /* See declaration of similar field in struct zone */
        unsigned long pageblock_flags[0];
};

void subsection_map_init(unsigned long pfn, unsigned long nr_pages);

struct page;
struct page_ext;
struct mem_section {
        /*
         * This is, logically, a pointer to an array of struct
         * pages.  However, it is stored with some other magic.
         * (see sparse.c::sparse_init_one_section())
         *
         * Additionally during early boot we encode node id of
         * the location of the section here to guide allocation.
         * (see sparse.c::memory_present())
         *
         * Making it a UL at least makes someone do a cast
         * before using it wrong.
         */
        unsigned long section_mem_map;

        struct mem_section_usage *usage;
#ifdef CONFIG_PAGE_EXTENSION
        /*
         * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use
         * section. (see page_ext.h about this.)
         */
        struct page_ext *page_ext;
        unsigned long pad;
#endif
        /*
         * WARNING: mem_section must be a power-of-2 in size for the
         * calculation and use of SECTION_ROOT_MASK to make sense.
         */
};

#ifdef CONFIG_SPARSEMEM_EXTREME
#define SECTIONS_PER_ROOT       (PAGE_SIZE / sizeof (struct mem_section))
#else
#define SECTIONS_PER_ROOT        1
#endif

#define SECTION_NR_TO_ROOT(sec)        ((sec) / SECTIONS_PER_ROOT)
#define NR_SECTION_ROOTS        DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT)
#define SECTION_ROOT_MASK        (SECTIONS_PER_ROOT - 1)

#ifdef CONFIG_SPARSEMEM_EXTREME
extern struct mem_section **mem_section;
#else
extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
#endif

static inline unsigned long *section_to_usemap(struct mem_section *ms)
{
        return ms->usage->pageblock_flags;
}

static inline struct mem_section *__nr_to_section(unsigned long nr)
{
        unsigned long root = SECTION_NR_TO_ROOT(nr);

        if (unlikely(root >= NR_SECTION_ROOTS))
                return NULL;

#ifdef CONFIG_SPARSEMEM_EXTREME
        if (!mem_section || !mem_section[root])
                return NULL;
#endif
        return &mem_section[root][nr & SECTION_ROOT_MASK];
}
extern unsigned long __section_nr(struct mem_section *ms);
extern size_t mem_section_usage_size(void);

/*
 * We use the lower bits of the mem_map pointer to store
 * a little bit of information.  The pointer is calculated
 * as mem_map - section_nr_to_pfn(pnum).  The result is
 * aligned to the minimum alignment of the two values:
 *   1. All mem_map arrays are page-aligned.
 *   2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT
 *      lowest bits.  PFN_SECTION_SHIFT is arch-specific
 *      (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the
 *      worst combination is powerpc with 256k pages,
 *      which results in PFN_SECTION_SHIFT equal 6.
 * To sum it up, at least 6 bits are available.
 */
#define        SECTION_MARKED_PRESENT        (1UL<<0)
#define SECTION_HAS_MEM_MAP        (1UL<<1)
#define SECTION_IS_ONLINE        (1UL<<2)
#define SECTION_IS_EARLY        (1UL<<3)
#define SECTION_MAP_LAST_BIT        (1UL<<4)
#define SECTION_MAP_MASK        (~(SECTION_MAP_LAST_BIT-1))
#define SECTION_NID_SHIFT        3

static inline struct page *__section_mem_map_addr(struct mem_section *section)
{
        unsigned long map = section->section_mem_map;
        map &= SECTION_MAP_MASK;
        return (struct page *)map;
}

static inline int present_section(struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_MARKED_PRESENT));
}

static inline int present_section_nr(unsigned long nr)
{
        return present_section(__nr_to_section(nr));
}

static inline int valid_section(struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP));
}

static inline int early_section(struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_IS_EARLY));
}

static inline int valid_section_nr(unsigned long nr)
{
        return valid_section(__nr_to_section(nr));
}

static inline int online_section(struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_IS_ONLINE));
}

static inline int online_section_nr(unsigned long nr)
{
        return online_section(__nr_to_section(nr));
}

#ifdef CONFIG_MEMORY_HOTPLUG
void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
#ifdef CONFIG_MEMORY_HOTREMOVE
void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
#endif
#endif

static inline struct mem_section *__pfn_to_section(unsigned long pfn)
{
        return __nr_to_section(pfn_to_section_nr(pfn));
}

extern unsigned long __highest_present_section_nr;

static inline int subsection_map_index(unsigned long pfn)
{
        return (pfn & ~(PAGE_SECTION_MASK)) / PAGES_PER_SUBSECTION;
}

#ifdef CONFIG_SPARSEMEM_VMEMMAP
static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
{
        int idx = subsection_map_index(pfn);

        return test_bit(idx, READ_ONCE(ms->usage)->subsection_map);
}
#else
static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
{
        return 1;
}
#endif

#ifndef CONFIG_HAVE_ARCH_PFN_VALID
static inline int pfn_valid(unsigned long pfn)
{
        struct mem_section *ms;
        int ret;

        if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
                return 0;
        ms = __pfn_to_section(pfn);
        rcu_read_lock();
        if (!valid_section(ms)) {
                rcu_read_unlock();
                return 0;
        }
        /*
         * Traditionally early sections always returned pfn_valid() for
         * the entire section-sized span.
         */
        ret = early_section(ms) || pfn_section_valid(ms, pfn);
        rcu_read_unlock();

        return ret;
}
#endif

static inline int pfn_in_present_section(unsigned long pfn)
{
        if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
                return 0;
        return present_section(__pfn_to_section(pfn));
}

static inline unsigned long next_present_section_nr(unsigned long section_nr)
{
        while (++section_nr <= __highest_present_section_nr) {
                if (present_section_nr(section_nr))
                        return section_nr;
        }

        return -1;
}

/*
 * These are _only_ used during initialisation, therefore they
 * can use __initdata ...  They could have names to indicate
 * this restriction.
 */
#ifdef CONFIG_NUMA
#define pfn_to_nid(pfn)                                                        \
({                                                                        \
        unsigned long __pfn_to_nid_pfn = (pfn);                                \
        page_to_nid(pfn_to_page(__pfn_to_nid_pfn));                        \
})
#else
#define pfn_to_nid(pfn)                (0)
#endif

void sparse_init(void);
#else
#define sparse_init()        do {} while (0)
#define sparse_index_init(_sec, _nid)  do {} while (0)
#define pfn_in_present_section pfn_valid
#define subsection_map_init(_pfn, _nr_pages) do {} while (0)
#endif /* CONFIG_SPARSEMEM */

/*
 * During memory init memblocks map pfns to nids. The search is expensive and
 * this caches recent lookups. The implementation of __early_pfn_to_nid
 * may treat start/end as pfns or sections.
 */
struct mminit_pfnnid_cache {
        unsigned long last_start;
        unsigned long last_end;
        int last_nid;
};

/*
 * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we
 * need to check pfn validity within that MAX_ORDER_NR_PAGES block.
 * pfn_valid_within() should be used in this case; we optimise this away
 * when we have no holes within a MAX_ORDER_NR_PAGES block.
 */
#ifdef CONFIG_HOLES_IN_ZONE
#define pfn_valid_within(pfn) pfn_valid(pfn)
#else
#define pfn_valid_within(pfn) (1)
#endif

#endif /* !__GENERATING_BOUNDS.H */
#endif /* !__ASSEMBLY__ */
#endif /* _LINUX_MMZONE_H */































































































































































































































    1 





































































































































































































































































































































































    1 























    1 


    1 

























































    1 










































































































    1 























































    1 








































    1 

















    1 












    1 









    1 















    1 
























































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_H
#define _ASM_X86_PGTABLE_H

#include <linux/mem_encrypt.h>
#include <asm/page.h>
#include <asm/pgtable_types.h>

/*
 * Macro to mark a page protection value as UC-
 */
#define pgprot_noncached(prot)                                                \
        ((boot_cpu_data.x86 > 3)                                        \
         ? (__pgprot(pgprot_val(prot) |                                        \
                     cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)))        \
         : (prot))

/*
 * Macros to add or remove encryption attribute
 */
#define pgprot_encrypted(prot)        __pgprot(__sme_set(pgprot_val(prot)))
#define pgprot_decrypted(prot)        __pgprot(__sme_clr(pgprot_val(prot)))

#ifndef __ASSEMBLY__
#include <asm/x86_init.h>
#include <asm/fpu/xstate.h>
#include <asm/fpu/api.h>
#include <asm-generic/pgtable_uffd.h>

extern pgd_t early_top_pgt[PTRS_PER_PGD];
bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd);

void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm);
void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm,
                                   bool user);
void ptdump_walk_pgd_level_checkwx(void);
void ptdump_walk_user_pgd_level_checkwx(void);

#ifdef CONFIG_DEBUG_WX
#define debug_checkwx()                ptdump_walk_pgd_level_checkwx()
#define debug_checkwx_user()        ptdump_walk_user_pgd_level_checkwx()
#else
#define debug_checkwx()                do { } while (0)
#define debug_checkwx_user()        do { } while (0)
#endif

/*
 * ZERO_PAGE is a global shared page that is always zero: used
 * for zero-mapped memory areas etc..
 */
extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
        __visible;
#define ZERO_PAGE(vaddr) ((void)(vaddr),virt_to_page(empty_zero_page))

extern spinlock_t pgd_lock;
extern struct list_head pgd_list;

extern struct mm_struct *pgd_page_get_mm(struct page *page);

extern pmdval_t early_pmd_flags;

#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else  /* !CONFIG_PARAVIRT_XXL */
#define set_pte(ptep, pte)                native_set_pte(ptep, pte)

#define set_pte_atomic(ptep, pte)                                        \
        native_set_pte_atomic(ptep, pte)

#define set_pmd(pmdp, pmd)                native_set_pmd(pmdp, pmd)

#ifndef __PAGETABLE_P4D_FOLDED
#define set_pgd(pgdp, pgd)                native_set_pgd(pgdp, pgd)
#define pgd_clear(pgd)                        (pgtable_l5_enabled() ? native_pgd_clear(pgd) : 0)
#endif

#ifndef set_p4d
# define set_p4d(p4dp, p4d)                native_set_p4d(p4dp, p4d)
#endif

#ifndef __PAGETABLE_PUD_FOLDED
#define p4d_clear(p4d)                        native_p4d_clear(p4d)
#endif

#ifndef set_pud
# define set_pud(pudp, pud)                native_set_pud(pudp, pud)
#endif

#ifndef __PAGETABLE_PUD_FOLDED
#define pud_clear(pud)                        native_pud_clear(pud)
#endif

#define pte_clear(mm, addr, ptep)        native_pte_clear(mm, addr, ptep)
#define pmd_clear(pmd)                        native_pmd_clear(pmd)

#define pgd_val(x)        native_pgd_val(x)
#define __pgd(x)        native_make_pgd(x)

#ifndef __PAGETABLE_P4D_FOLDED
#define p4d_val(x)        native_p4d_val(x)
#define __p4d(x)        native_make_p4d(x)
#endif

#ifndef __PAGETABLE_PUD_FOLDED
#define pud_val(x)        native_pud_val(x)
#define __pud(x)        native_make_pud(x)
#endif

#ifndef __PAGETABLE_PMD_FOLDED
#define pmd_val(x)        native_pmd_val(x)
#define __pmd(x)        native_make_pmd(x)
#endif

#define pte_val(x)        native_pte_val(x)
#define __pte(x)        native_make_pte(x)

#define arch_end_context_switch(prev)        do {} while(0)
#endif        /* CONFIG_PARAVIRT_XXL */

/*
 * The following only work if pte_present() is true.
 * Undefined behaviour if not..
 */
static inline int pte_dirty(pte_t pte)
{
        return pte_flags(pte) & _PAGE_DIRTY;
}


static inline u32 read_pkru(void)
{
        if (boot_cpu_has(X86_FEATURE_OSPKE))
                return rdpkru();
        return 0;
}

static inline void write_pkru(u32 pkru)
{
        struct pkru_state *pk;

        if (!boot_cpu_has(X86_FEATURE_OSPKE))
                return;

        pk = get_xsave_addr(&current->thread.fpu.state.xsave, XFEATURE_PKRU);

        /*
         * The PKRU value in xstate needs to be in sync with the value that is
         * written to the CPU. The FPU restore on return to userland would
         * otherwise load the previous value again.
         */
        fpregs_lock();
        if (pk)
                pk->pkru = pkru;
        __write_pkru(pkru);
        fpregs_unlock();
}

static inline int pte_young(pte_t pte)
{
        return pte_flags(pte) & _PAGE_ACCESSED;
}

static inline int pmd_dirty(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_DIRTY;
}

static inline int pmd_young(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_ACCESSED;
}

static inline int pud_dirty(pud_t pud)
{
        return pud_flags(pud) & _PAGE_DIRTY;
}

static inline int pud_young(pud_t pud)
{
        return pud_flags(pud) & _PAGE_ACCESSED;
}

static inline int pte_write(pte_t pte)
{
        return pte_flags(pte) & _PAGE_RW;
}

static inline int pte_huge(pte_t pte)
{
        return pte_flags(pte) & _PAGE_PSE;
}

static inline int pte_global(pte_t pte)
{
        return pte_flags(pte) & _PAGE_GLOBAL;
}

static inline int pte_exec(pte_t pte)
{
        return !(pte_flags(pte) & _PAGE_NX);
}

static inline int pte_special(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SPECIAL;
}

/* Entries that were set to PROT_NONE are inverted */

static inline u64 protnone_mask(u64 val);

static inline unsigned long pte_pfn(pte_t pte)
{
        phys_addr_t pfn = pte_val(pte);
        pfn ^= protnone_mask(pfn);
        return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT;
}

static inline unsigned long pmd_pfn(pmd_t pmd)
{
        phys_addr_t pfn = pmd_val(pmd);
        pfn ^= protnone_mask(pfn);
        return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
}

static inline unsigned long pud_pfn(pud_t pud)
{
        phys_addr_t pfn = pud_val(pud);
        pfn ^= protnone_mask(pfn);
        return (pfn & pud_pfn_mask(pud)) >> PAGE_SHIFT;
}

static inline unsigned long p4d_pfn(p4d_t p4d)
{
        return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT;
}

static inline unsigned long pgd_pfn(pgd_t pgd)
{
        return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT;
}

#define p4d_leaf        p4d_large
static inline int p4d_large(p4d_t p4d)
{
        /* No 512 GiB pages yet */
        return 0;
}

#define pte_page(pte)        pfn_to_page(pte_pfn(pte))

#define pmd_leaf        pmd_large
static inline int pmd_large(pmd_t pte)
{
        return pmd_flags(pte) & _PAGE_PSE;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* NOTE: when predicate huge page, consider also pmd_devmap, or use pmd_large */
static inline int pmd_trans_huge(pmd_t pmd)
{
        return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
}

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static inline int pud_trans_huge(pud_t pud)
{
        return (pud_val(pud) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
}
#endif

#define has_transparent_hugepage has_transparent_hugepage
static inline int has_transparent_hugepage(void)
{
        return boot_cpu_has(X86_FEATURE_PSE);
}

#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
static inline int pmd_devmap(pmd_t pmd)
{
        return !!(pmd_val(pmd) & _PAGE_DEVMAP);
}

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static inline int pud_devmap(pud_t pud)
{
        return !!(pud_val(pud) & _PAGE_DEVMAP);
}
#else
static inline int pud_devmap(pud_t pud)
{
        return 0;
}
#endif

static inline int pgd_devmap(pgd_t pgd)
{
        return 0;
}
#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
{
        pteval_t v = native_pte_val(pte);

        return native_make_pte(v | set);
}

static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
{
        pteval_t v = native_pte_val(pte);

        return native_make_pte(v & ~clear);
}

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline int pte_uffd_wp(pte_t pte)
{
        return pte_flags(pte) & _PAGE_UFFD_WP;
}

static inline pte_t pte_mkuffd_wp(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_UFFD_WP);
}

static inline pte_t pte_clear_uffd_wp(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_UFFD_WP);
}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */

static inline pte_t pte_mkclean(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_DIRTY);
}

static inline pte_t pte_mkold(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_ACCESSED);
}

static inline pte_t pte_wrprotect(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_RW);
}

static inline pte_t pte_mkexec(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_NX);
}

static inline pte_t pte_mkdirty(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
}

static inline pte_t pte_mkyoung(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_ACCESSED);
}

static inline pte_t pte_mkwrite(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_RW);
}

static inline pte_t pte_mkhuge(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_PSE);
}

static inline pte_t pte_clrhuge(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_PSE);
}

static inline pte_t pte_mkglobal(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_GLOBAL);
}

static inline pte_t pte_clrglobal(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_GLOBAL);
}

static inline pte_t pte_mkspecial(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SPECIAL);
}

static inline pte_t pte_mkdevmap(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP);
}

static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
{
        pmdval_t v = native_pmd_val(pmd);

        return native_make_pmd(v | set);
}

static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
{
        pmdval_t v = native_pmd_val(pmd);

        return native_make_pmd(v & ~clear);
}

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline int pmd_uffd_wp(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_UFFD_WP;
}

static inline pmd_t pmd_mkuffd_wp(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_UFFD_WP);
}

static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_UFFD_WP);
}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */

static inline pmd_t pmd_mkold(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_ACCESSED);
}

static inline pmd_t pmd_mkclean(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_DIRTY);
}

static inline pmd_t pmd_wrprotect(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_RW);
}

static inline pmd_t pmd_mkdirty(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
}

static inline pmd_t pmd_mkdevmap(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_DEVMAP);
}

static inline pmd_t pmd_mkhuge(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_PSE);
}

static inline pmd_t pmd_mkyoung(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_ACCESSED);
}

static inline pmd_t pmd_mkwrite(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_RW);
}

static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
{
        pudval_t v = native_pud_val(pud);

        return native_make_pud(v | set);
}

static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
{
        pudval_t v = native_pud_val(pud);

        return native_make_pud(v & ~clear);
}

static inline pud_t pud_mkold(pud_t pud)
{
        return pud_clear_flags(pud, _PAGE_ACCESSED);
}

static inline pud_t pud_mkclean(pud_t pud)
{
        return pud_clear_flags(pud, _PAGE_DIRTY);
}

static inline pud_t pud_wrprotect(pud_t pud)
{
        return pud_clear_flags(pud, _PAGE_RW);
}

static inline pud_t pud_mkdirty(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
}

static inline pud_t pud_mkdevmap(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_DEVMAP);
}

static inline pud_t pud_mkhuge(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_PSE);
}

static inline pud_t pud_mkyoung(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_ACCESSED);
}

static inline pud_t pud_mkwrite(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_RW);
}

#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline int pte_soft_dirty(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SOFT_DIRTY;
}

static inline int pmd_soft_dirty(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_SOFT_DIRTY;
}

static inline int pud_soft_dirty(pud_t pud)
{
        return pud_flags(pud) & _PAGE_SOFT_DIRTY;
}

static inline pte_t pte_mksoft_dirty(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SOFT_DIRTY);
}

static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
}

static inline pud_t pud_mksoft_dirty(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_SOFT_DIRTY);
}

static inline pte_t pte_clear_soft_dirty(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
}

static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
}

static inline pud_t pud_clear_soft_dirty(pud_t pud)
{
        return pud_clear_flags(pud, _PAGE_SOFT_DIRTY);
}

#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */

/*
 * Mask out unsupported bits in a present pgprot.  Non-present pgprots
 * can use those bits for other purposes, so leave them be.
 */
static inline pgprotval_t massage_pgprot(pgprot_t pgprot)
{
        pgprotval_t protval = pgprot_val(pgprot);

        if (protval & _PAGE_PRESENT)
                protval &= __supported_pte_mask;

        return protval;
}

static inline pgprotval_t check_pgprot(pgprot_t pgprot)
{
        pgprotval_t massaged_val = massage_pgprot(pgprot);

        /* mmdebug.h can not be included here because of dependencies */
#ifdef CONFIG_DEBUG_VM
        WARN_ONCE(pgprot_val(pgprot) != massaged_val,
                  "attempted to set unsupported pgprot: %016llx "
                  "bits: %016llx supported: %016llx\n",
                  (u64)pgprot_val(pgprot),
                  (u64)pgprot_val(pgprot) ^ massaged_val,
                  (u64)__supported_pte_mask);
#endif

        return massaged_val;
}

static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
{
        phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
        pfn ^= protnone_mask(pgprot_val(pgprot));
        pfn &= PTE_PFN_MASK;
        return __pte(pfn | check_pgprot(pgprot));
}

static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
{
        phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
        pfn ^= protnone_mask(pgprot_val(pgprot));
        pfn &= PHYSICAL_PMD_PAGE_MASK;
        return __pmd(pfn | check_pgprot(pgprot));
}

static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot)
{
        phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
        pfn ^= protnone_mask(pgprot_val(pgprot));
        pfn &= PHYSICAL_PUD_PAGE_MASK;
        return __pud(pfn | check_pgprot(pgprot));
}

static inline pmd_t pmd_mkinvalid(pmd_t pmd)
{
        return pfn_pmd(pmd_pfn(pmd),
                      __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
}

static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask);

static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
        pteval_t val = pte_val(pte), oldval = val;

        /*
         * Chop off the NX bit (if present), and add the NX portion of
         * the newprot (if present):
         */
        val &= _PAGE_CHG_MASK;
        val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK;
        val = flip_protnone_guard(oldval, val, PTE_PFN_MASK);
        return __pte(val);
}

static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
{
        pmdval_t val = pmd_val(pmd), oldval = val;

        val &= _HPAGE_CHG_MASK;
        val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK;
        val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK);
        return __pmd(val);
}

/*
 * mprotect needs to preserve PAT and encryption bits when updating
 * vm_page_prot
 */
#define pgprot_modify pgprot_modify
static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
{
        pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK;
        pgprotval_t addbits = pgprot_val(newprot) & ~_PAGE_CHG_MASK;
        return __pgprot(preservebits | addbits);
}

#define pte_pgprot(x) __pgprot(pte_flags(x))
#define pmd_pgprot(x) __pgprot(pmd_flags(x))
#define pud_pgprot(x) __pgprot(pud_flags(x))
#define p4d_pgprot(x) __pgprot(p4d_flags(x))

#define canon_pgprot(p) __pgprot(massage_pgprot(p))

static inline pgprot_t arch_filter_pgprot(pgprot_t prot)
{
        return canon_pgprot(prot);
}

static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
                                         enum page_cache_mode pcm,
                                         enum page_cache_mode new_pcm)
{
        /*
         * PAT type is always WB for untracked ranges, so no need to check.
         */
        if (x86_platform.is_untracked_pat_range(paddr, paddr + size))
                return 1;

        /*
         * Certain new memtypes are not allowed with certain
         * requested memtype:
         * - request is uncached, return cannot be write-back
         * - request is write-combine, return cannot be write-back
         * - request is write-through, return cannot be write-back
         * - request is write-through, return cannot be write-combine
         */
        if ((pcm == _PAGE_CACHE_MODE_UC_MINUS &&
             new_pcm == _PAGE_CACHE_MODE_WB) ||
            (pcm == _PAGE_CACHE_MODE_WC &&
             new_pcm == _PAGE_CACHE_MODE_WB) ||
            (pcm == _PAGE_CACHE_MODE_WT &&
             new_pcm == _PAGE_CACHE_MODE_WB) ||
            (pcm == _PAGE_CACHE_MODE_WT &&
             new_pcm == _PAGE_CACHE_MODE_WC)) {
                return 0;
        }

        return 1;
}

pmd_t *populate_extra_pmd(unsigned long vaddr);
pte_t *populate_extra_pte(unsigned long vaddr);

#ifdef CONFIG_PAGE_TABLE_ISOLATION
pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd);

/*
 * Take a PGD location (pgdp) and a pgd value that needs to be set there.
 * Populates the user and returns the resulting PGD that must be set in
 * the kernel copy of the page tables.
 */
static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
{
        if (!static_cpu_has(X86_FEATURE_PTI))
                return pgd;
        return __pti_set_user_pgtbl(pgdp, pgd);
}
#else   /* CONFIG_PAGE_TABLE_ISOLATION */
static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
{
        return pgd;
}
#endif  /* CONFIG_PAGE_TABLE_ISOLATION */

#endif        /* __ASSEMBLY__ */


#ifdef CONFIG_X86_32
# include <asm/pgtable_32.h>
#else
# include <asm/pgtable_64.h>
#endif

#ifndef __ASSEMBLY__
#include <linux/mm_types.h>
#include <linux/mmdebug.h>
#include <linux/log2.h>
#include <asm/fixmap.h>

static inline int pte_none(pte_t pte)
{
        return !(pte.pte & ~(_PAGE_KNL_ERRATUM_MASK));
}

#define __HAVE_ARCH_PTE_SAME
static inline int pte_same(pte_t a, pte_t b)
{
        return a.pte == b.pte;
}

static inline int pte_present(pte_t a)
{
        return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
}

#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
static inline int pte_devmap(pte_t a)
{
        return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP;
}
#endif

#define pte_accessible pte_accessible
static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
{
        if (pte_flags(a) & _PAGE_PRESENT)
                return true;

        if ((pte_flags(a) & _PAGE_PROTNONE) &&
                        mm_tlb_flush_pending(mm))
                return true;

        return false;
}

static inline int pmd_present(pmd_t pmd)
{
        /*
         * Checking for _PAGE_PSE is needed too because
         * split_huge_page will temporarily clear the present bit (but
         * the _PAGE_PSE flag will remain set at all times while the
         * _PAGE_PRESENT bit is clear).
         */
        return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
}

#ifdef CONFIG_NUMA_BALANCING
/*
 * These work without NUMA balancing but the kernel does not care. See the
 * comment in include/linux/pgtable.h
 */
static inline int pte_protnone(pte_t pte)
{
        return (pte_flags(pte) & (_PAGE_PROTNONE | _PAGE_PRESENT))
                == _PAGE_PROTNONE;
}

static inline int pmd_protnone(pmd_t pmd)
{
        return (pmd_flags(pmd) & (_PAGE_PROTNONE | _PAGE_PRESENT))
                == _PAGE_PROTNONE;
}
#endif /* CONFIG_NUMA_BALANCING */

static inline int pmd_none(pmd_t pmd)
{
        /* Only check low word on 32-bit platforms, since it might be
           out of sync with upper half. */
        unsigned long val = native_pmd_val(pmd);
        return (val & ~_PAGE_KNL_ERRATUM_MASK) == 0;
}

static inline unsigned long pmd_page_vaddr(pmd_t pmd)
{
        return (unsigned long)__va(pmd_val(pmd) & pmd_pfn_mask(pmd));
}

/*
 * Currently stuck as a macro due to indirect forward reference to
 * linux/mmzone.h's __section_mem_map_addr() definition:
 */
#define pmd_page(pmd)        pfn_to_page(pmd_pfn(pmd))

/*
 * Conversion functions: convert a page and protection to a page entry,
 * and a page entry and page directory to the page they refer to.
 *
 * (Currently stuck as a macro because of indirect forward reference
 * to linux/mm.h:page_to_nid())
 */
#define mk_pte(page, pgprot)   pfn_pte(page_to_pfn(page), (pgprot))

static inline int pmd_bad(pmd_t pmd)
{
        return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
}

static inline unsigned long pages_to_mb(unsigned long npg)
{
        return npg >> (20 - PAGE_SHIFT);
}

#if CONFIG_PGTABLE_LEVELS > 2
static inline int pud_none(pud_t pud)
{
        return (native_pud_val(pud) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0;
}

static inline int pud_present(pud_t pud)
{
        return pud_flags(pud) & _PAGE_PRESENT;
}

static inline pmd_t *pud_pgtable(pud_t pud)
{
        return (pmd_t *)__va(pud_val(pud) & pud_pfn_mask(pud));
}

/*
 * Currently stuck as a macro due to indirect forward reference to
 * linux/mmzone.h's __section_mem_map_addr() definition:
 */
#define pud_page(pud)        pfn_to_page(pud_pfn(pud))

#define pud_leaf        pud_large
static inline int pud_large(pud_t pud)
{
        return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) ==
                (_PAGE_PSE | _PAGE_PRESENT);
}

static inline int pud_bad(pud_t pud)
{
        return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
}
#else
#define pud_leaf        pud_large
static inline int pud_large(pud_t pud)
{
        return 0;
}
#endif        /* CONFIG_PGTABLE_LEVELS > 2 */

#if CONFIG_PGTABLE_LEVELS > 3
static inline int p4d_none(p4d_t p4d)
{
        return (native_p4d_val(p4d) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0;
}

static inline int p4d_present(p4d_t p4d)
{
        return p4d_flags(p4d) & _PAGE_PRESENT;
}

static inline pud_t *p4d_pgtable(p4d_t p4d)
{
        return (pud_t *)__va(p4d_val(p4d) & p4d_pfn_mask(p4d));
}

/*
 * Currently stuck as a macro due to indirect forward reference to
 * linux/mmzone.h's __section_mem_map_addr() definition:
 */
#define p4d_page(p4d)        pfn_to_page(p4d_pfn(p4d))

static inline int p4d_bad(p4d_t p4d)
{
        unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;

        if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
                ignore_flags |= _PAGE_NX;

        return (p4d_flags(p4d) & ~ignore_flags) != 0;
}
#endif  /* CONFIG_PGTABLE_LEVELS > 3 */

static inline unsigned long p4d_index(unsigned long address)
{
        return (address >> P4D_SHIFT) & (PTRS_PER_P4D - 1);
}

#if CONFIG_PGTABLE_LEVELS > 4
static inline int pgd_present(pgd_t pgd)
{
        if (!pgtable_l5_enabled())
                return 1;
        return pgd_flags(pgd) & _PAGE_PRESENT;
}

static inline unsigned long pgd_page_vaddr(pgd_t pgd)
{
        return (unsigned long)__va((unsigned long)pgd_val(pgd) & PTE_PFN_MASK);
}

/*
 * Currently stuck as a macro due to indirect forward reference to
 * linux/mmzone.h's __section_mem_map_addr() definition:
 */
#define pgd_page(pgd)        pfn_to_page(pgd_pfn(pgd))

/* to find an entry in a page-table-directory. */
static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
{
        if (!pgtable_l5_enabled())
                return (p4d_t *)pgd;
        return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address);
}

static inline int pgd_bad(pgd_t pgd)
{
        unsigned long ignore_flags = _PAGE_USER;

        if (!pgtable_l5_enabled())
                return 0;

        if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
                ignore_flags |= _PAGE_NX;

        return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
}

static inline int pgd_none(pgd_t pgd)
{
        if (!pgtable_l5_enabled())
                return 0;
        /*
         * There is no need to do a workaround for the KNL stray
         * A/D bit erratum here.  PGDs only point to page tables
         * except on 32-bit non-PAE which is not supported on
         * KNL.
         */
        return !native_pgd_val(pgd);
}
#endif        /* CONFIG_PGTABLE_LEVELS > 4 */

#endif        /* __ASSEMBLY__ */

#define KERNEL_PGD_BOUNDARY        pgd_index(PAGE_OFFSET)
#define KERNEL_PGD_PTRS                (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)

#ifndef __ASSEMBLY__

extern int direct_gbpages;
void init_mem_mapping(void);
void early_alloc_pgt_buf(void);
extern void memblock_find_dma_reserve(void);
void __init poking_init(void);
unsigned long init_memory_mapping(unsigned long start,
                                  unsigned long end, pgprot_t prot);

#ifdef CONFIG_X86_64
extern pgd_t trampoline_pgd_entry;
#endif

/* local pte updates need not use xchg for locking */
static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
{
        pte_t res = *ptep;

        /* Pure native function needs no input for mm, addr */
        native_pte_clear(NULL, 0, ptep);
        return res;
}

static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp)
{
        pmd_t res = *pmdp;

        native_pmd_clear(pmdp);
        return res;
}

static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp)
{
        pud_t res = *pudp;

        native_pud_clear(pudp);
        return res;
}

static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
                              pte_t *ptep, pte_t pte)
{
        set_pte(ptep, pte);
}

static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
                              pmd_t *pmdp, pmd_t pmd)
{
        set_pmd(pmdp, pmd);
}

static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
                              pud_t *pudp, pud_t pud)
{
        native_set_pud(pudp, pud);
}

/*
 * We only update the dirty/accessed state if we set
 * the dirty bit by hand in the kernel, since the hardware
 * will do the accessed bit for us, and we don't want to
 * race with other CPU's that might be updating the dirty
 * bit at the same time.
 */
struct vm_area_struct;

#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
extern int ptep_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pte_t *ptep,
                                 pte_t entry, int dirty);

#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
                                     unsigned long addr, pte_t *ptep);

#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
extern int ptep_clear_flush_young(struct vm_area_struct *vma,
                                  unsigned long address, pte_t *ptep);

#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
                                       pte_t *ptep)
{
        pte_t pte = native_ptep_get_and_clear(ptep);
        return pte;
}

#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
                                            unsigned long addr, pte_t *ptep,
                                            int full)
{
        pte_t pte;
        if (full) {
                /*
                 * Full address destruction in progress; paravirt does not
                 * care about updates and native needs no locking
                 */
                pte = native_local_ptep_get_and_clear(ptep);
        } else {
                pte = ptep_get_and_clear(mm, addr, ptep);
        }
        return pte;
}

#define __HAVE_ARCH_PTEP_SET_WRPROTECT
static inline void ptep_set_wrprotect(struct mm_struct *mm,
                                      unsigned long addr, pte_t *ptep)
{
        clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte);
}

#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0)

#define mk_pmd(page, pgprot)   pfn_pmd(page_to_pfn(page), (pgprot))

#define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
extern int pmdp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp,
                                 pmd_t entry, int dirty);
extern int pudp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pud_t *pudp,
                                 pud_t entry, int dirty);

#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
                                     unsigned long addr, pmd_t *pmdp);
extern int pudp_test_and_clear_young(struct vm_area_struct *vma,
                                     unsigned long addr, pud_t *pudp);

#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
                                  unsigned long address, pmd_t *pmdp);


#define pmd_write pmd_write
static inline int pmd_write(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_RW;
}

#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
                                       pmd_t *pmdp)
{
        return native_pmdp_get_and_clear(pmdp);
}

#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
                                        unsigned long addr, pud_t *pudp)
{
        return native_pudp_get_and_clear(pudp);
}

#define __HAVE_ARCH_PMDP_SET_WRPROTECT
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long addr, pmd_t *pmdp)
{
        clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
}

#define pud_write pud_write
static inline int pud_write(pud_t pud)
{
        return pud_flags(pud) & _PAGE_RW;
}

#ifndef pmdp_establish
#define pmdp_establish pmdp_establish
static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmdp, pmd_t pmd)
{
        if (IS_ENABLED(CONFIG_SMP)) {
                return xchg(pmdp, pmd);
        } else {
                pmd_t old = *pmdp;
                WRITE_ONCE(*pmdp, pmd);
                return old;
        }
}
#endif
/*
 * Page table pages are page-aligned.  The lower half of the top
 * level is used for userspace and the top half for the kernel.
 *
 * Returns true for parts of the PGD that map userspace and
 * false for the parts that map the kernel.
 */
static inline bool pgdp_maps_userspace(void *__ptr)
{
        unsigned long ptr = (unsigned long)__ptr;

        return (((ptr & ~PAGE_MASK) / sizeof(pgd_t)) < PGD_KERNEL_START);
}

#define pgd_leaf        pgd_large
static inline int pgd_large(pgd_t pgd) { return 0; }

#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
 * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages
 * (8k-aligned and 8k in size).  The kernel one is at the beginning 4k and
 * the user one is in the last 4k.  To switch between them, you
 * just need to flip the 12th bit in their addresses.
 */
#define PTI_PGTABLE_SWITCH_BIT        PAGE_SHIFT

/*
 * This generates better code than the inline assembly in
 * __set_bit().
 */
static inline void *ptr_set_bit(void *ptr, int bit)
{
        unsigned long __ptr = (unsigned long)ptr;

        __ptr |= BIT(bit);
        return (void *)__ptr;
}
static inline void *ptr_clear_bit(void *ptr, int bit)
{
        unsigned long __ptr = (unsigned long)ptr;

        __ptr &= ~BIT(bit);
        return (void *)__ptr;
}

static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
{
        return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
}

static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
{
        return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
}

static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
{
        return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
}

static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
{
        return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
}
#endif /* CONFIG_PAGE_TABLE_ISOLATION */

/*
 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
 *
 *  dst - pointer to pgd range anwhere on a pgd page
 *  src - ""
 *  count - the number of pgds to copy.
 *
 * dst and src can be on the same page, but the range must not overlap,
 * and must not cross a page boundary.
 */
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
        memcpy(dst, src, count * sizeof(pgd_t));
#ifdef CONFIG_PAGE_TABLE_ISOLATION
        if (!static_cpu_has(X86_FEATURE_PTI))
                return;
        /* Clone the user space pgd as well */
        memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src),
               count * sizeof(pgd_t));
#endif
}

#define PTE_SHIFT ilog2(PTRS_PER_PTE)
static inline int page_level_shift(enum pg_level level)
{
        return (PAGE_SHIFT - PTE_SHIFT) + level * PTE_SHIFT;
}
static inline unsigned long page_level_size(enum pg_level level)
{
        return 1UL << page_level_shift(level);
}
static inline unsigned long page_level_mask(enum pg_level level)
{
        return ~(page_level_size(level) - 1);
}

/*
 * The x86 doesn't have any external MMU info: the kernel page
 * tables contain all the necessary information.
 */
static inline void update_mmu_cache(struct vm_area_struct *vma,
                unsigned long addr, pte_t *ptep)
{
}
static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
                unsigned long addr, pmd_t *pmd)
{
}
static inline void update_mmu_cache_pud(struct vm_area_struct *vma,
                unsigned long addr, pud_t *pud)
{
}

#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
}

static inline int pte_swp_soft_dirty(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
}

static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
}

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_SWP_SOFT_DIRTY);
}

static inline int pmd_swp_soft_dirty(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_SWP_SOFT_DIRTY;
}

static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_SWP_SOFT_DIRTY);
}
#endif
#endif

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline pte_t pte_swp_mkuffd_wp(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SWP_UFFD_WP);
}

static inline int pte_swp_uffd_wp(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SWP_UFFD_WP;
}

static inline pte_t pte_swp_clear_uffd_wp(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_SWP_UFFD_WP);
}

static inline pmd_t pmd_swp_mkuffd_wp(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_SWP_UFFD_WP);
}

static inline int pmd_swp_uffd_wp(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_SWP_UFFD_WP;
}

static inline pmd_t pmd_swp_clear_uffd_wp(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_SWP_UFFD_WP);
}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */

#define PKRU_AD_BIT 0x1u
#define PKRU_WD_BIT 0x2u
#define PKRU_BITS_PER_PKEY 2

#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
extern u32 init_pkru_value;
#else
#define init_pkru_value        0
#endif

static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
{
        int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
        return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits));
}

static inline bool __pkru_allows_write(u32 pkru, u16 pkey)
{
        int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
        /*
         * Access-disable disables writes too so we need to check
         * both bits here.
         */
        return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits));
}

static inline u16 pte_flags_pkey(unsigned long pte_flags)
{
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
        /* ifdef to avoid doing 59-bit shift on 32-bit values */
        return (pte_flags & _PAGE_PKEY_MASK) >> _PAGE_BIT_PKEY_BIT0;
#else
        return 0;
#endif
}

static inline bool __pkru_allows_pkey(u16 pkey, bool write)
{
        u32 pkru = read_pkru();

        if (!__pkru_allows_read(pkru, pkey))
                return false;
        if (write && !__pkru_allows_write(pkru, pkey))
                return false;

        return true;
}

/*
 * 'pteval' can come from a PTE, PMD or PUD.  We only check
 * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
 * same value on all 3 types.
 */
static inline bool __pte_access_permitted(unsigned long pteval, bool write)
{
        unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;

        if (write)
                need_pte_bits |= _PAGE_RW;

        if ((pteval & need_pte_bits) != need_pte_bits)
                return 0;

        return __pkru_allows_pkey(pte_flags_pkey(pteval), write);
}

#define pte_access_permitted pte_access_permitted
static inline bool pte_access_permitted(pte_t pte, bool write)
{
        return __pte_access_permitted(pte_val(pte), write);
}

#define pmd_access_permitted pmd_access_permitted
static inline bool pmd_access_permitted(pmd_t pmd, bool write)
{
        return __pte_access_permitted(pmd_val(pmd), write);
}

#define pud_access_permitted pud_access_permitted
static inline bool pud_access_permitted(pud_t pud, bool write)
{
        return __pte_access_permitted(pud_val(pud), write);
}

#define __HAVE_ARCH_PFN_MODIFY_ALLOWED 1
extern bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot);

static inline bool arch_has_pfn_modify_check(void)
{
        return boot_cpu_has_bug(X86_BUG_L1TF);
}

#define arch_faults_on_old_pte arch_faults_on_old_pte
static inline bool arch_faults_on_old_pte(void)
{
        return false;
}

#endif        /* __ASSEMBLY__ */

#endif /* _ASM_X86_PGTABLE_H */




































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
/* SPDX-License-Identifier: GPL-2.0 */
/*  linux/include/linux/clockchips.h
 *
 *  This file contains the structure definitions for clockchips.
 *
 *  If you are not a clockchip, or the time of day code, you should
 *  not be including this file!
 */
#ifndef _LINUX_CLOCKCHIPS_H
#define _LINUX_CLOCKCHIPS_H

#ifdef CONFIG_GENERIC_CLOCKEVENTS

# include <linux/clocksource.h>
# include <linux/cpumask.h>
# include <linux/ktime.h>
# include <linux/notifier.h>

struct clock_event_device;
struct module;

/*
 * Possible states of a clock event device.
 *
 * DETACHED:        Device is not used by clockevents core. Initial state or can be
 *                reached from SHUTDOWN.
 * SHUTDOWN:        Device is powered-off. Can be reached from PERIODIC or ONESHOT.
 * PERIODIC:        Device is programmed to generate events periodically. Can be
 *                reached from DETACHED or SHUTDOWN.
 * ONESHOT:        Device is programmed to generate event only once. Can be reached
 *                from DETACHED or SHUTDOWN.
 * ONESHOT_STOPPED: Device was programmed in ONESHOT mode and is temporarily
 *                    stopped.
 */
enum clock_event_state {
        CLOCK_EVT_STATE_DETACHED,
        CLOCK_EVT_STATE_SHUTDOWN,
        CLOCK_EVT_STATE_PERIODIC,
        CLOCK_EVT_STATE_ONESHOT,
        CLOCK_EVT_STATE_ONESHOT_STOPPED,
};

/*
 * Clock event features
 */
# define CLOCK_EVT_FEAT_PERIODIC        0x000001
# define CLOCK_EVT_FEAT_ONESHOT                0x000002
# define CLOCK_EVT_FEAT_KTIME                0x000004

/*
 * x86(64) specific (mis)features:
 *
 * - Clockevent source stops in C3 State and needs broadcast support.
 * - Local APIC timer is used as a dummy device.
 */
# define CLOCK_EVT_FEAT_C3STOP                0x000008
# define CLOCK_EVT_FEAT_DUMMY                0x000010

/*
 * Core shall set the interrupt affinity dynamically in broadcast mode
 */
# define CLOCK_EVT_FEAT_DYNIRQ                0x000020
# define CLOCK_EVT_FEAT_PERCPU                0x000040

/*
 * Clockevent device is based on a hrtimer for broadcast
 */
# define CLOCK_EVT_FEAT_HRTIMER                0x000080

/**
 * struct clock_event_device - clock event device descriptor
 * @event_handler:        Assigned by the framework to be called by the low
 *                        level handler of the event source
 * @set_next_event:        set next event function using a clocksource delta
 * @set_next_ktime:        set next event function using a direct ktime value
 * @next_event:                local storage for the next event in oneshot mode
 * @max_delta_ns:        maximum delta value in ns
 * @min_delta_ns:        minimum delta value in ns
 * @mult:                nanosecond to cycles multiplier
 * @shift:                nanoseconds to cycles divisor (power of two)
 * @state_use_accessors:current state of the device, assigned by the core code
 * @features:                features
 * @retries:                number of forced programming retries
 * @set_state_periodic:        switch state to periodic
 * @set_state_oneshot:        switch state to oneshot
 * @set_state_oneshot_stopped: switch state to oneshot_stopped
 * @set_state_shutdown:        switch state to shutdown
 * @tick_resume:        resume clkevt device
 * @broadcast:                function to broadcast events
 * @min_delta_ticks:        minimum delta value in ticks stored for reconfiguration
 * @max_delta_ticks:        maximum delta value in ticks stored for reconfiguration
 * @name:                ptr to clock event name
 * @rating:                variable to rate clock event devices
 * @irq:                IRQ number (only for non CPU local devices)
 * @bound_on:                Bound on CPU
 * @cpumask:                cpumask to indicate for which CPUs this device works
 * @list:                list head for the management code
 * @owner:                module reference
 */
struct clock_event_device {
        void                        (*event_handler)(struct clock_event_device *);
        int                        (*set_next_event)(unsigned long evt, struct clock_event_device *);
        int                        (*set_next_ktime)(ktime_t expires, struct clock_event_device *);
        ktime_t                        next_event;
        u64                        max_delta_ns;
        u64                        min_delta_ns;
        u32                        mult;
        u32                        shift;
        enum clock_event_state        state_use_accessors;
        unsigned int                features;
        unsigned long                retries;

        int                        (*set_state_periodic)(struct clock_event_device *);
        int                        (*set_state_oneshot)(struct clock_event_device *);
        int                        (*set_state_oneshot_stopped)(struct clock_event_device *);
        int                        (*set_state_shutdown)(struct clock_event_device *);
        int                        (*tick_resume)(struct clock_event_device *);

        void                        (*broadcast)(const struct cpumask *mask);
        void                        (*suspend)(struct clock_event_device *);
        void                        (*resume)(struct clock_event_device *);
        unsigned long                min_delta_ticks;
        unsigned long                max_delta_ticks;

        const char                *name;
        int                        rating;
        int                        irq;
        int                        bound_on;
        const struct cpumask        *cpumask;
        struct list_head        list;
        struct module                *owner;
} ____cacheline_aligned;

/* Helpers to verify state of a clockevent device */
static inline bool clockevent_state_detached(struct clock_event_device *dev)
{
        return dev->state_use_accessors == CLOCK_EVT_STATE_DETACHED;
}

static inline bool clockevent_state_shutdown(struct clock_event_device *dev)
{
        return dev->state_use_accessors == CLOCK_EVT_STATE_SHUTDOWN;
}

static inline bool clockevent_state_periodic(struct clock_event_device *dev)
{
        return dev->state_use_accessors == CLOCK_EVT_STATE_PERIODIC;
}

static inline bool clockevent_state_oneshot(struct clock_event_device *dev)
{
        return dev->state_use_accessors == CLOCK_EVT_STATE_ONESHOT;
}

static inline bool clockevent_state_oneshot_stopped(struct clock_event_device *dev)
{
        return dev->state_use_accessors == CLOCK_EVT_STATE_ONESHOT_STOPPED;
}

/*
 * Calculate a multiplication factor for scaled math, which is used to convert
 * nanoseconds based values to clock ticks:
 *
 * clock_ticks = (nanoseconds * factor) >> shift.
 *
 * div_sc is the rearranged equation to calculate a factor from a given clock
 * ticks / nanoseconds ratio:
 *
 * factor = (clock_ticks << shift) / nanoseconds
 */
static inline unsigned long
div_sc(unsigned long ticks, unsigned long nsec, int shift)
{
        u64 tmp = ((u64)ticks) << shift;

        do_div(tmp, nsec);

        return (unsigned long) tmp;
}

/* Clock event layer functions */
extern u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt);
extern void clockevents_register_device(struct clock_event_device *dev);
extern int clockevents_unbind_device(struct clock_event_device *ced, int cpu);

extern void clockevents_config_and_register(struct clock_event_device *dev,
                                            u32 freq, unsigned long min_delta,
                                            unsigned long max_delta);

extern int clockevents_update_freq(struct clock_event_device *ce, u32 freq);

static inline void
clockevents_calc_mult_shift(struct clock_event_device *ce, u32 freq, u32 maxsec)
{
        return clocks_calc_mult_shift(&ce->mult, &ce->shift, NSEC_PER_SEC, freq, maxsec);
}

extern void clockevents_suspend(void);
extern void clockevents_resume(void);

# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
#  ifdef CONFIG_ARCH_HAS_TICK_BROADCAST
extern void tick_broadcast(const struct cpumask *mask);
#  else
#   define tick_broadcast        NULL
#  endif
extern int tick_receive_broadcast(void);
# endif

# if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
extern void tick_setup_hrtimer_broadcast(void);
extern int tick_check_broadcast_expired(void);
# else
static inline int tick_check_broadcast_expired(void) { return 0; }
static inline void tick_setup_hrtimer_broadcast(void) { }
# endif

#else /* !CONFIG_GENERIC_CLOCKEVENTS: */

static inline void clockevents_suspend(void) { }
static inline void clockevents_resume(void) { }
static inline int tick_check_broadcast_expired(void) { return 0; }
static inline void tick_setup_hrtimer_broadcast(void) { }

#endif /* !CONFIG_GENERIC_CLOCKEVENTS */

#endif /* _LINUX_CLOCKCHIPS_H */











































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NF_CONNTRACK_COMMON_H
#define _NF_CONNTRACK_COMMON_H

#include <linux/atomic.h>
#include <uapi/linux/netfilter/nf_conntrack_common.h>

struct ip_conntrack_stat {
        unsigned int found;
        unsigned int invalid;
        unsigned int insert;
        unsigned int insert_failed;
        unsigned int clash_resolve;
        unsigned int drop;
        unsigned int early_drop;
        unsigned int error;
        unsigned int expect_new;
        unsigned int expect_create;
        unsigned int expect_delete;
        unsigned int search_restart;
};

#define NFCT_INFOMASK        7UL
#define NFCT_PTRMASK        ~(NFCT_INFOMASK)

struct nf_conntrack {
        atomic_t use;
};

void nf_conntrack_destroy(struct nf_conntrack *nfct);
static inline void nf_conntrack_put(struct nf_conntrack *nfct)
{
        if (nfct && atomic_dec_and_test(&nfct->use))
                nf_conntrack_destroy(nfct);
}
static inline void nf_conntrack_get(struct nf_conntrack *nfct)
{
        if (nfct)
                atomic_inc(&nfct->use);
}

#endif /* _NF_CONNTRACK_COMMON_H */


























































































































































































































































































    1 


















































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Runtime locking correctness validator
 *
 *  Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
 *
 * see Documentation/locking/lockdep-design.rst for more details.
 */
#ifndef __LINUX_LOCKDEP_H
#define __LINUX_LOCKDEP_H

#include <linux/lockdep_types.h>
#include <linux/smp.h>
#include <asm/percpu.h>

struct task_struct;

/* for sysctl */
extern int prove_locking;
extern int lock_stat;

#ifdef CONFIG_LOCKDEP

#include <linux/linkage.h>
#include <linux/list.h>
#include <linux/debug_locks.h>
#include <linux/stacktrace.h>

static inline void lockdep_copy_map(struct lockdep_map *to,
                                    struct lockdep_map *from)
{
        int i;

        *to = *from;
        /*
         * Since the class cache can be modified concurrently we could observe
         * half pointers (64bit arch using 32bit copy insns). Therefore clear
         * the caches and take the performance hit.
         *
         * XXX it doesn't work well with lockdep_set_class_and_subclass(), since
         *     that relies on cache abuse.
         */
        for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
                to->class_cache[i] = NULL;
}

/*
 * Every lock has a list of other locks that were taken after it.
 * We only grow the list, never remove from it:
 */
struct lock_list {
        struct list_head                entry;
        struct lock_class                *class;
        struct lock_class                *links_to;
        const struct lock_trace                *trace;
        u16                                distance;
        /* bitmap of different dependencies from head to this */
        u8                                dep;
        /* used by BFS to record whether "prev -> this" only has -(*R)-> */
        u8                                only_xr;

        /*
         * The parent field is used to implement breadth-first search, and the
         * bit 0 is reused to indicate if the lock has been accessed in BFS.
         */
        struct lock_list                *parent;
};

/**
 * struct lock_chain - lock dependency chain record
 *
 * @irq_context: the same as irq_context in held_lock below
 * @depth:       the number of held locks in this chain
 * @base:        the index in chain_hlocks for this chain
 * @entry:       the collided lock chains in lock_chain hash list
 * @chain_key:   the hash key of this lock_chain
 */
struct lock_chain {
        /* see BUILD_BUG_ON()s in add_chain_cache() */
        unsigned int                        irq_context :  2,
                                        depth       :  6,
                                        base            : 24;
        /* 4 byte hole */
        struct hlist_node                entry;
        u64                                chain_key;
};

#define MAX_LOCKDEP_KEYS_BITS                13
#define MAX_LOCKDEP_KEYS                (1UL << MAX_LOCKDEP_KEYS_BITS)
#define INITIAL_CHAIN_KEY                -1

struct held_lock {
        /*
         * One-way hash of the dependency chain up to this point. We
         * hash the hashes step by step as the dependency chain grows.
         *
         * We use it for dependency-caching and we skip detection
         * passes and dependency-updates if there is a cache-hit, so
         * it is absolutely critical for 100% coverage of the validator
         * to have a unique key value for every unique dependency path
         * that can occur in the system, to make a unique hash value
         * as likely as possible - hence the 64-bit width.
         *
         * The task struct holds the current hash value (initialized
         * with zero), here we store the previous hash value:
         */
        u64                                prev_chain_key;
        unsigned long                        acquire_ip;
        struct lockdep_map                *instance;
        struct lockdep_map                *nest_lock;
#ifdef CONFIG_LOCK_STAT
        u64                                 waittime_stamp;
        u64                                holdtime_stamp;
#endif
        /*
         * class_idx is zero-indexed; it points to the element in
         * lock_classes this held lock instance belongs to. class_idx is in
         * the range from 0 to (MAX_LOCKDEP_KEYS-1) inclusive.
         */
        unsigned int                        class_idx:MAX_LOCKDEP_KEYS_BITS;
        /*
         * The lock-stack is unified in that the lock chains of interrupt
         * contexts nest ontop of process context chains, but we 'separate'
         * the hashes by starting with 0 if we cross into an interrupt
         * context, and we also keep do not add cross-context lock
         * dependencies - the lock usage graph walking covers that area
         * anyway, and we'd just unnecessarily increase the number of
         * dependencies otherwise. [Note: hardirq and softirq contexts
         * are separated from each other too.]
         *
         * The following field is used to detect when we cross into an
         * interrupt context:
         */
        unsigned int irq_context:2; /* bit 0 - soft, bit 1 - hard */
        unsigned int trylock:1;                                                /* 16 bits */

        unsigned int read:2;        /* see lock_acquire() comment */
        unsigned int check:1;       /* see lock_acquire() comment */
        unsigned int hardirqs_off:1;
        unsigned int references:12;                                        /* 32 bits */
        unsigned int pin_count;
};

/*
 * Initialization, self-test and debugging-output methods:
 */
extern void lockdep_init(void);
extern void lockdep_reset(void);
extern void lockdep_reset_lock(struct lockdep_map *lock);
extern void lockdep_free_key_range(void *start, unsigned long size);
extern asmlinkage void lockdep_sys_exit(void);
extern void lockdep_set_selftest_task(struct task_struct *task);

extern void lockdep_init_task(struct task_struct *task);

/*
 * Split the recrursion counter in two to readily detect 'off' vs recursion.
 */
#define LOCKDEP_RECURSION_BITS        16
#define LOCKDEP_OFF                (1U << LOCKDEP_RECURSION_BITS)
#define LOCKDEP_RECURSION_MASK        (LOCKDEP_OFF - 1)

/*
 * lockdep_{off,on}() are macros to avoid tracing and kprobes; not inlines due
 * to header dependencies.
 */

#define lockdep_off()                                        \
do {                                                        \
        current->lockdep_recursion += LOCKDEP_OFF;        \
} while (0)

#define lockdep_on()                                        \
do {                                                        \
        current->lockdep_recursion -= LOCKDEP_OFF;        \
} while (0)

extern void lockdep_register_key(struct lock_class_key *key);
extern void lockdep_unregister_key(struct lock_class_key *key);

/*
 * These methods are used by specific locking variants (spinlocks,
 * rwlocks, mutexes and rwsems) to pass init/acquire/release events
 * to lockdep:
 */

extern void lockdep_init_map_type(struct lockdep_map *lock, const char *name,
        struct lock_class_key *key, int subclass, u8 inner, u8 outer, u8 lock_type);

static inline void
lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
                       struct lock_class_key *key, int subclass, u8 inner, u8 outer)
{
        lockdep_init_map_type(lock, name, key, subclass, inner, outer, LD_LOCK_NORMAL);
}

static inline void
lockdep_init_map_wait(struct lockdep_map *lock, const char *name,
                      struct lock_class_key *key, int subclass, u8 inner)
{
        lockdep_init_map_waits(lock, name, key, subclass, inner, LD_WAIT_INV);
}

static inline void lockdep_init_map(struct lockdep_map *lock, const char *name,
                             struct lock_class_key *key, int subclass)
{
        lockdep_init_map_wait(lock, name, key, subclass, LD_WAIT_INV);
}

/*
 * Reinitialize a lock key - for cases where there is special locking or
 * special initialization of locks so that the validator gets the scope
 * of dependencies wrong: they are either too broad (they need a class-split)
 * or they are too narrow (they suffer from a false class-split):
 */
#define lockdep_set_class(lock, key)                                \
        lockdep_init_map_type(&(lock)->dep_map, #key, key, 0,        \
                              (lock)->dep_map.wait_type_inner,        \
                              (lock)->dep_map.wait_type_outer,        \
                              (lock)->dep_map.lock_type)

#define lockdep_set_class_and_name(lock, key, name)                \
        lockdep_init_map_type(&(lock)->dep_map, name, key, 0,        \
                              (lock)->dep_map.wait_type_inner,        \
                              (lock)->dep_map.wait_type_outer,        \
                              (lock)->dep_map.lock_type)

#define lockdep_set_class_and_subclass(lock, key, sub)                \
        lockdep_init_map_type(&(lock)->dep_map, #key, key, sub,        \
                              (lock)->dep_map.wait_type_inner,        \
                              (lock)->dep_map.wait_type_outer,        \
                              (lock)->dep_map.lock_type)

#define lockdep_set_subclass(lock, sub)                                        \
        lockdep_init_map_type(&(lock)->dep_map, #lock, (lock)->dep_map.key, sub,\
                              (lock)->dep_map.wait_type_inner,                \
                              (lock)->dep_map.wait_type_outer,                \
                              (lock)->dep_map.lock_type)

#define lockdep_set_novalidate_class(lock) \
        lockdep_set_class_and_name(lock, &__lockdep_no_validate__, #lock)

/*
 * Compare locking classes
 */
#define lockdep_match_class(lock, key) lockdep_match_key(&(lock)->dep_map, key)

static inline int lockdep_match_key(struct lockdep_map *lock,
                                    struct lock_class_key *key)
{
        return lock->key == key;
}

/*
 * Acquire a lock.
 *
 * Values for "read":
 *
 *   0: exclusive (write) acquire
 *   1: read-acquire (no recursion allowed)
 *   2: read-acquire with same-instance recursion allowed
 *
 * Values for check:
 *
 *   0: simple checks (freeing, held-at-exit-time, etc.)
 *   1: full validation
 */
extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
                         int trylock, int read, int check,
                         struct lockdep_map *nest_lock, unsigned long ip);

extern void lock_release(struct lockdep_map *lock, unsigned long ip);

/*
 * Same "read" as for lock_acquire(), except -1 means any.
 */
extern int lock_is_held_type(const struct lockdep_map *lock, int read);

static inline int lock_is_held(const struct lockdep_map *lock)
{
        return lock_is_held_type(lock, -1);
}

#define lockdep_is_held(lock)                lock_is_held(&(lock)->dep_map)
#define lockdep_is_held_type(lock, r)        lock_is_held_type(&(lock)->dep_map, (r))

extern void lock_set_class(struct lockdep_map *lock, const char *name,
                           struct lock_class_key *key, unsigned int subclass,
                           unsigned long ip);

static inline void lock_set_subclass(struct lockdep_map *lock,
                unsigned int subclass, unsigned long ip)
{
        lock_set_class(lock, lock->name, lock->key, subclass, ip);
}

extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip);

#define NIL_COOKIE (struct pin_cookie){ .val = 0U, }

extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock);
extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie);
extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);

#define lockdep_depth(tsk)        (debug_locks ? (tsk)->lockdep_depth : 0)

#define lockdep_assert_held(l)        do {                                \
                WARN_ON(debug_locks && !lockdep_is_held(l));        \
        } while (0)

#define lockdep_assert_held_write(l)        do {                        \
                WARN_ON(debug_locks && !lockdep_is_held_type(l, 0));        \
        } while (0)

#define lockdep_assert_held_read(l)        do {                                \
                WARN_ON(debug_locks && !lockdep_is_held_type(l, 1));        \
        } while (0)

#define lockdep_assert_held_once(l)        do {                                \
                WARN_ON_ONCE(debug_locks && !lockdep_is_held(l));        \
        } while (0)

#define lockdep_assert_none_held_once()        do {                                \
                WARN_ON_ONCE(debug_locks && current->lockdep_depth);        \
        } while (0)

#define lockdep_recursing(tsk)        ((tsk)->lockdep_recursion)

#define lockdep_pin_lock(l)        lock_pin_lock(&(l)->dep_map)
#define lockdep_repin_lock(l,c)        lock_repin_lock(&(l)->dep_map, (c))
#define lockdep_unpin_lock(l,c)        lock_unpin_lock(&(l)->dep_map, (c))

#else /* !CONFIG_LOCKDEP */

static inline void lockdep_init_task(struct task_struct *task)
{
}

static inline void lockdep_off(void)
{
}

static inline void lockdep_on(void)
{
}

static inline void lockdep_set_selftest_task(struct task_struct *task)
{
}

# define lock_acquire(l, s, t, r, c, n, i)        do { } while (0)
# define lock_release(l, i)                        do { } while (0)
# define lock_downgrade(l, i)                        do { } while (0)
# define lock_set_class(l, n, k, s, i)                do { } while (0)
# define lock_set_subclass(l, s, i)                do { } while (0)
# define lockdep_init()                                do { } while (0)
# define lockdep_init_map_type(lock, name, key, sub, inner, outer, type) \
                do { (void)(name); (void)(key); } while (0)
# define lockdep_init_map_waits(lock, name, key, sub, inner, outer) \
                do { (void)(name); (void)(key); } while (0)
# define lockdep_init_map_wait(lock, name, key, sub, inner) \
                do { (void)(name); (void)(key); } while (0)
# define lockdep_init_map(lock, name, key, sub) \
                do { (void)(name); (void)(key); } while (0)
# define lockdep_set_class(lock, key)                do { (void)(key); } while (0)
# define lockdep_set_class_and_name(lock, key, name) \
                do { (void)(key); (void)(name); } while (0)
#define lockdep_set_class_and_subclass(lock, key, sub) \
                do { (void)(key); } while (0)
#define lockdep_set_subclass(lock, sub)                do { } while (0)

#define lockdep_set_novalidate_class(lock) do { } while (0)

/*
 * We don't define lockdep_match_class() and lockdep_match_key() for !LOCKDEP
 * case since the result is not well defined and the caller should rather
 * #ifdef the call himself.
 */

# define lockdep_reset()                do { debug_locks = 1; } while (0)
# define lockdep_free_key_range(start, size)        do { } while (0)
# define lockdep_sys_exit()                         do { } while (0)

static inline void lockdep_register_key(struct lock_class_key *key)
{
}

static inline void lockdep_unregister_key(struct lock_class_key *key)
{
}

#define lockdep_depth(tsk)        (0)

#define lockdep_is_held_type(l, r)                (1)

#define lockdep_assert_held(l)                        do { (void)(l); } while (0)
#define lockdep_assert_held_write(l)        do { (void)(l); } while (0)
#define lockdep_assert_held_read(l)                do { (void)(l); } while (0)
#define lockdep_assert_held_once(l)                do { (void)(l); } while (0)
#define lockdep_assert_none_held_once()        do { } while (0)

#define lockdep_recursing(tsk)                        (0)

#define NIL_COOKIE (struct pin_cookie){ }

#define lockdep_pin_lock(l)                        ({ struct pin_cookie cookie = { }; cookie; })
#define lockdep_repin_lock(l, c)                do { (void)(l); (void)(c); } while (0)
#define lockdep_unpin_lock(l, c)                do { (void)(l); (void)(c); } while (0)

#endif /* !LOCKDEP */

enum xhlock_context_t {
        XHLOCK_HARD,
        XHLOCK_SOFT,
        XHLOCK_CTX_NR,
};

#define lockdep_init_map_crosslock(m, n, k, s) do {} while (0)
/*
 * To initialize a lockdep_map statically use this macro.
 * Note that _name must not be NULL.
 */
#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
        { .name = (_name), .key = (void *)(_key), }

static inline void lockdep_invariant_state(bool force) {}
static inline void lockdep_free_task(struct task_struct *task) {}

#ifdef CONFIG_LOCK_STAT

extern void lock_contended(struct lockdep_map *lock, unsigned long ip);
extern void lock_acquired(struct lockdep_map *lock, unsigned long ip);

#define LOCK_CONTENDED(_lock, try, lock)                        \
do {                                                                \
        if (!try(_lock)) {                                        \
                lock_contended(&(_lock)->dep_map, _RET_IP_);        \
                lock(_lock);                                        \
        }                                                        \
        lock_acquired(&(_lock)->dep_map, _RET_IP_);                        \
} while (0)

#define LOCK_CONTENDED_RETURN(_lock, try, lock)                        \
({                                                                \
        int ____err = 0;                                        \
        if (!try(_lock)) {                                        \
                lock_contended(&(_lock)->dep_map, _RET_IP_);        \
                ____err = lock(_lock);                                \
        }                                                        \
        if (!____err)                                                \
                lock_acquired(&(_lock)->dep_map, _RET_IP_);        \
        ____err;                                                \
})

#else /* CONFIG_LOCK_STAT */

#define lock_contended(lockdep_map, ip) do {} while (0)
#define lock_acquired(lockdep_map, ip) do {} while (0)

#define LOCK_CONTENDED(_lock, try, lock) \
        lock(_lock)

#define LOCK_CONTENDED_RETURN(_lock, try, lock) \
        lock(_lock)

#endif /* CONFIG_LOCK_STAT */

#ifdef CONFIG_LOCKDEP

/*
 * On lockdep we dont want the hand-coded irq-enable of
 * _raw_*_lock_flags() code, because lockdep assumes
 * that interrupts are not re-enabled during lock-acquire:
 */
#define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \
        LOCK_CONTENDED((_lock), (try), (lock))

#else /* CONFIG_LOCKDEP */

#define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \
        lockfl((_lock), (flags))

#endif /* CONFIG_LOCKDEP */

#ifdef CONFIG_PROVE_LOCKING
extern void print_irqtrace_events(struct task_struct *curr);
#else
static inline void print_irqtrace_events(struct task_struct *curr)
{
}
#endif

/* Variable used to make lockdep treat read_lock() as recursive in selftests */
#ifdef CONFIG_DEBUG_LOCKING_API_SELFTESTS
extern unsigned int force_read_lock_recursive;
#else /* CONFIG_DEBUG_LOCKING_API_SELFTESTS */
#define force_read_lock_recursive 0
#endif /* CONFIG_DEBUG_LOCKING_API_SELFTESTS */

#ifdef CONFIG_LOCKDEP
extern bool read_lock_is_recursive(void);
#else /* CONFIG_LOCKDEP */
/* If !LOCKDEP, the value is meaningless */
#define read_lock_is_recursive() 0
#endif

/*
 * For trivial one-depth nesting of a lock-class, the following
 * global define can be used. (Subsystems with multiple levels
 * of nesting should define their own lock-nesting subclasses.)
 */
#define SINGLE_DEPTH_NESTING                        1

/*
 * Map the dependency ops to NOP or to real lockdep ops, depending
 * on the per lock-class debug mode:
 */

#define lock_acquire_exclusive(l, s, t, n, i)                lock_acquire(l, s, t, 0, 1, n, i)
#define lock_acquire_shared(l, s, t, n, i)                lock_acquire(l, s, t, 1, 1, n, i)
#define lock_acquire_shared_recursive(l, s, t, n, i)        lock_acquire(l, s, t, 2, 1, n, i)

#define spin_acquire(l, s, t, i)                lock_acquire_exclusive(l, s, t, NULL, i)
#define spin_acquire_nest(l, s, t, n, i)        lock_acquire_exclusive(l, s, t, n, i)
#define spin_release(l, i)                        lock_release(l, i)

#define rwlock_acquire(l, s, t, i)                lock_acquire_exclusive(l, s, t, NULL, i)
#define rwlock_acquire_read(l, s, t, i)                                        \
do {                                                                        \
        if (read_lock_is_recursive())                                        \
                lock_acquire_shared_recursive(l, s, t, NULL, i);        \
        else                                                                \
                lock_acquire_shared(l, s, t, NULL, i);                        \
} while (0)

#define rwlock_release(l, i)                        lock_release(l, i)

#define seqcount_acquire(l, s, t, i)                lock_acquire_exclusive(l, s, t, NULL, i)
#define seqcount_acquire_read(l, s, t, i)        lock_acquire_shared_recursive(l, s, t, NULL, i)
#define seqcount_release(l, i)                        lock_release(l, i)

#define mutex_acquire(l, s, t, i)                lock_acquire_exclusive(l, s, t, NULL, i)
#define mutex_acquire_nest(l, s, t, n, i)        lock_acquire_exclusive(l, s, t, n, i)
#define mutex_release(l, i)                        lock_release(l, i)

#define rwsem_acquire(l, s, t, i)                lock_acquire_exclusive(l, s, t, NULL, i)
#define rwsem_acquire_nest(l, s, t, n, i)        lock_acquire_exclusive(l, s, t, n, i)
#define rwsem_acquire_read(l, s, t, i)                lock_acquire_shared(l, s, t, NULL, i)
#define rwsem_release(l, i)                        lock_release(l, i)

#define lock_map_acquire(l)                        lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_)
#define lock_map_acquire_read(l)                lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_)
#define lock_map_acquire_tryread(l)                lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_)
#define lock_map_release(l)                        lock_release(l, _THIS_IP_)

#ifdef CONFIG_PROVE_LOCKING
# define might_lock(lock)                                                \
do {                                                                        \
        typecheck(struct lockdep_map *, &(lock)->dep_map);                \
        lock_acquire(&(lock)->dep_map, 0, 0, 0, 1, NULL, _THIS_IP_);        \
        lock_release(&(lock)->dep_map, _THIS_IP_);                        \
} while (0)
# define might_lock_read(lock)                                                \
do {                                                                        \
        typecheck(struct lockdep_map *, &(lock)->dep_map);                \
        lock_acquire(&(lock)->dep_map, 0, 0, 1, 1, NULL, _THIS_IP_);        \
        lock_release(&(lock)->dep_map, _THIS_IP_);                        \
} while (0)
# define might_lock_nested(lock, subclass)                                \
do {                                                                        \
        typecheck(struct lockdep_map *, &(lock)->dep_map);                \
        lock_acquire(&(lock)->dep_map, subclass, 0, 1, 1, NULL,                \
                     _THIS_IP_);                                        \
        lock_release(&(lock)->dep_map, _THIS_IP_);                        \
} while (0)

DECLARE_PER_CPU(int, hardirqs_enabled);
DECLARE_PER_CPU(int, hardirq_context);
DECLARE_PER_CPU(unsigned int, lockdep_recursion);

#define __lockdep_enabled        (debug_locks && !this_cpu_read(lockdep_recursion))

#define lockdep_assert_irqs_enabled()                                        \
do {                                                                        \
        WARN_ON_ONCE(__lockdep_enabled && !this_cpu_read(hardirqs_enabled)); \
} while (0)

#define lockdep_assert_irqs_disabled()                                        \
do {                                                                        \
        WARN_ON_ONCE(__lockdep_enabled && this_cpu_read(hardirqs_enabled)); \
} while (0)

#define lockdep_assert_in_irq()                                                \
do {                                                                        \
        WARN_ON_ONCE(__lockdep_enabled && !this_cpu_read(hardirq_context)); \
} while (0)

#define lockdep_assert_preemption_enabled()                                \
do {                                                                        \
        WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT)        &&                \
                     __lockdep_enabled                        &&                \
                     (preempt_count() != 0                ||                \
                      !this_cpu_read(hardirqs_enabled)));                \
} while (0)

#define lockdep_assert_preemption_disabled()                                \
do {                                                                        \
        WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT)        &&                \
                     __lockdep_enabled                        &&                \
                     (preempt_count() == 0                &&                \
                      this_cpu_read(hardirqs_enabled)));                \
} while (0)

#else
# define might_lock(lock) do { } while (0)
# define might_lock_read(lock) do { } while (0)
# define might_lock_nested(lock, subclass) do { } while (0)

# define lockdep_assert_irqs_enabled() do { } while (0)
# define lockdep_assert_irqs_disabled() do { } while (0)
# define lockdep_assert_in_irq() do { } while (0)

# define lockdep_assert_preemption_enabled() do { } while (0)
# define lockdep_assert_preemption_disabled() do { } while (0)
#endif

#ifdef CONFIG_PROVE_RAW_LOCK_NESTING

# define lockdep_assert_RT_in_threaded_ctx() do {                        \
                WARN_ONCE(debug_locks && !current->lockdep_recursion &&        \
                          lockdep_hardirq_context() &&                        \
                          !(current->hardirq_threaded || current->irq_config),        \
                          "Not in threaded context on PREEMPT_RT as expected\n");        \
} while (0)

#else

# define lockdep_assert_RT_in_threaded_ctx() do { } while (0)

#endif

#ifdef CONFIG_LOCKDEP
void lockdep_rcu_suspicious(const char *file, const int line, const char *s);
#else
static inline void
lockdep_rcu_suspicious(const char *file, const int line, const char *s)
{
}
#endif

#endif /* __LINUX_LOCKDEP_H */

































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_BSEARCH_H
#define _LINUX_BSEARCH_H

#include <linux/types.h>

static __always_inline
void *__inline_bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp)
{
        const char *pivot;
        int result;

        while (num > 0) {
                pivot = base + (num >> 1) * size;
                result = cmp(key, pivot);

                if (result == 0)
                        return (void *)pivot;

                if (result > 0) {
                        base = pivot + size;
                        num--;
                }
                num >>= 1;
        }

        return NULL;
}

extern void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp);

#endif /* _LINUX_BSEARCH_H */




























    1 




























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM pagemap

#if !defined(_TRACE_PAGEMAP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PAGEMAP_H

#include <linux/tracepoint.h>
#include <linux/mm.h>

#define        PAGEMAP_MAPPED                0x0001u
#define PAGEMAP_ANONYMOUS        0x0002u
#define PAGEMAP_FILE                0x0004u
#define PAGEMAP_SWAPCACHE        0x0008u
#define PAGEMAP_SWAPBACKED        0x0010u
#define PAGEMAP_MAPPEDDISK        0x0020u
#define PAGEMAP_BUFFERS                0x0040u

#define trace_pagemap_flags(page) ( \
        (PageAnon(page)                ? PAGEMAP_ANONYMOUS  : PAGEMAP_FILE) | \
        (page_mapped(page)        ? PAGEMAP_MAPPED     : 0) | \
        (PageSwapCache(page)        ? PAGEMAP_SWAPCACHE  : 0) | \
        (PageSwapBacked(page)        ? PAGEMAP_SWAPBACKED : 0) | \
        (PageMappedToDisk(page)        ? PAGEMAP_MAPPEDDISK : 0) | \
        (page_has_private(page) ? PAGEMAP_BUFFERS    : 0) \
        )

TRACE_EVENT(mm_lru_insertion,

        TP_PROTO(
                struct page *page,
                int lru
        ),

        TP_ARGS(page, lru),

        TP_STRUCT__entry(
                __field(struct page *,        page        )
                __field(unsigned long,        pfn        )
                __field(int,                lru        )
                __field(unsigned long,        flags        )
        ),

        TP_fast_assign(
                __entry->page        = page;
                __entry->pfn        = page_to_pfn(page);
                __entry->lru        = lru;
                __entry->flags        = trace_pagemap_flags(page);
        ),

        /* Flag format is based on page-types.c formatting for pagemap */
        TP_printk("page=%p pfn=%lu lru=%d flags=%s%s%s%s%s%s",
                        __entry->page,
                        __entry->pfn,
                        __entry->lru,
                        __entry->flags & PAGEMAP_MAPPED                ? "M" : " ",
                        __entry->flags & PAGEMAP_ANONYMOUS        ? "a" : "f",
                        __entry->flags & PAGEMAP_SWAPCACHE        ? "s" : " ",
                        __entry->flags & PAGEMAP_SWAPBACKED        ? "b" : " ",
                        __entry->flags & PAGEMAP_MAPPEDDISK        ? "d" : " ",
                        __entry->flags & PAGEMAP_BUFFERS        ? "B" : " ")
);

TRACE_EVENT(mm_lru_activate,

        TP_PROTO(struct page *page),

        TP_ARGS(page),

        TP_STRUCT__entry(
                __field(struct page *,        page        )
                __field(unsigned long,        pfn        )
        ),

        TP_fast_assign(
                __entry->page        = page;
                __entry->pfn        = page_to_pfn(page);
        ),

        /* Flag format is based on page-types.c formatting for pagemap */
        TP_printk("page=%p pfn=%lu", __entry->page, __entry->pfn)

);

#endif /* _TRACE_PAGEMAP_H */

/* This part must be outside protection */
#include <trace/define_trace.h>














































    1 











































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_UACCESS_64_H
#define _ASM_X86_UACCESS_64_H

/*
 * User space memory access functions
 */
#include <linux/compiler.h>
#include <linux/lockdep.h>
#include <linux/kasan-checks.h>
#include <asm/alternative.h>
#include <asm/cpufeatures.h>
#include <asm/page.h>

/*
 * Copy To/From Userspace
 */

/* Handles exceptions in both to and from, but doesn't do access_ok */
__must_check unsigned long
copy_user_enhanced_fast_string(void *to, const void *from, unsigned len);
__must_check unsigned long
copy_user_generic_string(void *to, const void *from, unsigned len);
__must_check unsigned long
copy_user_generic_unrolled(void *to, const void *from, unsigned len);

static __always_inline __must_check unsigned long
copy_user_generic(void *to, const void *from, unsigned len)
{
        unsigned ret;

        /*
         * If CPU has ERMS feature, use copy_user_enhanced_fast_string.
         * Otherwise, if CPU has rep_good feature, use copy_user_generic_string.
         * Otherwise, use copy_user_generic_unrolled.
         */
        alternative_call_2(copy_user_generic_unrolled,
                         copy_user_generic_string,
                         X86_FEATURE_REP_GOOD,
                         copy_user_enhanced_fast_string,
                         X86_FEATURE_ERMS,
                         ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from),
                                     "=d" (len)),
                         "1" (to), "2" (from), "3" (len)
                         : "memory", "rcx", "r8", "r9", "r10", "r11");
        return ret;
}

static __always_inline __must_check unsigned long
raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
{
        return copy_user_generic(dst, (__force void *)src, size);
}

static __always_inline __must_check unsigned long
raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
{
        return copy_user_generic((__force void *)dst, src, size);
}

static __always_inline __must_check
unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigned long size)
{
        return copy_user_generic((__force void *)dst,
                                 (__force void *)src, size);
}

extern long __copy_user_nocache(void *dst, const void __user *src,
                                unsigned size, int zerorest);

extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size);
extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
                           size_t len);

static inline int
__copy_from_user_inatomic_nocache(void *dst, const void __user *src,
                                  unsigned size)
{
        kasan_check_write(dst, size);
        return __copy_user_nocache(dst, src, size, 0);
}

static inline int
__copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
{
        kasan_check_write(dst, size);
        return __copy_user_flushcache(dst, src, size);
}
#endif /* _ASM_X86_UACCESS_64_H */















































































































































































































    1 



























    1 





































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_BITOPS_H
#define _ASM_X86_BITOPS_H

/*
 * Copyright 1992, Linus Torvalds.
 *
 * Note: inlines with more than a single statement should be marked
 * __always_inline to avoid problems with older gcc's inlining heuristics.
 */

#ifndef _LINUX_BITOPS_H
#error only <linux/bitops.h> can be included directly
#endif

#include <linux/compiler.h>
#include <asm/alternative.h>
#include <asm/rmwcc.h>
#include <asm/barrier.h>

#if BITS_PER_LONG == 32
# define _BITOPS_LONG_SHIFT 5
#elif BITS_PER_LONG == 64
# define _BITOPS_LONG_SHIFT 6
#else
# error "Unexpected BITS_PER_LONG"
#endif

#define BIT_64(n)                        (U64_C(1) << (n))

/*
 * These have to be done with inline assembly: that way the bit-setting
 * is guaranteed to be atomic. All bit operations return 0 if the bit
 * was cleared before the operation and != 0 if it was not.
 *
 * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
 */

#define RLONG_ADDR(x)                         "m" (*(volatile long *) (x))
#define WBYTE_ADDR(x)                        "+m" (*(volatile char *) (x))

#define ADDR                                RLONG_ADDR(addr)

/*
 * We do the locked ops that don't return the old value as
 * a mask operation on a byte.
 */
#define CONST_MASK_ADDR(nr, addr)        WBYTE_ADDR((void *)(addr) + ((nr)>>3))
#define CONST_MASK(nr)                        (1 << ((nr) & 7))

static __always_inline void
arch_set_bit(long nr, volatile unsigned long *addr)
{
        if (__builtin_constant_p(nr)) {
                asm volatile(LOCK_PREFIX "orb %b1,%0"
                        : CONST_MASK_ADDR(nr, addr)
                        : "iq" (CONST_MASK(nr))
                        : "memory");
        } else {
                asm volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0"
                        : : RLONG_ADDR(addr), "Ir" (nr) : "memory");
        }
}

static __always_inline void
arch___set_bit(long nr, volatile unsigned long *addr)
{
        asm volatile(__ASM_SIZE(bts) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
}

static __always_inline void
arch_clear_bit(long nr, volatile unsigned long *addr)
{
        if (__builtin_constant_p(nr)) {
                asm volatile(LOCK_PREFIX "andb %b1,%0"
                        : CONST_MASK_ADDR(nr, addr)
                        : "iq" (~CONST_MASK(nr)));
        } else {
                asm volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0"
                        : : RLONG_ADDR(addr), "Ir" (nr) : "memory");
        }
}

static __always_inline void
arch_clear_bit_unlock(long nr, volatile unsigned long *addr)
{
        barrier();
        arch_clear_bit(nr, addr);
}

static __always_inline void
arch___clear_bit(long nr, volatile unsigned long *addr)
{
        asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
}

static __always_inline bool
arch_clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr)
{
        bool negative;
        asm volatile(LOCK_PREFIX "andb %2,%1"
                CC_SET(s)
                : CC_OUT(s) (negative), WBYTE_ADDR(addr)
                : "ir" ((char) ~(1 << nr)) : "memory");
        return negative;
}
#define arch_clear_bit_unlock_is_negative_byte                                 \
        arch_clear_bit_unlock_is_negative_byte

static __always_inline void
arch___clear_bit_unlock(long nr, volatile unsigned long *addr)
{
        arch___clear_bit(nr, addr);
}

static __always_inline void
arch___change_bit(long nr, volatile unsigned long *addr)
{
        asm volatile(__ASM_SIZE(btc) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
}

static __always_inline void
arch_change_bit(long nr, volatile unsigned long *addr)
{
        if (__builtin_constant_p(nr)) {
                asm volatile(LOCK_PREFIX "xorb %b1,%0"
                        : CONST_MASK_ADDR(nr, addr)
                        : "iq" (CONST_MASK(nr)));
        } else {
                asm volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0"
                        : : RLONG_ADDR(addr), "Ir" (nr) : "memory");
        }
}

static __always_inline bool
arch_test_and_set_bit(long nr, volatile unsigned long *addr)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts), *addr, c, "Ir", nr);
}

static __always_inline bool
arch_test_and_set_bit_lock(long nr, volatile unsigned long *addr)
{
        return arch_test_and_set_bit(nr, addr);
}

static __always_inline bool
arch___test_and_set_bit(long nr, volatile unsigned long *addr)
{
        bool oldbit;

        asm(__ASM_SIZE(bts) " %2,%1"
            CC_SET(c)
            : CC_OUT(c) (oldbit)
            : ADDR, "Ir" (nr) : "memory");
        return oldbit;
}

static __always_inline bool
arch_test_and_clear_bit(long nr, volatile unsigned long *addr)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btr), *addr, c, "Ir", nr);
}

/*
 * Note: the operation is performed atomically with respect to
 * the local CPU, but not other CPUs. Portable code should not
 * rely on this behaviour.
 * KVM relies on this behaviour on x86 for modifying memory that is also
 * accessed from a hypervisor on the same CPU if running in a VM: don't change
 * this without also updating arch/x86/kernel/kvm.c
 */
static __always_inline bool
arch___test_and_clear_bit(long nr, volatile unsigned long *addr)
{
        bool oldbit;

        asm volatile(__ASM_SIZE(btr) " %2,%1"
                     CC_SET(c)
                     : CC_OUT(c) (oldbit)
                     : ADDR, "Ir" (nr) : "memory");
        return oldbit;
}

static __always_inline bool
arch___test_and_change_bit(long nr, volatile unsigned long *addr)
{
        bool oldbit;

        asm volatile(__ASM_SIZE(btc) " %2,%1"
                     CC_SET(c)
                     : CC_OUT(c) (oldbit)
                     : ADDR, "Ir" (nr) : "memory");

        return oldbit;
}

static __always_inline bool
arch_test_and_change_bit(long nr, volatile unsigned long *addr)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc), *addr, c, "Ir", nr);
}

static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr)
{
        return ((1UL << (nr & (BITS_PER_LONG-1))) &
                (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
}

static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr)
{
        bool oldbit;

        asm volatile(__ASM_SIZE(bt) " %2,%1"
                     CC_SET(c)
                     : CC_OUT(c) (oldbit)
                     : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory");

        return oldbit;
}

#define arch_test_bit(nr, addr)                        \
        (__builtin_constant_p((nr))                \
         ? constant_test_bit((nr), (addr))        \
         : variable_test_bit((nr), (addr)))

/**
 * __ffs - find first set bit in word
 * @word: The word to search
 *
 * Undefined if no bit exists, so code should check against 0 first.
 */
static __always_inline unsigned long __ffs(unsigned long word)
{
        asm("rep; bsf %1,%0"
                : "=r" (word)
                : "rm" (word));
        return word;
}

/**
 * ffz - find first zero bit in word
 * @word: The word to search
 *
 * Undefined if no zero exists, so code should check against ~0UL first.
 */
static __always_inline unsigned long ffz(unsigned long word)
{
        asm("rep; bsf %1,%0"
                : "=r" (word)
                : "r" (~word));
        return word;
}

/*
 * __fls: find last set bit in word
 * @word: The word to search
 *
 * Undefined if no set bit exists, so code should check against 0 first.
 */
static __always_inline unsigned long __fls(unsigned long word)
{
        asm("bsr %1,%0"
            : "=r" (word)
            : "rm" (word));
        return word;
}

#undef ADDR

#ifdef __KERNEL__
/**
 * ffs - find first set bit in word
 * @x: the word to search
 *
 * This is defined the same way as the libc and compiler builtin ffs
 * routines, therefore differs in spirit from the other bitops.
 *
 * ffs(value) returns 0 if value is 0 or the position of the first
 * set bit if value is nonzero. The first (least significant) bit
 * is at position 1.
 */
static __always_inline int ffs(int x)
{
        int r;

#ifdef CONFIG_X86_64
        /*
         * AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the
         * dest reg is undefined if x==0, but their CPU architect says its
         * value is written to set it to the same as before, except that the
         * top 32 bits will be cleared.
         *
         * We cannot do this on 32 bits because at the very least some
         * 486 CPUs did not behave this way.
         */
        asm("bsfl %1,%0"
            : "=r" (r)
            : "rm" (x), "0" (-1));
#elif defined(CONFIG_X86_CMOV)
        asm("bsfl %1,%0\n\t"
            "cmovzl %2,%0"
            : "=&r" (r) : "rm" (x), "r" (-1));
#else
        asm("bsfl %1,%0\n\t"
            "jnz 1f\n\t"
            "movl $-1,%0\n"
            "1:" : "=r" (r) : "rm" (x));
#endif
        return r + 1;
}

/**
 * fls - find last set bit in word
 * @x: the word to search
 *
 * This is defined in a similar way as the libc and compiler builtin
 * ffs, but returns the position of the most significant set bit.
 *
 * fls(value) returns 0 if value is 0 or the position of the last
 * set bit if value is nonzero. The last (most significant) bit is
 * at position 32.
 */
static __always_inline int fls(unsigned int x)
{
        int r;

#ifdef CONFIG_X86_64
        /*
         * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
         * dest reg is undefined if x==0, but their CPU architect says its
         * value is written to set it to the same as before, except that the
         * top 32 bits will be cleared.
         *
         * We cannot do this on 32 bits because at the very least some
         * 486 CPUs did not behave this way.
         */
        asm("bsrl %1,%0"
            : "=r" (r)
            : "rm" (x), "0" (-1));
#elif defined(CONFIG_X86_CMOV)
        asm("bsrl %1,%0\n\t"
            "cmovzl %2,%0"
            : "=&r" (r) : "rm" (x), "rm" (-1));
#else
        asm("bsrl %1,%0\n\t"
            "jnz 1f\n\t"
            "movl $-1,%0\n"
            "1:" : "=r" (r) : "rm" (x));
#endif
        return r + 1;
}

/**
 * fls64 - find last set bit in a 64-bit word
 * @x: the word to search
 *
 * This is defined in a similar way as the libc and compiler builtin
 * ffsll, but returns the position of the most significant set bit.
 *
 * fls64(value) returns 0 if value is 0 or the position of the last
 * set bit if value is nonzero. The last (most significant) bit is
 * at position 64.
 */
#ifdef CONFIG_X86_64
static __always_inline int fls64(__u64 x)
{
        int bitpos = -1;
        /*
         * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
         * dest reg is undefined if x==0, but their CPU architect says its
         * value is written to set it to the same as before.
         */
        asm("bsrq %1,%q0"
            : "+r" (bitpos)
            : "rm" (x));
        return bitpos + 1;
}
#else
#include <asm-generic/bitops/fls64.h>
#endif

#include <asm-generic/bitops/find.h>

#include <asm-generic/bitops/sched.h>

#include <asm/arch_hweight.h>

#include <asm-generic/bitops/const_hweight.h>

#include <asm-generic/bitops/instrumented-atomic.h>
#include <asm-generic/bitops/instrumented-non-atomic.h>
#include <asm-generic/bitops/instrumented-lock.h>

#include <asm-generic/bitops/le.h>

#include <asm-generic/bitops/ext2-atomic-setbit.h>

#endif /* __KERNEL__ */
#endif /* _ASM_X86_BITOPS_H */








































































    1 

























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
  Red Black Trees
  (C) 1999  Andrea Arcangeli <andrea@suse.de>
  

  linux/include/linux/rbtree.h

  To use rbtrees you'll have to implement your own insert and search cores.
  This will avoid us to use callbacks and to drop drammatically performances.
  I know it's not the cleaner way,  but in C (not in C++) to get
  performances and genericity...

  See Documentation/core-api/rbtree.rst for documentation and samples.
*/

#ifndef        _LINUX_RBTREE_H
#define        _LINUX_RBTREE_H

#include <linux/kernel.h>
#include <linux/stddef.h>
#include <linux/rcupdate.h>

struct rb_node {
        unsigned long  __rb_parent_color;
        struct rb_node *rb_right;
        struct rb_node *rb_left;
} __attribute__((aligned(sizeof(long))));
    /* The alignment might seem pointless, but allegedly CRIS needs it */

struct rb_root {
        struct rb_node *rb_node;
};

#define rb_parent(r)   ((struct rb_node *)((r)->__rb_parent_color & ~3))

#define RB_ROOT        (struct rb_root) { NULL, }
#define        rb_entry(ptr, type, member) container_of(ptr, type, member)

#define RB_EMPTY_ROOT(root)  (READ_ONCE((root)->rb_node) == NULL)

/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */
#define RB_EMPTY_NODE(node)  \
        ((node)->__rb_parent_color == (unsigned long)(node))
#define RB_CLEAR_NODE(node)  \
        ((node)->__rb_parent_color = (unsigned long)(node))


extern void rb_insert_color(struct rb_node *, struct rb_root *);
extern void rb_erase(struct rb_node *, struct rb_root *);


/* Find logical next and previous nodes in a tree */
extern struct rb_node *rb_next(const struct rb_node *);
extern struct rb_node *rb_prev(const struct rb_node *);
extern struct rb_node *rb_first(const struct rb_root *);
extern struct rb_node *rb_last(const struct rb_root *);

/* Postorder iteration - always visit the parent after its children */
extern struct rb_node *rb_first_postorder(const struct rb_root *);
extern struct rb_node *rb_next_postorder(const struct rb_node *);

/* Fast replacement of a single node without remove/rebalance/add/rebalance */
extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
                            struct rb_root *root);
extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
                                struct rb_root *root);

static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
                                struct rb_node **rb_link)
{
        node->__rb_parent_color = (unsigned long)parent;
        node->rb_left = node->rb_right = NULL;

        *rb_link = node;
}

static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
                                    struct rb_node **rb_link)
{
        node->__rb_parent_color = (unsigned long)parent;
        node->rb_left = node->rb_right = NULL;

        rcu_assign_pointer(*rb_link, node);
}

#define rb_entry_safe(ptr, type, member) \
        ({ typeof(ptr) ____ptr = (ptr); \
           ____ptr ? rb_entry(____ptr, type, member) : NULL; \
        })

/**
 * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of
 * given type allowing the backing memory of @pos to be invalidated
 *
 * @pos:        the 'type *' to use as a loop cursor.
 * @n:                another 'type *' to use as temporary storage
 * @root:        'rb_root *' of the rbtree.
 * @field:        the name of the rb_node field within 'type'.
 *
 * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as
 * list_for_each_entry_safe() and allows the iteration to continue independent
 * of changes to @pos by the body of the loop.
 *
 * Note, however, that it cannot handle other modifications that re-order the
 * rbtree it is iterating over. This includes calling rb_erase() on @pos, as
 * rb_erase() may rebalance the tree, causing us to miss some nodes.
 */
#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
        for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \
             pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \
                        typeof(*pos), field); 1; }); \
             pos = n)

/*
 * Leftmost-cached rbtrees.
 *
 * We do not cache the rightmost node based on footprint
 * size vs number of potential users that could benefit
 * from O(1) rb_last(). Just not worth it, users that want
 * this feature can always implement the logic explicitly.
 * Furthermore, users that want to cache both pointers may
 * find it a bit asymmetric, but that's ok.
 */
struct rb_root_cached {
        struct rb_root rb_root;
        struct rb_node *rb_leftmost;
};

#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }

/* Same as rb_first(), but O(1) */
#define rb_first_cached(root) (root)->rb_leftmost

static inline void rb_insert_color_cached(struct rb_node *node,
                                          struct rb_root_cached *root,
                                          bool leftmost)
{
        if (leftmost)
                root->rb_leftmost = node;
        rb_insert_color(node, &root->rb_root);
}

static inline void rb_erase_cached(struct rb_node *node,
                                   struct rb_root_cached *root)
{
        if (root->rb_leftmost == node)
                root->rb_leftmost = rb_next(node);
        rb_erase(node, &root->rb_root);
}

static inline void rb_replace_node_cached(struct rb_node *victim,
                                          struct rb_node *new,
                                          struct rb_root_cached *root)
{
        if (root->rb_leftmost == victim)
                root->rb_leftmost = new;
        rb_replace_node(victim, new, &root->rb_root);
}

#endif        /* _LINUX_RBTREE_H */


























    1 


































    1 


































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#ifndef _LINUX_MMAP_LOCK_H
#define _LINUX_MMAP_LOCK_H

#include <linux/mmdebug.h>

#define MMAP_LOCK_INITIALIZER(name) \
        .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock),

static inline void mmap_init_lock(struct mm_struct *mm)
{
        init_rwsem(&mm->mmap_lock);
}

static inline void mmap_write_lock(struct mm_struct *mm)
{
        down_write(&mm->mmap_lock);
}

static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass)
{
        down_write_nested(&mm->mmap_lock, subclass);
}

static inline int mmap_write_lock_killable(struct mm_struct *mm)
{
        return down_write_killable(&mm->mmap_lock);
}

static inline bool mmap_write_trylock(struct mm_struct *mm)
{
        return down_write_trylock(&mm->mmap_lock) != 0;
}

static inline void mmap_write_unlock(struct mm_struct *mm)
{
        up_write(&mm->mmap_lock);
}

static inline void mmap_write_downgrade(struct mm_struct *mm)
{
        downgrade_write(&mm->mmap_lock);
}

static inline void mmap_read_lock(struct mm_struct *mm)
{
        down_read(&mm->mmap_lock);
}

static inline int mmap_read_lock_killable(struct mm_struct *mm)
{
        return down_read_killable(&mm->mmap_lock);
}

static inline bool mmap_read_trylock(struct mm_struct *mm)
{
        return down_read_trylock(&mm->mmap_lock) != 0;
}

static inline void mmap_read_unlock(struct mm_struct *mm)
{
        up_read(&mm->mmap_lock);
}

static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm)
{
        if (down_read_trylock(&mm->mmap_lock)) {
                rwsem_release(&mm->mmap_lock.dep_map, _RET_IP_);
                return true;
        }
        return false;
}

static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
{
        up_read_non_owner(&mm->mmap_lock);
}

static inline void mmap_assert_locked(struct mm_struct *mm)
{
        lockdep_assert_held(&mm->mmap_lock);
        VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
}

static inline void mmap_assert_write_locked(struct mm_struct *mm)
{
        lockdep_assert_held_write(&mm->mmap_lock);
        VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
}

static inline int mmap_lock_is_contended(struct mm_struct *mm)
{
        return rwsem_is_contended(&mm->mmap_lock);
}

#endif /* _LINUX_MMAP_LOCK_H */










































































































































































































































































































































    1 


    1 




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 





    1 


























    1 
    1 





































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
/* CPU control.
 * (C) 2001, 2002, 2003, 2004 Rusty Russell
 *
 * This code is licenced under the GPL.
 */
#include <linux/sched/mm.h>
#include <linux/proc_fs.h>
#include <linux/smp.h>
#include <linux/init.h>
#include <linux/notifier.h>
#include <linux/sched/signal.h>
#include <linux/sched/hotplug.h>
#include <linux/sched/isolation.h>
#include <linux/sched/task.h>
#include <linux/sched/smt.h>
#include <linux/unistd.h>
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/rcupdate.h>
#include <linux/export.h>
#include <linux/bug.h>
#include <linux/kthread.h>
#include <linux/stop_machine.h>
#include <linux/mutex.h>
#include <linux/gfp.h>
#include <linux/suspend.h>
#include <linux/lockdep.h>
#include <linux/tick.h>
#include <linux/irq.h>
#include <linux/nmi.h>
#include <linux/smpboot.h>
#include <linux/relay.h>
#include <linux/slab.h>
#include <linux/scs.h>
#include <linux/percpu-rwsem.h>
#include <linux/cpuset.h>
#include <linux/random.h>

#include <trace/events/power.h>
#define CREATE_TRACE_POINTS
#include <trace/events/cpuhp.h>

#include "smpboot.h"

/**
 * cpuhp_cpu_state - Per cpu hotplug state storage
 * @state:        The current cpu state
 * @target:        The target state
 * @thread:        Pointer to the hotplug thread
 * @should_run:        Thread should execute
 * @rollback:        Perform a rollback
 * @single:        Single callback invocation
 * @bringup:        Single callback bringup or teardown selector
 * @cb_state:        The state for a single callback (install/uninstall)
 * @result:        Result of the operation
 * @done_up:        Signal completion to the issuer of the task for cpu-up
 * @done_down:        Signal completion to the issuer of the task for cpu-down
 */
struct cpuhp_cpu_state {
        enum cpuhp_state        state;
        enum cpuhp_state        target;
        enum cpuhp_state        fail;
#ifdef CONFIG_SMP
        struct task_struct        *thread;
        bool                        should_run;
        bool                        rollback;
        bool                        single;
        bool                        bringup;
        struct hlist_node        *node;
        struct hlist_node        *last;
        enum cpuhp_state        cb_state;
        int                        result;
        struct completion        done_up;
        struct completion        done_down;
#endif
};

static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = {
        .fail = CPUHP_INVALID,
};

#ifdef CONFIG_SMP
cpumask_t cpus_booted_once_mask;
#endif

#if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
static struct lockdep_map cpuhp_state_up_map =
        STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map);
static struct lockdep_map cpuhp_state_down_map =
        STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map);


static inline void cpuhp_lock_acquire(bool bringup)
{
        lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
}

static inline void cpuhp_lock_release(bool bringup)
{
        lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
}
#else

static inline void cpuhp_lock_acquire(bool bringup) { }
static inline void cpuhp_lock_release(bool bringup) { }

#endif

/**
 * cpuhp_step - Hotplug state machine step
 * @name:        Name of the step
 * @startup:        Startup function of the step
 * @teardown:        Teardown function of the step
 * @cant_stop:        Bringup/teardown can't be stopped at this step
 */
struct cpuhp_step {
        const char                *name;
        union {
                int                (*single)(unsigned int cpu);
                int                (*multi)(unsigned int cpu,
                                         struct hlist_node *node);
        } startup;
        union {
                int                (*single)(unsigned int cpu);
                int                (*multi)(unsigned int cpu,
                                         struct hlist_node *node);
        } teardown;
        struct hlist_head        list;
        bool                        cant_stop;
        bool                        multi_instance;
};

static DEFINE_MUTEX(cpuhp_state_mutex);
static struct cpuhp_step cpuhp_hp_states[];

static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
{
        return cpuhp_hp_states + state;
}

/**
 * cpuhp_invoke_callback _ Invoke the callbacks for a given state
 * @cpu:        The cpu for which the callback should be invoked
 * @state:        The state to do callbacks for
 * @bringup:        True if the bringup callback should be invoked
 * @node:        For multi-instance, do a single entry callback for install/remove
 * @lastp:        For multi-instance rollback, remember how far we got
 *
 * Called from cpu hotplug and from the state register machinery.
 */
static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
                                 bool bringup, struct hlist_node *node,
                                 struct hlist_node **lastp)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        struct cpuhp_step *step = cpuhp_get_step(state);
        int (*cbm)(unsigned int cpu, struct hlist_node *node);
        int (*cb)(unsigned int cpu);
        int ret, cnt;

        if (st->fail == state) {
                st->fail = CPUHP_INVALID;

                if (!(bringup ? step->startup.single : step->teardown.single))
                        return 0;

                return -EAGAIN;
        }

        if (!step->multi_instance) {
                WARN_ON_ONCE(lastp && *lastp);
                cb = bringup ? step->startup.single : step->teardown.single;
                if (!cb)
                        return 0;
                trace_cpuhp_enter(cpu, st->target, state, cb);
                ret = cb(cpu);
                trace_cpuhp_exit(cpu, st->state, state, ret);
                return ret;
        }
        cbm = bringup ? step->startup.multi : step->teardown.multi;
        if (!cbm)
                return 0;

        /* Single invocation for instance add/remove */
        if (node) {
                WARN_ON_ONCE(lastp && *lastp);
                trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
                ret = cbm(cpu, node);
                trace_cpuhp_exit(cpu, st->state, state, ret);
                return ret;
        }

        /* State transition. Invoke on all instances */
        cnt = 0;
        hlist_for_each(node, &step->list) {
                if (lastp && node == *lastp)
                        break;

                trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
                ret = cbm(cpu, node);
                trace_cpuhp_exit(cpu, st->state, state, ret);
                if (ret) {
                        if (!lastp)
                                goto err;

                        *lastp = node;
                        return ret;
                }
                cnt++;
        }
        if (lastp)
                *lastp = NULL;
        return 0;
err:
        /* Rollback the instances if one failed */
        cbm = !bringup ? step->startup.multi : step->teardown.multi;
        if (!cbm)
                return ret;

        hlist_for_each(node, &step->list) {
                if (!cnt--)
                        break;

                trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
                ret = cbm(cpu, node);
                trace_cpuhp_exit(cpu, st->state, state, ret);
                /*
                 * Rollback must not fail,
                 */
                WARN_ON_ONCE(ret);
        }
        return ret;
}

#ifdef CONFIG_SMP
static bool cpuhp_is_ap_state(enum cpuhp_state state)
{
        /*
         * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
         * purposes as that state is handled explicitly in cpu_down.
         */
        return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
}

static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
{
        struct completion *done = bringup ? &st->done_up : &st->done_down;
        wait_for_completion(done);
}

static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
{
        struct completion *done = bringup ? &st->done_up : &st->done_down;
        complete(done);
}

/*
 * The former STARTING/DYING states, ran with IRQs disabled and must not fail.
 */
static bool cpuhp_is_atomic_state(enum cpuhp_state state)
{
        return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE;
}

/* Serializes the updates to cpu_online_mask, cpu_present_mask */
static DEFINE_MUTEX(cpu_add_remove_lock);
bool cpuhp_tasks_frozen;
EXPORT_SYMBOL_GPL(cpuhp_tasks_frozen);

/*
 * The following two APIs (cpu_maps_update_begin/done) must be used when
 * attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
 */
void cpu_maps_update_begin(void)
{
        mutex_lock(&cpu_add_remove_lock);
}

void cpu_maps_update_done(void)
{
        mutex_unlock(&cpu_add_remove_lock);
}

/*
 * If set, cpu_up and cpu_down will return -EBUSY and do nothing.
 * Should always be manipulated under cpu_add_remove_lock
 */
static int cpu_hotplug_disabled;

#ifdef CONFIG_HOTPLUG_CPU

DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock);

void cpus_read_lock(void)
{
        percpu_down_read(&cpu_hotplug_lock);
}
EXPORT_SYMBOL_GPL(cpus_read_lock);

int cpus_read_trylock(void)
{
        return percpu_down_read_trylock(&cpu_hotplug_lock);
}
EXPORT_SYMBOL_GPL(cpus_read_trylock);

void cpus_read_unlock(void)
{
        percpu_up_read(&cpu_hotplug_lock);
}
EXPORT_SYMBOL_GPL(cpus_read_unlock);

void cpus_write_lock(void)
{
        percpu_down_write(&cpu_hotplug_lock);
}

void cpus_write_unlock(void)
{
        percpu_up_write(&cpu_hotplug_lock);
}

void lockdep_assert_cpus_held(void)
{
        /*
         * We can't have hotplug operations before userspace starts running,
         * and some init codepaths will knowingly not take the hotplug lock.
         * This is all valid, so mute lockdep until it makes sense to report
         * unheld locks.
         */
        if (system_state < SYSTEM_RUNNING)
                return;

        percpu_rwsem_assert_held(&cpu_hotplug_lock);
}

static void lockdep_acquire_cpus_lock(void)
{
        rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_);
}

static void lockdep_release_cpus_lock(void)
{
        rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_);
}

/*
 * Wait for currently running CPU hotplug operations to complete (if any) and
 * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
 * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the
 * hotplug path before performing hotplug operations. So acquiring that lock
 * guarantees mutual exclusion from any currently running hotplug operations.
 */
void cpu_hotplug_disable(void)
{
        cpu_maps_update_begin();
        cpu_hotplug_disabled++;
        cpu_maps_update_done();
}
EXPORT_SYMBOL_GPL(cpu_hotplug_disable);

static void __cpu_hotplug_enable(void)
{
        if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n"))
                return;
        cpu_hotplug_disabled--;
}

void cpu_hotplug_enable(void)
{
        cpu_maps_update_begin();
        __cpu_hotplug_enable();
        cpu_maps_update_done();
}
EXPORT_SYMBOL_GPL(cpu_hotplug_enable);

#else

static void lockdep_acquire_cpus_lock(void)
{
}

static void lockdep_release_cpus_lock(void)
{
}

#endif        /* CONFIG_HOTPLUG_CPU */

/*
 * Architectures that need SMT-specific errata handling during SMT hotplug
 * should override this.
 */
void __weak arch_smt_update(void) { }

#ifdef CONFIG_HOTPLUG_SMT
enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;

void __init cpu_smt_disable(bool force)
{
        if (!cpu_smt_possible())
                return;

        if (force) {
                pr_info("SMT: Force disabled\n");
                cpu_smt_control = CPU_SMT_FORCE_DISABLED;
        } else {
                pr_info("SMT: disabled\n");
                cpu_smt_control = CPU_SMT_DISABLED;
        }
}

/*
 * The decision whether SMT is supported can only be done after the full
 * CPU identification. Called from architecture code.
 */
void __init cpu_smt_check_topology(void)
{
        if (!topology_smt_supported())
                cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
}

static int __init smt_cmdline_disable(char *str)
{
        cpu_smt_disable(str && !strcmp(str, "force"));
        return 0;
}
early_param("nosmt", smt_cmdline_disable);

static inline bool cpu_smt_allowed(unsigned int cpu)
{
        if (cpu_smt_control == CPU_SMT_ENABLED)
                return true;

        if (topology_is_primary_thread(cpu))
                return true;

        /*
         * On x86 it's required to boot all logical CPUs at least once so
         * that the init code can get a chance to set CR4.MCE on each
         * CPU. Otherwise, a broadcasted MCE observing CR4.MCE=0b on any
         * core will shutdown the machine.
         */
        return !cpumask_test_cpu(cpu, &cpus_booted_once_mask);
}

/* Returns true if SMT is not supported of forcefully (irreversibly) disabled */
bool cpu_smt_possible(void)
{
        return cpu_smt_control != CPU_SMT_FORCE_DISABLED &&
                cpu_smt_control != CPU_SMT_NOT_SUPPORTED;
}
EXPORT_SYMBOL_GPL(cpu_smt_possible);
#else
static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
#endif

static inline enum cpuhp_state
cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
{
        enum cpuhp_state prev_state = st->state;

        st->rollback = false;
        st->last = NULL;

        st->target = target;
        st->single = false;
        st->bringup = st->state < target;

        return prev_state;
}

static inline void
cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
{
        st->rollback = true;

        /*
         * If we have st->last we need to undo partial multi_instance of this
         * state first. Otherwise start undo at the previous state.
         */
        if (!st->last) {
                if (st->bringup)
                        st->state--;
                else
                        st->state++;
        }

        st->target = prev_state;
        st->bringup = !st->bringup;
}

/* Regular hotplug invocation of the AP hotplug thread */
static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st)
{
        if (!st->single && st->state == st->target)
                return;

        st->result = 0;
        /*
         * Make sure the above stores are visible before should_run becomes
         * true. Paired with the mb() above in cpuhp_thread_fun()
         */
        smp_mb();
        st->should_run = true;
        wake_up_process(st->thread);
        wait_for_ap_thread(st, st->bringup);
}

static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target)
{
        enum cpuhp_state prev_state;
        int ret;

        prev_state = cpuhp_set_state(st, target);
        __cpuhp_kick_ap(st);
        if ((ret = st->result)) {
                cpuhp_reset_state(st, prev_state);
                __cpuhp_kick_ap(st);
        }

        return ret;
}

static int bringup_wait_for_ap(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);

        /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */
        wait_for_ap_thread(st, true);
        if (WARN_ON_ONCE((!cpu_online(cpu))))
                return -ECANCELED;

        /* Unpark the hotplug thread of the target cpu */
        kthread_unpark(st->thread);

        /*
         * SMT soft disabling on X86 requires to bring the CPU out of the
         * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit.  The
         * CPU marked itself as booted_once in notify_cpu_starting() so the
         * cpu_smt_allowed() check will now return false if this is not the
         * primary sibling.
         */
        if (!cpu_smt_allowed(cpu))
                return -ECANCELED;

        if (st->target <= CPUHP_AP_ONLINE_IDLE)
                return 0;

        return cpuhp_kick_ap(st, st->target);
}

static int bringup_cpu(unsigned int cpu)
{
        struct task_struct *idle = idle_thread_get(cpu);
        int ret;

        /*
         * Reset stale stack state from the last time this CPU was online.
         */
        scs_task_reset(idle);
        kasan_unpoison_task_stack(idle);

        /*
         * Some architectures have to walk the irq descriptors to
         * setup the vector space for the cpu which comes online.
         * Prevent irq alloc/free across the bringup.
         */
        irq_lock_sparse();

        /* Arch-specific enabling code. */
        ret = __cpu_up(cpu, idle);
        irq_unlock_sparse();
        if (ret)
                return ret;
        return bringup_wait_for_ap(cpu);
}

static int finish_cpu(unsigned int cpu)
{
        struct task_struct *idle = idle_thread_get(cpu);
        struct mm_struct *mm = idle->active_mm;

        /*
         * idle_task_exit() will have switched to &init_mm, now
         * clean up any remaining active_mm state.
         */
        if (mm != &init_mm)
                idle->active_mm = &init_mm;
        mmdrop(mm);
        return 0;
}

/*
 * Hotplug state machine related functions
 */

static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
{
        for (st->state--; st->state > st->target; st->state--)
                cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
}

static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st)
{
        if (IS_ENABLED(CONFIG_HOTPLUG_CPU))
                return true;
        /*
         * When CPU hotplug is disabled, then taking the CPU down is not
         * possible because takedown_cpu() and the architecture and
         * subsystem specific mechanisms are not available. So the CPU
         * which would be completely unplugged again needs to stay around
         * in the current state.
         */
        return st->state <= CPUHP_BRINGUP_CPU;
}

static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
                              enum cpuhp_state target)
{
        enum cpuhp_state prev_state = st->state;
        int ret = 0;

        while (st->state < target) {
                st->state++;
                ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
                if (ret) {
                        if (can_rollback_cpu(st)) {
                                st->target = prev_state;
                                undo_cpu_up(cpu, st);
                        }
                        break;
                }
        }
        return ret;
}

/*
 * The cpu hotplug threads manage the bringup and teardown of the cpus
 */
static void cpuhp_create(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);

        init_completion(&st->done_up);
        init_completion(&st->done_down);
}

static int cpuhp_should_run(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);

        return st->should_run;
}

/*
 * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke
 * callbacks when a state gets [un]installed at runtime.
 *
 * Each invocation of this function by the smpboot thread does a single AP
 * state callback.
 *
 * It has 3 modes of operation:
 *  - single: runs st->cb_state
 *  - up:     runs ++st->state, while st->state < st->target
 *  - down:   runs st->state--, while st->state > st->target
 *
 * When complete or on error, should_run is cleared and the completion is fired.
 */
static void cpuhp_thread_fun(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
        bool bringup = st->bringup;
        enum cpuhp_state state;

        if (WARN_ON_ONCE(!st->should_run))
                return;

        /*
         * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures
         * that if we see ->should_run we also see the rest of the state.
         */
        smp_mb();

        /*
         * The BP holds the hotplug lock, but we're now running on the AP,
         * ensure that anybody asserting the lock is held, will actually find
         * it so.
         */
        lockdep_acquire_cpus_lock();
        cpuhp_lock_acquire(bringup);

        if (st->single) {
                state = st->cb_state;
                st->should_run = false;
        } else {
                if (bringup) {
                        st->state++;
                        state = st->state;
                        st->should_run = (st->state < st->target);
                        WARN_ON_ONCE(st->state > st->target);
                } else {
                        state = st->state;
                        st->state--;
                        st->should_run = (st->state > st->target);
                        WARN_ON_ONCE(st->state < st->target);
                }
        }

        WARN_ON_ONCE(!cpuhp_is_ap_state(state));

        if (cpuhp_is_atomic_state(state)) {
                local_irq_disable();
                st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
                local_irq_enable();

                /*
                 * STARTING/DYING must not fail!
                 */
                WARN_ON_ONCE(st->result);
        } else {
                st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
        }

        if (st->result) {
                /*
                 * If we fail on a rollback, we're up a creek without no
                 * paddle, no way forward, no way back. We loose, thanks for
                 * playing.
                 */
                WARN_ON_ONCE(st->rollback);
                st->should_run = false;
        }

        cpuhp_lock_release(bringup);
        lockdep_release_cpus_lock();

        if (!st->should_run)
                complete_ap_thread(st, bringup);
}

/* Invoke a single callback on a remote cpu */
static int
cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
                         struct hlist_node *node)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        int ret;

        if (!cpu_online(cpu))
                return 0;

        cpuhp_lock_acquire(false);
        cpuhp_lock_release(false);

        cpuhp_lock_acquire(true);
        cpuhp_lock_release(true);

        /*
         * If we are up and running, use the hotplug thread. For early calls
         * we invoke the thread function directly.
         */
        if (!st->thread)
                return cpuhp_invoke_callback(cpu, state, bringup, node, NULL);

        st->rollback = false;
        st->last = NULL;

        st->node = node;
        st->bringup = bringup;
        st->cb_state = state;
        st->single = true;

        __cpuhp_kick_ap(st);

        /*
         * If we failed and did a partial, do a rollback.
         */
        if ((ret = st->result) && st->last) {
                st->rollback = true;
                st->bringup = !bringup;

                __cpuhp_kick_ap(st);
        }

        /*
         * Clean up the leftovers so the next hotplug operation wont use stale
         * data.
         */
        st->node = st->last = NULL;
        return ret;
}

static int cpuhp_kick_ap_work(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        enum cpuhp_state prev_state = st->state;
        int ret;

        cpuhp_lock_acquire(false);
        cpuhp_lock_release(false);

        cpuhp_lock_acquire(true);
        cpuhp_lock_release(true);

        trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work);
        ret = cpuhp_kick_ap(st, st->target);
        trace_cpuhp_exit(cpu, st->state, prev_state, ret);

        return ret;
}

static struct smp_hotplug_thread cpuhp_threads = {
        .store                        = &cpuhp_state.thread,
        .create                        = &cpuhp_create,
        .thread_should_run        = cpuhp_should_run,
        .thread_fn                = cpuhp_thread_fun,
        .thread_comm                = "cpuhp/%u",
        .selfparking                = true,
};

void __init cpuhp_threads_init(void)
{
        BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads));
        kthread_unpark(this_cpu_read(cpuhp_state.thread));
}

/*
 *
 * Serialize hotplug trainwrecks outside of the cpu_hotplug_lock
 * protected region.
 *
 * The operation is still serialized against concurrent CPU hotplug via
 * cpu_add_remove_lock, i.e. CPU map protection.  But it is _not_
 * serialized against other hotplug related activity like adding or
 * removing of state callbacks and state instances, which invoke either the
 * startup or the teardown callback of the affected state.
 *
 * This is required for subsystems which are unfixable vs. CPU hotplug and
 * evade lock inversion problems by scheduling work which has to be
 * completed _before_ cpu_up()/_cpu_down() returns.
 *
 * Don't even think about adding anything to this for any new code or even
 * drivers. It's only purpose is to keep existing lock order trainwrecks
 * working.
 *
 * For cpu_down() there might be valid reasons to finish cleanups which are
 * not required to be done under cpu_hotplug_lock, but that's a different
 * story and would be not invoked via this.
 */
static void cpu_up_down_serialize_trainwrecks(bool tasks_frozen)
{
        /*
         * cpusets delegate hotplug operations to a worker to "solve" the
         * lock order problems. Wait for the worker, but only if tasks are
         * _not_ frozen (suspend, hibernate) as that would wait forever.
         *
         * The wait is required because otherwise the hotplug operation
         * returns with inconsistent state, which could even be observed in
         * user space when a new CPU is brought up. The CPU plug uevent
         * would be delivered and user space reacting on it would fail to
         * move tasks to the newly plugged CPU up to the point where the
         * work has finished because up to that point the newly plugged CPU
         * is not assignable in cpusets/cgroups. On unplug that's not
         * necessarily a visible issue, but it is still inconsistent state,
         * which is the real problem which needs to be "fixed". This can't
         * prevent the transient state between scheduling the work and
         * returning from waiting for it.
         */
        if (!tasks_frozen)
                cpuset_wait_for_hotplug();
}

#ifdef CONFIG_HOTPLUG_CPU
#ifndef arch_clear_mm_cpumask_cpu
#define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm))
#endif

/**
 * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
 * @cpu: a CPU id
 *
 * This function walks all processes, finds a valid mm struct for each one and
 * then clears a corresponding bit in mm's cpumask.  While this all sounds
 * trivial, there are various non-obvious corner cases, which this function
 * tries to solve in a safe manner.
 *
 * Also note that the function uses a somewhat relaxed locking scheme, so it may
 * be called only for an already offlined CPU.
 */
void clear_tasks_mm_cpumask(int cpu)
{
        struct task_struct *p;

        /*
         * This function is called after the cpu is taken down and marked
         * offline, so its not like new tasks will ever get this cpu set in
         * their mm mask. -- Peter Zijlstra
         * Thus, we may use rcu_read_lock() here, instead of grabbing
         * full-fledged tasklist_lock.
         */
        WARN_ON(cpu_online(cpu));
        rcu_read_lock();
        for_each_process(p) {
                struct task_struct *t;

                /*
                 * Main thread might exit, but other threads may still have
                 * a valid mm. Find one.
                 */
                t = find_lock_task_mm(p);
                if (!t)
                        continue;
                arch_clear_mm_cpumask_cpu(cpu, t->mm);
                task_unlock(t);
        }
        rcu_read_unlock();
}

/* Take this CPU down. */
static int take_cpu_down(void *_param)
{
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
        enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
        int err, cpu = smp_processor_id();
        int ret;

        /* Ensure this CPU doesn't handle any more interrupts. */
        err = __cpu_disable();
        if (err < 0)
                return err;

        /*
         * We get here while we are in CPUHP_TEARDOWN_CPU state and we must not
         * do this step again.
         */
        WARN_ON(st->state != CPUHP_TEARDOWN_CPU);
        st->state--;
        /* Invoke the former CPU_DYING callbacks */
        for (; st->state > target; st->state--) {
                ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
                /*
                 * DYING must not fail!
                 */
                WARN_ON_ONCE(ret);
        }

        /* Give up timekeeping duties */
        tick_handover_do_timer();
        /* Remove CPU from timer broadcasting */
        tick_offline_cpu(cpu);
        /* Park the stopper thread */
        stop_machine_park(cpu);
        return 0;
}

static int takedown_cpu(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        int err;

        /* Park the smpboot threads */
        kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);

        /*
         * Prevent irq alloc/free while the dying cpu reorganizes the
         * interrupt affinities.
         */
        irq_lock_sparse();

        /*
         * So now all preempt/rcu users must observe !cpu_active().
         */
        err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu));
        if (err) {
                /* CPU refused to die */
                irq_unlock_sparse();
                /* Unpark the hotplug thread so we can rollback there */
                kthread_unpark(per_cpu_ptr(&cpuhp_state, cpu)->thread);
                return err;
        }
        BUG_ON(cpu_online(cpu));

        /*
         * The teardown callback for CPUHP_AP_SCHED_STARTING will have removed
         * all runnable tasks from the CPU, there's only the idle task left now
         * that the migration thread is done doing the stop_machine thing.
         *
         * Wait for the stop thread to go away.
         */
        wait_for_ap_thread(st, false);
        BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);

        /* Interrupts are moved away from the dying cpu, reenable alloc/free */
        irq_unlock_sparse();

        hotplug_cpu__broadcast_tick_pull(cpu);
        /* This actually kills the CPU. */
        __cpu_die(cpu);

        tick_cleanup_dead_cpu(cpu);
        rcutree_migrate_callbacks(cpu);
        return 0;
}

static void cpuhp_complete_idle_dead(void *arg)
{
        struct cpuhp_cpu_state *st = arg;

        complete_ap_thread(st, false);
}

void cpuhp_report_idle_dead(void)
{
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);

        BUG_ON(st->state != CPUHP_AP_OFFLINE);
        rcu_report_dead(smp_processor_id());
        st->state = CPUHP_AP_IDLE_DEAD;
        /*
         * We cannot call complete after rcu_report_dead() so we delegate it
         * to an online cpu.
         */
        smp_call_function_single(cpumask_first(cpu_online_mask),
                                 cpuhp_complete_idle_dead, st, 0);
}

static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
{
        for (st->state++; st->state < st->target; st->state++)
                cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
}

static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
                                enum cpuhp_state target)
{
        enum cpuhp_state prev_state = st->state;
        int ret = 0;

        for (; st->state > target; st->state--) {
                ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
                if (ret) {
                        st->target = prev_state;
                        if (st->state < prev_state)
                                undo_cpu_down(cpu, st);
                        break;
                }
        }
        return ret;
}

/* Requires cpu_add_remove_lock to be held */
static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
                           enum cpuhp_state target)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        int prev_state, ret = 0;

        if (num_online_cpus() == 1)
                return -EBUSY;

        if (!cpu_present(cpu))
                return -EINVAL;

        cpus_write_lock();

        cpuhp_tasks_frozen = tasks_frozen;

        prev_state = cpuhp_set_state(st, target);
        /*
         * If the current CPU state is in the range of the AP hotplug thread,
         * then we need to kick the thread.
         */
        if (st->state > CPUHP_TEARDOWN_CPU) {
                st->target = max((int)target, CPUHP_TEARDOWN_CPU);
                ret = cpuhp_kick_ap_work(cpu);
                /*
                 * The AP side has done the error rollback already. Just
                 * return the error code..
                 */
                if (ret)
                        goto out;

                /*
                 * We might have stopped still in the range of the AP hotplug
                 * thread. Nothing to do anymore.
                 */
                if (st->state > CPUHP_TEARDOWN_CPU)
                        goto out;

                st->target = target;
        }
        /*
         * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
         * to do the further cleanups.
         */
        ret = cpuhp_down_callbacks(cpu, st, target);
        if (ret && st->state == CPUHP_TEARDOWN_CPU && st->state < prev_state) {
                cpuhp_reset_state(st, prev_state);
                __cpuhp_kick_ap(st);
        }

out:
        cpus_write_unlock();
        /*
         * Do post unplug cleanup. This is still protected against
         * concurrent CPU hotplug via cpu_add_remove_lock.
         */
        lockup_detector_cleanup();
        arch_smt_update();
        cpu_up_down_serialize_trainwrecks(tasks_frozen);
        return ret;
}

static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
{
        if (cpu_hotplug_disabled)
                return -EBUSY;
        return _cpu_down(cpu, 0, target);
}

static int cpu_down(unsigned int cpu, enum cpuhp_state target)
{
        int err;

        cpu_maps_update_begin();
        err = cpu_down_maps_locked(cpu, target);
        cpu_maps_update_done();
        return err;
}

/**
 * cpu_device_down - Bring down a cpu device
 * @dev: Pointer to the cpu device to offline
 *
 * This function is meant to be used by device core cpu subsystem only.
 *
 * Other subsystems should use remove_cpu() instead.
 */
int cpu_device_down(struct device *dev)
{
        return cpu_down(dev->id, CPUHP_OFFLINE);
}

int remove_cpu(unsigned int cpu)
{
        int ret;

        lock_device_hotplug();
        ret = device_offline(get_cpu_device(cpu));
        unlock_device_hotplug();

        return ret;
}
EXPORT_SYMBOL_GPL(remove_cpu);

void smp_shutdown_nonboot_cpus(unsigned int primary_cpu)
{
        unsigned int cpu;
        int error;

        cpu_maps_update_begin();

        /*
         * Make certain the cpu I'm about to reboot on is online.
         *
         * This is inline to what migrate_to_reboot_cpu() already do.
         */
        if (!cpu_online(primary_cpu))
                primary_cpu = cpumask_first(cpu_online_mask);

        for_each_online_cpu(cpu) {
                if (cpu == primary_cpu)
                        continue;

                error = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
                if (error) {
                        pr_err("Failed to offline CPU%d - error=%d",
                                cpu, error);
                        break;
                }
        }

        /*
         * Ensure all but the reboot CPU are offline.
         */
        BUG_ON(num_online_cpus() > 1);

        /*
         * Make sure the CPUs won't be enabled by someone else after this
         * point. Kexec will reboot to a new kernel shortly resetting
         * everything along the way.
         */
        cpu_hotplug_disabled++;

        cpu_maps_update_done();
}

#else
#define takedown_cpu                NULL
#endif /*CONFIG_HOTPLUG_CPU*/

/**
 * notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU
 * @cpu: cpu that just started
 *
 * It must be called by the arch code on the new cpu, before the new cpu
 * enables interrupts and before the "boot" cpu returns from __cpu_up().
 */
void notify_cpu_starting(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
        int ret;

        rcu_cpu_starting(cpu);        /* Enables RCU usage on this CPU. */
        cpumask_set_cpu(cpu, &cpus_booted_once_mask);
        while (st->state < target) {
                st->state++;
                ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
                /*
                 * STARTING must not fail!
                 */
                WARN_ON_ONCE(ret);
        }
}

/*
 * Called from the idle task. Wake up the controlling task which brings the
 * hotplug thread of the upcoming CPU up and then delegates the rest of the
 * online bringup to the hotplug thread.
 */
void cpuhp_online_idle(enum cpuhp_state state)
{
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);

        /* Happens for the boot cpu */
        if (state != CPUHP_AP_ONLINE_IDLE)
                return;

        /*
         * Unpart the stopper thread before we start the idle loop (and start
         * scheduling); this ensures the stopper task is always available.
         */
        stop_machine_unpark(smp_processor_id());

        st->state = CPUHP_AP_ONLINE_IDLE;
        complete_ap_thread(st, true);
}

/* Requires cpu_add_remove_lock to be held */
static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        struct task_struct *idle;
        int ret = 0;

        cpus_write_lock();

        if (!cpu_present(cpu)) {
                ret = -EINVAL;
                goto out;
        }

        /*
         * The caller of cpu_up() might have raced with another
         * caller. Nothing to do.
         */
        if (st->state >= target)
                goto out;

        if (st->state == CPUHP_OFFLINE) {
                /* Let it fail before we try to bring the cpu up */
                idle = idle_thread_get(cpu);
                if (IS_ERR(idle)) {
                        ret = PTR_ERR(idle);
                        goto out;
                }
        }

        cpuhp_tasks_frozen = tasks_frozen;

        cpuhp_set_state(st, target);
        /*
         * If the current CPU state is in the range of the AP hotplug thread,
         * then we need to kick the thread once more.
         */
        if (st->state > CPUHP_BRINGUP_CPU) {
                ret = cpuhp_kick_ap_work(cpu);
                /*
                 * The AP side has done the error rollback already. Just
                 * return the error code..
                 */
                if (ret)
                        goto out;
        }

        /*
         * Try to reach the target state. We max out on the BP at
         * CPUHP_BRINGUP_CPU. After that the AP hotplug thread is
         * responsible for bringing it up to the target state.
         */
        target = min((int)target, CPUHP_BRINGUP_CPU);
        ret = cpuhp_up_callbacks(cpu, st, target);
out:
        cpus_write_unlock();
        arch_smt_update();
        cpu_up_down_serialize_trainwrecks(tasks_frozen);
        return ret;
}

static int cpu_up(unsigned int cpu, enum cpuhp_state target)
{
        int err = 0;

        if (!cpu_possible(cpu)) {
                pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
                       cpu);
#if defined(CONFIG_IA64)
                pr_err("please check additional_cpus= boot parameter\n");
#endif
                return -EINVAL;
        }

        err = try_online_node(cpu_to_node(cpu));
        if (err)
                return err;

        cpu_maps_update_begin();

        if (cpu_hotplug_disabled) {
                err = -EBUSY;
                goto out;
        }
        if (!cpu_smt_allowed(cpu)) {
                err = -EPERM;
                goto out;
        }

        err = _cpu_up(cpu, 0, target);
out:
        cpu_maps_update_done();
        return err;
}

/**
 * cpu_device_up - Bring up a cpu device
 * @dev: Pointer to the cpu device to online
 *
 * This function is meant to be used by device core cpu subsystem only.
 *
 * Other subsystems should use add_cpu() instead.
 */
int cpu_device_up(struct device *dev)
{
        return cpu_up(dev->id, CPUHP_ONLINE);
}

int add_cpu(unsigned int cpu)
{
        int ret;

        lock_device_hotplug();
        ret = device_online(get_cpu_device(cpu));
        unlock_device_hotplug();

        return ret;
}
EXPORT_SYMBOL_GPL(add_cpu);

/**
 * bringup_hibernate_cpu - Bring up the CPU that we hibernated on
 * @sleep_cpu: The cpu we hibernated on and should be brought up.
 *
 * On some architectures like arm64, we can hibernate on any CPU, but on
 * wake up the CPU we hibernated on might be offline as a side effect of
 * using maxcpus= for example.
 */
int bringup_hibernate_cpu(unsigned int sleep_cpu)
{
        int ret;

        if (!cpu_online(sleep_cpu)) {
                pr_info("Hibernated on a CPU that is offline! Bringing CPU up.\n");
                ret = cpu_up(sleep_cpu, CPUHP_ONLINE);
                if (ret) {
                        pr_err("Failed to bring hibernate-CPU up!\n");
                        return ret;
                }
        }
        return 0;
}

void bringup_nonboot_cpus(unsigned int setup_max_cpus)
{
        unsigned int cpu;

        for_each_present_cpu(cpu) {
                if (num_online_cpus() >= setup_max_cpus)
                        break;
                if (!cpu_online(cpu))
                        cpu_up(cpu, CPUHP_ONLINE);
        }
}

#ifdef CONFIG_PM_SLEEP_SMP
static cpumask_var_t frozen_cpus;

int freeze_secondary_cpus(int primary)
{
        int cpu, error = 0;

        cpu_maps_update_begin();
        if (primary == -1) {
                primary = cpumask_first(cpu_online_mask);
                if (!housekeeping_cpu(primary, HK_FLAG_TIMER))
                        primary = housekeeping_any_cpu(HK_FLAG_TIMER);
        } else {
                if (!cpu_online(primary))
                        primary = cpumask_first(cpu_online_mask);
        }

        /*
         * We take down all of the non-boot CPUs in one shot to avoid races
         * with the userspace trying to use the CPU hotplug at the same time
         */
        cpumask_clear(frozen_cpus);

        pr_info("Disabling non-boot CPUs ...\n");
        for_each_online_cpu(cpu) {
                if (cpu == primary)
                        continue;

                if (pm_wakeup_pending()) {
                        pr_info("Wakeup pending. Abort CPU freeze\n");
                        error = -EBUSY;
                        break;
                }

                trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
                error = _cpu_down(cpu, 1, CPUHP_OFFLINE);
                trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
                if (!error)
                        cpumask_set_cpu(cpu, frozen_cpus);
                else {
                        pr_err("Error taking CPU%d down: %d\n", cpu, error);
                        break;
                }
        }

        if (!error)
                BUG_ON(num_online_cpus() > 1);
        else
                pr_err("Non-boot CPUs are not disabled\n");

        /*
         * Make sure the CPUs won't be enabled by someone else. We need to do
         * this even in case of failure as all freeze_secondary_cpus() users are
         * supposed to do thaw_secondary_cpus() on the failure path.
         */
        cpu_hotplug_disabled++;

        cpu_maps_update_done();
        return error;
}

void __weak arch_thaw_secondary_cpus_begin(void)
{
}

void __weak arch_thaw_secondary_cpus_end(void)
{
}

void thaw_secondary_cpus(void)
{
        int cpu, error;

        /* Allow everyone to use the CPU hotplug again */
        cpu_maps_update_begin();
        __cpu_hotplug_enable();
        if (cpumask_empty(frozen_cpus))
                goto out;

        pr_info("Enabling non-boot CPUs ...\n");

        arch_thaw_secondary_cpus_begin();

        for_each_cpu(cpu, frozen_cpus) {
                trace_suspend_resume(TPS("CPU_ON"), cpu, true);
                error = _cpu_up(cpu, 1, CPUHP_ONLINE);
                trace_suspend_resume(TPS("CPU_ON"), cpu, false);
                if (!error) {
                        pr_info("CPU%d is up\n", cpu);
                        continue;
                }
                pr_warn("Error taking CPU%d up: %d\n", cpu, error);
        }

        arch_thaw_secondary_cpus_end();

        cpumask_clear(frozen_cpus);
out:
        cpu_maps_update_done();
}

static int __init alloc_frozen_cpus(void)
{
        if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
                return -ENOMEM;
        return 0;
}
core_initcall(alloc_frozen_cpus);

/*
 * When callbacks for CPU hotplug notifications are being executed, we must
 * ensure that the state of the system with respect to the tasks being frozen
 * or not, as reported by the notification, remains unchanged *throughout the
 * duration* of the execution of the callbacks.
 * Hence we need to prevent the freezer from racing with regular CPU hotplug.
 *
 * This synchronization is implemented by mutually excluding regular CPU
 * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/
 * Hibernate notifications.
 */
static int
cpu_hotplug_pm_callback(struct notifier_block *nb,
                        unsigned long action, void *ptr)
{
        switch (action) {

        case PM_SUSPEND_PREPARE:
        case PM_HIBERNATION_PREPARE:
                cpu_hotplug_disable();
                break;

        case PM_POST_SUSPEND:
        case PM_POST_HIBERNATION:
                cpu_hotplug_enable();
                break;

        default:
                return NOTIFY_DONE;
        }

        return NOTIFY_OK;
}


static int __init cpu_hotplug_pm_sync_init(void)
{
        /*
         * cpu_hotplug_pm_callback has higher priority than x86
         * bsp_pm_callback which depends on cpu_hotplug_pm_callback
         * to disable cpu hotplug to avoid cpu hotplug race.
         */
        pm_notifier(cpu_hotplug_pm_callback, 0);
        return 0;
}
core_initcall(cpu_hotplug_pm_sync_init);

#endif /* CONFIG_PM_SLEEP_SMP */

int __boot_cpu_id;

#endif /* CONFIG_SMP */

/* Boot processor state steps */
static struct cpuhp_step cpuhp_hp_states[] = {
        [CPUHP_OFFLINE] = {
                .name                        = "offline",
                .startup.single                = NULL,
                .teardown.single        = NULL,
        },
#ifdef CONFIG_SMP
        [CPUHP_CREATE_THREADS]= {
                .name                        = "threads:prepare",
                .startup.single                = smpboot_create_threads,
                .teardown.single        = NULL,
                .cant_stop                = true,
        },
        [CPUHP_PERF_PREPARE] = {
                .name                        = "perf:prepare",
                .startup.single                = perf_event_init_cpu,
                .teardown.single        = perf_event_exit_cpu,
        },
        [CPUHP_RANDOM_PREPARE] = {
                .name                        = "random:prepare",
                .startup.single                = random_prepare_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_WORKQUEUE_PREP] = {
                .name                        = "workqueue:prepare",
                .startup.single                = workqueue_prepare_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_HRTIMERS_PREPARE] = {
                .name                        = "hrtimers:prepare",
                .startup.single                = hrtimers_prepare_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_SMPCFD_PREPARE] = {
                .name                        = "smpcfd:prepare",
                .startup.single                = smpcfd_prepare_cpu,
                .teardown.single        = smpcfd_dead_cpu,
        },
        [CPUHP_RELAY_PREPARE] = {
                .name                        = "relay:prepare",
                .startup.single                = relay_prepare_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_SLAB_PREPARE] = {
                .name                        = "slab:prepare",
                .startup.single                = slab_prepare_cpu,
                .teardown.single        = slab_dead_cpu,
        },
        [CPUHP_RCUTREE_PREP] = {
                .name                        = "RCU/tree:prepare",
                .startup.single                = rcutree_prepare_cpu,
                .teardown.single        = rcutree_dead_cpu,
        },
        /*
         * On the tear-down path, timers_dead_cpu() must be invoked
         * before blk_mq_queue_reinit_notify() from notify_dead(),
         * otherwise a RCU stall occurs.
         */
        [CPUHP_TIMERS_PREPARE] = {
                .name                        = "timers:prepare",
                .startup.single                = timers_prepare_cpu,
                .teardown.single        = timers_dead_cpu,
        },
        /* Kicks the plugged cpu into life */
        [CPUHP_BRINGUP_CPU] = {
                .name                        = "cpu:bringup",
                .startup.single                = bringup_cpu,
                .teardown.single        = finish_cpu,
                .cant_stop                = true,
        },
        /* Final state before CPU kills itself */
        [CPUHP_AP_IDLE_DEAD] = {
                .name                        = "idle:dead",
        },
        /*
         * Last state before CPU enters the idle loop to die. Transient state
         * for synchronization.
         */
        [CPUHP_AP_OFFLINE] = {
                .name                        = "ap:offline",
                .cant_stop                = true,
        },
        /* First state is scheduler control. Interrupts are disabled */
        [CPUHP_AP_SCHED_STARTING] = {
                .name                        = "sched:starting",
                .startup.single                = sched_cpu_starting,
                .teardown.single        = sched_cpu_dying,
        },
        [CPUHP_AP_RCUTREE_DYING] = {
                .name                        = "RCU/tree:dying",
                .startup.single                = NULL,
                .teardown.single        = rcutree_dying_cpu,
        },
        [CPUHP_AP_SMPCFD_DYING] = {
                .name                        = "smpcfd:dying",
                .startup.single                = NULL,
                .teardown.single        = smpcfd_dying_cpu,
        },
        [CPUHP_AP_HRTIMERS_DYING] = {
                .name                        = "hrtimers:dying",
                .startup.single                = NULL,
                .teardown.single        = hrtimers_cpu_dying,
        },

        /* Entry state on starting. Interrupts enabled from here on. Transient
         * state for synchronsization */
        [CPUHP_AP_ONLINE] = {
                .name                        = "ap:online",
        },
        /*
         * Handled on controll processor until the plugged processor manages
         * this itself.
         */
        [CPUHP_TEARDOWN_CPU] = {
                .name                        = "cpu:teardown",
                .startup.single                = NULL,
                .teardown.single        = takedown_cpu,
                .cant_stop                = true,
        },
        /* Handle smpboot threads park/unpark */
        [CPUHP_AP_SMPBOOT_THREADS] = {
                .name                        = "smpboot/threads:online",
                .startup.single                = smpboot_unpark_threads,
                .teardown.single        = smpboot_park_threads,
        },
        [CPUHP_AP_IRQ_AFFINITY_ONLINE] = {
                .name                        = "irq/affinity:online",
                .startup.single                = irq_affinity_online_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_AP_PERF_ONLINE] = {
                .name                        = "perf:online",
                .startup.single                = perf_event_init_cpu,
                .teardown.single        = perf_event_exit_cpu,
        },
        [CPUHP_AP_WATCHDOG_ONLINE] = {
                .name                        = "lockup_detector:online",
                .startup.single                = lockup_detector_online_cpu,
                .teardown.single        = lockup_detector_offline_cpu,
        },
        [CPUHP_AP_WORKQUEUE_ONLINE] = {
                .name                        = "workqueue:online",
                .startup.single                = workqueue_online_cpu,
                .teardown.single        = workqueue_offline_cpu,
        },
        [CPUHP_AP_RANDOM_ONLINE] = {
                .name                        = "random:online",
                .startup.single                = random_online_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_AP_RCUTREE_ONLINE] = {
                .name                        = "RCU/tree:online",
                .startup.single                = rcutree_online_cpu,
                .teardown.single        = rcutree_offline_cpu,
        },
#endif
        /*
         * The dynamically registered state space is here
         */

#ifdef CONFIG_SMP
        /* Last state is scheduler control setting the cpu active */
        [CPUHP_AP_ACTIVE] = {
                .name                        = "sched:active",
                .startup.single                = sched_cpu_activate,
                .teardown.single        = sched_cpu_deactivate,
        },
#endif

        /* CPU is fully up and running. */
        [CPUHP_ONLINE] = {
                .name                        = "online",
                .startup.single                = NULL,
                .teardown.single        = NULL,
        },
};

/* Sanity check for callbacks */
static int cpuhp_cb_check(enum cpuhp_state state)
{
        if (state <= CPUHP_OFFLINE || state >= CPUHP_ONLINE)
                return -EINVAL;
        return 0;
}

/*
 * Returns a free for dynamic slot assignment of the Online state. The states
 * are protected by the cpuhp_slot_states mutex and an empty slot is identified
 * by having no name assigned.
 */
static int cpuhp_reserve_state(enum cpuhp_state state)
{
        enum cpuhp_state i, end;
        struct cpuhp_step *step;

        switch (state) {
        case CPUHP_AP_ONLINE_DYN:
                step = cpuhp_hp_states + CPUHP_AP_ONLINE_DYN;
                end = CPUHP_AP_ONLINE_DYN_END;
                break;
        case CPUHP_BP_PREPARE_DYN:
                step = cpuhp_hp_states + CPUHP_BP_PREPARE_DYN;
                end = CPUHP_BP_PREPARE_DYN_END;
                break;
        default:
                return -EINVAL;
        }

        for (i = state; i <= end; i++, step++) {
                if (!step->name)
                        return i;
        }
        WARN(1, "No more dynamic states available for CPU hotplug\n");
        return -ENOSPC;
}

static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name,
                                 int (*startup)(unsigned int cpu),
                                 int (*teardown)(unsigned int cpu),
                                 bool multi_instance)
{
        /* (Un)Install the callbacks for further cpu hotplug operations */
        struct cpuhp_step *sp;
        int ret = 0;

        /*
         * If name is NULL, then the state gets removed.
         *
         * CPUHP_AP_ONLINE_DYN and CPUHP_BP_PREPARE_DYN are handed out on
         * the first allocation from these dynamic ranges, so the removal
         * would trigger a new allocation and clear the wrong (already
         * empty) state, leaving the callbacks of the to be cleared state
         * dangling, which causes wreckage on the next hotplug operation.
         */
        if (name && (state == CPUHP_AP_ONLINE_DYN ||
                     state == CPUHP_BP_PREPARE_DYN)) {
                ret = cpuhp_reserve_state(state);
                if (ret < 0)
                        return ret;
                state = ret;
        }
        sp = cpuhp_get_step(state);
        if (name && sp->name)
                return -EBUSY;

        sp->startup.single = startup;
        sp->teardown.single = teardown;
        sp->name = name;
        sp->multi_instance = multi_instance;
        INIT_HLIST_HEAD(&sp->list);
        return ret;
}

static void *cpuhp_get_teardown_cb(enum cpuhp_state state)
{
        return cpuhp_get_step(state)->teardown.single;
}

/*
 * Call the startup/teardown function for a step either on the AP or
 * on the current CPU.
 */
static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
                            struct hlist_node *node)
{
        struct cpuhp_step *sp = cpuhp_get_step(state);
        int ret;

        /*
         * If there's nothing to do, we done.
         * Relies on the union for multi_instance.
         */
        if ((bringup && !sp->startup.single) ||
            (!bringup && !sp->teardown.single))
                return 0;
        /*
         * The non AP bound callbacks can fail on bringup. On teardown
         * e.g. module removal we crash for now.
         */
#ifdef CONFIG_SMP
        if (cpuhp_is_ap_state(state))
                ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node);
        else
                ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
#else
        ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
#endif
        BUG_ON(ret && !bringup);
        return ret;
}

/*
 * Called from __cpuhp_setup_state on a recoverable failure.
 *
 * Note: The teardown callbacks for rollback are not allowed to fail!
 */
static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
                                   struct hlist_node *node)
{
        int cpu;

        /* Roll back the already executed steps on the other cpus */
        for_each_present_cpu(cpu) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
                int cpustate = st->state;

                if (cpu >= failedcpu)
                        break;

                /* Did we invoke the startup call on that cpu ? */
                if (cpustate >= state)
                        cpuhp_issue_call(cpu, state, false, node);
        }
}

int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state,
                                          struct hlist_node *node,
                                          bool invoke)
{
        struct cpuhp_step *sp;
        int cpu;
        int ret;

        lockdep_assert_cpus_held();

        sp = cpuhp_get_step(state);
        if (sp->multi_instance == false)
                return -EINVAL;

        mutex_lock(&cpuhp_state_mutex);

        if (!invoke || !sp->startup.multi)
                goto add_node;

        /*
         * Try to call the startup callback for each present cpu
         * depending on the hotplug state of the cpu.
         */
        for_each_present_cpu(cpu) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
                int cpustate = st->state;

                if (cpustate < state)
                        continue;

                ret = cpuhp_issue_call(cpu, state, true, node);
                if (ret) {
                        if (sp->teardown.multi)
                                cpuhp_rollback_install(cpu, state, node);
                        goto unlock;
                }
        }
add_node:
        ret = 0;
        hlist_add_head(node, &sp->list);
unlock:
        mutex_unlock(&cpuhp_state_mutex);
        return ret;
}

int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
                               bool invoke)
{
        int ret;

        cpus_read_lock();
        ret = __cpuhp_state_add_instance_cpuslocked(state, node, invoke);
        cpus_read_unlock();
        return ret;
}
EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);

/**
 * __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state
 * @state:                The state to setup
 * @invoke:                If true, the startup function is invoked for cpus where
 *                        cpu state >= @state
 * @startup:                startup callback function
 * @teardown:                teardown callback function
 * @multi_instance:        State is set up for multiple instances which get
 *                        added afterwards.
 *
 * The caller needs to hold cpus read locked while calling this function.
 * Returns:
 *   On success:
 *      Positive state number if @state is CPUHP_AP_ONLINE_DYN
 *      0 for all other states
 *   On failure: proper (negative) error code
 */
int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state,
                                   const char *name, bool invoke,
                                   int (*startup)(unsigned int cpu),
                                   int (*teardown)(unsigned int cpu),
                                   bool multi_instance)
{
        int cpu, ret = 0;
        bool dynstate;

        lockdep_assert_cpus_held();

        if (cpuhp_cb_check(state) || !name)
                return -EINVAL;

        mutex_lock(&cpuhp_state_mutex);

        ret = cpuhp_store_callbacks(state, name, startup, teardown,
                                    multi_instance);

        dynstate = state == CPUHP_AP_ONLINE_DYN;
        if (ret > 0 && dynstate) {
                state = ret;
                ret = 0;
        }

        if (ret || !invoke || !startup)
                goto out;

        /*
         * Try to call the startup callback for each present cpu
         * depending on the hotplug state of the cpu.
         */
        for_each_present_cpu(cpu) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
                int cpustate = st->state;

                if (cpustate < state)
                        continue;

                ret = cpuhp_issue_call(cpu, state, true, NULL);
                if (ret) {
                        if (teardown)
                                cpuhp_rollback_install(cpu, state, NULL);
                        cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
                        goto out;
                }
        }
out:
        mutex_unlock(&cpuhp_state_mutex);
        /*
         * If the requested state is CPUHP_AP_ONLINE_DYN, return the
         * dynamically allocated state in case of success.
         */
        if (!ret && dynstate)
                return state;
        return ret;
}
EXPORT_SYMBOL(__cpuhp_setup_state_cpuslocked);

int __cpuhp_setup_state(enum cpuhp_state state,
                        const char *name, bool invoke,
                        int (*startup)(unsigned int cpu),
                        int (*teardown)(unsigned int cpu),
                        bool multi_instance)
{
        int ret;

        cpus_read_lock();
        ret = __cpuhp_setup_state_cpuslocked(state, name, invoke, startup,
                                             teardown, multi_instance);
        cpus_read_unlock();
        return ret;
}
EXPORT_SYMBOL(__cpuhp_setup_state);

int __cpuhp_state_remove_instance(enum cpuhp_state state,
                                  struct hlist_node *node, bool invoke)
{
        struct cpuhp_step *sp = cpuhp_get_step(state);
        int cpu;

        BUG_ON(cpuhp_cb_check(state));

        if (!sp->multi_instance)
                return -EINVAL;

        cpus_read_lock();
        mutex_lock(&cpuhp_state_mutex);

        if (!invoke || !cpuhp_get_teardown_cb(state))
                goto remove;
        /*
         * Call the teardown callback for each present cpu depending
         * on the hotplug state of the cpu. This function is not
         * allowed to fail currently!
         */
        for_each_present_cpu(cpu) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
                int cpustate = st->state;

                if (cpustate >= state)
                        cpuhp_issue_call(cpu, state, false, node);
        }

remove:
        hlist_del(node);
        mutex_unlock(&cpuhp_state_mutex);
        cpus_read_unlock();

        return 0;
}
EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance);

/**
 * __cpuhp_remove_state_cpuslocked - Remove the callbacks for an hotplug machine state
 * @state:        The state to remove
 * @invoke:        If true, the teardown function is invoked for cpus where
 *                cpu state >= @state
 *
 * The caller needs to hold cpus read locked while calling this function.
 * The teardown callback is currently not allowed to fail. Think
 * about module removal!
 */
void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke)
{
        struct cpuhp_step *sp = cpuhp_get_step(state);
        int cpu;

        BUG_ON(cpuhp_cb_check(state));

        lockdep_assert_cpus_held();

        mutex_lock(&cpuhp_state_mutex);
        if (sp->multi_instance) {
                WARN(!hlist_empty(&sp->list),
                     "Error: Removing state %d which has instances left.\n",
                     state);
                goto remove;
        }

        if (!invoke || !cpuhp_get_teardown_cb(state))
                goto remove;

        /*
         * Call the teardown callback for each present cpu depending
         * on the hotplug state of the cpu. This function is not
         * allowed to fail currently!
         */
        for_each_present_cpu(cpu) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
                int cpustate = st->state;

                if (cpustate >= state)
                        cpuhp_issue_call(cpu, state, false, NULL);
        }
remove:
        cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
        mutex_unlock(&cpuhp_state_mutex);
}
EXPORT_SYMBOL(__cpuhp_remove_state_cpuslocked);

void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
{
        cpus_read_lock();
        __cpuhp_remove_state_cpuslocked(state, invoke);
        cpus_read_unlock();
}
EXPORT_SYMBOL(__cpuhp_remove_state);

#ifdef CONFIG_HOTPLUG_SMT
static void cpuhp_offline_cpu_device(unsigned int cpu)
{
        struct device *dev = get_cpu_device(cpu);

        dev->offline = true;
        /* Tell user space about the state change */
        kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
}

static void cpuhp_online_cpu_device(unsigned int cpu)
{
        struct device *dev = get_cpu_device(cpu);

        dev->offline = false;
        /* Tell user space about the state change */
        kobject_uevent(&dev->kobj, KOBJ_ONLINE);
}

int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
{
        int cpu, ret = 0;

        cpu_maps_update_begin();
        for_each_online_cpu(cpu) {
                if (topology_is_primary_thread(cpu))
                        continue;
                ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
                if (ret)
                        break;
                /*
                 * As this needs to hold the cpu maps lock it's impossible
                 * to call device_offline() because that ends up calling
                 * cpu_down() which takes cpu maps lock. cpu maps lock
                 * needs to be held as this might race against in kernel
                 * abusers of the hotplug machinery (thermal management).
                 *
                 * So nothing would update device:offline state. That would
                 * leave the sysfs entry stale and prevent onlining after
                 * smt control has been changed to 'off' again. This is
                 * called under the sysfs hotplug lock, so it is properly
                 * serialized against the regular offline usage.
                 */
                cpuhp_offline_cpu_device(cpu);
        }
        if (!ret)
                cpu_smt_control = ctrlval;
        cpu_maps_update_done();
        return ret;
}

int cpuhp_smt_enable(void)
{
        int cpu, ret = 0;

        cpu_maps_update_begin();
        cpu_smt_control = CPU_SMT_ENABLED;
        for_each_present_cpu(cpu) {
                /* Skip online CPUs and CPUs on offline nodes */
                if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
                        continue;
                ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
                if (ret)
                        break;
                /* See comment in cpuhp_smt_disable() */
                cpuhp_online_cpu_device(cpu);
        }
        cpu_maps_update_done();
        return ret;
}
#endif

#if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU)
static ssize_t show_cpuhp_state(struct device *dev,
                                struct device_attribute *attr, char *buf)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);

        return sprintf(buf, "%d\n", st->state);
}
static DEVICE_ATTR(state, 0444, show_cpuhp_state, NULL);

static ssize_t write_cpuhp_target(struct device *dev,
                                  struct device_attribute *attr,
                                  const char *buf, size_t count)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
        struct cpuhp_step *sp;
        int target, ret;

        ret = kstrtoint(buf, 10, &target);
        if (ret)
                return ret;

#ifdef CONFIG_CPU_HOTPLUG_STATE_CONTROL
        if (target < CPUHP_OFFLINE || target > CPUHP_ONLINE)
                return -EINVAL;
#else
        if (target != CPUHP_OFFLINE && target != CPUHP_ONLINE)
                return -EINVAL;
#endif

        ret = lock_device_hotplug_sysfs();
        if (ret)
                return ret;

        mutex_lock(&cpuhp_state_mutex);
        sp = cpuhp_get_step(target);
        ret = !sp->name || sp->cant_stop ? -EINVAL : 0;
        mutex_unlock(&cpuhp_state_mutex);
        if (ret)
                goto out;

        if (st->state < target)
                ret = cpu_up(dev->id, target);
        else if (st->state > target)
                ret = cpu_down(dev->id, target);
        else if (WARN_ON(st->target != target))
                st->target = target;
out:
        unlock_device_hotplug();
        return ret ? ret : count;
}

static ssize_t show_cpuhp_target(struct device *dev,
                                 struct device_attribute *attr, char *buf)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);

        return sprintf(buf, "%d\n", st->target);
}
static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target);


static ssize_t write_cpuhp_fail(struct device *dev,
                                struct device_attribute *attr,
                                const char *buf, size_t count)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
        struct cpuhp_step *sp;
        int fail, ret;

        ret = kstrtoint(buf, 10, &fail);
        if (ret)
                return ret;

        if (fail < CPUHP_OFFLINE || fail > CPUHP_ONLINE)
                return -EINVAL;

        /*
         * Cannot fail STARTING/DYING callbacks.
         */
        if (cpuhp_is_atomic_state(fail))
                return -EINVAL;

        /*
         * Cannot fail anything that doesn't have callbacks.
         */
        mutex_lock(&cpuhp_state_mutex);
        sp = cpuhp_get_step(fail);
        if (!sp->startup.single && !sp->teardown.single)
                ret = -EINVAL;
        mutex_unlock(&cpuhp_state_mutex);
        if (ret)
                return ret;

        st->fail = fail;

        return count;
}

static ssize_t show_cpuhp_fail(struct device *dev,
                               struct device_attribute *attr, char *buf)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);

        return sprintf(buf, "%d\n", st->fail);
}

static DEVICE_ATTR(fail, 0644, show_cpuhp_fail, write_cpuhp_fail);

static struct attribute *cpuhp_cpu_attrs[] = {
        &dev_attr_state.attr,
        &dev_attr_target.attr,
        &dev_attr_fail.attr,
        NULL
};

static const struct attribute_group cpuhp_cpu_attr_group = {
        .attrs = cpuhp_cpu_attrs,
        .name = "hotplug",
        NULL
};

static ssize_t show_cpuhp_states(struct device *dev,
                                 struct device_attribute *attr, char *buf)
{
        ssize_t cur, res = 0;
        int i;

        mutex_lock(&cpuhp_state_mutex);
        for (i = CPUHP_OFFLINE; i <= CPUHP_ONLINE; i++) {
                struct cpuhp_step *sp = cpuhp_get_step(i);

                if (sp->name) {
                        cur = sprintf(buf, "%3d: %s\n", i, sp->name);
                        buf += cur;
                        res += cur;
                }
        }
        mutex_unlock(&cpuhp_state_mutex);
        return res;
}
static DEVICE_ATTR(states, 0444, show_cpuhp_states, NULL);

static struct attribute *cpuhp_cpu_root_attrs[] = {
        &dev_attr_states.attr,
        NULL
};

static const struct attribute_group cpuhp_cpu_root_attr_group = {
        .attrs = cpuhp_cpu_root_attrs,
        .name = "hotplug",
        NULL
};

#ifdef CONFIG_HOTPLUG_SMT

static ssize_t
__store_smt_control(struct device *dev, struct device_attribute *attr,
                    const char *buf, size_t count)
{
        int ctrlval, ret;

        if (sysfs_streq(buf, "on"))
                ctrlval = CPU_SMT_ENABLED;
        else if (sysfs_streq(buf, "off"))
                ctrlval = CPU_SMT_DISABLED;
        else if (sysfs_streq(buf, "forceoff"))
                ctrlval = CPU_SMT_FORCE_DISABLED;
        else
                return -EINVAL;

        if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
                return -EPERM;

        if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
                return -ENODEV;

        ret = lock_device_hotplug_sysfs();
        if (ret)
                return ret;

        if (ctrlval != cpu_smt_control) {
                switch (ctrlval) {
                case CPU_SMT_ENABLED:
                        ret = cpuhp_smt_enable();
                        break;
                case CPU_SMT_DISABLED:
                case CPU_SMT_FORCE_DISABLED:
                        ret = cpuhp_smt_disable(ctrlval);
                        break;
                }
        }

        unlock_device_hotplug();
        return ret ? ret : count;
}

#else /* !CONFIG_HOTPLUG_SMT */
static ssize_t
__store_smt_control(struct device *dev, struct device_attribute *attr,
                    const char *buf, size_t count)
{
        return -ENODEV;
}
#endif /* CONFIG_HOTPLUG_SMT */

static const char *smt_states[] = {
        [CPU_SMT_ENABLED]                = "on",
        [CPU_SMT_DISABLED]                = "off",
        [CPU_SMT_FORCE_DISABLED]        = "forceoff",
        [CPU_SMT_NOT_SUPPORTED]                = "notsupported",
        [CPU_SMT_NOT_IMPLEMENTED]        = "notimplemented",
};

static ssize_t
show_smt_control(struct device *dev, struct device_attribute *attr, char *buf)
{
        const char *state = smt_states[cpu_smt_control];

        return snprintf(buf, PAGE_SIZE - 2, "%s\n", state);
}

static ssize_t
store_smt_control(struct device *dev, struct device_attribute *attr,
                  const char *buf, size_t count)
{
        return __store_smt_control(dev, attr, buf, count);
}
static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control);

static ssize_t
show_smt_active(struct device *dev, struct device_attribute *attr, char *buf)
{
        return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active());
}
static DEVICE_ATTR(active, 0444, show_smt_active, NULL);

static struct attribute *cpuhp_smt_attrs[] = {
        &dev_attr_control.attr,
        &dev_attr_active.attr,
        NULL
};

static const struct attribute_group cpuhp_smt_attr_group = {
        .attrs = cpuhp_smt_attrs,
        .name = "smt",
        NULL
};

static int __init cpu_smt_sysfs_init(void)
{
        return sysfs_create_group(&cpu_subsys.dev_root->kobj,
                                  &cpuhp_smt_attr_group);
}

static int __init cpuhp_sysfs_init(void)
{
        int cpu, ret;

        ret = cpu_smt_sysfs_init();
        if (ret)
                return ret;

        ret = sysfs_create_group(&cpu_subsys.dev_root->kobj,
                                 &cpuhp_cpu_root_attr_group);
        if (ret)
                return ret;

        for_each_possible_cpu(cpu) {
                struct device *dev = get_cpu_device(cpu);

                if (!dev)
                        continue;
                ret = sysfs_create_group(&dev->kobj, &cpuhp_cpu_attr_group);
                if (ret)
                        return ret;
        }
        return 0;
}
device_initcall(cpuhp_sysfs_init);
#endif /* CONFIG_SYSFS && CONFIG_HOTPLUG_CPU */

/*
 * cpu_bit_bitmap[] is a special, "compressed" data structure that
 * represents all NR_CPUS bits binary values of 1<<nr.
 *
 * It is used by cpumask_of() to get a constant address to a CPU
 * mask value that has a single bit set only.
 */

/* cpu_bit_bitmap[0] is empty - so we can back into it */
#define MASK_DECLARE_1(x)        [x+1][0] = (1UL << (x))
#define MASK_DECLARE_2(x)        MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
#define MASK_DECLARE_4(x)        MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
#define MASK_DECLARE_8(x)        MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)

const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {

        MASK_DECLARE_8(0),        MASK_DECLARE_8(8),
        MASK_DECLARE_8(16),        MASK_DECLARE_8(24),
#if BITS_PER_LONG > 32
        MASK_DECLARE_8(32),        MASK_DECLARE_8(40),
        MASK_DECLARE_8(48),        MASK_DECLARE_8(56),
#endif
};
EXPORT_SYMBOL_GPL(cpu_bit_bitmap);

const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
EXPORT_SYMBOL(cpu_all_bits);

#ifdef CONFIG_INIT_ALL_POSSIBLE
struct cpumask __cpu_possible_mask __read_mostly
        = {CPU_BITS_ALL};
#else
struct cpumask __cpu_possible_mask __read_mostly;
#endif
EXPORT_SYMBOL(__cpu_possible_mask);

struct cpumask __cpu_online_mask __read_mostly;
EXPORT_SYMBOL(__cpu_online_mask);

struct cpumask __cpu_present_mask __read_mostly;
EXPORT_SYMBOL(__cpu_present_mask);

struct cpumask __cpu_active_mask __read_mostly;
EXPORT_SYMBOL(__cpu_active_mask);

atomic_t __num_online_cpus __read_mostly;
EXPORT_SYMBOL(__num_online_cpus);

void init_cpu_present(const struct cpumask *src)
{
        cpumask_copy(&__cpu_present_mask, src);
}

void init_cpu_possible(const struct cpumask *src)
{
        cpumask_copy(&__cpu_possible_mask, src);
}

void init_cpu_online(const struct cpumask *src)
{
        cpumask_copy(&__cpu_online_mask, src);
}

void set_cpu_online(unsigned int cpu, bool online)
{
        /*
         * atomic_inc/dec() is required to handle the horrid abuse of this
         * function by the reboot and kexec code which invoke it from
         * IPI/NMI broadcasts when shutting down CPUs. Invocation from
         * regular CPU hotplug is properly serialized.
         *
         * Note, that the fact that __num_online_cpus is of type atomic_t
         * does not protect readers which are not serialized against
         * concurrent hotplug operations.
         */
        if (online) {
                if (!cpumask_test_and_set_cpu(cpu, &__cpu_online_mask))
                        atomic_inc(&__num_online_cpus);
        } else {
                if (cpumask_test_and_clear_cpu(cpu, &__cpu_online_mask))
                        atomic_dec(&__num_online_cpus);
        }
}

/*
 * Activate the first processor.
 */
void __init boot_cpu_init(void)
{
        int cpu = smp_processor_id();

        /* Mark the boot cpu "present", "online" etc for SMP and UP case */
        set_cpu_online(cpu, true);
        set_cpu_active(cpu, true);
        set_cpu_present(cpu, true);
        set_cpu_possible(cpu, true);

#ifdef CONFIG_SMP
        __boot_cpu_id = cpu;
#endif
}

/*
 * Must be called _AFTER_ setting up the per_cpu areas
 */
void __init boot_cpu_hotplug_init(void)
{
#ifdef CONFIG_SMP
        cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask);
#endif
        this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
}

/*
 * These are used for a global "mitigations=" cmdline option for toggling
 * optional CPU mitigations.
 */
enum cpu_mitigations {
        CPU_MITIGATIONS_OFF,
        CPU_MITIGATIONS_AUTO,
        CPU_MITIGATIONS_AUTO_NOSMT,
};

static enum cpu_mitigations cpu_mitigations __ro_after_init =
        CPU_MITIGATIONS_AUTO;

static int __init mitigations_parse_cmdline(char *arg)
{
        if (!strcmp(arg, "off"))
                cpu_mitigations = CPU_MITIGATIONS_OFF;
        else if (!strcmp(arg, "auto"))
                cpu_mitigations = CPU_MITIGATIONS_AUTO;
        else if (!strcmp(arg, "auto,nosmt"))
                cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT;
        else
                pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n",
                        arg);

        return 0;
}
early_param("mitigations", mitigations_parse_cmdline);

/* mitigations=off */
bool cpu_mitigations_off(void)
{
        return cpu_mitigations == CPU_MITIGATIONS_OFF;
}
EXPORT_SYMBOL_GPL(cpu_mitigations_off);

/* mitigations=auto,nosmt */
bool cpu_mitigations_auto_nosmt(void)
{
        return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT;
}
EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt);































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_TASK_STACK_H
#define _LINUX_SCHED_TASK_STACK_H

/*
 * task->stack (kernel stack) handling interfaces:
 */

#include <linux/sched.h>
#include <linux/magic.h>

#ifdef CONFIG_THREAD_INFO_IN_TASK

/*
 * When accessing the stack of a non-current task that might exit, use
 * try_get_task_stack() instead.  task_stack_page will return a pointer
 * that could get freed out from under you.
 */
static __always_inline void *task_stack_page(const struct task_struct *task)
{
        return task->stack;
}

#define setup_thread_stack(new,old)        do { } while(0)

static __always_inline unsigned long *end_of_stack(const struct task_struct *task)
{
#ifdef CONFIG_STACK_GROWSUP
        return (unsigned long *)((unsigned long)task->stack + THREAD_SIZE) - 1;
#else
        return task->stack;
#endif
}

#elif !defined(__HAVE_THREAD_FUNCTIONS)

#define task_stack_page(task)        ((void *)(task)->stack)

static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
{
        *task_thread_info(p) = *task_thread_info(org);
        task_thread_info(p)->task = p;
}

/*
 * Return the address of the last usable long on the stack.
 *
 * When the stack grows down, this is just above the thread
 * info struct. Going any lower will corrupt the threadinfo.
 *
 * When the stack grows up, this is the highest address.
 * Beyond that position, we corrupt data on the next page.
 */
static inline unsigned long *end_of_stack(struct task_struct *p)
{
#ifdef CONFIG_STACK_GROWSUP
        return (unsigned long *)((unsigned long)task_thread_info(p) + THREAD_SIZE) - 1;
#else
        return (unsigned long *)(task_thread_info(p) + 1);
#endif
}

#endif

#ifdef CONFIG_THREAD_INFO_IN_TASK
static inline void *try_get_task_stack(struct task_struct *tsk)
{
        return refcount_inc_not_zero(&tsk->stack_refcount) ?
                task_stack_page(tsk) : NULL;
}

extern void put_task_stack(struct task_struct *tsk);
#else
static inline void *try_get_task_stack(struct task_struct *tsk)
{
        return task_stack_page(tsk);
}

static inline void put_task_stack(struct task_struct *tsk) {}
#endif

#define task_stack_end_corrupted(task) \
                (*(end_of_stack(task)) != STACK_END_MAGIC)

static inline int object_is_on_stack(const void *obj)
{
        void *stack = task_stack_page(current);

        return (obj >= stack) && (obj < (stack + THREAD_SIZE));
}

extern void thread_stack_cache_init(void);

#ifdef CONFIG_DEBUG_STACK_USAGE
static inline unsigned long stack_not_used(struct task_struct *p)
{
        unsigned long *n = end_of_stack(p);

        do {         /* Skip over canary */
# ifdef CONFIG_STACK_GROWSUP
                n--;
# else
                n++;
# endif
        } while (!*n);

# ifdef CONFIG_STACK_GROWSUP
        return (unsigned long)end_of_stack(p) - (unsigned long)n;
# else
        return (unsigned long)n - (unsigned long)end_of_stack(p);
# endif
}
#endif
extern void set_task_stack_end_magic(struct task_struct *tsk);

#ifndef __HAVE_ARCH_KSTACK_END
static inline int kstack_end(void *addr)
{
        /* Reliable end of stack detection:
         * Some APM bios versions misalign the stack
         */
        return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*)));
}
#endif

#endif /* _LINUX_SCHED_TASK_STACK_H */
























    1 













































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RCULIST_BL_H
#define _LINUX_RCULIST_BL_H

/*
 * RCU-protected bl list version. See include/linux/list_bl.h.
 */
#include <linux/list_bl.h>
#include <linux/rcupdate.h>

static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h,
                                        struct hlist_bl_node *n)
{
        LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);
        LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) !=
                                                        LIST_BL_LOCKMASK);
        rcu_assign_pointer(h->first,
                (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK));
}

static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h)
{
        return (struct hlist_bl_node *)
                ((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK);
}

/**
 * hlist_bl_del_rcu - deletes entry from hash list without re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: hlist_bl_unhashed() on entry does not return true after this,
 * the entry is in an undefined state. It is useful for RCU based
 * lockfree traversal.
 *
 * In particular, it means that we can not poison the forward
 * pointers that may still be used for walking the hash list.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_bl_add_head_rcu()
 * or hlist_bl_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_bl_for_each_entry().
 */
static inline void hlist_bl_del_rcu(struct hlist_bl_node *n)
{
        __hlist_bl_del(n);
        n->pprev = LIST_POISON2;
}

/**
 * hlist_bl_add_head_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist_bl,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_bl_add_head_rcu()
 * or hlist_bl_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_bl_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n,
                                        struct hlist_bl_head *h)
{
        struct hlist_bl_node *first;

        /* don't need hlist_bl_first_rcu because we're under lock */
        first = hlist_bl_first(h);

        n->next = first;
        if (first)
                first->pprev = &n->next;
        n->pprev = &h->first;

        /* need _rcu because we can have concurrent lock free readers */
        hlist_bl_set_first_rcu(h, n);
}
/**
 * hlist_bl_for_each_entry_rcu - iterate over rcu list of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_bl_node to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_bl_node within the struct.
 *
 */
#define hlist_bl_for_each_entry_rcu(tpos, pos, head, member)                \
        for (pos = hlist_bl_first_rcu(head);                                \
                pos &&                                                        \
                ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \
                pos = rcu_dereference_raw(pos->next))

#endif






























































































































































































































    1 


    1 




















    1 















    1 


    1 




















































































































































































































































    1 





    1 
















    1 

    1 










    1 


    1 







    1 


    1 
    1 


    1 



    1 




















    1 





















































    1 









    1 
    1 
    1 
    1 




















































































































































































































    1 





    1 




    1 













    1 










    1 
    1 

























































































    1 




    1 










































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/file.c
 *
 *  Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
 *
 *  Manage the dynamic fd arrays in the process files_struct.
 */

#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/close_range.h>
#include <net/sock.h>

#include "internal.h"

unsigned int sysctl_nr_open __read_mostly = 1024*1024;
unsigned int sysctl_nr_open_min = BITS_PER_LONG;
/* our min() is unusable in constant expressions ;-/ */
#define __const_min(x, y) ((x) < (y) ? (x) : (y))
unsigned int sysctl_nr_open_max =
        __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG;

static void __free_fdtable(struct fdtable *fdt)
{
        kvfree(fdt->fd);
        kvfree(fdt->open_fds);
        kfree(fdt);
}

static void free_fdtable_rcu(struct rcu_head *rcu)
{
        __free_fdtable(container_of(rcu, struct fdtable, rcu));
}

#define BITBIT_NR(nr)        BITS_TO_LONGS(BITS_TO_LONGS(nr))
#define BITBIT_SIZE(nr)        (BITBIT_NR(nr) * sizeof(long))

/*
 * Copy 'count' fd bits from the old table to the new table and clear the extra
 * space if any.  This does not copy the file pointers.  Called with the files
 * spinlock held for write.
 */
static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
                            unsigned int count)
{
        unsigned int cpy, set;

        cpy = count / BITS_PER_BYTE;
        set = (nfdt->max_fds - count) / BITS_PER_BYTE;
        memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
        memset((char *)nfdt->open_fds + cpy, 0, set);
        memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
        memset((char *)nfdt->close_on_exec + cpy, 0, set);

        cpy = BITBIT_SIZE(count);
        set = BITBIT_SIZE(nfdt->max_fds) - cpy;
        memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy);
        memset((char *)nfdt->full_fds_bits + cpy, 0, set);
}

/*
 * Copy all file descriptors from the old table to the new, expanded table and
 * clear the extra space.  Called with the files spinlock held for write.
 */
static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
{
        size_t cpy, set;

        BUG_ON(nfdt->max_fds < ofdt->max_fds);

        cpy = ofdt->max_fds * sizeof(struct file *);
        set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
        memcpy(nfdt->fd, ofdt->fd, cpy);
        memset((char *)nfdt->fd + cpy, 0, set);

        copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
}

/*
 * Note how the fdtable bitmap allocations very much have to be a multiple of
 * BITS_PER_LONG. This is not only because we walk those things in chunks of
 * 'unsigned long' in some places, but simply because that is how the Linux
 * kernel bitmaps are defined to work: they are not "bits in an array of bytes",
 * they are very much "bits in an array of unsigned long".
 *
 * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied
 * by that "1024/sizeof(ptr)" before, we already know there are sufficient
 * clear low bits. Clang seems to realize that, gcc ends up being confused.
 *
 * On a 128-bit machine, the ALIGN() would actually matter. In the meantime,
 * let's consider it documentation (and maybe a test-case for gcc to improve
 * its code generation ;)
 */
static struct fdtable * alloc_fdtable(unsigned int nr)
{
        struct fdtable *fdt;
        void *data;

        /*
         * Figure out how many fds we actually want to support in this fdtable.
         * Allocation steps are keyed to the size of the fdarray, since it
         * grows far faster than any of the other dynamic data. We try to fit
         * the fdarray into comfortable page-tuned chunks: starting at 1024B
         * and growing in powers of two from there on.
         */
        nr /= (1024 / sizeof(struct file *));
        nr = roundup_pow_of_two(nr + 1);
        nr *= (1024 / sizeof(struct file *));
        nr = ALIGN(nr, BITS_PER_LONG);
        /*
         * Note that this can drive nr *below* what we had passed if sysctl_nr_open
         * had been set lower between the check in expand_files() and here.  Deal
         * with that in caller, it's cheaper that way.
         *
         * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
         * bitmaps handling below becomes unpleasant, to put it mildly...
         */
        if (unlikely(nr > sysctl_nr_open))
                nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;

        fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
        if (!fdt)
                goto out;
        fdt->max_fds = nr;
        data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT);
        if (!data)
                goto out_fdt;
        fdt->fd = data;

        data = kvmalloc(max_t(size_t,
                                 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES),
                                 GFP_KERNEL_ACCOUNT);
        if (!data)
                goto out_arr;
        fdt->open_fds = data;
        data += nr / BITS_PER_BYTE;
        fdt->close_on_exec = data;
        data += nr / BITS_PER_BYTE;
        fdt->full_fds_bits = data;

        return fdt;

out_arr:
        kvfree(fdt->fd);
out_fdt:
        kfree(fdt);
out:
        return NULL;
}

/*
 * Expand the file descriptor table.
 * This function will allocate a new fdtable and both fd array and fdset, of
 * the given size.
 * Return <0 error code on error; 1 on successful completion.
 * The files->file_lock should be held on entry, and will be held on exit.
 */
static int expand_fdtable(struct files_struct *files, unsigned int nr)
        __releases(files->file_lock)
        __acquires(files->file_lock)
{
        struct fdtable *new_fdt, *cur_fdt;

        spin_unlock(&files->file_lock);
        new_fdt = alloc_fdtable(nr);

        /* make sure all __fd_install() have seen resize_in_progress
         * or have finished their rcu_read_lock_sched() section.
         */
        if (atomic_read(&files->count) > 1)
                synchronize_rcu();

        spin_lock(&files->file_lock);
        if (!new_fdt)
                return -ENOMEM;
        /*
         * extremely unlikely race - sysctl_nr_open decreased between the check in
         * caller and alloc_fdtable().  Cheaper to catch it here...
         */
        if (unlikely(new_fdt->max_fds <= nr)) {
                __free_fdtable(new_fdt);
                return -EMFILE;
        }
        cur_fdt = files_fdtable(files);
        BUG_ON(nr < cur_fdt->max_fds);
        copy_fdtable(new_fdt, cur_fdt);
        rcu_assign_pointer(files->fdt, new_fdt);
        if (cur_fdt != &files->fdtab)
                call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
        /* coupled with smp_rmb() in __fd_install() */
        smp_wmb();
        return 1;
}

/*
 * Expand files.
 * This function will expand the file structures, if the requested size exceeds
 * the current capacity and there is room for expansion.
 * Return <0 error code on error; 0 when nothing done; 1 when files were
 * expanded and execution may have blocked.
 * The files->file_lock should be held on entry, and will be held on exit.
 */
static int expand_files(struct files_struct *files, unsigned int nr)
        __releases(files->file_lock)
        __acquires(files->file_lock)
{
        struct fdtable *fdt;
        int expanded = 0;

repeat:
        fdt = files_fdtable(files);

        /* Do we need to expand? */
        if (nr < fdt->max_fds)
                return expanded;

        /* Can we expand? */
        if (nr >= sysctl_nr_open)
                return -EMFILE;

        if (unlikely(files->resize_in_progress)) {
                spin_unlock(&files->file_lock);
                expanded = 1;
                wait_event(files->resize_wait, !files->resize_in_progress);
                spin_lock(&files->file_lock);
                goto repeat;
        }

        /* All good, so we try */
        files->resize_in_progress = true;
        expanded = expand_fdtable(files, nr);
        files->resize_in_progress = false;

        wake_up_all(&files->resize_wait);
        return expanded;
}

static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt)
{
        __set_bit(fd, fdt->close_on_exec);
}

static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt)
{
        if (test_bit(fd, fdt->close_on_exec))
                __clear_bit(fd, fdt->close_on_exec);
}

static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
{
        __set_bit(fd, fdt->open_fds);
        fd /= BITS_PER_LONG;
        if (!~fdt->open_fds[fd])
                __set_bit(fd, fdt->full_fds_bits);
}

static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
{
        __clear_bit(fd, fdt->open_fds);
        __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
}

static unsigned int count_open_files(struct fdtable *fdt)
{
        unsigned int size = fdt->max_fds;
        unsigned int i;

        /* Find the last open fd */
        for (i = size / BITS_PER_LONG; i > 0; ) {
                if (fdt->open_fds[--i])
                        break;
        }
        i = (i + 1) * BITS_PER_LONG;
        return i;
}

/*
 * Note that a sane fdtable size always has to be a multiple of
 * BITS_PER_LONG, since we have bitmaps that are sized by this.
 *
 * 'max_fds' will normally already be properly aligned, but it
 * turns out that in the close_range() -> __close_range() ->
 * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end
 * up having a 'max_fds' value that isn't already aligned.
 *
 * Rather than make close_range() have to worry about this,
 * just make that BITS_PER_LONG alignment be part of a sane
 * fdtable size. Becuase that's really what it is.
 */
static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
{
        unsigned int count;

        count = count_open_files(fdt);
        if (max_fds < NR_OPEN_DEFAULT)
                max_fds = NR_OPEN_DEFAULT;
        return ALIGN(min(count, max_fds), BITS_PER_LONG);
}

/*
 * Allocate a new files structure and copy contents from the
 * passed in files structure.
 * errorp will be valid only when the returned files_struct is NULL.
 */
struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp)
{
        struct files_struct *newf;
        struct file **old_fds, **new_fds;
        unsigned int open_files, i;
        struct fdtable *old_fdt, *new_fdt;

        *errorp = -ENOMEM;
        newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
        if (!newf)
                goto out;

        atomic_set(&newf->count, 1);

        spin_lock_init(&newf->file_lock);
        newf->resize_in_progress = false;
        init_waitqueue_head(&newf->resize_wait);
        newf->next_fd = 0;
        new_fdt = &newf->fdtab;
        new_fdt->max_fds = NR_OPEN_DEFAULT;
        new_fdt->close_on_exec = newf->close_on_exec_init;
        new_fdt->open_fds = newf->open_fds_init;
        new_fdt->full_fds_bits = newf->full_fds_bits_init;
        new_fdt->fd = &newf->fd_array[0];

        spin_lock(&oldf->file_lock);
        old_fdt = files_fdtable(oldf);
        open_files = sane_fdtable_size(old_fdt, max_fds);

        /*
         * Check whether we need to allocate a larger fd array and fd set.
         */
        while (unlikely(open_files > new_fdt->max_fds)) {
                spin_unlock(&oldf->file_lock);

                if (new_fdt != &newf->fdtab)
                        __free_fdtable(new_fdt);

                new_fdt = alloc_fdtable(open_files - 1);
                if (!new_fdt) {
                        *errorp = -ENOMEM;
                        goto out_release;
                }

                /* beyond sysctl_nr_open; nothing to do */
                if (unlikely(new_fdt->max_fds < open_files)) {
                        __free_fdtable(new_fdt);
                        *errorp = -EMFILE;
                        goto out_release;
                }

                /*
                 * Reacquire the oldf lock and a pointer to its fd table
                 * who knows it may have a new bigger fd table. We need
                 * the latest pointer.
                 */
                spin_lock(&oldf->file_lock);
                old_fdt = files_fdtable(oldf);
                open_files = sane_fdtable_size(old_fdt, max_fds);
        }

        copy_fd_bitmaps(new_fdt, old_fdt, open_files);

        old_fds = old_fdt->fd;
        new_fds = new_fdt->fd;

        for (i = open_files; i != 0; i--) {
                struct file *f = *old_fds++;
                if (f) {
                        get_file(f);
                } else {
                        /*
                         * The fd may be claimed in the fd bitmap but not yet
                         * instantiated in the files array if a sibling thread
                         * is partway through open().  So make sure that this
                         * fd is available to the new process.
                         */
                        __clear_open_fd(open_files - i, new_fdt);
                }
                rcu_assign_pointer(*new_fds++, f);
        }
        spin_unlock(&oldf->file_lock);

        /* clear the remainder */
        memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *));

        rcu_assign_pointer(newf->fdt, new_fdt);

        return newf;

out_release:
        kmem_cache_free(files_cachep, newf);
out:
        return NULL;
}

static struct fdtable *close_files(struct files_struct * files)
{
        /*
         * It is safe to dereference the fd table without RCU or
         * ->file_lock because this is the last reference to the
         * files structure.
         */
        struct fdtable *fdt = rcu_dereference_raw(files->fdt);
        unsigned int i, j = 0;

        for (;;) {
                unsigned long set;
                i = j * BITS_PER_LONG;
                if (i >= fdt->max_fds)
                        break;
                set = fdt->open_fds[j++];
                while (set) {
                        if (set & 1) {
                                struct file * file = xchg(&fdt->fd[i], NULL);
                                if (file) {
                                        filp_close(file, files);
                                        cond_resched();
                                }
                        }
                        i++;
                        set >>= 1;
                }
        }

        return fdt;
}

struct files_struct *get_files_struct(struct task_struct *task)
{
        struct files_struct *files;

        task_lock(task);
        files = task->files;
        if (files)
                atomic_inc(&files->count);
        task_unlock(task);

        return files;
}

void put_files_struct(struct files_struct *files)
{
        if (atomic_dec_and_test(&files->count)) {
                struct fdtable *fdt = close_files(files);

                /* free the arrays if they are not embedded */
                if (fdt != &files->fdtab)
                        __free_fdtable(fdt);
                kmem_cache_free(files_cachep, files);
        }
}

void reset_files_struct(struct files_struct *files)
{
        struct task_struct *tsk = current;
        struct files_struct *old;

        old = tsk->files;
        task_lock(tsk);
        tsk->files = files;
        task_unlock(tsk);
        put_files_struct(old);
}

void exit_files(struct task_struct *tsk)
{
        struct files_struct * files = tsk->files;

        if (files) {
                task_lock(tsk);
                tsk->files = NULL;
                task_unlock(tsk);
                put_files_struct(files);
        }
}

struct files_struct init_files = {
        .count                = ATOMIC_INIT(1),
        .fdt                = &init_files.fdtab,
        .fdtab                = {
                .max_fds        = NR_OPEN_DEFAULT,
                .fd                = &init_files.fd_array[0],
                .close_on_exec        = init_files.close_on_exec_init,
                .open_fds        = init_files.open_fds_init,
                .full_fds_bits        = init_files.full_fds_bits_init,
        },
        .file_lock        = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
        .resize_wait        = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
};

static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
{
        unsigned int maxfd = fdt->max_fds;
        unsigned int maxbit = maxfd / BITS_PER_LONG;
        unsigned int bitbit = start / BITS_PER_LONG;

        bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
        if (bitbit > maxfd)
                return maxfd;
        if (bitbit > start)
                start = bitbit;
        return find_next_zero_bit(fdt->open_fds, maxfd, start);
}

/*
 * allocate a file descriptor, mark it busy.
 */
int __alloc_fd(struct files_struct *files,
               unsigned start, unsigned end, unsigned flags)
{
        unsigned int fd;
        int error;
        struct fdtable *fdt;

        spin_lock(&files->file_lock);
repeat:
        fdt = files_fdtable(files);
        fd = start;
        if (fd < files->next_fd)
                fd = files->next_fd;

        if (fd < fdt->max_fds)
                fd = find_next_fd(fdt, fd);

        /*
         * N.B. For clone tasks sharing a files structure, this test
         * will limit the total number of files that can be opened.
         */
        error = -EMFILE;
        if (fd >= end)
                goto out;

        error = expand_files(files, fd);
        if (error < 0)
                goto out;

        /*
         * If we needed to expand the fs array we
         * might have blocked - try again.
         */
        if (error)
                goto repeat;

        if (start <= files->next_fd)
                files->next_fd = fd + 1;

        __set_open_fd(fd, fdt);
        if (flags & O_CLOEXEC)
                __set_close_on_exec(fd, fdt);
        else
                __clear_close_on_exec(fd, fdt);
        error = fd;
#if 1
        /* Sanity check */
        if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
                printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
                rcu_assign_pointer(fdt->fd[fd], NULL);
        }
#endif

out:
        spin_unlock(&files->file_lock);
        return error;
}

static int alloc_fd(unsigned start, unsigned flags)
{
        return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags);
}

int __get_unused_fd_flags(unsigned flags, unsigned long nofile)
{
        return __alloc_fd(current->files, 0, nofile, flags);
}

int get_unused_fd_flags(unsigned flags)
{
        return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE));
}
EXPORT_SYMBOL(get_unused_fd_flags);

static void __put_unused_fd(struct files_struct *files, unsigned int fd)
{
        struct fdtable *fdt = files_fdtable(files);
        __clear_open_fd(fd, fdt);
        if (fd < files->next_fd)
                files->next_fd = fd;
}

void put_unused_fd(unsigned int fd)
{
        struct files_struct *files = current->files;
        spin_lock(&files->file_lock);
        __put_unused_fd(files, fd);
        spin_unlock(&files->file_lock);
}

EXPORT_SYMBOL(put_unused_fd);

/*
 * Install a file pointer in the fd array.
 *
 * The VFS is full of places where we drop the files lock between
 * setting the open_fds bitmap and installing the file in the file
 * array.  At any such point, we are vulnerable to a dup2() race
 * installing a file in the array before us.  We need to detect this and
 * fput() the struct file we are about to overwrite in this case.
 *
 * It should never happen - if we allow dup2() do it, _really_ bad things
 * will follow.
 *
 * NOTE: __fd_install() variant is really, really low-level; don't
 * use it unless you are forced to by truly lousy API shoved down
 * your throat.  'files' *MUST* be either current->files or obtained
 * by get_files_struct(current) done by whoever had given it to you,
 * or really bad things will happen.  Normally you want to use
 * fd_install() instead.
 */

void __fd_install(struct files_struct *files, unsigned int fd,
                struct file *file)
{
        struct fdtable *fdt;

        rcu_read_lock_sched();

        if (unlikely(files->resize_in_progress)) {
                rcu_read_unlock_sched();
                spin_lock(&files->file_lock);
                fdt = files_fdtable(files);
                BUG_ON(fdt->fd[fd] != NULL);
                rcu_assign_pointer(fdt->fd[fd], file);
                spin_unlock(&files->file_lock);
                return;
        }
        /* coupled with smp_wmb() in expand_fdtable() */
        smp_rmb();
        fdt = rcu_dereference_sched(files->fdt);
        BUG_ON(fdt->fd[fd] != NULL);
        rcu_assign_pointer(fdt->fd[fd], file);
        rcu_read_unlock_sched();
}

/*
 * This consumes the "file" refcount, so callers should treat it
 * as if they had called fput(file).
 */
void fd_install(unsigned int fd, struct file *file)
{
        __fd_install(current->files, fd, file);
}

EXPORT_SYMBOL(fd_install);

static struct file *pick_file(struct files_struct *files, unsigned fd)
{
        struct file *file = NULL;
        struct fdtable *fdt;

        spin_lock(&files->file_lock);
        fdt = files_fdtable(files);
        if (fd >= fdt->max_fds)
                goto out_unlock;
        fd = array_index_nospec(fd, fdt->max_fds);
        file = fdt->fd[fd];
        if (!file)
                goto out_unlock;
        rcu_assign_pointer(fdt->fd[fd], NULL);
        __put_unused_fd(files, fd);

out_unlock:
        spin_unlock(&files->file_lock);
        return file;
}

/*
 * The same warnings as for __alloc_fd()/__fd_install() apply here...
 */
int __close_fd(struct files_struct *files, unsigned fd)
{
        struct file *file;

        file = pick_file(files, fd);
        if (!file)
                return -EBADF;

        return filp_close(file, files);
}
EXPORT_SYMBOL(__close_fd); /* for ksys_close() */

/**
 * __close_range() - Close all file descriptors in a given range.
 *
 * @fd:     starting file descriptor to close
 * @max_fd: last file descriptor to close
 *
 * This closes a range of file descriptors. All file descriptors
 * from @fd up to and including @max_fd are closed.
 */
int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
{
        unsigned int cur_max;
        struct task_struct *me = current;
        struct files_struct *cur_fds = me->files, *fds = NULL;

        if (flags & ~CLOSE_RANGE_UNSHARE)
                return -EINVAL;

        if (fd > max_fd)
                return -EINVAL;

        rcu_read_lock();
        cur_max = files_fdtable(cur_fds)->max_fds;
        rcu_read_unlock();

        /* cap to last valid index into fdtable */
        cur_max--;

        if (flags & CLOSE_RANGE_UNSHARE) {
                int ret;
                unsigned int max_unshare_fds = NR_OPEN_MAX;

                /*
                 * If the requested range is greater than the current maximum,
                 * we're closing everything so only copy all file descriptors
                 * beneath the lowest file descriptor.
                 */
                if (max_fd >= cur_max)
                        max_unshare_fds = fd;

                ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds);
                if (ret)
                        return ret;

                /*
                 * We used to share our file descriptor table, and have now
                 * created a private one, make sure we're using it below.
                 */
                if (fds)
                        swap(cur_fds, fds);
        }

        max_fd = min(max_fd, cur_max);
        while (fd <= max_fd) {
                struct file *file;

                file = pick_file(cur_fds, fd++);
                if (!file)
                        continue;

                filp_close(file, cur_fds);
                cond_resched();
        }

        if (fds) {
                /*
                 * We're done closing the files we were supposed to. Time to install
                 * the new file descriptor table and drop the old one.
                 */
                task_lock(me);
                me->files = cur_fds;
                task_unlock(me);
                put_files_struct(fds);
        }

        return 0;
}

/*
 * See close_fd_get_file() below, this variant assumes current->files->file_lock
 * is held.
 */
int __close_fd_get_file(unsigned int fd, struct file **res)
{
        struct files_struct *files = current->files;
        struct file *file;
        struct fdtable *fdt;

        fdt = files_fdtable(files);
        if (fd >= fdt->max_fds)
                goto out_err;
        file = fdt->fd[fd];
        if (!file)
                goto out_err;
        rcu_assign_pointer(fdt->fd[fd], NULL);
        __put_unused_fd(files, fd);
        get_file(file);
        *res = file;
        return 0;
out_err:
        *res = NULL;
        return -ENOENT;
}

/*
 * variant of close_fd that gets a ref on the file for later fput.
 * The caller must ensure that filp_close() called on the file, and then
 * an fput().
 */
int close_fd_get_file(unsigned int fd, struct file **res)
{
        struct files_struct *files = current->files;
        int ret;

        spin_lock(&files->file_lock);
        ret = __close_fd_get_file(fd, res);
        spin_unlock(&files->file_lock);

        return ret;
}

void do_close_on_exec(struct files_struct *files)
{
        unsigned i;
        struct fdtable *fdt;

        /* exec unshares first */
        spin_lock(&files->file_lock);
        for (i = 0; ; i++) {
                unsigned long set;
                unsigned fd = i * BITS_PER_LONG;
                fdt = files_fdtable(files);
                if (fd >= fdt->max_fds)
                        break;
                set = fdt->close_on_exec[i];
                if (!set)
                        continue;
                fdt->close_on_exec[i] = 0;
                for ( ; set ; fd++, set >>= 1) {
                        struct file *file;
                        if (!(set & 1))
                                continue;
                        file = fdt->fd[fd];
                        if (!file)
                                continue;
                        rcu_assign_pointer(fdt->fd[fd], NULL);
                        __put_unused_fd(files, fd);
                        spin_unlock(&files->file_lock);
                        filp_close(file, files);
                        cond_resched();
                        spin_lock(&files->file_lock);
                }

        }
        spin_unlock(&files->file_lock);
}

static inline struct file *__fget_files_rcu(struct files_struct *files,
        unsigned int fd, fmode_t mask, unsigned int refs)
{
        for (;;) {
                struct file *file;
                struct fdtable *fdt = rcu_dereference_raw(files->fdt);
                struct file __rcu **fdentry;

                if (unlikely(fd >= fdt->max_fds))
                        return NULL;

                fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds);
                file = rcu_dereference_raw(*fdentry);
                if (unlikely(!file))
                        return NULL;

                if (unlikely(file->f_mode & mask))
                        return NULL;

                /*
                 * Ok, we have a file pointer. However, because we do
                 * this all locklessly under RCU, we may be racing with
                 * that file being closed.
                 *
                 * Such a race can take two forms:
                 *
                 *  (a) the file ref already went down to zero,
                 *      and get_file_rcu_many() fails. Just try
                 *      again:
                 */
                if (unlikely(!get_file_rcu_many(file, refs)))
                        continue;

                /*
                 *  (b) the file table entry has changed under us.
                 *       Note that we don't need to re-check the 'fdt->fd'
                 *       pointer having changed, because it always goes
                 *       hand-in-hand with 'fdt'.
                 *
                 * If so, we need to put our refs and try again.
                 */
                if (unlikely(rcu_dereference_raw(files->fdt) != fdt) ||
                    unlikely(rcu_dereference_raw(*fdentry) != file)) {
                        fput_many(file, refs);
                        continue;
                }

                /*
                 * Ok, we have a ref to the file, and checked that it
                 * still exists.
                 */
                return file;
        }
}

static struct file *__fget_files(struct files_struct *files, unsigned int fd,
                                 fmode_t mask, unsigned int refs)
{
        struct file *file;

        rcu_read_lock();
        file = __fget_files_rcu(files, fd, mask, refs);
        rcu_read_unlock();

        return file;
}

static inline struct file *__fget(unsigned int fd, fmode_t mask,
                                  unsigned int refs)
{
        return __fget_files(current->files, fd, mask, refs);
}

struct file *fget_many(unsigned int fd, unsigned int refs)
{
        return __fget(fd, FMODE_PATH, refs);
}

struct file *fget(unsigned int fd)
{
        return __fget(fd, FMODE_PATH, 1);
}
EXPORT_SYMBOL(fget);

struct file *fget_raw(unsigned int fd)
{
        return __fget(fd, 0, 1);
}
EXPORT_SYMBOL(fget_raw);

struct file *fget_task(struct task_struct *task, unsigned int fd)
{
        struct file *file = NULL;

        task_lock(task);
        if (task->files)
                file = __fget_files(task->files, fd, 0, 1);
        task_unlock(task);

        return file;
}

/*
 * Lightweight file lookup - no refcnt increment if fd table isn't shared.
 *
 * You can use this instead of fget if you satisfy all of the following
 * conditions:
 * 1) You must call fput_light before exiting the syscall and returning control
 *    to userspace (i.e. you cannot remember the returned struct file * after
 *    returning to userspace).
 * 2) You must not call filp_close on the returned struct file * in between
 *    calls to fget_light and fput_light.
 * 3) You must not clone the current task in between the calls to fget_light
 *    and fput_light.
 *
 * The fput_needed flag returned by fget_light should be passed to the
 * corresponding fput_light.
 */
static unsigned long __fget_light(unsigned int fd, fmode_t mask)
{
        struct files_struct *files = current->files;
        struct file *file;

        if (atomic_read(&files->count) == 1) {
                file = __fcheck_files(files, fd);
                if (!file || unlikely(file->f_mode & mask))
                        return 0;
                return (unsigned long)file;
        } else {
                file = __fget(fd, mask, 1);
                if (!file)
                        return 0;
                return FDPUT_FPUT | (unsigned long)file;
        }
}
unsigned long __fdget(unsigned int fd)
{
        return __fget_light(fd, FMODE_PATH);
}
EXPORT_SYMBOL(__fdget);

unsigned long __fdget_raw(unsigned int fd)
{
        return __fget_light(fd, 0);
}

/*
 * Try to avoid f_pos locking. We only need it if the
 * file is marked for FMODE_ATOMIC_POS, and it can be
 * accessed multiple ways.
 *
 * Always do it for directories, because pidfd_getfd()
 * can make a file accessible even if it otherwise would
 * not be, and for directories this is a correctness
 * issue, not a "POSIX requirement".
 */
static inline bool file_needs_f_pos_lock(struct file *file)
{
        return (file->f_mode & FMODE_ATOMIC_POS) &&
                (file_count(file) > 1 || S_ISDIR(file_inode(file)->i_mode));
}

unsigned long __fdget_pos(unsigned int fd)
{
        unsigned long v = __fdget(fd);
        struct file *file = (struct file *)(v & ~3);

        if (file && file_needs_f_pos_lock(file)) {
                v |= FDPUT_POS_UNLOCK;
                mutex_lock(&file->f_pos_lock);
        }
        return v;
}

void __f_unlock_pos(struct file *f)
{
        mutex_unlock(&f->f_pos_lock);
}

/*
 * We only lock f_pos if we have threads or if the file might be
 * shared with another process. In both cases we'll have an elevated
 * file count (done either by fdget() or by fork()).
 */

void set_close_on_exec(unsigned int fd, int flag)
{
        struct files_struct *files = current->files;
        struct fdtable *fdt;
        spin_lock(&files->file_lock);
        fdt = files_fdtable(files);
        if (flag)
                __set_close_on_exec(fd, fdt);
        else
                __clear_close_on_exec(fd, fdt);
        spin_unlock(&files->file_lock);
}

bool get_close_on_exec(unsigned int fd)
{
        struct files_struct *files = current->files;
        struct fdtable *fdt;
        bool res;
        rcu_read_lock();
        fdt = files_fdtable(files);
        res = close_on_exec(fd, fdt);
        rcu_read_unlock();
        return res;
}

static int do_dup2(struct files_struct *files,
        struct file *file, unsigned fd, unsigned flags)
__releases(&files->file_lock)
{
        struct file *tofree;
        struct fdtable *fdt;

        /*
         * We need to detect attempts to do dup2() over allocated but still
         * not finished descriptor.  NB: OpenBSD avoids that at the price of
         * extra work in their equivalent of fget() - they insert struct
         * file immediately after grabbing descriptor, mark it larval if
         * more work (e.g. actual opening) is needed and make sure that
         * fget() treats larval files as absent.  Potentially interesting,
         * but while extra work in fget() is trivial, locking implications
         * and amount of surgery on open()-related paths in VFS are not.
         * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
         * deadlocks in rather amusing ways, AFAICS.  All of that is out of
         * scope of POSIX or SUS, since neither considers shared descriptor
         * tables and this condition does not arise without those.
         */
        fdt = files_fdtable(files);
        tofree = fdt->fd[fd];
        if (!tofree && fd_is_open(fd, fdt))
                goto Ebusy;
        get_file(file);
        rcu_assign_pointer(fdt->fd[fd], file);
        __set_open_fd(fd, fdt);
        if (flags & O_CLOEXEC)
                __set_close_on_exec(fd, fdt);
        else
                __clear_close_on_exec(fd, fdt);
        spin_unlock(&files->file_lock);

        if (tofree)
                filp_close(tofree, files);

        return fd;

Ebusy:
        spin_unlock(&files->file_lock);
        return -EBUSY;
}

int replace_fd(unsigned fd, struct file *file, unsigned flags)
{
        int err;
        struct files_struct *files = current->files;

        if (!file)
                return __close_fd(files, fd);

        if (fd >= rlimit(RLIMIT_NOFILE))
                return -EBADF;

        spin_lock(&files->file_lock);
        err = expand_files(files, fd);
        if (unlikely(err < 0))
                goto out_unlock;
        return do_dup2(files, file, fd, flags);

out_unlock:
        spin_unlock(&files->file_lock);
        return err;
}

/**
 * __receive_fd() - Install received file into file descriptor table
 *
 * @fd: fd to install into (if negative, a new fd will be allocated)
 * @file: struct file that was received from another process
 * @ufd: __user pointer to write new fd number to
 * @o_flags: the O_* flags to apply to the new fd entry
 *
 * Installs a received file into the file descriptor table, with appropriate
 * checks and count updates. Optionally writes the fd number to userspace, if
 * @ufd is non-NULL.
 *
 * This helper handles its own reference counting of the incoming
 * struct file.
 *
 * Returns newly install fd or -ve on error.
 */
int __receive_fd(int fd, struct file *file, int __user *ufd, unsigned int o_flags)
{
        int new_fd;
        int error;

        error = security_file_receive(file);
        if (error)
                return error;

        if (fd < 0) {
                new_fd = get_unused_fd_flags(o_flags);
                if (new_fd < 0)
                        return new_fd;
        } else {
                new_fd = fd;
        }

        if (ufd) {
                error = put_user(new_fd, ufd);
                if (error) {
                        if (fd < 0)
                                put_unused_fd(new_fd);
                        return error;
                }
        }

        if (fd < 0) {
                fd_install(new_fd, get_file(file));
        } else {
                error = replace_fd(new_fd, file, o_flags);
                if (error)
                        return error;
        }

        /* Bump the sock usage counts, if any. */
        __receive_sock(file);
        return new_fd;
}

static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
{
        int err = -EBADF;
        struct file *file;
        struct files_struct *files = current->files;

        if ((flags & ~O_CLOEXEC) != 0)
                return -EINVAL;

        if (unlikely(oldfd == newfd))
                return -EINVAL;

        if (newfd >= rlimit(RLIMIT_NOFILE))
                return -EBADF;

        spin_lock(&files->file_lock);
        err = expand_files(files, newfd);
        file = fcheck(oldfd);
        if (unlikely(!file))
                goto Ebadf;
        if (unlikely(err < 0)) {
                if (err == -EMFILE)
                        goto Ebadf;
                goto out_unlock;
        }
        return do_dup2(files, file, newfd, flags);

Ebadf:
        err = -EBADF;
out_unlock:
        spin_unlock(&files->file_lock);
        return err;
}

SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
{
        return ksys_dup3(oldfd, newfd, flags);
}

SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
{
        if (unlikely(newfd == oldfd)) { /* corner case */
                struct files_struct *files = current->files;
                int retval = oldfd;

                rcu_read_lock();
                if (!fcheck_files(files, oldfd))
                        retval = -EBADF;
                rcu_read_unlock();
                return retval;
        }
        return ksys_dup3(oldfd, newfd, 0);
}

SYSCALL_DEFINE1(dup, unsigned int, fildes)
{
        int ret = -EBADF;
        struct file *file = fget_raw(fildes);

        if (file) {
                ret = get_unused_fd_flags(0);
                if (ret >= 0)
                        fd_install(ret, file);
                else
                        fput(file);
        }
        return ret;
}

int f_dupfd(unsigned int from, struct file *file, unsigned flags)
{
        int err;
        if (from >= rlimit(RLIMIT_NOFILE))
                return -EINVAL;
        err = alloc_fd(from, flags);
        if (err >= 0) {
                get_file(file);
                fd_install(err, file);
        }
        return err;
}

int iterate_fd(struct files_struct *files, unsigned n,
                int (*f)(const void *, struct file *, unsigned),
                const void *p)
{
        struct fdtable *fdt;
        int res = 0;
        if (!files)
                return 0;
        spin_lock(&files->file_lock);
        for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
                struct file *file;
                file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
                if (!file)
                        continue;
                res = f(p, file, n);
                if (res)
                        break;
        }
        spin_unlock(&files->file_lock);
        return res;
}
EXPORT_SYMBOL(iterate_fd);


























































































    1 




    1 

    1 
















































    1 









    1 




    1 
































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
// SPDX-License-Identifier: GPL-2.0
/*
 * Fast batching percpu counters.
 */

#include <linux/percpu_counter.h>
#include <linux/mutex.h>
#include <linux/init.h>
#include <linux/cpu.h>
#include <linux/module.h>
#include <linux/debugobjects.h>

#ifdef CONFIG_HOTPLUG_CPU
static LIST_HEAD(percpu_counters);
static DEFINE_SPINLOCK(percpu_counters_lock);
#endif

#ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER

static const struct debug_obj_descr percpu_counter_debug_descr;

static bool percpu_counter_fixup_free(void *addr, enum debug_obj_state state)
{
        struct percpu_counter *fbc = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                percpu_counter_destroy(fbc);
                debug_object_free(fbc, &percpu_counter_debug_descr);
                return true;
        default:
                return false;
        }
}

static const struct debug_obj_descr percpu_counter_debug_descr = {
        .name                = "percpu_counter",
        .fixup_free        = percpu_counter_fixup_free,
};

static inline void debug_percpu_counter_activate(struct percpu_counter *fbc)
{
        debug_object_init(fbc, &percpu_counter_debug_descr);
        debug_object_activate(fbc, &percpu_counter_debug_descr);
}

static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc)
{
        debug_object_deactivate(fbc, &percpu_counter_debug_descr);
        debug_object_free(fbc, &percpu_counter_debug_descr);
}

#else        /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */
static inline void debug_percpu_counter_activate(struct percpu_counter *fbc)
{ }
static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc)
{ }
#endif        /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */

void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
{
        int cpu;
        unsigned long flags;

        raw_spin_lock_irqsave(&fbc->lock, flags);
        for_each_possible_cpu(cpu) {
                s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
                *pcount = 0;
        }
        fbc->count = amount;
        raw_spin_unlock_irqrestore(&fbc->lock, flags);
}
EXPORT_SYMBOL(percpu_counter_set);

/**
 * This function is both preempt and irq safe. The former is due to explicit
 * preemption disable. The latter is guaranteed by the fact that the slow path
 * is explicitly protected by an irq-safe spinlock whereas the fast patch uses
 * this_cpu_add which is irq-safe by definition. Hence there is no need muck
 * with irq state before calling this one
 */
void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
{
        s64 count;

        preempt_disable();
        count = __this_cpu_read(*fbc->counters) + amount;
        if (abs(count) >= batch) {
                unsigned long flags;
                raw_spin_lock_irqsave(&fbc->lock, flags);
                fbc->count += count;
                __this_cpu_sub(*fbc->counters, count - amount);
                raw_spin_unlock_irqrestore(&fbc->lock, flags);
        } else {
                this_cpu_add(*fbc->counters, amount);
        }
        preempt_enable();
}
EXPORT_SYMBOL(percpu_counter_add_batch);

/*
 * For percpu_counter with a big batch, the devication of its count could
 * be big, and there is requirement to reduce the deviation, like when the
 * counter's batch could be runtime decreased to get a better accuracy,
 * which can be achieved by running this sync function on each CPU.
 */
void percpu_counter_sync(struct percpu_counter *fbc)
{
        unsigned long flags;
        s64 count;

        raw_spin_lock_irqsave(&fbc->lock, flags);
        count = __this_cpu_read(*fbc->counters);
        fbc->count += count;
        __this_cpu_sub(*fbc->counters, count);
        raw_spin_unlock_irqrestore(&fbc->lock, flags);
}
EXPORT_SYMBOL(percpu_counter_sync);

/*
 * Add up all the per-cpu counts, return the result.  This is a more accurate
 * but much slower version of percpu_counter_read_positive()
 */
s64 __percpu_counter_sum(struct percpu_counter *fbc)
{
        s64 ret;
        int cpu;
        unsigned long flags;

        raw_spin_lock_irqsave(&fbc->lock, flags);
        ret = fbc->count;
        for_each_online_cpu(cpu) {
                s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
                ret += *pcount;
        }
        raw_spin_unlock_irqrestore(&fbc->lock, flags);
        return ret;
}
EXPORT_SYMBOL(__percpu_counter_sum);

int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
                          struct lock_class_key *key)
{
        unsigned long flags __maybe_unused;

        raw_spin_lock_init(&fbc->lock);
        lockdep_set_class(&fbc->lock, key);
        fbc->count = amount;
        fbc->counters = alloc_percpu_gfp(s32, gfp);
        if (!fbc->counters)
                return -ENOMEM;

        debug_percpu_counter_activate(fbc);

#ifdef CONFIG_HOTPLUG_CPU
        INIT_LIST_HEAD(&fbc->list);
        spin_lock_irqsave(&percpu_counters_lock, flags);
        list_add(&fbc->list, &percpu_counters);
        spin_unlock_irqrestore(&percpu_counters_lock, flags);
#endif
        return 0;
}
EXPORT_SYMBOL(__percpu_counter_init);

void percpu_counter_destroy(struct percpu_counter *fbc)
{
        unsigned long flags __maybe_unused;

        if (!fbc->counters)
                return;

        debug_percpu_counter_deactivate(fbc);

#ifdef CONFIG_HOTPLUG_CPU
        spin_lock_irqsave(&percpu_counters_lock, flags);
        list_del(&fbc->list);
        spin_unlock_irqrestore(&percpu_counters_lock, flags);
#endif
        free_percpu(fbc->counters);
        fbc->counters = NULL;
}
EXPORT_SYMBOL(percpu_counter_destroy);

int percpu_counter_batch __read_mostly = 32;
EXPORT_SYMBOL(percpu_counter_batch);

static int compute_batch_value(unsigned int cpu)
{
        int nr = num_online_cpus();

        percpu_counter_batch = max(32, nr*2);
        return 0;
}

static int percpu_counter_cpu_dead(unsigned int cpu)
{
#ifdef CONFIG_HOTPLUG_CPU
        struct percpu_counter *fbc;

        compute_batch_value(cpu);

        spin_lock_irq(&percpu_counters_lock);
        list_for_each_entry(fbc, &percpu_counters, list) {
                s32 *pcount;

                raw_spin_lock(&fbc->lock);
                pcount = per_cpu_ptr(fbc->counters, cpu);
                fbc->count += *pcount;
                *pcount = 0;
                raw_spin_unlock(&fbc->lock);
        }
        spin_unlock_irq(&percpu_counters_lock);
#endif
        return 0;
}

/*
 * Compare counter against given value.
 * Return 1 if greater, 0 if equal and -1 if less
 */
int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch)
{
        s64        count;

        count = percpu_counter_read(fbc);
        /* Check to see if rough count will be sufficient for comparison */
        if (abs(count - rhs) > (batch * num_online_cpus())) {
                if (count > rhs)
                        return 1;
                else
                        return -1;
        }
        /* Need to use precise count */
        count = percpu_counter_sum(fbc);
        if (count > rhs)
                return 1;
        else if (count < rhs)
                return -1;
        else
                return 0;
}
EXPORT_SYMBOL(__percpu_counter_compare);

static int __init percpu_counter_startup(void)
{
        int ret;

        ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "lib/percpu_cnt:online",
                                compute_batch_value, NULL);
        WARN_ON(ret < 0);
        ret = cpuhp_setup_state_nocalls(CPUHP_PERCPU_CNT_DEAD,
                                        "lib/percpu_cnt:dead", NULL,
                                        percpu_counter_cpu_dead);
        WARN_ON(ret < 0);
        return 0;
}
module_init(percpu_counter_startup);
























































































































































































































































































































































































































































































































































































































































































































































































































    1 



    1 





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
// SPDX-License-Identifier: GPL-2.0
/*
 *  Kernel internal timers
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  1997-01-28  Modified by Finn Arne Gangstad to make timers scale better.
 *
 *  1997-09-10  Updated NTP code according to technical memorandum Jan '96
 *              "A Kernel Model for Precision Timekeeping" by Dave Mills
 *  1998-12-24  Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
 *              serialize accesses to xtime/lost_ticks).
 *                              Copyright (C) 1998  Andrea Arcangeli
 *  1999-03-10  Improved NTP compatibility by Ulrich Windl
 *  2002-05-31        Move sys_sysinfo here and make its locking sane, Robert Love
 *  2000-10-05  Implemented scalable SMP per-CPU timer handling.
 *                              Copyright (C) 2000, 2001, 2002  Ingo Molnar
 *              Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
 */

#include <linux/kernel_stat.h>
#include <linux/export.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/pid_namespace.h>
#include <linux/notifier.h>
#include <linux/thread_info.h>
#include <linux/time.h>
#include <linux/jiffies.h>
#include <linux/posix-timers.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
#include <linux/delay.h>
#include <linux/tick.h>
#include <linux/kallsyms.h>
#include <linux/irq_work.h>
#include <linux/sched/signal.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/nohz.h>
#include <linux/sched/debug.h>
#include <linux/slab.h>
#include <linux/compat.h>
#include <linux/random.h>

#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/div64.h>
#include <asm/timex.h>
#include <asm/io.h>

#include "tick-internal.h"

#define CREATE_TRACE_POINTS
#include <trace/events/timer.h>

__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;

EXPORT_SYMBOL(jiffies_64);

/*
 * The timer wheel has LVL_DEPTH array levels. Each level provides an array of
 * LVL_SIZE buckets. Each level is driven by its own clock and therefor each
 * level has a different granularity.
 *
 * The level granularity is:                LVL_CLK_DIV ^ lvl
 * The level clock frequency is:        HZ / (LVL_CLK_DIV ^ level)
 *
 * The array level of a newly armed timer depends on the relative expiry
 * time. The farther the expiry time is away the higher the array level and
 * therefor the granularity becomes.
 *
 * Contrary to the original timer wheel implementation, which aims for 'exact'
 * expiry of the timers, this implementation removes the need for recascading
 * the timers into the lower array levels. The previous 'classic' timer wheel
 * implementation of the kernel already violated the 'exact' expiry by adding
 * slack to the expiry time to provide batched expiration. The granularity
 * levels provide implicit batching.
 *
 * This is an optimization of the original timer wheel implementation for the
 * majority of the timer wheel use cases: timeouts. The vast majority of
 * timeout timers (networking, disk I/O ...) are canceled before expiry. If
 * the timeout expires it indicates that normal operation is disturbed, so it
 * does not matter much whether the timeout comes with a slight delay.
 *
 * The only exception to this are networking timers with a small expiry
 * time. They rely on the granularity. Those fit into the first wheel level,
 * which has HZ granularity.
 *
 * We don't have cascading anymore. timers with a expiry time above the
 * capacity of the last wheel level are force expired at the maximum timeout
 * value of the last wheel level. From data sampling we know that the maximum
 * value observed is 5 days (network connection tracking), so this should not
 * be an issue.
 *
 * The currently chosen array constants values are a good compromise between
 * array size and granularity.
 *
 * This results in the following granularity and range levels:
 *
 * HZ 1000 steps
 * Level Offset  Granularity            Range
 *  0      0         1 ms                0 ms -         63 ms
 *  1     64         8 ms               64 ms -        511 ms
 *  2    128        64 ms              512 ms -       4095 ms (512ms - ~4s)
 *  3    192       512 ms             4096 ms -      32767 ms (~4s - ~32s)
 *  4    256      4096 ms (~4s)      32768 ms -     262143 ms (~32s - ~4m)
 *  5    320     32768 ms (~32s)    262144 ms -    2097151 ms (~4m - ~34m)
 *  6    384    262144 ms (~4m)    2097152 ms -   16777215 ms (~34m - ~4h)
 *  7    448   2097152 ms (~34m)  16777216 ms -  134217727 ms (~4h - ~1d)
 *  8    512  16777216 ms (~4h)  134217728 ms - 1073741822 ms (~1d - ~12d)
 *
 * HZ  300
 * Level Offset  Granularity            Range
 *  0           0         3 ms                0 ms -        210 ms
 *  1          64        26 ms              213 ms -       1703 ms (213ms - ~1s)
 *  2         128       213 ms             1706 ms -      13650 ms (~1s - ~13s)
 *  3         192      1706 ms (~1s)      13653 ms -     109223 ms (~13s - ~1m)
 *  4         256     13653 ms (~13s)    109226 ms -     873810 ms (~1m - ~14m)
 *  5         320    109226 ms (~1m)     873813 ms -    6990503 ms (~14m - ~1h)
 *  6         384    873813 ms (~14m)   6990506 ms -   55924050 ms (~1h - ~15h)
 *  7         448   6990506 ms (~1h)   55924053 ms -  447392423 ms (~15h - ~5d)
 *  8    512  55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d)
 *
 * HZ  250
 * Level Offset  Granularity            Range
 *  0           0         4 ms                0 ms -        255 ms
 *  1          64        32 ms              256 ms -       2047 ms (256ms - ~2s)
 *  2         128       256 ms             2048 ms -      16383 ms (~2s - ~16s)
 *  3         192      2048 ms (~2s)      16384 ms -     131071 ms (~16s - ~2m)
 *  4         256     16384 ms (~16s)    131072 ms -    1048575 ms (~2m - ~17m)
 *  5         320    131072 ms (~2m)    1048576 ms -    8388607 ms (~17m - ~2h)
 *  6         384   1048576 ms (~17m)   8388608 ms -   67108863 ms (~2h - ~18h)
 *  7         448   8388608 ms (~2h)   67108864 ms -  536870911 ms (~18h - ~6d)
 *  8    512  67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d)
 *
 * HZ  100
 * Level Offset  Granularity            Range
 *  0           0         10 ms               0 ms -        630 ms
 *  1          64         80 ms             640 ms -       5110 ms (640ms - ~5s)
 *  2         128        640 ms            5120 ms -      40950 ms (~5s - ~40s)
 *  3         192       5120 ms (~5s)     40960 ms -     327670 ms (~40s - ~5m)
 *  4         256      40960 ms (~40s)   327680 ms -    2621430 ms (~5m - ~43m)
 *  5         320     327680 ms (~5m)   2621440 ms -   20971510 ms (~43m - ~5h)
 *  6         384    2621440 ms (~43m) 20971520 ms -  167772150 ms (~5h - ~1d)
 *  7         448   20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d)
 */

/* Clock divisor for the next level */
#define LVL_CLK_SHIFT        3
#define LVL_CLK_DIV        (1UL << LVL_CLK_SHIFT)
#define LVL_CLK_MASK        (LVL_CLK_DIV - 1)
#define LVL_SHIFT(n)        ((n) * LVL_CLK_SHIFT)
#define LVL_GRAN(n)        (1UL << LVL_SHIFT(n))

/*
 * The time start value for each level to select the bucket at enqueue
 * time. We start from the last possible delta of the previous level
 * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()).
 */
#define LVL_START(n)        ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))

/* Size of each clock level */
#define LVL_BITS        6
#define LVL_SIZE        (1UL << LVL_BITS)
#define LVL_MASK        (LVL_SIZE - 1)
#define LVL_OFFS(n)        ((n) * LVL_SIZE)

/* Level depth */
#if HZ > 100
# define LVL_DEPTH        9
# else
# define LVL_DEPTH        8
#endif

/* The cutoff (max. capacity of the wheel) */
#define WHEEL_TIMEOUT_CUTOFF        (LVL_START(LVL_DEPTH))
#define WHEEL_TIMEOUT_MAX        (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1))

/*
 * The resulting wheel size. If NOHZ is configured we allocate two
 * wheels so we have a separate storage for the deferrable timers.
 */
#define WHEEL_SIZE        (LVL_SIZE * LVL_DEPTH)

#ifdef CONFIG_NO_HZ_COMMON
# define NR_BASES        2
# define BASE_STD        0
# define BASE_DEF        1
#else
# define NR_BASES        1
# define BASE_STD        0
# define BASE_DEF        0
#endif

struct timer_base {
        raw_spinlock_t                lock;
        struct timer_list        *running_timer;
#ifdef CONFIG_PREEMPT_RT
        spinlock_t                expiry_lock;
        atomic_t                timer_waiters;
#endif
        unsigned long                clk;
        unsigned long                next_expiry;
        unsigned int                cpu;
        bool                        next_expiry_recalc;
        bool                        is_idle;
        bool                        timers_pending;
        DECLARE_BITMAP(pending_map, WHEEL_SIZE);
        struct hlist_head        vectors[WHEEL_SIZE];
} ____cacheline_aligned;

static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);

#ifdef CONFIG_NO_HZ_COMMON

static DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
static DEFINE_MUTEX(timer_keys_mutex);

static void timer_update_keys(struct work_struct *work);
static DECLARE_WORK(timer_update_work, timer_update_keys);

#ifdef CONFIG_SMP
unsigned int sysctl_timer_migration = 1;

DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);

static void timers_update_migration(void)
{
        if (sysctl_timer_migration && tick_nohz_active)
                static_branch_enable(&timers_migration_enabled);
        else
                static_branch_disable(&timers_migration_enabled);
}
#else
static inline void timers_update_migration(void) { }
#endif /* !CONFIG_SMP */

static void timer_update_keys(struct work_struct *work)
{
        mutex_lock(&timer_keys_mutex);
        timers_update_migration();
        static_branch_enable(&timers_nohz_active);
        mutex_unlock(&timer_keys_mutex);
}

void timers_update_nohz(void)
{
        schedule_work(&timer_update_work);
}

int timer_migration_handler(struct ctl_table *table, int write,
                            void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;

        mutex_lock(&timer_keys_mutex);
        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (!ret && write)
                timers_update_migration();
        mutex_unlock(&timer_keys_mutex);
        return ret;
}

static inline bool is_timers_nohz_active(void)
{
        return static_branch_unlikely(&timers_nohz_active);
}
#else
static inline bool is_timers_nohz_active(void) { return false; }
#endif /* NO_HZ_COMMON */

static unsigned long round_jiffies_common(unsigned long j, int cpu,
                bool force_up)
{
        int rem;
        unsigned long original = j;

        /*
         * We don't want all cpus firing their timers at once hitting the
         * same lock or cachelines, so we skew each extra cpu with an extra
         * 3 jiffies. This 3 jiffies came originally from the mm/ code which
         * already did this.
         * The skew is done by adding 3*cpunr, then round, then subtract this
         * extra offset again.
         */
        j += cpu * 3;

        rem = j % HZ;

        /*
         * If the target jiffie is just after a whole second (which can happen
         * due to delays of the timer irq, long irq off times etc etc) then
         * we should round down to the whole second, not up. Use 1/4th second
         * as cutoff for this rounding as an extreme upper bound for this.
         * But never round down if @force_up is set.
         */
        if (rem < HZ/4 && !force_up) /* round down */
                j = j - rem;
        else /* round up */
                j = j - rem + HZ;

        /* now that we have rounded, subtract the extra skew again */
        j -= cpu * 3;

        /*
         * Make sure j is still in the future. Otherwise return the
         * unmodified value.
         */
        return time_is_after_jiffies(j) ? j : original;
}

/**
 * __round_jiffies - function to round jiffies to a full second
 * @j: the time in (absolute) jiffies that should be rounded
 * @cpu: the processor number on which the timeout will happen
 *
 * __round_jiffies() rounds an absolute time in the future (in jiffies)
 * up or down to (approximately) full seconds. This is useful for timers
 * for which the exact time they fire does not matter too much, as long as
 * they fire approximately every X seconds.
 *
 * By rounding these timers to whole seconds, all such timers will fire
 * at the same time, rather than at various times spread out. The goal
 * of this is to have the CPU wake up less, which saves power.
 *
 * The exact rounding is skewed for each processor to avoid all
 * processors firing at the exact same time, which could lead
 * to lock contention or spurious cache line bouncing.
 *
 * The return value is the rounded version of the @j parameter.
 */
unsigned long __round_jiffies(unsigned long j, int cpu)
{
        return round_jiffies_common(j, cpu, false);
}
EXPORT_SYMBOL_GPL(__round_jiffies);

/**
 * __round_jiffies_relative - function to round jiffies to a full second
 * @j: the time in (relative) jiffies that should be rounded
 * @cpu: the processor number on which the timeout will happen
 *
 * __round_jiffies_relative() rounds a time delta  in the future (in jiffies)
 * up or down to (approximately) full seconds. This is useful for timers
 * for which the exact time they fire does not matter too much, as long as
 * they fire approximately every X seconds.
 *
 * By rounding these timers to whole seconds, all such timers will fire
 * at the same time, rather than at various times spread out. The goal
 * of this is to have the CPU wake up less, which saves power.
 *
 * The exact rounding is skewed for each processor to avoid all
 * processors firing at the exact same time, which could lead
 * to lock contention or spurious cache line bouncing.
 *
 * The return value is the rounded version of the @j parameter.
 */
unsigned long __round_jiffies_relative(unsigned long j, int cpu)
{
        unsigned long j0 = jiffies;

        /* Use j0 because jiffies might change while we run */
        return round_jiffies_common(j + j0, cpu, false) - j0;
}
EXPORT_SYMBOL_GPL(__round_jiffies_relative);

/**
 * round_jiffies - function to round jiffies to a full second
 * @j: the time in (absolute) jiffies that should be rounded
 *
 * round_jiffies() rounds an absolute time in the future (in jiffies)
 * up or down to (approximately) full seconds. This is useful for timers
 * for which the exact time they fire does not matter too much, as long as
 * they fire approximately every X seconds.
 *
 * By rounding these timers to whole seconds, all such timers will fire
 * at the same time, rather than at various times spread out. The goal
 * of this is to have the CPU wake up less, which saves power.
 *
 * The return value is the rounded version of the @j parameter.
 */
unsigned long round_jiffies(unsigned long j)
{
        return round_jiffies_common(j, raw_smp_processor_id(), false);
}
EXPORT_SYMBOL_GPL(round_jiffies);

/**
 * round_jiffies_relative - function to round jiffies to a full second
 * @j: the time in (relative) jiffies that should be rounded
 *
 * round_jiffies_relative() rounds a time delta  in the future (in jiffies)
 * up or down to (approximately) full seconds. This is useful for timers
 * for which the exact time they fire does not matter too much, as long as
 * they fire approximately every X seconds.
 *
 * By rounding these timers to whole seconds, all such timers will fire
 * at the same time, rather than at various times spread out. The goal
 * of this is to have the CPU wake up less, which saves power.
 *
 * The return value is the rounded version of the @j parameter.
 */
unsigned long round_jiffies_relative(unsigned long j)
{
        return __round_jiffies_relative(j, raw_smp_processor_id());
}
EXPORT_SYMBOL_GPL(round_jiffies_relative);

/**
 * __round_jiffies_up - function to round jiffies up to a full second
 * @j: the time in (absolute) jiffies that should be rounded
 * @cpu: the processor number on which the timeout will happen
 *
 * This is the same as __round_jiffies() except that it will never
 * round down.  This is useful for timeouts for which the exact time
 * of firing does not matter too much, as long as they don't fire too
 * early.
 */
unsigned long __round_jiffies_up(unsigned long j, int cpu)
{
        return round_jiffies_common(j, cpu, true);
}
EXPORT_SYMBOL_GPL(__round_jiffies_up);

/**
 * __round_jiffies_up_relative - function to round jiffies up to a full second
 * @j: the time in (relative) jiffies that should be rounded
 * @cpu: the processor number on which the timeout will happen
 *
 * This is the same as __round_jiffies_relative() except that it will never
 * round down.  This is useful for timeouts for which the exact time
 * of firing does not matter too much, as long as they don't fire too
 * early.
 */
unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
{
        unsigned long j0 = jiffies;

        /* Use j0 because jiffies might change while we run */
        return round_jiffies_common(j + j0, cpu, true) - j0;
}
EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);

/**
 * round_jiffies_up - function to round jiffies up to a full second
 * @j: the time in (absolute) jiffies that should be rounded
 *
 * This is the same as round_jiffies() except that it will never
 * round down.  This is useful for timeouts for which the exact time
 * of firing does not matter too much, as long as they don't fire too
 * early.
 */
unsigned long round_jiffies_up(unsigned long j)
{
        return round_jiffies_common(j, raw_smp_processor_id(), true);
}
EXPORT_SYMBOL_GPL(round_jiffies_up);

/**
 * round_jiffies_up_relative - function to round jiffies up to a full second
 * @j: the time in (relative) jiffies that should be rounded
 *
 * This is the same as round_jiffies_relative() except that it will never
 * round down.  This is useful for timeouts for which the exact time
 * of firing does not matter too much, as long as they don't fire too
 * early.
 */
unsigned long round_jiffies_up_relative(unsigned long j)
{
        return __round_jiffies_up_relative(j, raw_smp_processor_id());
}
EXPORT_SYMBOL_GPL(round_jiffies_up_relative);


static inline unsigned int timer_get_idx(struct timer_list *timer)
{
        return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT;
}

static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
{
        timer->flags = (timer->flags & ~TIMER_ARRAYMASK) |
                        idx << TIMER_ARRAYSHIFT;
}

/*
 * Helper function to calculate the array index for a given expiry
 * time.
 */
static inline unsigned calc_index(unsigned long expires, unsigned lvl,
                                  unsigned long *bucket_expiry)
{

        /*
         * The timer wheel has to guarantee that a timer does not fire
         * early. Early expiry can happen due to:
         * - Timer is armed at the edge of a tick
         * - Truncation of the expiry time in the outer wheel levels
         *
         * Round up with level granularity to prevent this.
         */
        expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl);
        *bucket_expiry = expires << LVL_SHIFT(lvl);
        return LVL_OFFS(lvl) + (expires & LVL_MASK);
}

static int calc_wheel_index(unsigned long expires, unsigned long clk,
                            unsigned long *bucket_expiry)
{
        unsigned long delta = expires - clk;
        unsigned int idx;

        if (delta < LVL_START(1)) {
                idx = calc_index(expires, 0, bucket_expiry);
        } else if (delta < LVL_START(2)) {
                idx = calc_index(expires, 1, bucket_expiry);
        } else if (delta < LVL_START(3)) {
                idx = calc_index(expires, 2, bucket_expiry);
        } else if (delta < LVL_START(4)) {
                idx = calc_index(expires, 3, bucket_expiry);
        } else if (delta < LVL_START(5)) {
                idx = calc_index(expires, 4, bucket_expiry);
        } else if (delta < LVL_START(6)) {
                idx = calc_index(expires, 5, bucket_expiry);
        } else if (delta < LVL_START(7)) {
                idx = calc_index(expires, 6, bucket_expiry);
        } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
                idx = calc_index(expires, 7, bucket_expiry);
        } else if ((long) delta < 0) {
                idx = clk & LVL_MASK;
                *bucket_expiry = clk;
        } else {
                /*
                 * Force expire obscene large timeouts to expire at the
                 * capacity limit of the wheel.
                 */
                if (delta >= WHEEL_TIMEOUT_CUTOFF)
                        expires = clk + WHEEL_TIMEOUT_MAX;

                idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry);
        }
        return idx;
}

static void
trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
{
        if (!is_timers_nohz_active())
                return;

        /*
         * TODO: This wants some optimizing similar to the code below, but we
         * will do that when we switch from push to pull for deferrable timers.
         */
        if (timer->flags & TIMER_DEFERRABLE) {
                if (tick_nohz_full_cpu(base->cpu))
                        wake_up_nohz_cpu(base->cpu);
                return;
        }

        /*
         * We might have to IPI the remote CPU if the base is idle and the
         * timer is not deferrable. If the other CPU is on the way to idle
         * then it can't set base->is_idle as we hold the base lock:
         */
        if (base->is_idle)
                wake_up_nohz_cpu(base->cpu);
}

/*
 * Enqueue the timer into the hash bucket, mark it pending in
 * the bitmap, store the index in the timer flags then wake up
 * the target CPU if needed.
 */
static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
                          unsigned int idx, unsigned long bucket_expiry)
{

        hlist_add_head(&timer->entry, base->vectors + idx);
        __set_bit(idx, base->pending_map);
        timer_set_idx(timer, idx);

        trace_timer_start(timer, timer->expires, timer->flags);

        /*
         * Check whether this is the new first expiring timer. The
         * effective expiry time of the timer is required here
         * (bucket_expiry) instead of timer->expires.
         */
        if (time_before(bucket_expiry, base->next_expiry)) {
                /*
                 * Set the next expiry time and kick the CPU so it
                 * can reevaluate the wheel:
                 */
                base->next_expiry = bucket_expiry;
                base->timers_pending = true;
                base->next_expiry_recalc = false;
                trigger_dyntick_cpu(base, timer);
        }
}

static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
{
        unsigned long bucket_expiry;
        unsigned int idx;

        idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry);
        enqueue_timer(base, timer, idx, bucket_expiry);
}

#ifdef CONFIG_DEBUG_OBJECTS_TIMERS

static const struct debug_obj_descr timer_debug_descr;

static void *timer_debug_hint(void *addr)
{
        return ((struct timer_list *) addr)->function;
}

static bool timer_is_static_object(void *addr)
{
        struct timer_list *timer = addr;

        return (timer->entry.pprev == NULL &&
                timer->entry.next == TIMER_ENTRY_STATIC);
}

/*
 * fixup_init is called when:
 * - an active object is initialized
 */
static bool timer_fixup_init(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                del_timer_sync(timer);
                debug_object_init(timer, &timer_debug_descr);
                return true;
        default:
                return false;
        }
}

/* Stub timer callback for improperly used timers. */
static void stub_timer(struct timer_list *unused)
{
        WARN_ON(1);
}

/*
 * fixup_activate is called when:
 * - an active object is activated
 * - an unknown non-static object is activated
 */
static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_NOTAVAILABLE:
                timer_setup(timer, stub_timer, 0);
                return true;

        case ODEBUG_STATE_ACTIVE:
                WARN_ON(1);
                fallthrough;
        default:
                return false;
        }
}

/*
 * fixup_free is called when:
 * - an active object is freed
 */
static bool timer_fixup_free(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                del_timer_sync(timer);
                debug_object_free(timer, &timer_debug_descr);
                return true;
        default:
                return false;
        }
}

/*
 * fixup_assert_init is called when:
 * - an untracked/uninit-ed object is found
 */
static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_NOTAVAILABLE:
                timer_setup(timer, stub_timer, 0);
                return true;
        default:
                return false;
        }
}

static const struct debug_obj_descr timer_debug_descr = {
        .name                        = "timer_list",
        .debug_hint                = timer_debug_hint,
        .is_static_object        = timer_is_static_object,
        .fixup_init                = timer_fixup_init,
        .fixup_activate                = timer_fixup_activate,
        .fixup_free                = timer_fixup_free,
        .fixup_assert_init        = timer_fixup_assert_init,
};

static inline void debug_timer_init(struct timer_list *timer)
{
        debug_object_init(timer, &timer_debug_descr);
}

static inline void debug_timer_activate(struct timer_list *timer)
{
        debug_object_activate(timer, &timer_debug_descr);
}

static inline void debug_timer_deactivate(struct timer_list *timer)
{
        debug_object_deactivate(timer, &timer_debug_descr);
}

static inline void debug_timer_assert_init(struct timer_list *timer)
{
        debug_object_assert_init(timer, &timer_debug_descr);
}

static void do_init_timer(struct timer_list *timer,
                          void (*func)(struct timer_list *),
                          unsigned int flags,
                          const char *name, struct lock_class_key *key);

void init_timer_on_stack_key(struct timer_list *timer,
                             void (*func)(struct timer_list *),
                             unsigned int flags,
                             const char *name, struct lock_class_key *key)
{
        debug_object_init_on_stack(timer, &timer_debug_descr);
        do_init_timer(timer, func, flags, name, key);
}
EXPORT_SYMBOL_GPL(init_timer_on_stack_key);

void destroy_timer_on_stack(struct timer_list *timer)
{
        debug_object_free(timer, &timer_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_timer_on_stack);

#else
static inline void debug_timer_init(struct timer_list *timer) { }
static inline void debug_timer_activate(struct timer_list *timer) { }
static inline void debug_timer_deactivate(struct timer_list *timer) { }
static inline void debug_timer_assert_init(struct timer_list *timer) { }
#endif

static inline void debug_init(struct timer_list *timer)
{
        debug_timer_init(timer);
        trace_timer_init(timer);
}

static inline void debug_deactivate(struct timer_list *timer)
{
        debug_timer_deactivate(timer);
        trace_timer_cancel(timer);
}

static inline void debug_assert_init(struct timer_list *timer)
{
        debug_timer_assert_init(timer);
}

static void do_init_timer(struct timer_list *timer,
                          void (*func)(struct timer_list *),
                          unsigned int flags,
                          const char *name, struct lock_class_key *key)
{
        timer->entry.pprev = NULL;
        timer->function = func;
        if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS))
                flags &= TIMER_INIT_FLAGS;
        timer->flags = flags | raw_smp_processor_id();
        lockdep_init_map(&timer->lockdep_map, name, key, 0);
}

/**
 * init_timer_key - initialize a timer
 * @timer: the timer to be initialized
 * @func: timer callback function
 * @flags: timer flags
 * @name: name of the timer
 * @key: lockdep class key of the fake lock used for tracking timer
 *       sync lock dependencies
 *
 * init_timer_key() must be done to a timer prior calling *any* of the
 * other timer functions.
 */
void init_timer_key(struct timer_list *timer,
                    void (*func)(struct timer_list *), unsigned int flags,
                    const char *name, struct lock_class_key *key)
{
        debug_init(timer);
        do_init_timer(timer, func, flags, name, key);
}
EXPORT_SYMBOL(init_timer_key);

static inline void detach_timer(struct timer_list *timer, bool clear_pending)
{
        struct hlist_node *entry = &timer->entry;

        debug_deactivate(timer);

        __hlist_del(entry);
        if (clear_pending)
                entry->pprev = NULL;
        entry->next = LIST_POISON2;
}

static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
                             bool clear_pending)
{
        unsigned idx = timer_get_idx(timer);

        if (!timer_pending(timer))
                return 0;

        if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) {
                __clear_bit(idx, base->pending_map);
                base->next_expiry_recalc = true;
        }

        detach_timer(timer, clear_pending);
        return 1;
}

static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
{
        struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);

        /*
         * If the timer is deferrable and NO_HZ_COMMON is set then we need
         * to use the deferrable base.
         */
        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
                base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
        return base;
}

static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
{
        struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);

        /*
         * If the timer is deferrable and NO_HZ_COMMON is set then we need
         * to use the deferrable base.
         */
        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
                base = this_cpu_ptr(&timer_bases[BASE_DEF]);
        return base;
}

static inline struct timer_base *get_timer_base(u32 tflags)
{
        return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
}

static inline struct timer_base *
get_target_base(struct timer_base *base, unsigned tflags)
{
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
        if (static_branch_likely(&timers_migration_enabled) &&
            !(tflags & TIMER_PINNED))
                return get_timer_cpu_base(tflags, get_nohz_timer_target());
#endif
        return get_timer_this_cpu_base(tflags);
}

static inline void forward_timer_base(struct timer_base *base)
{
        unsigned long jnow = READ_ONCE(jiffies);

        /*
         * No need to forward if we are close enough below jiffies.
         * Also while executing timers, base->clk is 1 offset ahead
         * of jiffies to avoid endless requeuing to current jffies.
         */
        if ((long)(jnow - base->clk) < 1)
                return;

        /*
         * If the next expiry value is > jiffies, then we fast forward to
         * jiffies otherwise we forward to the next expiry value.
         */
        if (time_after(base->next_expiry, jnow)) {
                base->clk = jnow;
        } else {
                if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
                        return;
                base->clk = base->next_expiry;
        }
}


/*
 * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
 * that all timers which are tied to this base are locked, and the base itself
 * is locked too.
 *
 * So __run_timers/migrate_timers can safely modify all timers which could
 * be found in the base->vectors array.
 *
 * When a timer is migrating then the TIMER_MIGRATING flag is set and we need
 * to wait until the migration is done.
 */
static struct timer_base *lock_timer_base(struct timer_list *timer,
                                          unsigned long *flags)
        __acquires(timer->base->lock)
{
        for (;;) {
                struct timer_base *base;
                u32 tf;

                /*
                 * We need to use READ_ONCE() here, otherwise the compiler
                 * might re-read @tf between the check for TIMER_MIGRATING
                 * and spin_lock().
                 */
                tf = READ_ONCE(timer->flags);

                if (!(tf & TIMER_MIGRATING)) {
                        base = get_timer_base(tf);
                        raw_spin_lock_irqsave(&base->lock, *flags);
                        if (timer->flags == tf)
                                return base;
                        raw_spin_unlock_irqrestore(&base->lock, *flags);
                }
                cpu_relax();
        }
}

#define MOD_TIMER_PENDING_ONLY                0x01
#define MOD_TIMER_REDUCE                0x02
#define MOD_TIMER_NOTPENDING                0x04

static inline int
__mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options)
{
        unsigned long clk = 0, flags, bucket_expiry;
        struct timer_base *base, *new_base;
        unsigned int idx = UINT_MAX;
        int ret = 0;

        BUG_ON(!timer->function);

        /*
         * This is a common optimization triggered by the networking code - if
         * the timer is re-modified to have the same timeout or ends up in the
         * same array bucket then just return:
         */
        if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) {
                /*
                 * The downside of this optimization is that it can result in
                 * larger granularity than you would get from adding a new
                 * timer with this expiry.
                 */
                long diff = timer->expires - expires;

                if (!diff)
                        return 1;
                if (options & MOD_TIMER_REDUCE && diff <= 0)
                        return 1;

                /*
                 * We lock timer base and calculate the bucket index right
                 * here. If the timer ends up in the same bucket, then we
                 * just update the expiry time and avoid the whole
                 * dequeue/enqueue dance.
                 */
                base = lock_timer_base(timer, &flags);
                forward_timer_base(base);

                if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) &&
                    time_before_eq(timer->expires, expires)) {
                        ret = 1;
                        goto out_unlock;
                }

                clk = base->clk;
                idx = calc_wheel_index(expires, clk, &bucket_expiry);

                /*
                 * Retrieve and compare the array index of the pending
                 * timer. If it matches set the expiry to the new value so a
                 * subsequent call will exit in the expires check above.
                 */
                if (idx == timer_get_idx(timer)) {
                        if (!(options & MOD_TIMER_REDUCE))
                                timer->expires = expires;
                        else if (time_after(timer->expires, expires))
                                timer->expires = expires;
                        ret = 1;
                        goto out_unlock;
                }
        } else {
                base = lock_timer_base(timer, &flags);
                forward_timer_base(base);
        }

        ret = detach_if_pending(timer, base, false);
        if (!ret && (options & MOD_TIMER_PENDING_ONLY))
                goto out_unlock;

        new_base = get_target_base(base, timer->flags);

        if (base != new_base) {
                /*
                 * We are trying to schedule the timer on the new base.
                 * However we can't change timer's base while it is running,
                 * otherwise timer_delete_sync() can't detect that the timer's
                 * handler yet has not finished. This also guarantees that the
                 * timer is serialized wrt itself.
                 */
                if (likely(base->running_timer != timer)) {
                        /* See the comment in lock_timer_base() */
                        timer->flags |= TIMER_MIGRATING;

                        raw_spin_unlock(&base->lock);
                        base = new_base;
                        raw_spin_lock(&base->lock);
                        WRITE_ONCE(timer->flags,
                                   (timer->flags & ~TIMER_BASEMASK) | base->cpu);
                        forward_timer_base(base);
                }
        }

        debug_timer_activate(timer);

        timer->expires = expires;
        /*
         * If 'idx' was calculated above and the base time did not advance
         * between calculating 'idx' and possibly switching the base, only
         * enqueue_timer() is required. Otherwise we need to (re)calculate
         * the wheel index via internal_add_timer().
         */
        if (idx != UINT_MAX && clk == base->clk)
                enqueue_timer(base, timer, idx, bucket_expiry);
        else
                internal_add_timer(base, timer);

out_unlock:
        raw_spin_unlock_irqrestore(&base->lock, flags);

        return ret;
}

/**
 * mod_timer_pending - Modify a pending timer's timeout
 * @timer:        The pending timer to be modified
 * @expires:        New absolute timeout in jiffies
 *
 * mod_timer_pending() is the same for pending timers as mod_timer(), but
 * will not activate inactive timers.
 *
 * Return:
 * * %0 - The timer was inactive and not modified
 * * %1 - The timer was active and requeued to expire at @expires
 */
int mod_timer_pending(struct timer_list *timer, unsigned long expires)
{
        return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY);
}
EXPORT_SYMBOL(mod_timer_pending);

/**
 * mod_timer - Modify a timer's timeout
 * @timer:        The timer to be modified
 * @expires:        New absolute timeout in jiffies
 *
 * mod_timer(timer, expires) is equivalent to:
 *
 *     del_timer(timer); timer->expires = expires; add_timer(timer);
 *
 * mod_timer() is more efficient than the above open coded sequence. In
 * case that the timer is inactive, the del_timer() part is a NOP. The
 * timer is in any case activated with the new expiry time @expires.
 *
 * Note that if there are multiple unserialized concurrent users of the
 * same timer, then mod_timer() is the only safe way to modify the timeout,
 * since add_timer() cannot modify an already running timer.
 *
 * Return:
 * * %0 - The timer was inactive and started
 * * %1 - The timer was active and requeued to expire at @expires or
 *          the timer was active and not modified because @expires did
 *          not change the effective expiry time
 */
int mod_timer(struct timer_list *timer, unsigned long expires)
{
        return __mod_timer(timer, expires, 0);
}
EXPORT_SYMBOL(mod_timer);

/**
 * timer_reduce - Modify a timer's timeout if it would reduce the timeout
 * @timer:        The timer to be modified
 * @expires:        New absolute timeout in jiffies
 *
 * timer_reduce() is very similar to mod_timer(), except that it will only
 * modify an enqueued timer if that would reduce the expiration time. If
 * @timer is not enqueued it starts the timer.
 *
 * Return:
 * * %0 - The timer was inactive and started
 * * %1 - The timer was active and requeued to expire at @expires or
 *          the timer was active and not modified because @expires
 *          did not change the effective expiry time such that the
 *          timer would expire earlier than already scheduled
 */
int timer_reduce(struct timer_list *timer, unsigned long expires)
{
        return __mod_timer(timer, expires, MOD_TIMER_REDUCE);
}
EXPORT_SYMBOL(timer_reduce);

/**
 * add_timer - Start a timer
 * @timer:        The timer to be started
 *
 * Start @timer to expire at @timer->expires in the future. @timer->expires
 * is the absolute expiry time measured in 'jiffies'. When the timer expires
 * timer->function(timer) will be invoked from soft interrupt context.
 *
 * The @timer->expires and @timer->function fields must be set prior
 * to calling this function.
 *
 * If @timer->expires is already in the past @timer will be queued to
 * expire at the next timer tick.
 *
 * This can only operate on an inactive timer. Attempts to invoke this on
 * an active timer are rejected with a warning.
 */
void add_timer(struct timer_list *timer)
{
        BUG_ON(timer_pending(timer));
        __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
}
EXPORT_SYMBOL(add_timer);

/**
 * add_timer_on - Start a timer on a particular CPU
 * @timer:        The timer to be started
 * @cpu:        The CPU to start it on
 *
 * Same as add_timer() except that it starts the timer on the given CPU.
 *
 * See add_timer() for further details.
 */
void add_timer_on(struct timer_list *timer, int cpu)
{
        struct timer_base *new_base, *base;
        unsigned long flags;

        BUG_ON(timer_pending(timer) || !timer->function);

        new_base = get_timer_cpu_base(timer->flags, cpu);

        /*
         * If @timer was on a different CPU, it should be migrated with the
         * old base locked to prevent other operations proceeding with the
         * wrong base locked.  See lock_timer_base().
         */
        base = lock_timer_base(timer, &flags);
        if (base != new_base) {
                timer->flags |= TIMER_MIGRATING;

                raw_spin_unlock(&base->lock);
                base = new_base;
                raw_spin_lock(&base->lock);
                WRITE_ONCE(timer->flags,
                           (timer->flags & ~TIMER_BASEMASK) | cpu);
        }
        forward_timer_base(base);

        debug_timer_activate(timer);
        internal_add_timer(base, timer);
        raw_spin_unlock_irqrestore(&base->lock, flags);
}
EXPORT_SYMBOL_GPL(add_timer_on);

/**
 * del_timer - Deactivate a timer.
 * @timer:        The timer to be deactivated
 *
 * The function only deactivates a pending timer, but contrary to
 * timer_delete_sync() it does not take into account whether the timer's
 * callback function is concurrently executed on a different CPU or not.
 * It neither prevents rearming of the timer. If @timer can be rearmed
 * concurrently then the return value of this function is meaningless.
 *
 * Return:
 * * %0 - The timer was not pending
 * * %1 - The timer was pending and deactivated
 */
int del_timer(struct timer_list *timer)
{
        struct timer_base *base;
        unsigned long flags;
        int ret = 0;

        debug_assert_init(timer);

        if (timer_pending(timer)) {
                base = lock_timer_base(timer, &flags);
                ret = detach_if_pending(timer, base, true);
                raw_spin_unlock_irqrestore(&base->lock, flags);
        }

        return ret;
}
EXPORT_SYMBOL(del_timer);

/**
 * try_to_del_timer_sync - Try to deactivate a timer
 * @timer:        Timer to deactivate
 *
 * This function tries to deactivate a timer. On success the timer is not
 * queued and the timer callback function is not running on any CPU.
 *
 * This function does not guarantee that the timer cannot be rearmed right
 * after dropping the base lock. That needs to be prevented by the calling
 * code if necessary.
 *
 * Return:
 * * %0  - The timer was not pending
 * * %1  - The timer was pending and deactivated
 * * %-1 - The timer callback function is running on a different CPU
 */
int try_to_del_timer_sync(struct timer_list *timer)
{
        struct timer_base *base;
        unsigned long flags;
        int ret = -1;

        debug_assert_init(timer);

        base = lock_timer_base(timer, &flags);

        if (base->running_timer != timer)
                ret = detach_if_pending(timer, base, true);

        raw_spin_unlock_irqrestore(&base->lock, flags);

        return ret;
}
EXPORT_SYMBOL(try_to_del_timer_sync);

#ifdef CONFIG_PREEMPT_RT
static __init void timer_base_init_expiry_lock(struct timer_base *base)
{
        spin_lock_init(&base->expiry_lock);
}

static inline void timer_base_lock_expiry(struct timer_base *base)
{
        spin_lock(&base->expiry_lock);
}

static inline void timer_base_unlock_expiry(struct timer_base *base)
{
        spin_unlock(&base->expiry_lock);
}

/*
 * The counterpart to del_timer_wait_running().
 *
 * If there is a waiter for base->expiry_lock, then it was waiting for the
 * timer callback to finish. Drop expiry_lock and reaquire it. That allows
 * the waiter to acquire the lock and make progress.
 */
static void timer_sync_wait_running(struct timer_base *base)
{
        if (atomic_read(&base->timer_waiters)) {
                raw_spin_unlock_irq(&base->lock);
                spin_unlock(&base->expiry_lock);
                spin_lock(&base->expiry_lock);
                raw_spin_lock_irq(&base->lock);
        }
}

/*
 * This function is called on PREEMPT_RT kernels when the fast path
 * deletion of a timer failed because the timer callback function was
 * running.
 *
 * This prevents priority inversion, if the softirq thread on a remote CPU
 * got preempted, and it prevents a life lock when the task which tries to
 * delete a timer preempted the softirq thread running the timer callback
 * function.
 */
static void del_timer_wait_running(struct timer_list *timer)
{
        u32 tf;

        tf = READ_ONCE(timer->flags);
        if (!(tf & TIMER_MIGRATING)) {
                struct timer_base *base = get_timer_base(tf);

                /*
                 * Mark the base as contended and grab the expiry lock,
                 * which is held by the softirq across the timer
                 * callback. Drop the lock immediately so the softirq can
                 * expire the next timer. In theory the timer could already
                 * be running again, but that's more than unlikely and just
                 * causes another wait loop.
                 */
                atomic_inc(&base->timer_waiters);
                spin_lock_bh(&base->expiry_lock);
                atomic_dec(&base->timer_waiters);
                spin_unlock_bh(&base->expiry_lock);
        }
}
#else
static inline void timer_base_init_expiry_lock(struct timer_base *base) { }
static inline void timer_base_lock_expiry(struct timer_base *base) { }
static inline void timer_base_unlock_expiry(struct timer_base *base) { }
static inline void timer_sync_wait_running(struct timer_base *base) { }
static inline void del_timer_wait_running(struct timer_list *timer) { }
#endif

/**
 * timer_delete_sync - Deactivate a timer and wait for the handler to finish.
 * @timer:        The timer to be deactivated
 *
 * Synchronization rules: Callers must prevent restarting of the timer,
 * otherwise this function is meaningless. It must not be called from
 * interrupt contexts unless the timer is an irqsafe one. The caller must
 * not hold locks which would prevent completion of the timer's callback
 * function. The timer's handler must not call add_timer_on(). Upon exit
 * the timer is not queued and the handler is not running on any CPU.
 *
 * For !irqsafe timers, the caller must not hold locks that are held in
 * interrupt context. Even if the lock has nothing to do with the timer in
 * question.  Here's why::
 *
 *    CPU0                             CPU1
 *    ----                             ----
 *                                     <SOFTIRQ>
 *                                       call_timer_fn();
 *                                       base->running_timer = mytimer;
 *    spin_lock_irq(somelock);
 *                                     <IRQ>
 *                                        spin_lock(somelock);
 *    timer_delete_sync(mytimer);
 *    while (base->running_timer == mytimer);
 *
 * Now timer_delete_sync() will never return and never release somelock.
 * The interrupt on the other CPU is waiting to grab somelock but it has
 * interrupted the softirq that CPU0 is waiting to finish.
 *
 * This function cannot guarantee that the timer is not rearmed again by
 * some concurrent or preempting code, right after it dropped the base
 * lock. If there is the possibility of a concurrent rearm then the return
 * value of the function is meaningless.
 *
 * Return:
 * * %0        - The timer was not pending
 * * %1        - The timer was pending and deactivated
 */
int timer_delete_sync(struct timer_list *timer)
{
        int ret;

#ifdef CONFIG_LOCKDEP
        unsigned long flags;

        /*
         * If lockdep gives a backtrace here, please reference
         * the synchronization rules above.
         */
        local_irq_save(flags);
        lock_map_acquire(&timer->lockdep_map);
        lock_map_release(&timer->lockdep_map);
        local_irq_restore(flags);
#endif
        /*
         * don't use it in hardirq context, because it
         * could lead to deadlock.
         */
        WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE));

        do {
                ret = try_to_del_timer_sync(timer);

                if (unlikely(ret < 0)) {
                        del_timer_wait_running(timer);
                        cpu_relax();
                }
        } while (ret < 0);

        return ret;
}
EXPORT_SYMBOL(timer_delete_sync);

static void call_timer_fn(struct timer_list *timer,
                          void (*fn)(struct timer_list *),
                          unsigned long baseclk)
{
        int count = preempt_count();

#ifdef CONFIG_LOCKDEP
        /*
         * It is permissible to free the timer from inside the
         * function that is called from it, this we need to take into
         * account for lockdep too. To avoid bogus "held lock freed"
         * warnings as well as problems when looking into
         * timer->lockdep_map, make a copy and use that here.
         */
        struct lockdep_map lockdep_map;

        lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
#endif
        /*
         * Couple the lock chain with the lock chain at
         * timer_delete_sync() by acquiring the lock_map around the fn()
         * call here and in timer_delete_sync().
         */
        lock_map_acquire(&lockdep_map);

        trace_timer_expire_entry(timer, baseclk);
        fn(timer);
        trace_timer_expire_exit(timer);

        lock_map_release(&lockdep_map);

        if (count != preempt_count()) {
                WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n",
                          fn, count, preempt_count());
                /*
                 * Restore the preempt count. That gives us a decent
                 * chance to survive and extract information. If the
                 * callback kept a lock held, bad luck, but not worse
                 * than the BUG() we had.
                 */
                preempt_count_set(count);
        }
}

static void expire_timers(struct timer_base *base, struct hlist_head *head)
{
        /*
         * This value is required only for tracing. base->clk was
         * incremented directly before expire_timers was called. But expiry
         * is related to the old base->clk value.
         */
        unsigned long baseclk = base->clk - 1;

        while (!hlist_empty(head)) {
                struct timer_list *timer;
                void (*fn)(struct timer_list *);

                timer = hlist_entry(head->first, struct timer_list, entry);

                base->running_timer = timer;
                detach_timer(timer, true);

                fn = timer->function;

                if (timer->flags & TIMER_IRQSAFE) {
                        raw_spin_unlock(&base->lock);
                        call_timer_fn(timer, fn, baseclk);
                        raw_spin_lock(&base->lock);
                        base->running_timer = NULL;
                } else {
                        raw_spin_unlock_irq(&base->lock);
                        call_timer_fn(timer, fn, baseclk);
                        raw_spin_lock_irq(&base->lock);
                        base->running_timer = NULL;
                        timer_sync_wait_running(base);
                }
        }
}

static int collect_expired_timers(struct timer_base *base,
                                  struct hlist_head *heads)
{
        unsigned long clk = base->clk = base->next_expiry;
        struct hlist_head *vec;
        int i, levels = 0;
        unsigned int idx;

        for (i = 0; i < LVL_DEPTH; i++) {
                idx = (clk & LVL_MASK) + i * LVL_SIZE;

                if (__test_and_clear_bit(idx, base->pending_map)) {
                        vec = base->vectors + idx;
                        hlist_move_list(vec, heads++);
                        levels++;
                }
                /* Is it time to look at the next level? */
                if (clk & LVL_CLK_MASK)
                        break;
                /* Shift clock for the next level granularity */
                clk >>= LVL_CLK_SHIFT;
        }
        return levels;
}

/*
 * Find the next pending bucket of a level. Search from level start (@offset)
 * + @clk upwards and if nothing there, search from start of the level
 * (@offset) up to @offset + clk.
 */
static int next_pending_bucket(struct timer_base *base, unsigned offset,
                               unsigned clk)
{
        unsigned pos, start = offset + clk;
        unsigned end = offset + LVL_SIZE;

        pos = find_next_bit(base->pending_map, end, start);
        if (pos < end)
                return pos - start;

        pos = find_next_bit(base->pending_map, start, offset);
        return pos < start ? pos + LVL_SIZE - start : -1;
}

/*
 * Search the first expiring timer in the various clock levels. Caller must
 * hold base->lock.
 */
static unsigned long __next_timer_interrupt(struct timer_base *base)
{
        unsigned long clk, next, adj;
        unsigned lvl, offset = 0;

        next = base->clk + NEXT_TIMER_MAX_DELTA;
        clk = base->clk;
        for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
                int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
                unsigned long lvl_clk = clk & LVL_CLK_MASK;

                if (pos >= 0) {
                        unsigned long tmp = clk + (unsigned long) pos;

                        tmp <<= LVL_SHIFT(lvl);
                        if (time_before(tmp, next))
                                next = tmp;

                        /*
                         * If the next expiration happens before we reach
                         * the next level, no need to check further.
                         */
                        if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK))
                                break;
                }
                /*
                 * Clock for the next level. If the current level clock lower
                 * bits are zero, we look at the next level as is. If not we
                 * need to advance it by one because that's going to be the
                 * next expiring bucket in that level. base->clk is the next
                 * expiring jiffie. So in case of:
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
                 *  0    0    0    0    0    0
                 *
                 * we have to look at all levels @index 0. With
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
                 *  0    0    0    0    0    2
                 *
                 * LVL0 has the next expiring bucket @index 2. The upper
                 * levels have the next expiring bucket @index 1.
                 *
                 * In case that the propagation wraps the next level the same
                 * rules apply:
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
                 *  0    0    0    0    F    2
                 *
                 * So after looking at LVL0 we get:
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1
                 *  0    0    0    1    0
                 *
                 * So no propagation from LVL1 to LVL2 because that happened
                 * with the add already, but then we need to propagate further
                 * from LVL2 to LVL3.
                 *
                 * So the simple check whether the lower bits of the current
                 * level are 0 or not is sufficient for all cases.
                 */
                adj = lvl_clk ? 1 : 0;
                clk >>= LVL_CLK_SHIFT;
                clk += adj;
        }

        base->next_expiry_recalc = false;
        base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);

        return next;
}

#ifdef CONFIG_NO_HZ_COMMON
/*
 * Check, if the next hrtimer event is before the next timer wheel
 * event:
 */
static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
{
        u64 nextevt = hrtimer_get_next_event();

        /*
         * If high resolution timers are enabled
         * hrtimer_get_next_event() returns KTIME_MAX.
         */
        if (expires <= nextevt)
                return expires;

        /*
         * If the next timer is already expired, return the tick base
         * time so the tick is fired immediately.
         */
        if (nextevt <= basem)
                return basem;

        /*
         * Round up to the next jiffie. High resolution timers are
         * off, so the hrtimers are expired in the tick and we need to
         * make sure that this tick really expires the timer to avoid
         * a ping pong of the nohz stop code.
         *
         * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
         */
        return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
}

/**
 * get_next_timer_interrupt - return the time (clock mono) of the next timer
 * @basej:        base time jiffies
 * @basem:        base time clock monotonic
 *
 * Returns the tick aligned clock monotonic time of the next pending
 * timer or KTIME_MAX if no timer is pending.
 */
u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
{
        struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
        u64 expires = KTIME_MAX;
        unsigned long nextevt;

        /*
         * Pretend that there is no timer pending if the cpu is offline.
         * Possible pending timers will be migrated later to an active cpu.
         */
        if (cpu_is_offline(smp_processor_id()))
                return expires;

        raw_spin_lock(&base->lock);
        if (base->next_expiry_recalc)
                base->next_expiry = __next_timer_interrupt(base);
        nextevt = base->next_expiry;

        /*
         * We have a fresh next event. Check whether we can forward the
         * base. We can only do that when @basej is past base->clk
         * otherwise we might rewind base->clk.
         */
        if (time_after(basej, base->clk)) {
                if (time_after(nextevt, basej))
                        base->clk = basej;
                else if (time_after(nextevt, base->clk))
                        base->clk = nextevt;
        }

        if (time_before_eq(nextevt, basej)) {
                expires = basem;
                base->is_idle = false;
        } else {
                if (base->timers_pending)
                        expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
                /*
                 * If we expect to sleep more than a tick, mark the base idle.
                 * Also the tick is stopped so any added timer must forward
                 * the base clk itself to keep granularity small. This idle
                 * logic is only maintained for the BASE_STD base, deferrable
                 * timers may still see large granularity skew (by design).
                 */
                if ((expires - basem) > TICK_NSEC)
                        base->is_idle = true;
        }
        raw_spin_unlock(&base->lock);

        return cmp_next_hrtimer_event(basem, expires);
}

/**
 * timer_clear_idle - Clear the idle state of the timer base
 *
 * Called with interrupts disabled
 */
void timer_clear_idle(void)
{
        struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);

        /*
         * We do this unlocked. The worst outcome is a remote enqueue sending
         * a pointless IPI, but taking the lock would just make the window for
         * sending the IPI a few instructions smaller for the cost of taking
         * the lock in the exit from idle path.
         */
        base->is_idle = false;
}
#endif

/*
 * Called from the timer interrupt handler to charge one tick to the current
 * process.  user_tick is 1 if the tick is user time, 0 for system.
 */
void update_process_times(int user_tick)
{
        struct task_struct *p = current;

        PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0);

        /* Note: this timer irq context must be accounted for as well. */
        account_process_tick(p, user_tick);
        run_local_timers();
        rcu_sched_clock_irq(user_tick);
#ifdef CONFIG_IRQ_WORK
        if (in_irq())
                irq_work_tick();
#endif
        scheduler_tick();
        if (IS_ENABLED(CONFIG_POSIX_TIMERS))
                run_posix_cpu_timers();
}

/**
 * __run_timers - run all expired timers (if any) on this CPU.
 * @base: the timer vector to be processed.
 */
static inline void __run_timers(struct timer_base *base)
{
        struct hlist_head heads[LVL_DEPTH];
        int levels;

        if (time_before(jiffies, base->next_expiry))
                return;

        timer_base_lock_expiry(base);
        raw_spin_lock_irq(&base->lock);

        while (time_after_eq(jiffies, base->clk) &&
               time_after_eq(jiffies, base->next_expiry)) {
                levels = collect_expired_timers(base, heads);
                /*
                 * The two possible reasons for not finding any expired
                 * timer at this clk are that all matching timers have been
                 * dequeued or no timer has been queued since
                 * base::next_expiry was set to base::clk +
                 * NEXT_TIMER_MAX_DELTA.
                 */
                WARN_ON_ONCE(!levels && !base->next_expiry_recalc
                             && base->timers_pending);
                base->clk++;
                base->next_expiry = __next_timer_interrupt(base);

                while (levels--)
                        expire_timers(base, heads + levels);
        }
        raw_spin_unlock_irq(&base->lock);
        timer_base_unlock_expiry(base);
}

/*
 * This function runs timers and the timer-tq in bottom half context.
 */
static __latent_entropy void run_timer_softirq(struct softirq_action *h)
{
        struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);

        __run_timers(base);
        if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
                __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
}

/*
 * Called by the local, per-CPU timer interrupt on SMP.
 */
void run_local_timers(void)
{
        struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);

        hrtimer_run_queues();
        /* Raise the softirq only if required. */
        if (time_before(jiffies, base->next_expiry)) {
                if (!IS_ENABLED(CONFIG_NO_HZ_COMMON))
                        return;
                /* CPU is awake, so check the deferrable base. */
                base++;
                if (time_before(jiffies, base->next_expiry))
                        return;
        }
        raise_softirq(TIMER_SOFTIRQ);
}

/*
 * Since schedule_timeout()'s timer is defined on the stack, it must store
 * the target task on the stack as well.
 */
struct process_timer {
        struct timer_list timer;
        struct task_struct *task;
};

static void process_timeout(struct timer_list *t)
{
        struct process_timer *timeout = from_timer(timeout, t, timer);

        wake_up_process(timeout->task);
}

/**
 * schedule_timeout - sleep until timeout
 * @timeout: timeout value in jiffies
 *
 * Make the current task sleep until @timeout jiffies have elapsed.
 * The function behavior depends on the current task state
 * (see also set_current_state() description):
 *
 * %TASK_RUNNING - the scheduler is called, but the task does not sleep
 * at all. That happens because sched_submit_work() does nothing for
 * tasks in %TASK_RUNNING state.
 *
 * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
 * pass before the routine returns unless the current task is explicitly
 * woken up, (e.g. by wake_up_process()).
 *
 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
 * delivered to the current task or the current task is explicitly woken
 * up.
 *
 * The current task state is guaranteed to be %TASK_RUNNING when this
 * routine returns.
 *
 * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
 * the CPU away without a bound on the timeout. In this case the return
 * value will be %MAX_SCHEDULE_TIMEOUT.
 *
 * Returns 0 when the timer has expired otherwise the remaining time in
 * jiffies will be returned. In all cases the return value is guaranteed
 * to be non-negative.
 */
signed long __sched schedule_timeout(signed long timeout)
{
        struct process_timer timer;
        unsigned long expire;

        switch (timeout)
        {
        case MAX_SCHEDULE_TIMEOUT:
                /*
                 * These two special cases are useful to be comfortable
                 * in the caller. Nothing more. We could take
                 * MAX_SCHEDULE_TIMEOUT from one of the negative value
                 * but I' d like to return a valid offset (>=0) to allow
                 * the caller to do everything it want with the retval.
                 */
                schedule();
                goto out;
        default:
                /*
                 * Another bit of PARANOID. Note that the retval will be
                 * 0 since no piece of kernel is supposed to do a check
                 * for a negative retval of schedule_timeout() (since it
                 * should never happens anyway). You just have the printk()
                 * that will tell you if something is gone wrong and where.
                 */
                if (timeout < 0) {
                        printk(KERN_ERR "schedule_timeout: wrong timeout "
                                "value %lx\n", timeout);
                        dump_stack();
                        current->state = TASK_RUNNING;
                        goto out;
                }
        }

        expire = timeout + jiffies;

        timer.task = current;
        timer_setup_on_stack(&timer.timer, process_timeout, 0);
        __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING);
        schedule();
        del_singleshot_timer_sync(&timer.timer);

        /* Remove the timer from the object tracker */
        destroy_timer_on_stack(&timer.timer);

        timeout = expire - jiffies;

 out:
        return timeout < 0 ? 0 : timeout;
}
EXPORT_SYMBOL(schedule_timeout);

/*
 * We can use __set_current_state() here because schedule_timeout() calls
 * schedule() unconditionally.
 */
signed long __sched schedule_timeout_interruptible(signed long timeout)
{
        __set_current_state(TASK_INTERRUPTIBLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_interruptible);

signed long __sched schedule_timeout_killable(signed long timeout)
{
        __set_current_state(TASK_KILLABLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_killable);

signed long __sched schedule_timeout_uninterruptible(signed long timeout)
{
        __set_current_state(TASK_UNINTERRUPTIBLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_uninterruptible);

/*
 * Like schedule_timeout_uninterruptible(), except this task will not contribute
 * to load average.
 */
signed long __sched schedule_timeout_idle(signed long timeout)
{
        __set_current_state(TASK_IDLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_idle);

#ifdef CONFIG_HOTPLUG_CPU
static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head)
{
        struct timer_list *timer;
        int cpu = new_base->cpu;

        while (!hlist_empty(head)) {
                timer = hlist_entry(head->first, struct timer_list, entry);
                detach_timer(timer, false);
                timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
                internal_add_timer(new_base, timer);
        }
}

int timers_prepare_cpu(unsigned int cpu)
{
        struct timer_base *base;
        int b;

        for (b = 0; b < NR_BASES; b++) {
                base = per_cpu_ptr(&timer_bases[b], cpu);
                base->clk = jiffies;
                base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
                base->timers_pending = false;
                base->is_idle = false;
        }
        return 0;
}

int timers_dead_cpu(unsigned int cpu)
{
        struct timer_base *old_base;
        struct timer_base *new_base;
        int b, i;

        BUG_ON(cpu_online(cpu));

        for (b = 0; b < NR_BASES; b++) {
                old_base = per_cpu_ptr(&timer_bases[b], cpu);
                new_base = get_cpu_ptr(&timer_bases[b]);
                /*
                 * The caller is globally serialized and nobody else
                 * takes two locks at once, deadlock is not possible.
                 */
                raw_spin_lock_irq(&new_base->lock);
                raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);

                /*
                 * The current CPUs base clock might be stale. Update it
                 * before moving the timers over.
                 */
                forward_timer_base(new_base);

                BUG_ON(old_base->running_timer);

                for (i = 0; i < WHEEL_SIZE; i++)
                        migrate_timer_list(new_base, old_base->vectors + i);

                raw_spin_unlock(&old_base->lock);
                raw_spin_unlock_irq(&new_base->lock);
                put_cpu_ptr(&timer_bases);
        }
        return 0;
}

#endif /* CONFIG_HOTPLUG_CPU */

static void __init init_timer_cpu(int cpu)
{
        struct timer_base *base;
        int i;

        for (i = 0; i < NR_BASES; i++) {
                base = per_cpu_ptr(&timer_bases[i], cpu);
                base->cpu = cpu;
                raw_spin_lock_init(&base->lock);
                base->clk = jiffies;
                base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
                timer_base_init_expiry_lock(base);
        }
}

static void __init init_timer_cpus(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                init_timer_cpu(cpu);
}

void __init init_timers(void)
{
        init_timer_cpus();
        posix_cputimers_init_work();
        open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}

/**
 * msleep - sleep safely even with waitqueue interruptions
 * @msecs: Time in milliseconds to sleep for
 */
void msleep(unsigned int msecs)
{
        unsigned long timeout = msecs_to_jiffies(msecs) + 1;

        while (timeout)
                timeout = schedule_timeout_uninterruptible(timeout);
}

EXPORT_SYMBOL(msleep);

/**
 * msleep_interruptible - sleep waiting for signals
 * @msecs: Time in milliseconds to sleep for
 */
unsigned long msleep_interruptible(unsigned int msecs)
{
        unsigned long timeout = msecs_to_jiffies(msecs) + 1;

        while (timeout && !signal_pending(current))
                timeout = schedule_timeout_interruptible(timeout);
        return jiffies_to_msecs(timeout);
}

EXPORT_SYMBOL(msleep_interruptible);

/**
 * usleep_range - Sleep for an approximate time
 * @min: Minimum time in usecs to sleep
 * @max: Maximum time in usecs to sleep
 *
 * In non-atomic context where the exact wakeup time is flexible, use
 * usleep_range() instead of udelay().  The sleep improves responsiveness
 * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces
 * power usage by allowing hrtimers to take advantage of an already-
 * scheduled interrupt instead of scheduling a new one just for this sleep.
 */
void __sched usleep_range(unsigned long min, unsigned long max)
{
        ktime_t exp = ktime_add_us(ktime_get(), min);
        u64 delta = (u64)(max - min) * NSEC_PER_USEC;

        for (;;) {
                __set_current_state(TASK_UNINTERRUPTIBLE);
                /* Do not return before the requested sleep time has elapsed */
                if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS))
                        break;
        }
}
EXPORT_SYMBOL(usleep_range);





























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CLEANCACHE_H
#define _LINUX_CLEANCACHE_H

#include <linux/fs.h>
#include <linux/exportfs.h>
#include <linux/mm.h>

#define CLEANCACHE_NO_POOL                -1
#define CLEANCACHE_NO_BACKEND                -2
#define CLEANCACHE_NO_BACKEND_SHARED        -3

#define CLEANCACHE_KEY_MAX 6

/*
 * cleancache requires every file with a page in cleancache to have a
 * unique key unless/until the file is removed/truncated.  For some
 * filesystems, the inode number is unique, but for "modern" filesystems
 * an exportable filehandle is required (see exportfs.h)
 */
struct cleancache_filekey {
        union {
                ino_t ino;
                __u32 fh[CLEANCACHE_KEY_MAX];
                u32 key[CLEANCACHE_KEY_MAX];
        } u;
};

struct cleancache_ops {
        int (*init_fs)(size_t);
        int (*init_shared_fs)(uuid_t *uuid, size_t);
        int (*get_page)(int, struct cleancache_filekey,
                        pgoff_t, struct page *);
        void (*put_page)(int, struct cleancache_filekey,
                        pgoff_t, struct page *);
        void (*invalidate_page)(int, struct cleancache_filekey, pgoff_t);
        void (*invalidate_inode)(int, struct cleancache_filekey);
        void (*invalidate_fs)(int);
};

extern int cleancache_register_ops(const struct cleancache_ops *ops);
extern void __cleancache_init_fs(struct super_block *);
extern void __cleancache_init_shared_fs(struct super_block *);
extern int  __cleancache_get_page(struct page *);
extern void __cleancache_put_page(struct page *);
extern void __cleancache_invalidate_page(struct address_space *, struct page *);
extern void __cleancache_invalidate_inode(struct address_space *);
extern void __cleancache_invalidate_fs(struct super_block *);

#ifdef CONFIG_CLEANCACHE
#define cleancache_enabled (1)
static inline bool cleancache_fs_enabled_mapping(struct address_space *mapping)
{
        return mapping->host->i_sb->cleancache_poolid >= 0;
}
static inline bool cleancache_fs_enabled(struct page *page)
{
        return cleancache_fs_enabled_mapping(page->mapping);
}
#else
#define cleancache_enabled (0)
#define cleancache_fs_enabled(_page) (0)
#define cleancache_fs_enabled_mapping(_page) (0)
#endif

/*
 * The shim layer provided by these inline functions allows the compiler
 * to reduce all cleancache hooks to nothingness if CONFIG_CLEANCACHE
 * is disabled, to a single global variable check if CONFIG_CLEANCACHE
 * is enabled but no cleancache "backend" has dynamically enabled it,
 * and, for the most frequent cleancache ops, to a single global variable
 * check plus a superblock element comparison if CONFIG_CLEANCACHE is enabled
 * and a cleancache backend has dynamically enabled cleancache, but the
 * filesystem referenced by that cleancache op has not enabled cleancache.
 * As a result, CONFIG_CLEANCACHE can be enabled by default with essentially
 * no measurable performance impact.
 */

static inline void cleancache_init_fs(struct super_block *sb)
{
        if (cleancache_enabled)
                __cleancache_init_fs(sb);
}

static inline void cleancache_init_shared_fs(struct super_block *sb)
{
        if (cleancache_enabled)
                __cleancache_init_shared_fs(sb);
}

static inline int cleancache_get_page(struct page *page)
{
        if (cleancache_enabled && cleancache_fs_enabled(page))
                return __cleancache_get_page(page);
        return -1;
}

static inline void cleancache_put_page(struct page *page)
{
        if (cleancache_enabled && cleancache_fs_enabled(page))
                __cleancache_put_page(page);
}

static inline void cleancache_invalidate_page(struct address_space *mapping,
                                        struct page *page)
{
        /* careful... page->mapping is NULL sometimes when this is called */
        if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping))
                __cleancache_invalidate_page(mapping, page);
}

static inline void cleancache_invalidate_inode(struct address_space *mapping)
{
        if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping))
                __cleancache_invalidate_inode(mapping);
}

static inline void cleancache_invalidate_fs(struct super_block *sb)
{
        if (cleancache_enabled)
                __cleancache_invalidate_fs(sb);
}

#endif /* _LINUX_CLEANCACHE_H */

















































    1 













    1 






    1 



































    1 














    1 
































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PERCPU_RWSEM_H
#define _LINUX_PERCPU_RWSEM_H

#include <linux/atomic.h>
#include <linux/percpu.h>
#include <linux/rcuwait.h>
#include <linux/wait.h>
#include <linux/rcu_sync.h>
#include <linux/lockdep.h>

struct percpu_rw_semaphore {
        struct rcu_sync                rss;
        unsigned int __percpu        *read_count;
        struct rcuwait                writer;
        wait_queue_head_t        waiters;
        atomic_t                block;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
#endif
};

#ifdef CONFIG_DEBUG_LOCK_ALLOC
#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname)        .dep_map = { .name = #lockname },
#else
#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname)
#endif

#define __DEFINE_PERCPU_RWSEM(name, is_static)                                \
static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name);                \
is_static struct percpu_rw_semaphore name = {                                \
        .rss = __RCU_SYNC_INITIALIZER(name.rss),                        \
        .read_count = &__percpu_rwsem_rc_##name,                        \
        .writer = __RCUWAIT_INITIALIZER(name.writer),                        \
        .waiters = __WAIT_QUEUE_HEAD_INITIALIZER(name.waiters),                \
        .block = ATOMIC_INIT(0),                                        \
        __PERCPU_RWSEM_DEP_MAP_INIT(name)                                \
}

#define DEFINE_PERCPU_RWSEM(name)                \
        __DEFINE_PERCPU_RWSEM(name, /* not static */)
#define DEFINE_STATIC_PERCPU_RWSEM(name)        \
        __DEFINE_PERCPU_RWSEM(name, static)

extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool);

static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
{
        might_sleep();

        rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);

        preempt_disable();
        /*
         * We are in an RCU-sched read-side critical section, so the writer
         * cannot both change sem->state from readers_fast and start checking
         * counters while we are here. So if we see !sem->state, we know that
         * the writer won't be checking until we're past the preempt_enable()
         * and that once the synchronize_rcu() is done, the writer will see
         * anything we did within this RCU-sched read-size critical section.
         */
        if (likely(rcu_sync_is_idle(&sem->rss)))
                this_cpu_inc(*sem->read_count);
        else
                __percpu_down_read(sem, false); /* Unconditional memory barrier */
        /*
         * The preempt_enable() prevents the compiler from
         * bleeding the critical section out.
         */
        preempt_enable();
}

static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
{
        bool ret = true;

        preempt_disable();
        /*
         * Same as in percpu_down_read().
         */
        if (likely(rcu_sync_is_idle(&sem->rss)))
                this_cpu_inc(*sem->read_count);
        else
                ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */
        preempt_enable();
        /*
         * The barrier() from preempt_enable() prevents the compiler from
         * bleeding the critical section out.
         */

        if (ret)
                rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);

        return ret;
}

static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
{
        rwsem_release(&sem->dep_map, _RET_IP_);

        preempt_disable();
        /*
         * Same as in percpu_down_read().
         */
        if (likely(rcu_sync_is_idle(&sem->rss))) {
                this_cpu_dec(*sem->read_count);
        } else {
                /*
                 * slowpath; reader will only ever wake a single blocked
                 * writer.
                 */
                smp_mb(); /* B matches C */
                /*
                 * In other words, if they see our decrement (presumably to
                 * aggregate zero, as that is the only time it matters) they
                 * will also see our critical section.
                 */
                this_cpu_dec(*sem->read_count);
                rcuwait_wake_up(&sem->writer);
        }
        preempt_enable();
}

extern void percpu_down_write(struct percpu_rw_semaphore *);
extern void percpu_up_write(struct percpu_rw_semaphore *);

extern int __percpu_init_rwsem(struct percpu_rw_semaphore *,
                                const char *, struct lock_class_key *);

extern void percpu_free_rwsem(struct percpu_rw_semaphore *);

#define percpu_init_rwsem(sem)                                        \
({                                                                \
        static struct lock_class_key rwsem_key;                        \
        __percpu_init_rwsem(sem, #sem, &rwsem_key);                \
})

#define percpu_rwsem_is_held(sem)        lockdep_is_held(sem)
#define percpu_rwsem_assert_held(sem)        lockdep_assert_held(sem)

static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
                                        bool read, unsigned long ip)
{
        lock_release(&sem->dep_map, ip);
}

static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem,
                                        bool read, unsigned long ip)
{
        lock_acquire(&sem->dep_map, 0, 1, read, 1, NULL, ip);
}

#endif


















































































































































































































































































































































































    1 




    1 






































    1 










































































    1 














    1 

    1 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/kernel/capability.c
 *
 * Copyright (C) 1997  Andrew Main <zefram@fysh.org>
 *
 * Integrated into 2.1.97+,  Andrew G. Morgan <morgan@kernel.org>
 * 30 May 2002:        Cleanup, Robert M. Love <rml@tech9.net>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/uaccess.h>

/*
 * Leveraged for setting/resetting capabilities
 */

const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
EXPORT_SYMBOL(__cap_empty_set);

int file_caps_enabled = 1;

static int __init file_caps_disable(char *str)
{
        file_caps_enabled = 0;
        return 1;
}
__setup("no_file_caps", file_caps_disable);

#ifdef CONFIG_MULTIUSER
/*
 * More recent versions of libcap are available from:
 *
 *   http://www.kernel.org/pub/linux/libs/security/linux-privs/
 */

static void warn_legacy_capability_use(void)
{
        char name[sizeof(current->comm)];

        pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n",
                     get_task_comm(name, current));
}

/*
 * Version 2 capabilities worked fine, but the linux/capability.h file
 * that accompanied their introduction encouraged their use without
 * the necessary user-space source code changes. As such, we have
 * created a version 3 with equivalent functionality to version 2, but
 * with a header change to protect legacy source code from using
 * version 2 when it wanted to use version 1. If your system has code
 * that trips the following warning, it is using version 2 specific
 * capabilities and may be doing so insecurely.
 *
 * The remedy is to either upgrade your version of libcap (to 2.10+,
 * if the application is linked against it), or recompile your
 * application with modern kernel headers and this warning will go
 * away.
 */

static void warn_deprecated_v2(void)
{
        char name[sizeof(current->comm)];

        pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n",
                     get_task_comm(name, current));
}

/*
 * Version check. Return the number of u32s in each capability flag
 * array, or a negative value on error.
 */
static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
{
        __u32 version;

        if (get_user(version, &header->version))
                return -EFAULT;

        switch (version) {
        case _LINUX_CAPABILITY_VERSION_1:
                warn_legacy_capability_use();
                *tocopy = _LINUX_CAPABILITY_U32S_1;
                break;
        case _LINUX_CAPABILITY_VERSION_2:
                warn_deprecated_v2();
                fallthrough;        /* v3 is otherwise equivalent to v2 */
        case _LINUX_CAPABILITY_VERSION_3:
                *tocopy = _LINUX_CAPABILITY_U32S_3;
                break;
        default:
                if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version))
                        return -EFAULT;
                return -EINVAL;
        }

        return 0;
}

/*
 * The only thing that can change the capabilities of the current
 * process is the current process. As such, we can't be in this code
 * at the same time as we are in the process of setting capabilities
 * in this process. The net result is that we can limit our use of
 * locks to when we are reading the caps of another process.
 */
static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
                                     kernel_cap_t *pIp, kernel_cap_t *pPp)
{
        int ret;

        if (pid && (pid != task_pid_vnr(current))) {
                struct task_struct *target;

                rcu_read_lock();

                target = find_task_by_vpid(pid);
                if (!target)
                        ret = -ESRCH;
                else
                        ret = security_capget(target, pEp, pIp, pPp);

                rcu_read_unlock();
        } else
                ret = security_capget(current, pEp, pIp, pPp);

        return ret;
}

/**
 * sys_capget - get the capabilities of a given process.
 * @header: pointer to struct that contains capability version and
 *        target pid data
 * @dataptr: pointer to struct that contains the effective, permitted,
 *        and inheritable capabilities that are returned
 *
 * Returns 0 on success and < 0 on error.
 */
SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
{
        int ret = 0;
        pid_t pid;
        unsigned tocopy;
        kernel_cap_t pE, pI, pP;

        ret = cap_validate_magic(header, &tocopy);
        if ((dataptr == NULL) || (ret != 0))
                return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret;

        if (get_user(pid, &header->pid))
                return -EFAULT;

        if (pid < 0)
                return -EINVAL;

        ret = cap_get_target_pid(pid, &pE, &pI, &pP);
        if (!ret) {
                struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
                unsigned i;

                for (i = 0; i < tocopy; i++) {
                        kdata[i].effective = pE.cap[i];
                        kdata[i].permitted = pP.cap[i];
                        kdata[i].inheritable = pI.cap[i];
                }

                /*
                 * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
                 * we silently drop the upper capabilities here. This
                 * has the effect of making older libcap
                 * implementations implicitly drop upper capability
                 * bits when they perform a: capget/modify/capset
                 * sequence.
                 *
                 * This behavior is considered fail-safe
                 * behavior. Upgrading the application to a newer
                 * version of libcap will enable access to the newer
                 * capabilities.
                 *
                 * An alternative would be to return an error here
                 * (-ERANGE), but that causes legacy applications to
                 * unexpectedly fail; the capget/modify/capset aborts
                 * before modification is attempted and the application
                 * fails.
                 */
                if (copy_to_user(dataptr, kdata, tocopy
                                 * sizeof(struct __user_cap_data_struct))) {
                        return -EFAULT;
                }
        }

        return ret;
}

/**
 * sys_capset - set capabilities for a process or (*) a group of processes
 * @header: pointer to struct that contains capability version and
 *        target pid data
 * @data: pointer to struct that contains the effective, permitted,
 *        and inheritable capabilities
 *
 * Set capabilities for the current process only.  The ability to any other
 * process(es) has been deprecated and removed.
 *
 * The restrictions on setting capabilities are specified as:
 *
 * I: any raised capabilities must be a subset of the old permitted
 * P: any raised capabilities must be a subset of the old permitted
 * E: must be set to a subset of new permitted
 *
 * Returns 0 on success and < 0 on error.
 */
SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
{
        struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
        unsigned i, tocopy, copybytes;
        kernel_cap_t inheritable, permitted, effective;
        struct cred *new;
        int ret;
        pid_t pid;

        ret = cap_validate_magic(header, &tocopy);
        if (ret != 0)
                return ret;

        if (get_user(pid, &header->pid))
                return -EFAULT;

        /* may only affect current now */
        if (pid != 0 && pid != task_pid_vnr(current))
                return -EPERM;

        copybytes = tocopy * sizeof(struct __user_cap_data_struct);
        if (copybytes > sizeof(kdata))
                return -EFAULT;

        if (copy_from_user(&kdata, data, copybytes))
                return -EFAULT;

        for (i = 0; i < tocopy; i++) {
                effective.cap[i] = kdata[i].effective;
                permitted.cap[i] = kdata[i].permitted;
                inheritable.cap[i] = kdata[i].inheritable;
        }
        while (i < _KERNEL_CAPABILITY_U32S) {
                effective.cap[i] = 0;
                permitted.cap[i] = 0;
                inheritable.cap[i] = 0;
                i++;
        }

        effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
        permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
        inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        ret = security_capset(new, current_cred(),
                              &effective, &inheritable, &permitted);
        if (ret < 0)
                goto error;

        audit_log_capset(new, current_cred());

        return commit_creds(new);

error:
        abort_creds(new);
        return ret;
}

/**
 * has_ns_capability - Does a task have a capability in a specific user ns
 * @t: The task in question
 * @ns: target user namespace
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
 * currently in effect to the specified user namespace, false if not.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
bool has_ns_capability(struct task_struct *t,
                       struct user_namespace *ns, int cap)
{
        int ret;

        rcu_read_lock();
        ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE);
        rcu_read_unlock();

        return (ret == 0);
}

/**
 * has_capability - Does a task have a capability in init_user_ns
 * @t: The task in question
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
 * currently in effect to the initial user namespace, false if not.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
bool has_capability(struct task_struct *t, int cap)
{
        return has_ns_capability(t, &init_user_ns, cap);
}
EXPORT_SYMBOL(has_capability);

/**
 * has_ns_capability_noaudit - Does a task have a capability (unaudited)
 * in a specific user ns.
 * @t: The task in question
 * @ns: target user namespace
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
 * currently in effect to the specified user namespace, false if not.
 * Do not write an audit message for the check.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
bool has_ns_capability_noaudit(struct task_struct *t,
                               struct user_namespace *ns, int cap)
{
        int ret;

        rcu_read_lock();
        ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT);
        rcu_read_unlock();

        return (ret == 0);
}

/**
 * has_capability_noaudit - Does a task have a capability (unaudited) in the
 * initial user ns
 * @t: The task in question
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
 * currently in effect to init_user_ns, false if not.  Don't write an
 * audit message for the check.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
bool has_capability_noaudit(struct task_struct *t, int cap)
{
        return has_ns_capability_noaudit(t, &init_user_ns, cap);
}

static bool ns_capable_common(struct user_namespace *ns,
                              int cap,
                              unsigned int opts)
{
        int capable;

        if (unlikely(!cap_valid(cap))) {
                pr_crit("capable() called with invalid cap=%u\n", cap);
                BUG();
        }

        capable = security_capable(current_cred(), ns, cap, opts);
        if (capable == 0) {
                current->flags |= PF_SUPERPRIV;
                return true;
        }
        return false;
}

/**
 * ns_capable - Determine if the current task has a superior capability in effect
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool ns_capable(struct user_namespace *ns, int cap)
{
        return ns_capable_common(ns, cap, CAP_OPT_NONE);
}
EXPORT_SYMBOL(ns_capable);

/**
 * ns_capable_noaudit - Determine if the current task has a superior capability
 * (unaudited) in effect
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool ns_capable_noaudit(struct user_namespace *ns, int cap)
{
        return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT);
}
EXPORT_SYMBOL(ns_capable_noaudit);

/**
 * ns_capable_setid - Determine if the current task has a superior capability
 * in effect, while signalling that this check is being done from within a
 * setid or setgroups syscall.
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool ns_capable_setid(struct user_namespace *ns, int cap)
{
        return ns_capable_common(ns, cap, CAP_OPT_INSETID);
}
EXPORT_SYMBOL(ns_capable_setid);

/**
 * capable - Determine if the current task has a superior capability in effect
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool capable(int cap)
{
        return ns_capable(&init_user_ns, cap);
}
EXPORT_SYMBOL(capable);
#endif /* CONFIG_MULTIUSER */

/**
 * file_ns_capable - Determine if the file's opener had a capability in effect
 * @file:  The file we want to check
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if task that opened the file had a capability in effect
 * when the file was opened.
 *
 * This does not set PF_SUPERPRIV because the caller may not
 * actually be privileged.
 */
bool file_ns_capable(const struct file *file, struct user_namespace *ns,
                     int cap)
{

        if (WARN_ON_ONCE(!cap_valid(cap)))
                return false;

        if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0)
                return true;

        return false;
}
EXPORT_SYMBOL(file_ns_capable);

/**
 * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode?
 * @ns: The user namespace in question
 * @inode: The inode in question
 *
 * Return true if the inode uid and gid are within the namespace.
 */
bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode)
{
        return kuid_has_mapping(ns, inode->i_uid) &&
                kgid_has_mapping(ns, inode->i_gid);
}

/**
 * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
 * @inode: The inode in question
 * @cap: The capability in question
 *
 * Return true if the current task has the given capability targeted at
 * its own user namespace and that the given inode's uid and gid are
 * mapped into the current user namespace.
 */
bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
{
        struct user_namespace *ns = current_user_ns();

        return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, inode);
}
EXPORT_SYMBOL(capable_wrt_inode_uidgid);

/**
 * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace
 * @tsk: The task that may be ptraced
 * @ns: The user namespace to search for CAP_SYS_PTRACE in
 *
 * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE
 * in the specified user namespace.
 */
bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns)
{
        int ret = 0;  /* An absent tracer adds no restrictions */
        const struct cred *cred;

        rcu_read_lock();
        cred = rcu_dereference(tsk->ptracer_cred);
        if (cred)
                ret = security_capable(cred, ns, CAP_SYS_PTRACE,
                                       CAP_OPT_NOAUDIT);
        rcu_read_unlock();
        return (ret == 0);
}





































    1 




    1 



































































    1 









    1 














    1 





    1 













    1 








































































    1 











    1 




























    1 




















    1 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
// SPDX-License-Identifier: GPL-2.0-only
/*
 * This implements the various checks for CONFIG_HARDENED_USERCOPY*,
 * which are designed to protect kernel memory from needless exposure
 * and overwrite under many unintended conditions. This code is based
 * on PAX_USERCOPY, which is:
 *
 * Copyright (C) 2001-2016 PaX Team, Bradley Spengler, Open Source
 * Security Inc.
 */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/thread_info.h>
#include <linux/atomic.h>
#include <linux/jump_label.h>
#include <asm/sections.h>

/*
 * Checks if a given pointer and length is contained by the current
 * stack frame (if possible).
 *
 * Returns:
 *        NOT_STACK: not at all on the stack
 *        GOOD_FRAME: fully within a valid stack frame
 *        GOOD_STACK: fully on the stack (when can't do frame-checking)
 *        BAD_STACK: error condition (invalid stack position or bad stack frame)
 */
static noinline int check_stack_object(const void *obj, unsigned long len)
{
        const void * const stack = task_stack_page(current);
        const void * const stackend = stack + THREAD_SIZE;
        int ret;

        /* Object is not on the stack at all. */
        if (obj + len <= stack || stackend <= obj)
                return NOT_STACK;

        /*
         * Reject: object partially overlaps the stack (passing the
         * check above means at least one end is within the stack,
         * so if this check fails, the other end is outside the stack).
         */
        if (obj < stack || stackend < obj + len)
                return BAD_STACK;

        /* Check if object is safely within a valid frame. */
        ret = arch_within_stack_frames(stack, stackend, obj, len);
        if (ret)
                return ret;

        return GOOD_STACK;
}

/*
 * If these functions are reached, then CONFIG_HARDENED_USERCOPY has found
 * an unexpected state during a copy_from_user() or copy_to_user() call.
 * There are several checks being performed on the buffer by the
 * __check_object_size() function. Normal stack buffer usage should never
 * trip the checks, and kernel text addressing will always trip the check.
 * For cache objects, it is checking that only the whitelisted range of
 * bytes for a given cache is being accessed (via the cache's usersize and
 * useroffset fields). To adjust a cache whitelist, use the usercopy-aware
 * kmem_cache_create_usercopy() function to create the cache (and
 * carefully audit the whitelist range).
 */
void usercopy_warn(const char *name, const char *detail, bool to_user,
                   unsigned long offset, unsigned long len)
{
        WARN_ONCE(1, "Bad or missing usercopy whitelist? Kernel memory %s attempt detected %s %s%s%s%s (offset %lu, size %lu)!\n",
                 to_user ? "exposure" : "overwrite",
                 to_user ? "from" : "to",
                 name ? : "unknown?!",
                 detail ? " '" : "", detail ? : "", detail ? "'" : "",
                 offset, len);
}

void __noreturn usercopy_abort(const char *name, const char *detail,
                               bool to_user, unsigned long offset,
                               unsigned long len)
{
        pr_emerg("Kernel memory %s attempt detected %s %s%s%s%s (offset %lu, size %lu)!\n",
                 to_user ? "exposure" : "overwrite",
                 to_user ? "from" : "to",
                 name ? : "unknown?!",
                 detail ? " '" : "", detail ? : "", detail ? "'" : "",
                 offset, len);

        /*
         * For greater effect, it would be nice to do do_group_exit(),
         * but BUG() actually hooks all the lock-breaking and per-arch
         * Oops code, so that is used here instead.
         */
        BUG();
}

/* Returns true if any portion of [ptr,ptr+n) over laps with [low,high). */
static bool overlaps(const unsigned long ptr, unsigned long n,
                     unsigned long low, unsigned long high)
{
        const unsigned long check_low = ptr;
        unsigned long check_high = check_low + n;

        /* Does not overlap if entirely above or entirely below. */
        if (check_low >= high || check_high <= low)
                return false;

        return true;
}

/* Is this address range in the kernel text area? */
static inline void check_kernel_text_object(const unsigned long ptr,
                                            unsigned long n, bool to_user)
{
        unsigned long textlow = (unsigned long)_stext;
        unsigned long texthigh = (unsigned long)_etext;
        unsigned long textlow_linear, texthigh_linear;

        if (overlaps(ptr, n, textlow, texthigh))
                usercopy_abort("kernel text", NULL, to_user, ptr - textlow, n);

        /*
         * Some architectures have virtual memory mappings with a secondary
         * mapping of the kernel text, i.e. there is more than one virtual
         * kernel address that points to the kernel image. It is usually
         * when there is a separate linear physical memory mapping, in that
         * __pa() is not just the reverse of __va(). This can be detected
         * and checked:
         */
        textlow_linear = (unsigned long)lm_alias(textlow);
        /* No different mapping: we're done. */
        if (textlow_linear == textlow)
                return;

        /* Check the secondary mapping... */
        texthigh_linear = (unsigned long)lm_alias(texthigh);
        if (overlaps(ptr, n, textlow_linear, texthigh_linear))
                usercopy_abort("linear kernel text", NULL, to_user,
                               ptr - textlow_linear, n);
}

static inline void check_bogus_address(const unsigned long ptr, unsigned long n,
                                       bool to_user)
{
        /* Reject if object wraps past end of memory. */
        if (ptr + (n - 1) < ptr)
                usercopy_abort("wrapped address", NULL, to_user, 0, ptr + n);

        /* Reject if NULL or ZERO-allocation. */
        if (ZERO_OR_NULL_PTR(ptr))
                usercopy_abort("null address", NULL, to_user, ptr, n);
}

/* Checks for allocs that are marked in some way as spanning multiple pages. */
static inline void check_page_span(const void *ptr, unsigned long n,
                                   struct page *page, bool to_user)
{
#ifdef CONFIG_HARDENED_USERCOPY_PAGESPAN
        const void *end = ptr + n - 1;
        struct page *endpage;
        bool is_reserved, is_cma;

        /*
         * Sometimes the kernel data regions are not marked Reserved (see
         * check below). And sometimes [_sdata,_edata) does not cover
         * rodata and/or bss, so check each range explicitly.
         */

        /* Allow reads of kernel rodata region (if not marked as Reserved). */
        if (ptr >= (const void *)__start_rodata &&
            end <= (const void *)__end_rodata) {
                if (!to_user)
                        usercopy_abort("rodata", NULL, to_user, 0, n);
                return;
        }

        /* Allow kernel data region (if not marked as Reserved). */
        if (ptr >= (const void *)_sdata && end <= (const void *)_edata)
                return;

        /* Allow kernel bss region (if not marked as Reserved). */
        if (ptr >= (const void *)__bss_start &&
            end <= (const void *)__bss_stop)
                return;

        /* Is the object wholly within one base page? */
        if (likely(((unsigned long)ptr & (unsigned long)PAGE_MASK) ==
                   ((unsigned long)end & (unsigned long)PAGE_MASK)))
                return;

        /* Allow if fully inside the same compound (__GFP_COMP) page. */
        endpage = virt_to_head_page(end);
        if (likely(endpage == page))
                return;

        /*
         * Reject if range is entirely either Reserved (i.e. special or
         * device memory), or CMA. Otherwise, reject since the object spans
         * several independently allocated pages.
         */
        is_reserved = PageReserved(page);
        is_cma = is_migrate_cma_page(page);
        if (!is_reserved && !is_cma)
                usercopy_abort("spans multiple pages", NULL, to_user, 0, n);

        for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) {
                page = virt_to_head_page(ptr);
                if (is_reserved && !PageReserved(page))
                        usercopy_abort("spans Reserved and non-Reserved pages",
                                       NULL, to_user, 0, n);
                if (is_cma && !is_migrate_cma_page(page))
                        usercopy_abort("spans CMA and non-CMA pages", NULL,
                                       to_user, 0, n);
        }
#endif
}

static inline void check_heap_object(const void *ptr, unsigned long n,
                                     bool to_user)
{
        struct page *page;

        if (!virt_addr_valid(ptr))
                return;

        /*
         * When CONFIG_HIGHMEM=y, kmap_to_page() will give either the
         * highmem page or fallback to virt_to_page(). The following
         * is effectively a highmem-aware virt_to_head_page().
         */
        page = compound_head(kmap_to_page((void *)ptr));

        if (PageSlab(page)) {
                /* Check slab allocator for flags and size. */
                __check_heap_object(ptr, n, page, to_user);
        } else {
                /* Verify object does not incorrectly span multiple pages. */
                check_page_span(ptr, n, page, to_user);
        }
}

static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);

/*
 * Validates that the given object is:
 * - not bogus address
 * - fully contained by stack (or stack frame, when available)
 * - fully within SLAB object (or object whitelist area, when available)
 * - not in kernel text
 */
void __check_object_size(const void *ptr, unsigned long n, bool to_user)
{
        if (static_branch_unlikely(&bypass_usercopy_checks))
                return;

        /* Skip all tests if size is zero. */
        if (!n)
                return;

        /* Check for invalid addresses. */
        check_bogus_address((const unsigned long)ptr, n, to_user);

        /* Check for bad stack object. */
        switch (check_stack_object(ptr, n)) {
        case NOT_STACK:
                /* Object is not touching the current process stack. */
                break;
        case GOOD_FRAME:
        case GOOD_STACK:
                /*
                 * Object is either in the correct frame (when it
                 * is possible to check) or just generally on the
                 * process stack (when frame checking not available).
                 */
                return;
        default:
                usercopy_abort("process stack", NULL, to_user, 0, n);
        }

        /* Check for bad heap object. */
        check_heap_object(ptr, n, to_user);

        /* Check for object in kernel to avoid text exposure. */
        check_kernel_text_object((const unsigned long)ptr, n, to_user);
}
EXPORT_SYMBOL(__check_object_size);

static bool enable_checks __initdata = true;

static int __init parse_hardened_usercopy(char *str)
{
        if (strtobool(str, &enable_checks))
                pr_warn("Invalid option string for hardened_usercopy: '%s'\n",
                        str);
        return 1;
}

__setup("hardened_usercopy=", parse_hardened_usercopy);

static int __init set_hardened_usercopy(void)
{
        if (enable_checks == false)
                static_branch_enable(&bypass_usercopy_checks);
        return 1;
}

late_initcall(set_hardened_usercopy);


























































































































































































































































































































































































































    1 























    1 
    1 






































    1 






























    1 

    1 



















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (C) 1994 Linus Torvalds
 *
 * Pentium III FXSR, SSE support
 * General FPU state handling cleanups
 *        Gareth Hughes <gareth@valinux.com>, May 2000
 * x86-64 work by Andi Kleen 2002
 */

#ifndef _ASM_X86_FPU_INTERNAL_H
#define _ASM_X86_FPU_INTERNAL_H

#include <linux/compat.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/mm.h>

#include <asm/user.h>
#include <asm/fpu/api.h>
#include <asm/fpu/xstate.h>
#include <asm/fpu/xcr.h>
#include <asm/cpufeature.h>
#include <asm/trace/fpu.h>

/*
 * High level FPU state handling functions:
 */
extern void fpu__prepare_read(struct fpu *fpu);
extern void fpu__prepare_write(struct fpu *fpu);
extern void fpu__save(struct fpu *fpu);
extern int  fpu__restore_sig(void __user *buf, int ia32_frame);
extern void fpu__drop(struct fpu *fpu);
extern int  fpu__copy(struct task_struct *dst, struct task_struct *src);
extern void fpu__clear_user_states(struct fpu *fpu);
extern void fpu__clear_all(struct fpu *fpu);
extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);

/*
 * Boot time FPU initialization functions:
 */
extern void fpu__init_cpu(void);
extern void fpu__init_system_xstate(void);
extern void fpu__init_cpu_xstate(void);
extern void fpu__init_system(void);
extern void fpu__init_check_bugs(void);
extern void fpu__resume_cpu(void);
extern u64 fpu__get_supported_xfeatures_mask(void);

/*
 * Debugging facility:
 */
#ifdef CONFIG_X86_DEBUG_FPU
# define WARN_ON_FPU(x) WARN_ON_ONCE(x)
#else
# define WARN_ON_FPU(x) ({ (void)(x); 0; })
#endif

/*
 * FPU related CPU feature flag helper routines:
 */
static __always_inline __pure bool use_xsaveopt(void)
{
        return static_cpu_has(X86_FEATURE_XSAVEOPT);
}

static __always_inline __pure bool use_xsave(void)
{
        return static_cpu_has(X86_FEATURE_XSAVE);
}

static __always_inline __pure bool use_fxsr(void)
{
        return static_cpu_has(X86_FEATURE_FXSR);
}

/*
 * fpstate handling functions:
 */

extern union fpregs_state init_fpstate;

extern void fpstate_init(union fpregs_state *state);
#ifdef CONFIG_MATH_EMULATION
extern void fpstate_init_soft(struct swregs_state *soft);
#else
static inline void fpstate_init_soft(struct swregs_state *soft) {}
#endif

static inline void fpstate_init_xstate(struct xregs_state *xsave)
{
        /*
         * XRSTORS requires these bits set in xcomp_bv, or it will
         * trigger #GP:
         */
        xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all;
}

static inline void fpstate_init_fxstate(struct fxregs_state *fx)
{
        fx->cwd = 0x37f;
        fx->mxcsr = MXCSR_DEFAULT;
}
extern void fpstate_sanitize_xstate(struct fpu *fpu);

/* Returns 0 or the negated trap number, which results in -EFAULT for #PF */
#define user_insn(insn, output, input...)                                \
({                                                                        \
        int err;                                                        \
                                                                        \
        might_fault();                                                        \
                                                                        \
        asm volatile(ASM_STAC "\n"                                        \
                     "1: " #insn "\n"                                        \
                     "2: " ASM_CLAC "\n"                                \
                     ".section .fixup,\"ax\"\n"                                \
                     "3:  negl %%eax\n"                                        \
                     "    jmp  2b\n"                                        \
                     ".previous\n"                                        \
                     _ASM_EXTABLE_FAULT(1b, 3b)                                \
                     : [err] "=a" (err), output                                \
                     : "0"(0), input);                                        \
        err;                                                                \
})

#define kernel_insn_err(insn, output, input...)                                \
({                                                                        \
        int err;                                                        \
        asm volatile("1:" #insn "\n\t"                                        \
                     "2:\n"                                                \
                     ".section .fixup,\"ax\"\n"                                \
                     "3:  movl $-1,%[err]\n"                                \
                     "    jmp  2b\n"                                        \
                     ".previous\n"                                        \
                     _ASM_EXTABLE(1b, 3b)                                \
                     : [err] "=r" (err), output                                \
                     : "0"(0), input);                                        \
        err;                                                                \
})

#define kernel_insn(insn, output, input...)                                \
        asm volatile("1:" #insn "\n\t"                                        \
                     "2:\n"                                                \
                     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_fprestore)        \
                     : output : input)

static inline int copy_fregs_to_user(struct fregs_state __user *fx)
{
        return user_insn(fnsave %[fx]; fwait,  [fx] "=m" (*fx), "m" (*fx));
}

static inline int copy_fxregs_to_user(struct fxregs_state __user *fx)
{
        if (IS_ENABLED(CONFIG_X86_32))
                return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
        else
                return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));

}

static inline void copy_kernel_to_fxregs(struct fxregs_state *fx)
{
        if (IS_ENABLED(CONFIG_X86_32))
                kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
        else
                kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
}

static inline int copy_kernel_to_fxregs_err(struct fxregs_state *fx)
{
        if (IS_ENABLED(CONFIG_X86_32))
                return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
        else
                return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
}

static inline int copy_user_to_fxregs(struct fxregs_state __user *fx)
{
        if (IS_ENABLED(CONFIG_X86_32))
                return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
        else
                return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
}

static inline void copy_kernel_to_fregs(struct fregs_state *fx)
{
        kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
}

static inline int copy_kernel_to_fregs_err(struct fregs_state *fx)
{
        return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
}

static inline int copy_user_to_fregs(struct fregs_state __user *fx)
{
        return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
}

static inline void copy_fxregs_to_kernel(struct fpu *fpu)
{
        if (IS_ENABLED(CONFIG_X86_32))
                asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave));
        else
                asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave));
}

static inline void fxsave(struct fxregs_state *fx)
{
        if (IS_ENABLED(CONFIG_X86_32))
                asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx));
        else
                asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx));
}

/* These macros all use (%edi)/(%rdi) as the single memory argument. */
#define XSAVE                ".byte " REX_PREFIX "0x0f,0xae,0x27"
#define XSAVEOPT        ".byte " REX_PREFIX "0x0f,0xae,0x37"
#define XSAVES                ".byte " REX_PREFIX "0x0f,0xc7,0x2f"
#define XRSTOR                ".byte " REX_PREFIX "0x0f,0xae,0x2f"
#define XRSTORS                ".byte " REX_PREFIX "0x0f,0xc7,0x1f"

/*
 * After this @err contains 0 on success or the negated trap number when
 * the operation raises an exception. For faults this results in -EFAULT.
 */
#define XSTATE_OP(op, st, lmask, hmask, err)                                \
        asm volatile("1:" op "\n\t"                                        \
                     "xor %[err], %[err]\n"                                \
                     "2:\n\t"                                                \
                     ".pushsection .fixup,\"ax\"\n\t"                        \
                     "3: negl %%eax\n\t"                                \
                     "jmp 2b\n\t"                                        \
                     ".popsection\n\t"                                        \
                     _ASM_EXTABLE_FAULT(1b, 3b)                                \
                     : [err] "=a" (err)                                        \
                     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)        \
                     : "memory")

/*
 * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact
 * format and supervisor states in addition to modified optimization in
 * XSAVEOPT.
 *
 * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
 * supports modified optimization which is not supported by XSAVE.
 *
 * We use XSAVE as a fallback.
 *
 * The 661 label is defined in the ALTERNATIVE* macros as the address of the
 * original instruction which gets replaced. We need to use it here as the
 * address of the instruction where we might get an exception at.
 */
#define XSTATE_XSAVE(st, lmask, hmask, err)                                \
        asm volatile(ALTERNATIVE_2(XSAVE,                                \
                                   XSAVEOPT, X86_FEATURE_XSAVEOPT,        \
                                   XSAVES,   X86_FEATURE_XSAVES)        \
                     "\n"                                                \
                     "xor %[err], %[err]\n"                                \
                     "3:\n"                                                \
                     ".pushsection .fixup,\"ax\"\n"                        \
                     "4: movl $-2, %[err]\n"                                \
                     "jmp 3b\n"                                                \
                     ".popsection\n"                                        \
                     _ASM_EXTABLE(661b, 4b)                                \
                     : [err] "=r" (err)                                        \
                     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)        \
                     : "memory")

/*
 * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact
 * XSAVE area format.
 */
#define XSTATE_XRESTORE(st, lmask, hmask)                                \
        asm volatile(ALTERNATIVE(XRSTOR,                                \
                                 XRSTORS, X86_FEATURE_XSAVES)                \
                     "\n"                                                \
                     "3:\n"                                                \
                     _ASM_EXTABLE_HANDLE(661b, 3b, ex_handler_fprestore)\
                     :                                                        \
                     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)        \
                     : "memory")

/*
 * This function is called only during boot time when x86 caps are not set
 * up and alternative can not be used yet.
 */
static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
{
        u64 mask = -1;
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        WARN_ON(system_state != SYSTEM_BOOTING);

        if (boot_cpu_has(X86_FEATURE_XSAVES))
                XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
        else
                XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);

        /*
         * We should never fault when copying from a kernel buffer, and the FPU
         * state we set at boot time should be valid.
         */
        WARN_ON_FPU(err);
}

/*
 * Save processor xstate to xsave area.
 */
static inline void copy_xregs_to_kernel(struct xregs_state *xstate)
{
        u64 mask = xfeatures_mask_all;
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        WARN_ON_FPU(!alternatives_patched);

        XSTATE_XSAVE(xstate, lmask, hmask, err);

        /* We should never fault when copying to a kernel buffer: */
        WARN_ON_FPU(err);
}

/*
 * Restore processor xstate from xsave area.
 */
static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask)
{
        u32 lmask = mask;
        u32 hmask = mask >> 32;

        XSTATE_XRESTORE(xstate, lmask, hmask);
}

/*
 * Save xstate to user space xsave area.
 *
 * We don't use modified optimization because xrstor/xrstors might track
 * a different application.
 *
 * We don't use compacted format xsave area for
 * backward compatibility for old applications which don't understand
 * compacted format of xsave area.
 */
static inline int copy_xregs_to_user(struct xregs_state __user *buf)
{
        u64 mask = xfeatures_mask_user();
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        /*
         * Clear the xsave header first, so that reserved fields are
         * initialized to zero.
         */
        err = __clear_user(&buf->header, sizeof(buf->header));
        if (unlikely(err))
                return -EFAULT;

        stac();
        XSTATE_OP(XSAVE, buf, lmask, hmask, err);
        clac();

        return err;
}

/*
 * Restore xstate from user space xsave area.
 */
static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask)
{
        struct xregs_state *xstate = ((__force struct xregs_state *)buf);
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        stac();
        XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
        clac();

        return err;
}

/*
 * Restore xstate from kernel space xsave area, return an error code instead of
 * an exception.
 */
static inline int copy_kernel_to_xregs_err(struct xregs_state *xstate, u64 mask)
{
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        if (static_cpu_has(X86_FEATURE_XSAVES))
                XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
        else
                XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);

        return err;
}

extern int copy_fpregs_to_fpstate(struct fpu *fpu);

static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask)
{
        if (use_xsave()) {
                copy_kernel_to_xregs(&fpstate->xsave, mask);
        } else {
                if (use_fxsr())
                        copy_kernel_to_fxregs(&fpstate->fxsave);
                else
                        copy_kernel_to_fregs(&fpstate->fsave);
        }
}

static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate)
{
        /*
         * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
         * pending. Clear the x87 state here by setting it to fixed values.
         * "m" is a random variable that should be in L1.
         */
        if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) {
                asm volatile(
                        "fnclex\n\t"
                        "emms\n\t"
                        "fildl %P[addr]"        /* set F?P to defined value */
                        : : [addr] "m" (fpstate));
        }

        __copy_kernel_to_fpregs(fpstate, -1);
}

extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);

/*
 * FPU context switch related helper methods:
 */

DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);

/*
 * The in-register FPU state for an FPU context on a CPU is assumed to be
 * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx
 * matches the FPU.
 *
 * If the FPU register state is valid, the kernel can skip restoring the
 * FPU state from memory.
 *
 * Any code that clobbers the FPU registers or updates the in-memory
 * FPU state for a task MUST let the rest of the kernel know that the
 * FPU registers are no longer valid for this task.
 *
 * Either one of these invalidation functions is enough. Invalidate
 * a resource you control: CPU if using the CPU for something else
 * (with preemption disabled), FPU for the current task, or a task that
 * is prevented from running by the current task.
 */
static inline void __cpu_invalidate_fpregs_state(void)
{
        __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
}

static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
{
        fpu->last_cpu = -1;
}

static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
{
        return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
}

/*
 * These generally need preemption protection to work,
 * do try to avoid using these on their own:
 */
static inline void fpregs_deactivate(struct fpu *fpu)
{
        this_cpu_write(fpu_fpregs_owner_ctx, NULL);
        trace_x86_fpu_regs_deactivated(fpu);
}

static inline void fpregs_activate(struct fpu *fpu)
{
        this_cpu_write(fpu_fpregs_owner_ctx, fpu);
        trace_x86_fpu_regs_activated(fpu);
}

/*
 * Internal helper, do not use directly. Use switch_fpu_return() instead.
 */
static inline void __fpregs_load_activate(void)
{
        struct fpu *fpu = &current->thread.fpu;
        int cpu = smp_processor_id();

        if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
                return;

        if (!fpregs_state_valid(fpu, cpu)) {
                copy_kernel_to_fpregs(&fpu->state);
                fpregs_activate(fpu);
                fpu->last_cpu = cpu;
        }
        clear_thread_flag(TIF_NEED_FPU_LOAD);
}

/*
 * FPU state switching for scheduling.
 *
 * This is a two-stage process:
 *
 *  - switch_fpu_prepare() saves the old state.
 *    This is done within the context of the old process.
 *
 *  - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state
 *    will get loaded on return to userspace, or when the kernel needs it.
 *
 * If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers
 * are saved in the current thread's FPU register state.
 *
 * If TIF_NEED_FPU_LOAD is set then CPU's FPU registers may not
 * hold current()'s FPU registers. It is required to load the
 * registers before returning to userland or using the content
 * otherwise.
 *
 * The FPU context is only stored/restored for a user task and
 * PF_KTHREAD is used to distinguish between kernel and user threads.
 */
static inline void switch_fpu_prepare(struct task_struct *prev, int cpu)
{
        struct fpu *old_fpu = &prev->thread.fpu;

        if (static_cpu_has(X86_FEATURE_FPU) && !(prev->flags & PF_KTHREAD)) {
                if (!copy_fpregs_to_fpstate(old_fpu))
                        old_fpu->last_cpu = -1;
                else
                        old_fpu->last_cpu = cpu;

                /* But leave fpu_fpregs_owner_ctx! */
                trace_x86_fpu_regs_deactivated(old_fpu);
        }
}

/*
 * Misc helper functions:
 */

/*
 * Load PKRU from the FPU context if available. Delay loading of the
 * complete FPU state until the return to userland.
 */
static inline void switch_fpu_finish(struct task_struct *next)
{
        u32 pkru_val = init_pkru_value;
        struct pkru_state *pk;
        struct fpu *next_fpu = &next->thread.fpu;

        if (!static_cpu_has(X86_FEATURE_FPU))
                return;

        set_thread_flag(TIF_NEED_FPU_LOAD);

        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return;

        /*
         * PKRU state is switched eagerly because it needs to be valid before we
         * return to userland e.g. for a copy_to_user() operation.
         */
        if (!(next->flags & PF_KTHREAD)) {
                /*
                 * If the PKRU bit in xsave.header.xfeatures is not set,
                 * then the PKRU component was in init state, which means
                 * XRSTOR will set PKRU to 0. If the bit is not set then
                 * get_xsave_addr() will return NULL because the PKRU value
                 * in memory is not valid. This means pkru_val has to be
                 * set to 0 and not to init_pkru_value.
                 */
                pk = get_xsave_addr(&next_fpu->state.xsave, XFEATURE_PKRU);
                pkru_val = pk ? pk->pkru : 0;
        }
        __write_pkru(pkru_val);
}

#endif /* _ASM_X86_FPU_INTERNAL_H */












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the AF_INET socket handler.
 *
 * Version:        @(#)sock.h        1.0.4        05/13/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Corey Minyard <wf-rch!minyard@relay.EU.net>
 *                Florian La Roche <flla@stud.uni-sb.de>
 *
 * Fixes:
 *                Alan Cox        :        Volatiles in skbuff pointers. See
 *                                        skbuff comments. May be overdone,
 *                                        better to prove they can be removed
 *                                        than the reverse.
 *                Alan Cox        :        Added a zapped field for tcp to note
 *                                        a socket is reset and must stay shut up
 *                Alan Cox        :        New fields for options
 *        Pauline Middelink        :        identd support
 *                Alan Cox        :        Eliminate low level recv/recvfrom
 *                David S. Miller        :        New socket lookup architecture.
 *              Steve Whitehouse:       Default routines for sock_ops
 *              Arnaldo C. Melo :        removed net_pinfo, tp_pinfo and made
 *                                      protinfo be just a void pointer, as the
 *                                      protocol specific parts were moved to
 *                                      respective headers and ipv4/v6, etc now
 *                                      use private slabcaches for its socks
 *              Pedro Hortas        :        New flags field for socket options
 */
#ifndef _SOCK_H
#define _SOCK_H

#include <linux/hardirq.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/list_nulls.h>
#include <linux/timer.h>
#include <linux/cache.h>
#include <linux/bitops.h>
#include <linux/lockdep.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>        /* struct sk_buff */
#include <linux/mm.h>
#include <linux/security.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/static_key.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/cgroup-defs.h>
#include <linux/rbtree.h>
#include <linux/filter.h>
#include <linux/rculist_nulls.h>
#include <linux/poll.h>
#include <linux/sockptr.h>

#include <linux/atomic.h>
#include <linux/refcount.h>
#include <net/dst.h>
#include <net/checksum.h>
#include <net/tcp_states.h>
#include <linux/net_tstamp.h>
#include <net/l3mdev.h>

/*
 * This structure really needs to be cleaned up.
 * Most of it is for TCP, and not used by any of
 * the other protocols.
 */

/* Define this to get the SOCK_DBG debugging facility. */
#define SOCK_DEBUGGING
#ifdef SOCK_DEBUGGING
#define SOCK_DEBUG(sk, msg...) do { if ((sk) && sock_flag((sk), SOCK_DBG)) \
                                        printk(KERN_DEBUG msg); } while (0)
#else
/* Validate arguments and do nothing */
static inline __printf(2, 3)
void SOCK_DEBUG(const struct sock *sk, const char *msg, ...)
{
}
#endif

/* This is the per-socket lock.  The spinlock provides a synchronization
 * between user contexts and software interrupt processing, whereas the
 * mini-semaphore synchronizes multiple users amongst themselves.
 */
typedef struct {
        spinlock_t                slock;
        int                        owned;
        wait_queue_head_t        wq;
        /*
         * We express the mutex-alike socket_lock semantics
         * to the lock validator by explicitly managing
         * the slock as a lock variant (in addition to
         * the slock itself):
         */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map dep_map;
#endif
} socket_lock_t;

struct sock;
struct proto;
struct net;

typedef __u32 __bitwise __portpair;
typedef __u64 __bitwise __addrpair;

/**
 *        struct sock_common - minimal network layer representation of sockets
 *        @skc_daddr: Foreign IPv4 addr
 *        @skc_rcv_saddr: Bound local IPv4 addr
 *        @skc_addrpair: 8-byte-aligned __u64 union of @skc_daddr & @skc_rcv_saddr
 *        @skc_hash: hash value used with various protocol lookup tables
 *        @skc_u16hashes: two u16 hash values used by UDP lookup tables
 *        @skc_dport: placeholder for inet_dport/tw_dport
 *        @skc_num: placeholder for inet_num/tw_num
 *        @skc_portpair: __u32 union of @skc_dport & @skc_num
 *        @skc_family: network address family
 *        @skc_state: Connection state
 *        @skc_reuse: %SO_REUSEADDR setting
 *        @skc_reuseport: %SO_REUSEPORT setting
 *        @skc_ipv6only: socket is IPV6 only
 *        @skc_net_refcnt: socket is using net ref counting
 *        @skc_bound_dev_if: bound device index if != 0
 *        @skc_bind_node: bind hash linkage for various protocol lookup tables
 *        @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol
 *        @skc_prot: protocol handlers inside a network family
 *        @skc_net: reference to the network namespace of this socket
 *        @skc_v6_daddr: IPV6 destination address
 *        @skc_v6_rcv_saddr: IPV6 source address
 *        @skc_cookie: socket's cookie value
 *        @skc_node: main hash linkage for various protocol lookup tables
 *        @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
 *        @skc_tx_queue_mapping: tx queue number for this connection
 *        @skc_rx_queue_mapping: rx queue number for this connection
 *        @skc_flags: place holder for sk_flags
 *                %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
 *                %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
 *        @skc_listener: connection request listener socket (aka rsk_listener)
 *                [union with @skc_flags]
 *        @skc_tw_dr: (aka tw_dr) ptr to &struct inet_timewait_death_row
 *                [union with @skc_flags]
 *        @skc_incoming_cpu: record/match cpu processing incoming packets
 *        @skc_rcv_wnd: (aka rsk_rcv_wnd) TCP receive window size (possibly scaled)
 *                [union with @skc_incoming_cpu]
 *        @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number
 *                [union with @skc_incoming_cpu]
 *        @skc_refcnt: reference count
 *
 *        This is the minimal network layer representation of sockets, the header
 *        for struct sock and struct inet_timewait_sock.
 */
struct sock_common {
        union {
                __addrpair        skc_addrpair;
                struct {
                        __be32        skc_daddr;
                        __be32        skc_rcv_saddr;
                };
        };
        union  {
                unsigned int        skc_hash;
                __u16                skc_u16hashes[2];
        };
        /* skc_dport && skc_num must be grouped as well */
        union {
                __portpair        skc_portpair;
                struct {
                        __be16        skc_dport;
                        __u16        skc_num;
                };
        };

        unsigned short                skc_family;
        volatile unsigned char        skc_state;
        unsigned char                skc_reuse:4;
        unsigned char                skc_reuseport:1;
        unsigned char                skc_ipv6only:1;
        unsigned char                skc_net_refcnt:1;
        int                        skc_bound_dev_if;
        union {
                struct hlist_node        skc_bind_node;
                struct hlist_node        skc_portaddr_node;
        };
        struct proto                *skc_prot;
        possible_net_t                skc_net;

#if IS_ENABLED(CONFIG_IPV6)
        struct in6_addr                skc_v6_daddr;
        struct in6_addr                skc_v6_rcv_saddr;
#endif

        atomic64_t                skc_cookie;

        /* following fields are padding to force
         * offset(struct sock, sk_refcnt) == 128 on 64bit arches
         * assuming IPV6 is enabled. We use this padding differently
         * for different kind of 'sockets'
         */
        union {
                unsigned long        skc_flags;
                struct sock        *skc_listener; /* request_sock */
                struct inet_timewait_death_row *skc_tw_dr; /* inet_timewait_sock */
        };
        /*
         * fields between dontcopy_begin/dontcopy_end
         * are not copied in sock_copy()
         */
        /* private: */
        int                        skc_dontcopy_begin[0];
        /* public: */
        union {
                struct hlist_node        skc_node;
                struct hlist_nulls_node skc_nulls_node;
        };
        unsigned short                skc_tx_queue_mapping;
#ifdef CONFIG_XPS
        unsigned short                skc_rx_queue_mapping;
#endif
        union {
                int                skc_incoming_cpu;
                u32                skc_rcv_wnd;
                u32                skc_tw_rcv_nxt; /* struct tcp_timewait_sock  */
        };

        refcount_t                skc_refcnt;
        /* private: */
        int                     skc_dontcopy_end[0];
        union {
                u32                skc_rxhash;
                u32                skc_window_clamp;
                u32                skc_tw_snd_nxt; /* struct tcp_timewait_sock */
        };
        /* public: */
};

struct bpf_local_storage;

/**
  *        struct sock - network layer representation of sockets
  *        @__sk_common: shared layout with inet_timewait_sock
  *        @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN
  *        @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
  *        @sk_lock:        synchronizer
  *        @sk_kern_sock: True if sock is using kernel lock classes
  *        @sk_rcvbuf: size of receive buffer in bytes
  *        @sk_wq: sock wait queue and async head
  *        @sk_rx_dst: receive input route used by early demux
  *        @sk_dst_cache: destination cache
  *        @sk_dst_pending_confirm: need to confirm neighbour
  *        @sk_policy: flow policy
  *        @sk_rx_skb_cache: cache copy of recently accessed RX skb
  *        @sk_receive_queue: incoming packets
  *        @sk_wmem_alloc: transmit queue bytes committed
  *        @sk_tsq_flags: TCP Small Queues flags
  *        @sk_write_queue: Packet sending queue
  *        @sk_omem_alloc: "o" is "option" or "other"
  *        @sk_wmem_queued: persistent queue size
  *        @sk_forward_alloc: space allocated forward
  *        @sk_napi_id: id of the last napi context to receive data for sk
  *        @sk_ll_usec: usecs to busypoll when there is no data
  *        @sk_allocation: allocation mode
  *        @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)
  *        @sk_pacing_status: Pacing status (requested, handled by sch_fq)
  *        @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE)
  *        @sk_sndbuf: size of send buffer in bytes
  *        @__sk_flags_offset: empty field used to determine location of bitfield
  *        @sk_padding: unused element for alignment
  *        @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets
  *        @sk_no_check_rx: allow zero checksum in RX packets
  *        @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
  *        @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK)
  *        @sk_route_forced_caps: static, forced route capabilities
  *                (set in tcp_init_sock())
  *        @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
  *        @sk_gso_max_size: Maximum GSO segment size to build
  *        @sk_gso_max_segs: Maximum number of GSO segments
  *        @sk_pacing_shift: scaling factor for TCP Small Queues
  *        @sk_lingertime: %SO_LINGER l_linger setting
  *        @sk_backlog: always used with the per-socket spinlock held
  *        @sk_callback_lock: used with the callbacks in the end of this struct
  *        @sk_error_queue: rarely used
  *        @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt,
  *                          IPV6_ADDRFORM for instance)
  *        @sk_err: last error
  *        @sk_err_soft: errors that don't cause failure but are the cause of a
  *                      persistent failure not just 'timed out'
  *        @sk_drops: raw/udp drops counter
  *        @sk_ack_backlog: current listen backlog
  *        @sk_max_ack_backlog: listen backlog set in listen()
  *        @sk_uid: user id of owner
  *        @sk_priority: %SO_PRIORITY setting
  *        @sk_type: socket type (%SOCK_STREAM, etc)
  *        @sk_protocol: which protocol this socket belongs in this network family
  *        @sk_peer_pid: &struct pid for this socket's peer
  *        @sk_peer_cred: %SO_PEERCRED setting
  *        @sk_rcvlowat: %SO_RCVLOWAT setting
  *        @sk_rcvtimeo: %SO_RCVTIMEO setting
  *        @sk_sndtimeo: %SO_SNDTIMEO setting
  *        @sk_txhash: computed flow hash for use on transmit
  *        @sk_filter: socket filtering instructions
  *        @sk_timer: sock cleanup timer
  *        @sk_stamp: time stamp of last packet received
  *        @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
  *        @sk_tsflags: SO_TIMESTAMPING socket options
  *        @sk_tskey: counter to disambiguate concurrent tstamp requests
  *        @sk_zckey: counter to order MSG_ZEROCOPY notifications
  *        @sk_socket: Identd and reporting IO signals
  *        @sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock.
  *        @sk_frag: cached page frag
  *        @sk_peek_off: current peek_offset value
  *        @sk_send_head: front of stuff to transmit
  *        @tcp_rtx_queue: TCP re-transmit queue [union with @sk_send_head]
  *        @sk_tx_skb_cache: cache copy of recently accessed TX skb
  *        @sk_security: used by security modules
  *        @sk_mark: generic packet mark
  *        @sk_cgrp_data: cgroup data for this cgroup
  *        @sk_memcg: this socket's memory cgroup association
  *        @sk_write_pending: a write to stream socket waits to start
  *        @sk_wait_pending: number of threads blocked on this socket
  *        @sk_state_change: callback to indicate change in the state of the sock
  *        @sk_data_ready: callback to indicate there is data to be processed
  *        @sk_write_space: callback to indicate there is bf sending space available
  *        @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE)
  *        @sk_backlog_rcv: callback to process the backlog
  *        @sk_validate_xmit_skb: ptr to an optional validate function
  *        @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0
  *        @sk_reuseport_cb: reuseport group container
  *        @sk_bpf_storage: ptr to cache and control for bpf_sk_storage
  *        @sk_rcu: used during RCU grace period
  *        @sk_clockid: clockid used by time-based scheduling (SO_TXTIME)
  *        @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME
  *        @sk_txtime_report_errors: set report errors mode for SO_TXTIME
  *        @sk_txtime_unused: unused txtime flags
  */
struct sock {
        /*
         * Now struct inet_timewait_sock also uses sock_common, so please just
         * don't add nothing before this first member (__sk_common) --acme
         */
        struct sock_common        __sk_common;
#define sk_node                        __sk_common.skc_node
#define sk_nulls_node                __sk_common.skc_nulls_node
#define sk_refcnt                __sk_common.skc_refcnt
#define sk_tx_queue_mapping        __sk_common.skc_tx_queue_mapping
#ifdef CONFIG_XPS
#define sk_rx_queue_mapping        __sk_common.skc_rx_queue_mapping
#endif

#define sk_dontcopy_begin        __sk_common.skc_dontcopy_begin
#define sk_dontcopy_end                __sk_common.skc_dontcopy_end
#define sk_hash                        __sk_common.skc_hash
#define sk_portpair                __sk_common.skc_portpair
#define sk_num                        __sk_common.skc_num
#define sk_dport                __sk_common.skc_dport
#define sk_addrpair                __sk_common.skc_addrpair
#define sk_daddr                __sk_common.skc_daddr
#define sk_rcv_saddr                __sk_common.skc_rcv_saddr
#define sk_family                __sk_common.skc_family
#define sk_state                __sk_common.skc_state
#define sk_reuse                __sk_common.skc_reuse
#define sk_reuseport                __sk_common.skc_reuseport
#define sk_ipv6only                __sk_common.skc_ipv6only
#define sk_net_refcnt                __sk_common.skc_net_refcnt
#define sk_bound_dev_if                __sk_common.skc_bound_dev_if
#define sk_bind_node                __sk_common.skc_bind_node
#define sk_prot                        __sk_common.skc_prot
#define sk_net                        __sk_common.skc_net
#define sk_v6_daddr                __sk_common.skc_v6_daddr
#define sk_v6_rcv_saddr        __sk_common.skc_v6_rcv_saddr
#define sk_cookie                __sk_common.skc_cookie
#define sk_incoming_cpu                __sk_common.skc_incoming_cpu
#define sk_flags                __sk_common.skc_flags
#define sk_rxhash                __sk_common.skc_rxhash

        socket_lock_t                sk_lock;
        atomic_t                sk_drops;
        int                        sk_rcvlowat;
        struct sk_buff_head        sk_error_queue;
        struct sk_buff                *sk_rx_skb_cache;
        struct sk_buff_head        sk_receive_queue;
        /*
         * The backlog queue is special, it is always used with
         * the per-socket spinlock held and requires low latency
         * access. Therefore we special case it's implementation.
         * Note : rmem_alloc is in this structure to fill a hole
         * on 64bit arches, not because its logically part of
         * backlog.
         */
        struct {
                atomic_t        rmem_alloc;
                int                len;
                struct sk_buff        *head;
                struct sk_buff        *tail;
        } sk_backlog;
#define sk_rmem_alloc sk_backlog.rmem_alloc

        int                        sk_forward_alloc;
#ifdef CONFIG_NET_RX_BUSY_POLL
        unsigned int                sk_ll_usec;
        /* ===== mostly read cache line ===== */
        unsigned int                sk_napi_id;
#endif
        int                        sk_rcvbuf;
        int                        sk_wait_pending;

        struct sk_filter __rcu        *sk_filter;
        union {
                struct socket_wq __rcu        *sk_wq;
                /* private: */
                struct socket_wq        *sk_wq_raw;
                /* public: */
        };
#ifdef CONFIG_XFRM
        struct xfrm_policy __rcu *sk_policy[2];
#endif
        struct dst_entry __rcu        *sk_rx_dst;
        struct dst_entry __rcu        *sk_dst_cache;
        atomic_t                sk_omem_alloc;
        int                        sk_sndbuf;

        /* ===== cache line for TX ===== */
        int                        sk_wmem_queued;
        refcount_t                sk_wmem_alloc;
        unsigned long                sk_tsq_flags;
        union {
                struct sk_buff        *sk_send_head;
                struct rb_root        tcp_rtx_queue;
        };
        struct sk_buff                *sk_tx_skb_cache;
        struct sk_buff_head        sk_write_queue;
        __s32                        sk_peek_off;
        int                        sk_write_pending;
        __u32                        sk_dst_pending_confirm;
        u32                        sk_pacing_status; /* see enum sk_pacing */
        long                        sk_sndtimeo;
        struct timer_list        sk_timer;
        __u32                        sk_priority;
        __u32                        sk_mark;
        unsigned long                sk_pacing_rate; /* bytes per second */
        unsigned long                sk_max_pacing_rate;
        struct page_frag        sk_frag;
        netdev_features_t        sk_route_caps;
        netdev_features_t        sk_route_nocaps;
        netdev_features_t        sk_route_forced_caps;
        int                        sk_gso_type;
        unsigned int                sk_gso_max_size;
        gfp_t                        sk_allocation;
        __u32                        sk_txhash;

        /*
         * Because of non atomicity rules, all
         * changes are protected by socket lock.
         */
        u8                        sk_padding : 1,
                                sk_kern_sock : 1,
                                sk_no_check_tx : 1,
                                sk_no_check_rx : 1,
                                sk_userlocks : 4;
        u8                        sk_pacing_shift;
        u16                        sk_type;
        u16                        sk_protocol;
        u16                        sk_gso_max_segs;
        unsigned long                sk_lingertime;
        struct proto                *sk_prot_creator;
        rwlock_t                sk_callback_lock;
        int                        sk_err,
                                sk_err_soft;
        u32                        sk_ack_backlog;
        u32                        sk_max_ack_backlog;
        kuid_t                        sk_uid;
        spinlock_t                sk_peer_lock;
        struct pid                *sk_peer_pid;
        const struct cred        *sk_peer_cred;

        long                        sk_rcvtimeo;
        ktime_t                        sk_stamp;
#if BITS_PER_LONG==32
        seqlock_t                sk_stamp_seq;
#endif
        u16                        sk_tsflags;
        u8                        sk_shutdown;
        u32                        sk_tskey;
        atomic_t                sk_zckey;

        u8                        sk_clockid;
        u8                        sk_txtime_deadline_mode : 1,
                                sk_txtime_report_errors : 1,
                                sk_txtime_unused : 6;

        struct socket                *sk_socket;
        void                        *sk_user_data;
#ifdef CONFIG_SECURITY
        void                        *sk_security;
#endif
        struct sock_cgroup_data        sk_cgrp_data;
        struct mem_cgroup        *sk_memcg;
        void                        (*sk_state_change)(struct sock *sk);
        void                        (*sk_data_ready)(struct sock *sk);
        void                        (*sk_write_space)(struct sock *sk);
        void                        (*sk_error_report)(struct sock *sk);
        int                        (*sk_backlog_rcv)(struct sock *sk,
                                                  struct sk_buff *skb);
#ifdef CONFIG_SOCK_VALIDATE_XMIT
        struct sk_buff*                (*sk_validate_xmit_skb)(struct sock *sk,
                                                        struct net_device *dev,
                                                        struct sk_buff *skb);
#endif
        void                    (*sk_destruct)(struct sock *sk);
        struct sock_reuseport __rcu        *sk_reuseport_cb;
#ifdef CONFIG_BPF_SYSCALL
        struct bpf_local_storage __rcu        *sk_bpf_storage;
#endif
        struct rcu_head                sk_rcu;
};

enum sk_pacing {
        SK_PACING_NONE                = 0,
        SK_PACING_NEEDED        = 1,
        SK_PACING_FQ                = 2,
};

/* flag bits in sk_user_data
 *
 * - SK_USER_DATA_NOCOPY:      Pointer stored in sk_user_data might
 *   not be suitable for copying when cloning the socket. For instance,
 *   it can point to a reference counted object. sk_user_data bottom
 *   bit is set if pointer must not be copied.
 *
 * - SK_USER_DATA_BPF:         Mark whether sk_user_data field is
 *   managed/owned by a BPF reuseport array. This bit should be set
 *   when sk_user_data's sk is added to the bpf's reuseport_array.
 *
 * - SK_USER_DATA_PSOCK:       Mark whether pointer stored in
 *   sk_user_data points to psock type. This bit should be set
 *   when sk_user_data is assigned to a psock object.
 */
#define SK_USER_DATA_NOCOPY        1UL
#define SK_USER_DATA_BPF        2UL
#define SK_USER_DATA_PSOCK        4UL
#define SK_USER_DATA_PTRMASK        ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF |\
                                  SK_USER_DATA_PSOCK)

/**
 * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied
 * @sk: socket
 */
static inline bool sk_user_data_is_nocopy(const struct sock *sk)
{
        return ((uintptr_t)sk->sk_user_data & SK_USER_DATA_NOCOPY);
}

#define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data)))

/**
 * __rcu_dereference_sk_user_data_with_flags - return the pointer
 * only if argument flags all has been set in sk_user_data. Otherwise
 * return NULL
 *
 * @sk: socket
 * @flags: flag bits
 */
static inline void *
__rcu_dereference_sk_user_data_with_flags(const struct sock *sk,
                                          uintptr_t flags)
{
        uintptr_t sk_user_data = (uintptr_t)rcu_dereference(__sk_user_data(sk));

        WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK);

        if ((sk_user_data & flags) == flags)
                return (void *)(sk_user_data & SK_USER_DATA_PTRMASK);
        return NULL;
}

#define rcu_dereference_sk_user_data(sk)                                \
        __rcu_dereference_sk_user_data_with_flags(sk, 0)
#define __rcu_assign_sk_user_data_with_flags(sk, ptr, flags)                \
({                                                                        \
        uintptr_t __tmp1 = (uintptr_t)(ptr),                                \
                  __tmp2 = (uintptr_t)(flags);                                \
        WARN_ON_ONCE(__tmp1 & ~SK_USER_DATA_PTRMASK);                        \
        WARN_ON_ONCE(__tmp2 & SK_USER_DATA_PTRMASK);                        \
        rcu_assign_pointer(__sk_user_data((sk)),                        \
                           __tmp1 | __tmp2);                                \
})
#define rcu_assign_sk_user_data(sk, ptr)                                \
        __rcu_assign_sk_user_data_with_flags(sk, ptr, 0)

/*
 * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK
 * or not whether his port will be reused by someone else. SK_FORCE_REUSE
 * on a socket means that the socket will reuse everybody else's port
 * without looking at the other's sk_reuse value.
 */

#define SK_NO_REUSE        0
#define SK_CAN_REUSE        1
#define SK_FORCE_REUSE        2

int sk_set_peek_off(struct sock *sk, int val);

static inline int sk_peek_offset(struct sock *sk, int flags)
{
        if (unlikely(flags & MSG_PEEK)) {
                return READ_ONCE(sk->sk_peek_off);
        }

        return 0;
}

static inline void sk_peek_offset_bwd(struct sock *sk, int val)
{
        s32 off = READ_ONCE(sk->sk_peek_off);

        if (unlikely(off >= 0)) {
                off = max_t(s32, off - val, 0);
                WRITE_ONCE(sk->sk_peek_off, off);
        }
}

static inline void sk_peek_offset_fwd(struct sock *sk, int val)
{
        sk_peek_offset_bwd(sk, -val);
}

/*
 * Hashed lists helper routines
 */
static inline struct sock *sk_entry(const struct hlist_node *node)
{
        return hlist_entry(node, struct sock, sk_node);
}

static inline struct sock *__sk_head(const struct hlist_head *head)
{
        return hlist_entry(head->first, struct sock, sk_node);
}

static inline struct sock *sk_head(const struct hlist_head *head)
{
        return hlist_empty(head) ? NULL : __sk_head(head);
}

static inline struct sock *__sk_nulls_head(const struct hlist_nulls_head *head)
{
        return hlist_nulls_entry(head->first, struct sock, sk_nulls_node);
}

static inline struct sock *sk_nulls_head(const struct hlist_nulls_head *head)
{
        return hlist_nulls_empty(head) ? NULL : __sk_nulls_head(head);
}

static inline struct sock *sk_next(const struct sock *sk)
{
        return hlist_entry_safe(sk->sk_node.next, struct sock, sk_node);
}

static inline struct sock *sk_nulls_next(const struct sock *sk)
{
        return (!is_a_nulls(sk->sk_nulls_node.next)) ?
                hlist_nulls_entry(sk->sk_nulls_node.next,
                                  struct sock, sk_nulls_node) :
                NULL;
}

static inline bool sk_unhashed(const struct sock *sk)
{
        return hlist_unhashed(&sk->sk_node);
}

static inline bool sk_hashed(const struct sock *sk)
{
        return !sk_unhashed(sk);
}

static inline void sk_node_init(struct hlist_node *node)
{
        node->pprev = NULL;
}

static inline void sk_nulls_node_init(struct hlist_nulls_node *node)
{
        node->pprev = NULL;
}

static inline void __sk_del_node(struct sock *sk)
{
        __hlist_del(&sk->sk_node);
}

/* NB: equivalent to hlist_del_init_rcu */
static inline bool __sk_del_node_init(struct sock *sk)
{
        if (sk_hashed(sk)) {
                __sk_del_node(sk);
                sk_node_init(&sk->sk_node);
                return true;
        }
        return false;
}

/* Grab socket reference count. This operation is valid only
   when sk is ALREADY grabbed f.e. it is found in hash table
   or a list and the lookup is made under lock preventing hash table
   modifications.
 */

static __always_inline void sock_hold(struct sock *sk)
{
        refcount_inc(&sk->sk_refcnt);
}

/* Ungrab socket in the context, which assumes that socket refcnt
   cannot hit zero, f.e. it is true in context of any socketcall.
 */
static __always_inline void __sock_put(struct sock *sk)
{
        refcount_dec(&sk->sk_refcnt);
}

static inline bool sk_del_node_init(struct sock *sk)
{
        bool rc = __sk_del_node_init(sk);

        if (rc) {
                /* paranoid for a while -acme */
                WARN_ON(refcount_read(&sk->sk_refcnt) == 1);
                __sock_put(sk);
        }
        return rc;
}
#define sk_del_node_init_rcu(sk)        sk_del_node_init(sk)

static inline bool __sk_nulls_del_node_init_rcu(struct sock *sk)
{
        if (sk_hashed(sk)) {
                hlist_nulls_del_init_rcu(&sk->sk_nulls_node);
                return true;
        }
        return false;
}

static inline bool sk_nulls_del_node_init_rcu(struct sock *sk)
{
        bool rc = __sk_nulls_del_node_init_rcu(sk);

        if (rc) {
                /* paranoid for a while -acme */
                WARN_ON(refcount_read(&sk->sk_refcnt) == 1);
                __sock_put(sk);
        }
        return rc;
}

static inline void __sk_add_node(struct sock *sk, struct hlist_head *list)
{
        hlist_add_head(&sk->sk_node, list);
}

static inline void sk_add_node(struct sock *sk, struct hlist_head *list)
{
        sock_hold(sk);
        __sk_add_node(sk, list);
}

static inline void sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
{
        sock_hold(sk);
        if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
            sk->sk_family == AF_INET6)
                hlist_add_tail_rcu(&sk->sk_node, list);
        else
                hlist_add_head_rcu(&sk->sk_node, list);
}

static inline void sk_add_node_tail_rcu(struct sock *sk, struct hlist_head *list)
{
        sock_hold(sk);
        hlist_add_tail_rcu(&sk->sk_node, list);
}

static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
{
        hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list);
}

static inline void __sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nulls_head *list)
{
        hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list);
}

static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
{
        sock_hold(sk);
        __sk_nulls_add_node_rcu(sk, list);
}

static inline void __sk_del_bind_node(struct sock *sk)
{
        __hlist_del(&sk->sk_bind_node);
}

static inline void sk_add_bind_node(struct sock *sk,
                                        struct hlist_head *list)
{
        hlist_add_head(&sk->sk_bind_node, list);
}

#define sk_for_each(__sk, list) \
        hlist_for_each_entry(__sk, list, sk_node)
#define sk_for_each_rcu(__sk, list) \
        hlist_for_each_entry_rcu(__sk, list, sk_node)
#define sk_nulls_for_each(__sk, node, list) \
        hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node)
#define sk_nulls_for_each_rcu(__sk, node, list) \
        hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node)
#define sk_for_each_from(__sk) \
        hlist_for_each_entry_from(__sk, sk_node)
#define sk_nulls_for_each_from(__sk, node) \
        if (__sk && ({ node = &(__sk)->sk_nulls_node; 1; })) \
                hlist_nulls_for_each_entry_from(__sk, node, sk_nulls_node)
#define sk_for_each_safe(__sk, tmp, list) \
        hlist_for_each_entry_safe(__sk, tmp, list, sk_node)
#define sk_for_each_bound(__sk, list) \
        hlist_for_each_entry(__sk, list, sk_bind_node)

/**
 * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_node to use as a loop cursor.
 * @head:        the head for your list.
 * @offset:        offset of hlist_node within the struct.
 *
 */
#define sk_for_each_entry_offset_rcu(tpos, pos, head, offset)                       \
        for (pos = rcu_dereference(hlist_first_rcu(head));                       \
             pos != NULL &&                                                       \
                ({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;});       \
             pos = rcu_dereference(hlist_next_rcu(pos)))

static inline struct user_namespace *sk_user_ns(struct sock *sk)
{
        /* Careful only use this in a context where these parameters
         * can not change and must all be valid, such as recvmsg from
         * userspace.
         */
        return sk->sk_socket->file->f_cred->user_ns;
}

/* Sock flags */
enum sock_flags {
        SOCK_DEAD,
        SOCK_DONE,
        SOCK_URGINLINE,
        SOCK_KEEPOPEN,
        SOCK_LINGER,
        SOCK_DESTROY,
        SOCK_BROADCAST,
        SOCK_TIMESTAMP,
        SOCK_ZAPPED,
        SOCK_USE_WRITE_QUEUE, /* whether to call sk->sk_write_space in sock_wfree */
        SOCK_DBG, /* %SO_DEBUG setting */
        SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */
        SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */
        SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
        SOCK_MEMALLOC, /* VM depends on this socket for swapping */
        SOCK_TIMESTAMPING_RX_SOFTWARE,  /* %SOF_TIMESTAMPING_RX_SOFTWARE */
        SOCK_FASYNC, /* fasync() active */
        SOCK_RXQ_OVFL,
        SOCK_ZEROCOPY, /* buffers from userspace */
        SOCK_WIFI_STATUS, /* push wifi status to userspace */
        SOCK_NOFCS, /* Tell NIC not to do the Ethernet FCS.
                     * Will use last 4 bytes of packet sent from
                     * user-space instead.
                     */
        SOCK_FILTER_LOCKED, /* Filter cannot be changed anymore */
        SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */
        SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */
        SOCK_TXTIME,
        SOCK_XDP, /* XDP is attached */
        SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */
};

#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))

static inline void sock_copy_flags(struct sock *nsk, struct sock *osk)
{
        nsk->sk_flags = osk->sk_flags;
}

static inline void sock_set_flag(struct sock *sk, enum sock_flags flag)
{
        __set_bit(flag, &sk->sk_flags);
}

static inline void sock_reset_flag(struct sock *sk, enum sock_flags flag)
{
        __clear_bit(flag, &sk->sk_flags);
}

static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit,
                                     int valbool)
{
        if (valbool)
                sock_set_flag(sk, bit);
        else
                sock_reset_flag(sk, bit);
}

static inline bool sock_flag(const struct sock *sk, enum sock_flags flag)
{
        return test_bit(flag, &sk->sk_flags);
}

#ifdef CONFIG_NET
DECLARE_STATIC_KEY_FALSE(memalloc_socks_key);
static inline int sk_memalloc_socks(void)
{
        return static_branch_unlikely(&memalloc_socks_key);
}

void __receive_sock(struct file *file);
#else

static inline int sk_memalloc_socks(void)
{
        return 0;
}

static inline void __receive_sock(struct file *file)
{ }
#endif

static inline gfp_t sk_gfp_mask(const struct sock *sk, gfp_t gfp_mask)
{
        return gfp_mask | (sk->sk_allocation & __GFP_MEMALLOC);
}

static inline void sk_acceptq_removed(struct sock *sk)
{
        WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog - 1);
}

static inline void sk_acceptq_added(struct sock *sk)
{
        WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog + 1);
}

static inline bool sk_acceptq_is_full(const struct sock *sk)
{
        return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog);
}

/*
 * Compute minimal free write space needed to queue new packets.
 */
static inline int sk_stream_min_wspace(const struct sock *sk)
{
        return READ_ONCE(sk->sk_wmem_queued) >> 1;
}

static inline int sk_stream_wspace(const struct sock *sk)
{
        return READ_ONCE(sk->sk_sndbuf) - READ_ONCE(sk->sk_wmem_queued);
}

static inline void sk_wmem_queued_add(struct sock *sk, int val)
{
        WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val);
}

void sk_stream_write_space(struct sock *sk);

/* OOB backlog add */
static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
{
        /* dont let skb dst not refcounted, we are going to leave rcu lock */
        skb_dst_force(skb);

        if (!sk->sk_backlog.tail)
                WRITE_ONCE(sk->sk_backlog.head, skb);
        else
                sk->sk_backlog.tail->next = skb;

        WRITE_ONCE(sk->sk_backlog.tail, skb);
        skb->next = NULL;
}

/*
 * Take into account size of receive queue and backlog queue
 * Do not take into account this skb truesize,
 * to allow even a single big packet to come.
 */
static inline bool sk_rcvqueues_full(const struct sock *sk, unsigned int limit)
{
        unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc);

        return qsize > limit;
}

/* The per-socket spinlock must be held here. */
static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb,
                                              unsigned int limit)
{
        if (sk_rcvqueues_full(sk, limit))
                return -ENOBUFS;

        /*
         * If the skb was allocated from pfmemalloc reserves, only
         * allow SOCK_MEMALLOC sockets to use it as this socket is
         * helping free memory
         */
        if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
                return -ENOMEM;

        __sk_add_backlog(sk, skb);
        sk->sk_backlog.len += skb->truesize;
        return 0;
}

int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb);

static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
        if (sk_memalloc_socks() && skb_pfmemalloc(skb))
                return __sk_backlog_rcv(sk, skb);

        return sk->sk_backlog_rcv(sk, skb);
}

static inline void sk_incoming_cpu_update(struct sock *sk)
{
        int cpu = raw_smp_processor_id();

        if (unlikely(READ_ONCE(sk->sk_incoming_cpu) != cpu))
                WRITE_ONCE(sk->sk_incoming_cpu, cpu);
}

static inline void sock_rps_record_flow_hash(__u32 hash)
{
#ifdef CONFIG_RPS
        struct rps_sock_flow_table *sock_flow_table;

        rcu_read_lock();
        sock_flow_table = rcu_dereference(rps_sock_flow_table);
        rps_record_sock_flow(sock_flow_table, hash);
        rcu_read_unlock();
#endif
}

static inline void sock_rps_record_flow(const struct sock *sk)
{
#ifdef CONFIG_RPS
        if (static_branch_unlikely(&rfs_needed)) {
                /* Reading sk->sk_rxhash might incur an expensive cache line
                 * miss.
                 *
                 * TCP_ESTABLISHED does cover almost all states where RFS
                 * might be useful, and is cheaper [1] than testing :
                 *        IPv4: inet_sk(sk)->inet_daddr
                 *         IPv6: ipv6_addr_any(&sk->sk_v6_daddr)
                 * OR        an additional socket flag
                 * [1] : sk_state and sk_prot are in the same cache line.
                 */
                if (sk->sk_state == TCP_ESTABLISHED) {
                        /* This READ_ONCE() is paired with the WRITE_ONCE()
                         * from sock_rps_save_rxhash() and sock_rps_reset_rxhash().
                         */
                        sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash));
                }
        }
#endif
}

static inline void sock_rps_save_rxhash(struct sock *sk,
                                        const struct sk_buff *skb)
{
#ifdef CONFIG_RPS
        /* The following WRITE_ONCE() is paired with the READ_ONCE()
         * here, and another one in sock_rps_record_flow().
         */
        if (unlikely(READ_ONCE(sk->sk_rxhash) != skb->hash))
                WRITE_ONCE(sk->sk_rxhash, skb->hash);
#endif
}

static inline void sock_rps_reset_rxhash(struct sock *sk)
{
#ifdef CONFIG_RPS
        /* Paired with READ_ONCE() in sock_rps_record_flow() */
        WRITE_ONCE(sk->sk_rxhash, 0);
#endif
}

#define sk_wait_event(__sk, __timeo, __condition, __wait)                \
        ({        int __rc;                                                \
                __sk->sk_wait_pending++;                                \
                release_sock(__sk);                                        \
                __rc = __condition;                                        \
                if (!__rc) {                                                \
                        *(__timeo) = wait_woken(__wait,                        \
                                                TASK_INTERRUPTIBLE,        \
                                                *(__timeo));                \
                }                                                        \
                sched_annotate_sleep();                                        \
                lock_sock(__sk);                                        \
                __sk->sk_wait_pending--;                                \
                __rc = __condition;                                        \
                __rc;                                                        \
        })

int sk_stream_wait_connect(struct sock *sk, long *timeo_p);
int sk_stream_wait_memory(struct sock *sk, long *timeo_p);
void sk_stream_wait_close(struct sock *sk, long timeo_p);
int sk_stream_error(struct sock *sk, int flags, int err);
void sk_stream_kill_queues(struct sock *sk);
void sk_set_memalloc(struct sock *sk);
void sk_clear_memalloc(struct sock *sk);

void __sk_flush_backlog(struct sock *sk);

static inline bool sk_flush_backlog(struct sock *sk)
{
        if (unlikely(READ_ONCE(sk->sk_backlog.tail))) {
                __sk_flush_backlog(sk);
                return true;
        }
        return false;
}

int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb);

struct request_sock_ops;
struct timewait_sock_ops;
struct inet_hashinfo;
struct raw_hashinfo;
struct smc_hashinfo;
struct module;

/*
 * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes
 * un-modified. Special care is taken when initializing object to zero.
 */
static inline void sk_prot_clear_nulls(struct sock *sk, int size)
{
        if (offsetof(struct sock, sk_node.next) != 0)
                memset(sk, 0, offsetof(struct sock, sk_node.next));
        memset(&sk->sk_node.pprev, 0,
               size - offsetof(struct sock, sk_node.pprev));
}

/* Networking protocol blocks we attach to sockets.
 * socket layer -> transport layer interface
 */
struct proto {
        void                        (*close)(struct sock *sk,
                                        long timeout);
        int                        (*pre_connect)(struct sock *sk,
                                        struct sockaddr *uaddr,
                                        int addr_len);
        int                        (*connect)(struct sock *sk,
                                        struct sockaddr *uaddr,
                                        int addr_len);
        int                        (*disconnect)(struct sock *sk, int flags);

        struct sock *                (*accept)(struct sock *sk, int flags, int *err,
                                          bool kern);

        int                        (*ioctl)(struct sock *sk, int cmd,
                                         unsigned long arg);
        int                        (*init)(struct sock *sk);
        void                        (*destroy)(struct sock *sk);
        void                        (*shutdown)(struct sock *sk, int how);
        int                        (*setsockopt)(struct sock *sk, int level,
                                        int optname, sockptr_t optval,
                                        unsigned int optlen);
        int                        (*getsockopt)(struct sock *sk, int level,
                                        int optname, char __user *optval,
                                        int __user *option);
        void                        (*keepalive)(struct sock *sk, int valbool);
#ifdef CONFIG_COMPAT
        int                        (*compat_ioctl)(struct sock *sk,
                                        unsigned int cmd, unsigned long arg);
#endif
        int                        (*sendmsg)(struct sock *sk, struct msghdr *msg,
                                           size_t len);
        int                        (*recvmsg)(struct sock *sk, struct msghdr *msg,
                                           size_t len, int noblock, int flags,
                                           int *addr_len);
        int                        (*sendpage)(struct sock *sk, struct page *page,
                                        int offset, size_t size, int flags);
        int                        (*bind)(struct sock *sk,
                                        struct sockaddr *addr, int addr_len);
        int                        (*bind_add)(struct sock *sk,
                                        struct sockaddr *addr, int addr_len);

        int                        (*backlog_rcv) (struct sock *sk,
                                                struct sk_buff *skb);
        bool                        (*bpf_bypass_getsockopt)(int level,
                                                         int optname);

        void                (*release_cb)(struct sock *sk);

        /* Keeping track of sk's, looking them up, and port selection methods. */
        int                        (*hash)(struct sock *sk);
        void                        (*unhash)(struct sock *sk);
        void                        (*rehash)(struct sock *sk);
        int                        (*get_port)(struct sock *sk, unsigned short snum);

        /* Keeping track of sockets in use */
#ifdef CONFIG_PROC_FS
        unsigned int                inuse_idx;
#endif

        bool                        (*stream_memory_free)(const struct sock *sk, int wake);
        bool                        (*stream_memory_read)(const struct sock *sk);
        /* Memory pressure */
        void                        (*enter_memory_pressure)(struct sock *sk);
        void                        (*leave_memory_pressure)(struct sock *sk);
        atomic_long_t                *memory_allocated;        /* Current allocated memory. */
        struct percpu_counter        *sockets_allocated;        /* Current number of sockets. */
        /*
         * Pressure flag: try to collapse.
         * Technical note: it is used by multiple contexts non atomically.
         * Make sure to use READ_ONCE()/WRITE_ONCE() for all reads/writes.
         * All the __sk_mem_schedule() is of this nature: accounting
         * is strict, actions are advisory and have some latency.
         */
        unsigned long                *memory_pressure;
        long                        *sysctl_mem;

        int                        *sysctl_wmem;
        int                        *sysctl_rmem;
        u32                        sysctl_wmem_offset;
        u32                        sysctl_rmem_offset;

        int                        max_header;
        bool                        no_autobind;

        struct kmem_cache        *slab;
        unsigned int                obj_size;
        slab_flags_t                slab_flags;
        unsigned int                useroffset;        /* Usercopy region offset */
        unsigned int                usersize;        /* Usercopy region size */

        unsigned int __percpu        *orphan_count;

        struct request_sock_ops        *rsk_prot;
        struct timewait_sock_ops *twsk_prot;

        union {
                struct inet_hashinfo        *hashinfo;
                struct udp_table        *udp_table;
                struct raw_hashinfo        *raw_hash;
                struct smc_hashinfo        *smc_hash;
        } h;

        struct module                *owner;

        char                        name[32];

        struct list_head        node;
#ifdef SOCK_REFCNT_DEBUG
        atomic_t                socks;
#endif
        int                        (*diag_destroy)(struct sock *sk, int err);
} __randomize_layout;

int proto_register(struct proto *prot, int alloc_slab);
void proto_unregister(struct proto *prot);
int sock_load_diag_module(int family, int protocol);

#ifdef SOCK_REFCNT_DEBUG
static inline void sk_refcnt_debug_inc(struct sock *sk)
{
        atomic_inc(&sk->sk_prot->socks);
}

static inline void sk_refcnt_debug_dec(struct sock *sk)
{
        atomic_dec(&sk->sk_prot->socks);
        printk(KERN_DEBUG "%s socket %p released, %d are still alive\n",
               sk->sk_prot->name, sk, atomic_read(&sk->sk_prot->socks));
}

static inline void sk_refcnt_debug_release(const struct sock *sk)
{
        if (refcount_read(&sk->sk_refcnt) != 1)
                printk(KERN_DEBUG "Destruction of the %s socket %p delayed, refcnt=%d\n",
                       sk->sk_prot->name, sk, refcount_read(&sk->sk_refcnt));
}
#else /* SOCK_REFCNT_DEBUG */
#define sk_refcnt_debug_inc(sk) do { } while (0)
#define sk_refcnt_debug_dec(sk) do { } while (0)
#define sk_refcnt_debug_release(sk) do { } while (0)
#endif /* SOCK_REFCNT_DEBUG */

static inline bool __sk_stream_memory_free(const struct sock *sk, int wake)
{
        if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf))
                return false;

        return sk->sk_prot->stream_memory_free ?
                sk->sk_prot->stream_memory_free(sk, wake) : true;
}

static inline bool sk_stream_memory_free(const struct sock *sk)
{
        return __sk_stream_memory_free(sk, 0);
}

static inline bool __sk_stream_is_writeable(const struct sock *sk, int wake)
{
        return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) &&
               __sk_stream_memory_free(sk, wake);
}

static inline bool sk_stream_is_writeable(const struct sock *sk)
{
        return __sk_stream_is_writeable(sk, 0);
}

static inline int sk_under_cgroup_hierarchy(struct sock *sk,
                                            struct cgroup *ancestor)
{
#ifdef CONFIG_SOCK_CGROUP_DATA
        return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data),
                                    ancestor);
#else
        return -ENOTSUPP;
#endif
}

static inline bool sk_has_memory_pressure(const struct sock *sk)
{
        return sk->sk_prot->memory_pressure != NULL;
}

static inline bool sk_under_global_memory_pressure(const struct sock *sk)
{
        return sk->sk_prot->memory_pressure &&
                !!READ_ONCE(*sk->sk_prot->memory_pressure);
}

static inline bool sk_under_memory_pressure(const struct sock *sk)
{
        if (!sk->sk_prot->memory_pressure)
                return false;

        if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
            mem_cgroup_under_socket_pressure(sk->sk_memcg))
                return true;

        return !!READ_ONCE(*sk->sk_prot->memory_pressure);
}

static inline long
sk_memory_allocated(const struct sock *sk)
{
        return atomic_long_read(sk->sk_prot->memory_allocated);
}

static inline long
sk_memory_allocated_add(struct sock *sk, int amt)
{
        return atomic_long_add_return(amt, sk->sk_prot->memory_allocated);
}

static inline void
sk_memory_allocated_sub(struct sock *sk, int amt)
{
        atomic_long_sub(amt, sk->sk_prot->memory_allocated);
}

static inline void sk_sockets_allocated_dec(struct sock *sk)
{
        percpu_counter_dec(sk->sk_prot->sockets_allocated);
}

static inline void sk_sockets_allocated_inc(struct sock *sk)
{
        percpu_counter_inc(sk->sk_prot->sockets_allocated);
}

static inline u64
sk_sockets_allocated_read_positive(struct sock *sk)
{
        return percpu_counter_read_positive(sk->sk_prot->sockets_allocated);
}

static inline int
proto_sockets_allocated_sum_positive(struct proto *prot)
{
        return percpu_counter_sum_positive(prot->sockets_allocated);
}

static inline long
proto_memory_allocated(struct proto *prot)
{
        return atomic_long_read(prot->memory_allocated);
}

static inline bool
proto_memory_pressure(struct proto *prot)
{
        if (!prot->memory_pressure)
                return false;
        return !!READ_ONCE(*prot->memory_pressure);
}


#ifdef CONFIG_PROC_FS
/* Called with local bh disabled */
void sock_prot_inuse_add(struct net *net, struct proto *prot, int inc);
int sock_prot_inuse_get(struct net *net, struct proto *proto);
int sock_inuse_get(struct net *net);
#else
static inline void sock_prot_inuse_add(struct net *net, struct proto *prot,
                int inc)
{
}
#endif


/* With per-bucket locks this operation is not-atomic, so that
 * this version is not worse.
 */
static inline int __sk_prot_rehash(struct sock *sk)
{
        sk->sk_prot->unhash(sk);
        return sk->sk_prot->hash(sk);
}

/* About 10 seconds */
#define SOCK_DESTROY_TIME (10*HZ)

/* Sockets 0-1023 can't be bound to unless you are superuser */
#define PROT_SOCK        1024

#define SHUTDOWN_MASK        3
#define RCV_SHUTDOWN        1
#define SEND_SHUTDOWN        2

#define SOCK_SNDBUF_LOCK        1
#define SOCK_RCVBUF_LOCK        2
#define SOCK_BINDADDR_LOCK        4
#define SOCK_BINDPORT_LOCK        8

struct socket_alloc {
        struct socket socket;
        struct inode vfs_inode;
};

static inline struct socket *SOCKET_I(struct inode *inode)
{
        return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
}

static inline struct inode *SOCK_INODE(struct socket *socket)
{
        return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
}

/*
 * Functions for memory accounting
 */
int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind);
int __sk_mem_schedule(struct sock *sk, int size, int kind);
void __sk_mem_reduce_allocated(struct sock *sk, int amount);
void __sk_mem_reclaim(struct sock *sk, int amount);

/* We used to have PAGE_SIZE here, but systems with 64KB pages
 * do not necessarily have 16x time more memory than 4KB ones.
 */
#define SK_MEM_QUANTUM 4096
#define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM)
#define SK_MEM_SEND        0
#define SK_MEM_RECV        1

/* sysctl_mem values are in pages, we convert them in SK_MEM_QUANTUM units */
static inline long sk_prot_mem_limits(const struct sock *sk, int index)
{
        long val = READ_ONCE(sk->sk_prot->sysctl_mem[index]);

#if PAGE_SIZE > SK_MEM_QUANTUM
        val <<= PAGE_SHIFT - SK_MEM_QUANTUM_SHIFT;
#elif PAGE_SIZE < SK_MEM_QUANTUM
        val >>= SK_MEM_QUANTUM_SHIFT - PAGE_SHIFT;
#endif
        return val;
}

static inline int sk_mem_pages(int amt)
{
        return (amt + SK_MEM_QUANTUM - 1) >> SK_MEM_QUANTUM_SHIFT;
}

static inline bool sk_has_account(struct sock *sk)
{
        /* return true if protocol supports memory accounting */
        return !!sk->sk_prot->memory_allocated;
}

static inline bool sk_wmem_schedule(struct sock *sk, int size)
{
        int delta;

        if (!sk_has_account(sk))
                return true;
        delta = size - sk->sk_forward_alloc;
        return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_SEND);
}

static inline bool
sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size)
{
        int delta;

        if (!sk_has_account(sk))
                return true;
        delta = size - sk->sk_forward_alloc;
        return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) ||
                skb_pfmemalloc(skb);
}

static inline void sk_mem_reclaim(struct sock *sk)
{
        if (!sk_has_account(sk))
                return;
        if (sk->sk_forward_alloc >= SK_MEM_QUANTUM)
                __sk_mem_reclaim(sk, sk->sk_forward_alloc);
}

static inline void sk_mem_reclaim_partial(struct sock *sk)
{
        if (!sk_has_account(sk))
                return;
        if (sk->sk_forward_alloc > SK_MEM_QUANTUM)
                __sk_mem_reclaim(sk, sk->sk_forward_alloc - 1);
}

static inline void sk_mem_charge(struct sock *sk, int size)
{
        if (!sk_has_account(sk))
                return;
        sk->sk_forward_alloc -= size;
}

static inline void sk_mem_uncharge(struct sock *sk, int size)
{
        if (!sk_has_account(sk))
                return;
        sk->sk_forward_alloc += size;

        /* Avoid a possible overflow.
         * TCP send queues can make this happen, if sk_mem_reclaim()
         * is not called and more than 2 GBytes are released at once.
         *
         * If we reach 2 MBytes, reclaim 1 MBytes right now, there is
         * no need to hold that much forward allocation anyway.
         */
        if (unlikely(sk->sk_forward_alloc >= 1 << 21))
                __sk_mem_reclaim(sk, 1 << 20);
}

DECLARE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
{
        sk_wmem_queued_add(sk, -skb->truesize);
        sk_mem_uncharge(sk, skb->truesize);
        if (static_branch_unlikely(&tcp_tx_skb_cache_key) &&
            !sk->sk_tx_skb_cache && !skb_cloned(skb)) {
                skb_ext_reset(skb);
                skb_zcopy_clear(skb, true);
                sk->sk_tx_skb_cache = skb;
                return;
        }
        __kfree_skb(skb);
}

static inline void sock_release_ownership(struct sock *sk)
{
        if (sk->sk_lock.owned) {
                sk->sk_lock.owned = 0;

                /* The sk_lock has mutex_unlock() semantics: */
                mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
        }
}

/*
 * Macro so as to not evaluate some arguments when
 * lockdep is not enabled.
 *
 * Mark both the sk_lock and the sk_lock.slock as a
 * per-address-family lock class.
 */
#define sock_lock_init_class_and_name(sk, sname, skey, name, key)        \
do {                                                                        \
        sk->sk_lock.owned = 0;                                                \
        init_waitqueue_head(&sk->sk_lock.wq);                                \
        spin_lock_init(&(sk)->sk_lock.slock);                                \
        debug_check_no_locks_freed((void *)&(sk)->sk_lock,                \
                        sizeof((sk)->sk_lock));                                \
        lockdep_set_class_and_name(&(sk)->sk_lock.slock,                \
                                (skey), (sname));                                \
        lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0);        \
} while (0)

#ifdef CONFIG_LOCKDEP
static inline bool lockdep_sock_is_held(const struct sock *sk)
{
        return lockdep_is_held(&sk->sk_lock) ||
               lockdep_is_held(&sk->sk_lock.slock);
}
#endif

void lock_sock_nested(struct sock *sk, int subclass);

static inline void lock_sock(struct sock *sk)
{
        lock_sock_nested(sk, 0);
}

void __release_sock(struct sock *sk);
void release_sock(struct sock *sk);

/* BH context may only use the following locking interface. */
#define bh_lock_sock(__sk)        spin_lock(&((__sk)->sk_lock.slock))
#define bh_lock_sock_nested(__sk) \
                                spin_lock_nested(&((__sk)->sk_lock.slock), \
                                SINGLE_DEPTH_NESTING)
#define bh_unlock_sock(__sk)        spin_unlock(&((__sk)->sk_lock.slock))

bool lock_sock_fast(struct sock *sk);
/**
 * unlock_sock_fast - complement of lock_sock_fast
 * @sk: socket
 * @slow: slow mode
 *
 * fast unlock socket for user context.
 * If slow mode is on, we call regular release_sock()
 */
static inline void unlock_sock_fast(struct sock *sk, bool slow)
{
        if (slow)
                release_sock(sk);
        else
                spin_unlock_bh(&sk->sk_lock.slock);
}

/* Used by processes to "lock" a socket state, so that
 * interrupts and bottom half handlers won't change it
 * from under us. It essentially blocks any incoming
 * packets, so that we won't get any new data or any
 * packets that change the state of the socket.
 *
 * While locked, BH processing will add new packets to
 * the backlog queue.  This queue is processed by the
 * owner of the socket lock right before it is released.
 *
 * Since ~2.3.5 it is also exclusive sleep lock serializing
 * accesses from user process context.
 */

static inline void sock_owned_by_me(const struct sock *sk)
{
#ifdef CONFIG_LOCKDEP
        WARN_ON_ONCE(!lockdep_sock_is_held(sk) && debug_locks);
#endif
}

static inline void sock_not_owned_by_me(const struct sock *sk)
{
#ifdef CONFIG_LOCKDEP
        WARN_ON_ONCE(lockdep_sock_is_held(sk) && debug_locks);
#endif
}

static inline bool sock_owned_by_user(const struct sock *sk)
{
        sock_owned_by_me(sk);
        return sk->sk_lock.owned;
}

static inline bool sock_owned_by_user_nocheck(const struct sock *sk)
{
        return sk->sk_lock.owned;
}

/* no reclassification while locks are held */
static inline bool sock_allow_reclassification(const struct sock *csk)
{
        struct sock *sk = (struct sock *)csk;

        return !sk->sk_lock.owned && !spin_is_locked(&sk->sk_lock.slock);
}

struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
                      struct proto *prot, int kern);
void sk_free(struct sock *sk);
void sk_destruct(struct sock *sk);
struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority);
void sk_free_unlock_clone(struct sock *sk);

struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
                             gfp_t priority);
void __sock_wfree(struct sk_buff *skb);
void sock_wfree(struct sk_buff *skb);
struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
                             gfp_t priority);
void skb_orphan_partial(struct sk_buff *skb);
void sock_rfree(struct sk_buff *skb);
void sock_efree(struct sk_buff *skb);
#ifdef CONFIG_INET
void sock_edemux(struct sk_buff *skb);
void sock_pfree(struct sk_buff *skb);
#else
#define sock_edemux sock_efree
#endif

int sock_setsockopt(struct socket *sock, int level, int op,
                    sockptr_t optval, unsigned int optlen);

int sock_getsockopt(struct socket *sock, int level, int op,
                    char __user *optval, int __user *optlen);
int sock_gettstamp(struct socket *sock, void __user *userstamp,
                   bool timeval, bool time32);
struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
                                    int noblock, int *errcode);
struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
                                     unsigned long data_len, int noblock,
                                     int *errcode, int max_page_order);
void *sock_kmalloc(struct sock *sk, int size, gfp_t priority);
void sock_kfree_s(struct sock *sk, void *mem, int size);
void sock_kzfree_s(struct sock *sk, void *mem, int size);
void sk_send_sigurg(struct sock *sk);

struct sockcm_cookie {
        u64 transmit_time;
        u32 mark;
        u16 tsflags;
};

static inline void sockcm_init(struct sockcm_cookie *sockc,
                               const struct sock *sk)
{
        *sockc = (struct sockcm_cookie) { .tsflags = sk->sk_tsflags };
}

int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
                     struct sockcm_cookie *sockc);
int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
                   struct sockcm_cookie *sockc);

/*
 * Functions to fill in entries in struct proto_ops when a protocol
 * does not implement a particular function.
 */
int sock_no_bind(struct socket *, struct sockaddr *, int);
int sock_no_connect(struct socket *, struct sockaddr *, int, int);
int sock_no_socketpair(struct socket *, struct socket *);
int sock_no_accept(struct socket *, struct socket *, int, bool);
int sock_no_getname(struct socket *, struct sockaddr *, int);
int sock_no_ioctl(struct socket *, unsigned int, unsigned long);
int sock_no_listen(struct socket *, int);
int sock_no_shutdown(struct socket *, int);
int sock_no_sendmsg(struct socket *, struct msghdr *, size_t);
int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t len);
int sock_no_recvmsg(struct socket *, struct msghdr *, size_t, int);
int sock_no_mmap(struct file *file, struct socket *sock,
                 struct vm_area_struct *vma);
ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset,
                         size_t size, int flags);
ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
                                int offset, size_t size, int flags);

/*
 * Functions to fill in entries in struct proto_ops when a protocol
 * uses the inet style.
 */
int sock_common_getsockopt(struct socket *sock, int level, int optname,
                                  char __user *optval, int __user *optlen);
int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                        int flags);
int sock_common_setsockopt(struct socket *sock, int level, int optname,
                           sockptr_t optval, unsigned int optlen);

void sk_common_release(struct sock *sk);

/*
 *        Default socket callbacks and setup code
 */

/* Initialise core socket variables using an explicit uid. */
void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid);

/* Initialise core socket variables.
 * Assumes struct socket *sock is embedded in a struct socket_alloc.
 */
void sock_init_data(struct socket *sock, struct sock *sk);

/*
 * Socket reference counting postulates.
 *
 * * Each user of socket SHOULD hold a reference count.
 * * Each access point to socket (an hash table bucket, reference from a list,
 *   running timer, skb in flight MUST hold a reference count.
 * * When reference count hits 0, it means it will never increase back.
 * * When reference count hits 0, it means that no references from
 *   outside exist to this socket and current process on current CPU
 *   is last user and may/should destroy this socket.
 * * sk_free is called from any context: process, BH, IRQ. When
 *   it is called, socket has no references from outside -> sk_free
 *   may release descendant resources allocated by the socket, but
 *   to the time when it is called, socket is NOT referenced by any
 *   hash tables, lists etc.
 * * Packets, delivered from outside (from network or from another process)
 *   and enqueued on receive/error queues SHOULD NOT grab reference count,
 *   when they sit in queue. Otherwise, packets will leak to hole, when
 *   socket is looked up by one cpu and unhasing is made by another CPU.
 *   It is true for udp/raw, netlink (leak to receive and error queues), tcp
 *   (leak to backlog). Packet socket does all the processing inside
 *   BR_NETPROTO_LOCK, so that it has not this race condition. UNIX sockets
 *   use separate SMP lock, so that they are prone too.
 */

/* Ungrab socket and destroy it, if it was the last reference. */
static inline void sock_put(struct sock *sk)
{
        if (refcount_dec_and_test(&sk->sk_refcnt))
                sk_free(sk);
}
/* Generic version of sock_put(), dealing with all sockets
 * (TCP_TIMEWAIT, TCP_NEW_SYN_RECV, ESTABLISHED...)
 */
void sock_gen_put(struct sock *sk);

int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested,
                     unsigned int trim_cap, bool refcounted);
static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb,
                                 const int nested)
{
        return __sk_receive_skb(sk, skb, nested, 1, true);
}

static inline void sk_tx_queue_set(struct sock *sk, int tx_queue)
{
        /* sk_tx_queue_mapping accept only upto a 16-bit value */
        if (WARN_ON_ONCE((unsigned short)tx_queue >= USHRT_MAX))
                return;
        /* Paired with READ_ONCE() in sk_tx_queue_get() and
         * other WRITE_ONCE() because socket lock might be not held.
         */
        WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue);
}

#define NO_QUEUE_MAPPING        USHRT_MAX

static inline void sk_tx_queue_clear(struct sock *sk)
{
        /* Paired with READ_ONCE() in sk_tx_queue_get() and
         * other WRITE_ONCE() because socket lock might be not held.
         */
        WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING);
}

static inline int sk_tx_queue_get(const struct sock *sk)
{
        if (sk) {
                /* Paired with WRITE_ONCE() in sk_tx_queue_clear()
                 * and sk_tx_queue_set().
                 */
                int val = READ_ONCE(sk->sk_tx_queue_mapping);

                if (val != NO_QUEUE_MAPPING)
                        return val;
        }
        return -1;
}

static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb)
{
#ifdef CONFIG_XPS
        if (skb_rx_queue_recorded(skb)) {
                u16 rx_queue = skb_get_rx_queue(skb);

                if (WARN_ON_ONCE(rx_queue == NO_QUEUE_MAPPING))
                        return;

                sk->sk_rx_queue_mapping = rx_queue;
        }
#endif
}

static inline void sk_rx_queue_clear(struct sock *sk)
{
#ifdef CONFIG_XPS
        sk->sk_rx_queue_mapping = NO_QUEUE_MAPPING;
#endif
}

#ifdef CONFIG_XPS
static inline int sk_rx_queue_get(const struct sock *sk)
{
        if (sk && sk->sk_rx_queue_mapping != NO_QUEUE_MAPPING)
                return sk->sk_rx_queue_mapping;

        return -1;
}
#endif

static inline void sk_set_socket(struct sock *sk, struct socket *sock)
{
        sk->sk_socket = sock;
}

static inline wait_queue_head_t *sk_sleep(struct sock *sk)
{
        BUILD_BUG_ON(offsetof(struct socket_wq, wait) != 0);
        return &rcu_dereference_raw(sk->sk_wq)->wait;
}
/* Detach socket from process context.
 * Announce socket dead, detach it from wait queue and inode.
 * Note that parent inode held reference count on this struct sock,
 * we do not release it in this function, because protocol
 * probably wants some additional cleanups or even continuing
 * to work with this socket (TCP).
 */
static inline void sock_orphan(struct sock *sk)
{
        write_lock_bh(&sk->sk_callback_lock);
        sock_set_flag(sk, SOCK_DEAD);
        sk_set_socket(sk, NULL);
        sk->sk_wq  = NULL;
        write_unlock_bh(&sk->sk_callback_lock);
}

static inline void sock_graft(struct sock *sk, struct socket *parent)
{
        WARN_ON(parent->sk);
        write_lock_bh(&sk->sk_callback_lock);
        rcu_assign_pointer(sk->sk_wq, &parent->wq);
        parent->sk = sk;
        sk_set_socket(sk, parent);
        sk->sk_uid = SOCK_INODE(parent)->i_uid;
        security_sock_graft(sk, parent);
        write_unlock_bh(&sk->sk_callback_lock);
}

kuid_t sock_i_uid(struct sock *sk);
unsigned long __sock_i_ino(struct sock *sk);
unsigned long sock_i_ino(struct sock *sk);

static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
{
        return sk ? sk->sk_uid : make_kuid(net->user_ns, 0);
}

static inline u32 net_tx_rndhash(void)
{
        u32 v = prandom_u32();

        return v ?: 1;
}

static inline void sk_set_txhash(struct sock *sk)
{
        /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */
        WRITE_ONCE(sk->sk_txhash, net_tx_rndhash());
}

static inline bool sk_rethink_txhash(struct sock *sk)
{
        if (sk->sk_txhash) {
                sk_set_txhash(sk);
                return true;
        }
        return false;
}

static inline struct dst_entry *
__sk_dst_get(struct sock *sk)
{
        return rcu_dereference_check(sk->sk_dst_cache,
                                     lockdep_sock_is_held(sk));
}

static inline struct dst_entry *
sk_dst_get(struct sock *sk)
{
        struct dst_entry *dst;

        rcu_read_lock();
        dst = rcu_dereference(sk->sk_dst_cache);
        if (dst && !atomic_inc_not_zero(&dst->__refcnt))
                dst = NULL;
        rcu_read_unlock();
        return dst;
}

static inline void __dst_negative_advice(struct sock *sk)
{
        struct dst_entry *ndst, *dst = __sk_dst_get(sk);

        if (dst && dst->ops->negative_advice) {
                ndst = dst->ops->negative_advice(dst);

                if (ndst != dst) {
                        rcu_assign_pointer(sk->sk_dst_cache, ndst);
                        sk_tx_queue_clear(sk);
                        WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
                }
        }
}

static inline void dst_negative_advice(struct sock *sk)
{
        sk_rethink_txhash(sk);
        __dst_negative_advice(sk);
}

static inline void
__sk_dst_set(struct sock *sk, struct dst_entry *dst)
{
        struct dst_entry *old_dst;

        sk_tx_queue_clear(sk);
        WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
        old_dst = rcu_dereference_protected(sk->sk_dst_cache,
                                            lockdep_sock_is_held(sk));
        rcu_assign_pointer(sk->sk_dst_cache, dst);
        dst_release(old_dst);
}

static inline void
sk_dst_set(struct sock *sk, struct dst_entry *dst)
{
        struct dst_entry *old_dst;

        sk_tx_queue_clear(sk);
        WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
        old_dst = xchg((__force struct dst_entry **)&sk->sk_dst_cache, dst);
        dst_release(old_dst);
}

static inline void
__sk_dst_reset(struct sock *sk)
{
        __sk_dst_set(sk, NULL);
}

static inline void
sk_dst_reset(struct sock *sk)
{
        sk_dst_set(sk, NULL);
}

struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie);

struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie);

static inline void sk_dst_confirm(struct sock *sk)
{
        if (!READ_ONCE(sk->sk_dst_pending_confirm))
                WRITE_ONCE(sk->sk_dst_pending_confirm, 1);
}

static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n)
{
        if (skb_get_dst_pending_confirm(skb)) {
                struct sock *sk = skb->sk;
                unsigned long now = jiffies;

                /* avoid dirtying neighbour */
                if (READ_ONCE(n->confirmed) != now)
                        WRITE_ONCE(n->confirmed, now);
                if (sk && READ_ONCE(sk->sk_dst_pending_confirm))
                        WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
        }
}

bool sk_mc_loop(struct sock *sk);

static inline bool sk_can_gso(const struct sock *sk)
{
        return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type);
}

void sk_setup_caps(struct sock *sk, struct dst_entry *dst);

static inline void sk_nocaps_add(struct sock *sk, netdev_features_t flags)
{
        sk->sk_route_nocaps |= flags;
        sk->sk_route_caps &= ~flags;
}

static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb,
                                           struct iov_iter *from, char *to,
                                           int copy, int offset)
{
        if (skb->ip_summed == CHECKSUM_NONE) {
                __wsum csum = 0;
                if (!csum_and_copy_from_iter_full(to, copy, &csum, from))
                        return -EFAULT;
                skb->csum = csum_block_add(skb->csum, csum, offset);
        } else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) {
                if (!copy_from_iter_full_nocache(to, copy, from))
                        return -EFAULT;
        } else if (!copy_from_iter_full(to, copy, from))
                return -EFAULT;

        return 0;
}

static inline int skb_add_data_nocache(struct sock *sk, struct sk_buff *skb,
                                       struct iov_iter *from, int copy)
{
        int err, offset = skb->len;

        err = skb_do_copy_data_nocache(sk, skb, from, skb_put(skb, copy),
                                       copy, offset);
        if (err)
                __skb_trim(skb, offset);

        return err;
}

static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *from,
                                           struct sk_buff *skb,
                                           struct page *page,
                                           int off, int copy)
{
        int err;

        err = skb_do_copy_data_nocache(sk, skb, from, page_address(page) + off,
                                       copy, skb->len);
        if (err)
                return err;

        skb->len             += copy;
        skb->data_len             += copy;
        skb->truesize             += copy;
        sk_wmem_queued_add(sk, copy);
        sk_mem_charge(sk, copy);
        return 0;
}

/**
 * sk_wmem_alloc_get - returns write allocations
 * @sk: socket
 *
 * Return: sk_wmem_alloc minus initial offset of one
 */
static inline int sk_wmem_alloc_get(const struct sock *sk)
{
        return refcount_read(&sk->sk_wmem_alloc) - 1;
}

/**
 * sk_rmem_alloc_get - returns read allocations
 * @sk: socket
 *
 * Return: sk_rmem_alloc
 */
static inline int sk_rmem_alloc_get(const struct sock *sk)
{
        return atomic_read(&sk->sk_rmem_alloc);
}

/**
 * sk_has_allocations - check if allocations are outstanding
 * @sk: socket
 *
 * Return: true if socket has write or read allocations
 */
static inline bool sk_has_allocations(const struct sock *sk)
{
        return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk);
}

/**
 * skwq_has_sleeper - check if there are any waiting processes
 * @wq: struct socket_wq
 *
 * Return: true if socket_wq has waiting processes
 *
 * The purpose of the skwq_has_sleeper and sock_poll_wait is to wrap the memory
 * barrier call. They were added due to the race found within the tcp code.
 *
 * Consider following tcp code paths::
 *
 *   CPU1                CPU2
 *   sys_select          receive packet
 *   ...                 ...
 *   __add_wait_queue    update tp->rcv_nxt
 *   ...                 ...
 *   tp->rcv_nxt check   sock_def_readable
 *   ...                 {
 *   schedule               rcu_read_lock();
 *                          wq = rcu_dereference(sk->sk_wq);
 *                          if (wq && waitqueue_active(&wq->wait))
 *                              wake_up_interruptible(&wq->wait)
 *                          ...
 *                       }
 *
 * The race for tcp fires when the __add_wait_queue changes done by CPU1 stay
 * in its cache, and so does the tp->rcv_nxt update on CPU2 side.  The CPU1
 * could then endup calling schedule and sleep forever if there are no more
 * data on the socket.
 *
 */
static inline bool skwq_has_sleeper(struct socket_wq *wq)
{
        return wq && wq_has_sleeper(&wq->wait);
}

/**
 * sock_poll_wait - place memory barrier behind the poll_wait call.
 * @filp:           file
 * @sock:           socket to wait on
 * @p:              poll_table
 *
 * See the comments in the wq_has_sleeper function.
 */
static inline void sock_poll_wait(struct file *filp, struct socket *sock,
                                  poll_table *p)
{
        if (!poll_does_not_wait(p)) {
                poll_wait(filp, &sock->wq.wait, p);
                /* We need to be sure we are in sync with the
                 * socket flags modification.
                 *
                 * This memory barrier is paired in the wq_has_sleeper.
                 */
                smp_mb();
        }
}

static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk)
{
        /* This pairs with WRITE_ONCE() in sk_set_txhash() */
        u32 txhash = READ_ONCE(sk->sk_txhash);

        if (txhash) {
                skb->l4_hash = 1;
                skb->hash = txhash;
        }
}

void skb_set_owner_w(struct sk_buff *skb, struct sock *sk);

/*
 *        Queue a received datagram if it will fit. Stream and sequenced
 *        protocols can't normally use this as they need to fit buffers in
 *        and play with them.
 *
 *        Inlined as it's very short and called for pretty much every
 *        packet ever received.
 */
static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
        skb_orphan(skb);
        skb->sk = sk;
        skb->destructor = sock_rfree;
        atomic_add(skb->truesize, &sk->sk_rmem_alloc);
        sk_mem_charge(sk, skb->truesize);
}

static inline __must_check bool skb_set_owner_sk_safe(struct sk_buff *skb, struct sock *sk)
{
        if (sk && refcount_inc_not_zero(&sk->sk_refcnt)) {
                skb_orphan(skb);
                skb->destructor = sock_efree;
                skb->sk = sk;
                return true;
        }
        return false;
}

static inline struct sk_buff *skb_clone_and_charge_r(struct sk_buff *skb, struct sock *sk)
{
        skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC));
        if (skb) {
                if (sk_rmem_schedule(sk, skb, skb->truesize)) {
                        skb_set_owner_r(skb, sk);
                        return skb;
                }
                __kfree_skb(skb);
        }
        return NULL;
}

void sk_reset_timer(struct sock *sk, struct timer_list *timer,
                    unsigned long expires);

void sk_stop_timer(struct sock *sk, struct timer_list *timer);

void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer);

int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
                        struct sk_buff *skb, unsigned int flags,
                        void (*destructor)(struct sock *sk,
                                           struct sk_buff *skb));
int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);

int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb);
struct sk_buff *sock_dequeue_err_skb(struct sock *sk);

/*
 *        Recover an error report and clear atomically
 */

static inline int sock_error(struct sock *sk)
{
        int err;

        /* Avoid an atomic operation for the common case.
         * This is racy since another cpu/thread can change sk_err under us.
         */
        if (likely(data_race(!sk->sk_err)))
                return 0;

        err = xchg(&sk->sk_err, 0);
        return -err;
}

static inline unsigned long sock_wspace(struct sock *sk)
{
        int amt = 0;

        if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
                amt = sk->sk_sndbuf - refcount_read(&sk->sk_wmem_alloc);
                if (amt < 0)
                        amt = 0;
        }
        return amt;
}

/* Note:
 *  We use sk->sk_wq_raw, from contexts knowing this
 *  pointer is not NULL and cannot disappear/change.
 */
static inline void sk_set_bit(int nr, struct sock *sk)
{
        if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) &&
            !sock_flag(sk, SOCK_FASYNC))
                return;

        set_bit(nr, &sk->sk_wq_raw->flags);
}

static inline void sk_clear_bit(int nr, struct sock *sk)
{
        if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) &&
            !sock_flag(sk, SOCK_FASYNC))
                return;

        clear_bit(nr, &sk->sk_wq_raw->flags);
}

static inline void sk_wake_async(const struct sock *sk, int how, int band)
{
        if (sock_flag(sk, SOCK_FASYNC)) {
                rcu_read_lock();
                sock_wake_async(rcu_dereference(sk->sk_wq), how, band);
                rcu_read_unlock();
        }
}

/* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might
 * need sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak.
 * Note: for send buffers, TCP works better if we can build two skbs at
 * minimum.
 */
#define TCP_SKB_MIN_TRUESIZE        (2048 + SKB_DATA_ALIGN(sizeof(struct sk_buff)))

#define SOCK_MIN_SNDBUF                (TCP_SKB_MIN_TRUESIZE * 2)
#define SOCK_MIN_RCVBUF                 TCP_SKB_MIN_TRUESIZE

static inline void sk_stream_moderate_sndbuf(struct sock *sk)
{
        u32 val;

        if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
                return;

        val = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1);

        WRITE_ONCE(sk->sk_sndbuf, max_t(u32, val, SOCK_MIN_SNDBUF));
}

struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
                                    bool force_schedule);

/**
 * sk_page_frag - return an appropriate page_frag
 * @sk: socket
 *
 * Use the per task page_frag instead of the per socket one for
 * optimization when we know that we're in process context and own
 * everything that's associated with %current.
 *
 * Both direct reclaim and page faults can nest inside other
 * socket operations and end up recursing into sk_page_frag()
 * while it's already in use: explicitly avoid task page_frag
 * usage if the caller is potentially doing any of them.
 * This assumes that page fault handlers use the GFP_NOFS flags.
 *
 * Return: a per task page_frag if context allows that,
 * otherwise a per socket one.
 */
static inline struct page_frag *sk_page_frag(struct sock *sk)
{
        if ((sk->sk_allocation & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC | __GFP_FS)) ==
            (__GFP_DIRECT_RECLAIM | __GFP_FS))
                return &current->task_frag;

        return &sk->sk_frag;
}

bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag);

/*
 *        Default write policy as shown to user space via poll/select/SIGIO
 */
static inline bool sock_writeable(const struct sock *sk)
{
        return refcount_read(&sk->sk_wmem_alloc) < (READ_ONCE(sk->sk_sndbuf) >> 1);
}

static inline gfp_t gfp_any(void)
{
        return in_softirq() ? GFP_ATOMIC : GFP_KERNEL;
}

static inline long sock_rcvtimeo(const struct sock *sk, bool noblock)
{
        return noblock ? 0 : sk->sk_rcvtimeo;
}

static inline long sock_sndtimeo(const struct sock *sk, bool noblock)
{
        return noblock ? 0 : sk->sk_sndtimeo;
}

static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len)
{
        int v = waitall ? len : min_t(int, READ_ONCE(sk->sk_rcvlowat), len);

        return v ?: 1;
}

/* Alas, with timeout socket operations are not restartable.
 * Compare this to poll().
 */
static inline int sock_intr_errno(long timeo)
{
        return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR;
}

struct sock_skb_cb {
        u32 dropcount;
};

/* Store sock_skb_cb at the end of skb->cb[] so protocol families
 * using skb->cb[] would keep using it directly and utilize its
 * alignement guarantee.
 */
#define SOCK_SKB_CB_OFFSET ((sizeof_field(struct sk_buff, cb) - \
                            sizeof(struct sock_skb_cb)))

#define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \
                            SOCK_SKB_CB_OFFSET))

#define sock_skb_cb_check_size(size) \
        BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET)

static inline void
sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb)
{
        SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ?
                                                atomic_read(&sk->sk_drops) : 0;
}

static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb)
{
        int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs);

        atomic_add(segs, &sk->sk_drops);
}

static inline ktime_t sock_read_timestamp(struct sock *sk)
{
#if BITS_PER_LONG==32
        unsigned int seq;
        ktime_t kt;

        do {
                seq = read_seqbegin(&sk->sk_stamp_seq);
                kt = sk->sk_stamp;
        } while (read_seqretry(&sk->sk_stamp_seq, seq));

        return kt;
#else
        return READ_ONCE(sk->sk_stamp);
#endif
}

static inline void sock_write_timestamp(struct sock *sk, ktime_t kt)
{
#if BITS_PER_LONG==32
        write_seqlock(&sk->sk_stamp_seq);
        sk->sk_stamp = kt;
        write_sequnlock(&sk->sk_stamp_seq);
#else
        WRITE_ONCE(sk->sk_stamp, kt);
#endif
}

void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
                           struct sk_buff *skb);
void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
                             struct sk_buff *skb);

static inline void
sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
{
        ktime_t kt = skb->tstamp;
        struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb);

        /*
         * generate control messages if
         * - receive time stamping in software requested
         * - software time stamp available and wanted
         * - hardware time stamps available and wanted
         */
        if (sock_flag(sk, SOCK_RCVTSTAMP) ||
            (sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) ||
            (kt && sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
            (hwtstamps->hwtstamp &&
             (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
                __sock_recv_timestamp(msg, sk, skb);
        else
                sock_write_timestamp(sk, kt);

        if (sock_flag(sk, SOCK_WIFI_STATUS) && skb->wifi_acked_valid)
                __sock_recv_wifi_status(msg, sk, skb);
}

void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
                              struct sk_buff *skb);

#define SK_DEFAULT_STAMP (-1L * NSEC_PER_SEC)
static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
                                          struct sk_buff *skb)
{
#define FLAGS_TS_OR_DROPS ((1UL << SOCK_RXQ_OVFL)                        | \
                           (1UL << SOCK_RCVTSTAMP))
#define TSFLAGS_ANY          (SOF_TIMESTAMPING_SOFTWARE                        | \
                           SOF_TIMESTAMPING_RAW_HARDWARE)

        if (sk->sk_flags & FLAGS_TS_OR_DROPS || sk->sk_tsflags & TSFLAGS_ANY)
                __sock_recv_ts_and_drops(msg, sk, skb);
        else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP)))
                sock_write_timestamp(sk, skb->tstamp);
        else if (unlikely(sock_read_timestamp(sk) == SK_DEFAULT_STAMP))
                sock_write_timestamp(sk, 0);
}

void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags);

/**
 * _sock_tx_timestamp - checks whether the outgoing packet is to be time stamped
 * @sk:                socket sending this packet
 * @tsflags:        timestamping flags to use
 * @tx_flags:        completed with instructions for time stamping
 * @tskey:      filled in with next sk_tskey (not for TCP, which uses seqno)
 *
 * Note: callers should take care of initial ``*tx_flags`` value (usually 0)
 */
static inline void _sock_tx_timestamp(struct sock *sk, __u16 tsflags,
                                      __u8 *tx_flags, __u32 *tskey)
{
        if (unlikely(tsflags)) {
                __sock_tx_timestamp(tsflags, tx_flags);
                if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey &&
                    tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
                        *tskey = sk->sk_tskey++;
        }
        if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS)))
                *tx_flags |= SKBTX_WIFI_STATUS;
}

static inline void sock_tx_timestamp(struct sock *sk, __u16 tsflags,
                                     __u8 *tx_flags)
{
        _sock_tx_timestamp(sk, tsflags, tx_flags, NULL);
}

static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
{
        _sock_tx_timestamp(skb->sk, tsflags, &skb_shinfo(skb)->tx_flags,
                           &skb_shinfo(skb)->tskey);
}

DECLARE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
/**
 * sk_eat_skb - Release a skb if it is no longer needed
 * @sk: socket to eat this skb from
 * @skb: socket buffer to eat
 *
 * This routine must be called with interrupts disabled or with the socket
 * locked so that the sk_buff queue operation is ok.
*/
static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
{
        __skb_unlink(skb, &sk->sk_receive_queue);
        if (static_branch_unlikely(&tcp_rx_skb_cache_key) &&
            !sk->sk_rx_skb_cache) {
                sk->sk_rx_skb_cache = skb;
                skb_orphan(skb);
                return;
        }
        __kfree_skb(skb);
}

static inline
struct net *sock_net(const struct sock *sk)
{
        return read_pnet(&sk->sk_net);
}

static inline
void sock_net_set(struct sock *sk, struct net *net)
{
        write_pnet(&sk->sk_net, net);
}

static inline bool
skb_sk_is_prefetched(struct sk_buff *skb)
{
#ifdef CONFIG_INET
        return skb->destructor == sock_pfree;
#else
        return false;
#endif /* CONFIG_INET */
}

/* This helper checks if a socket is a full socket,
 * ie _not_ a timewait or request socket.
 */
static inline bool sk_fullsock(const struct sock *sk)
{
        return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV);
}

static inline bool
sk_is_refcounted(struct sock *sk)
{
        /* Only full sockets have sk->sk_flags. */
        return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE);
}

/**
 * skb_steal_sock - steal a socket from an sk_buff
 * @skb: sk_buff to steal the socket from
 * @refcounted: is set to true if the socket is reference-counted
 */
static inline struct sock *
skb_steal_sock(struct sk_buff *skb, bool *refcounted)
{
        if (skb->sk) {
                struct sock *sk = skb->sk;

                *refcounted = true;
                if (skb_sk_is_prefetched(skb))
                        *refcounted = sk_is_refcounted(sk);
                skb->destructor = NULL;
                skb->sk = NULL;
                return sk;
        }
        *refcounted = false;
        return NULL;
}

/* Checks if this SKB belongs to an HW offloaded socket
 * and whether any SW fallbacks are required based on dev.
 * Check decrypted mark in case skb_orphan() cleared socket.
 */
static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
                                                   struct net_device *dev)
{
#ifdef CONFIG_SOCK_VALIDATE_XMIT
        struct sock *sk = skb->sk;

        if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) {
                skb = sk->sk_validate_xmit_skb(sk, dev, skb);
#ifdef CONFIG_TLS_DEVICE
        } else if (unlikely(skb->decrypted)) {
                pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n");
                kfree_skb(skb);
                skb = NULL;
#endif
        }
#endif

        return skb;
}

/* This helper checks if a socket is a LISTEN or NEW_SYN_RECV
 * SYNACK messages can be attached to either ones (depending on SYNCOOKIE)
 */
static inline bool sk_listener(const struct sock *sk)
{
        return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV);
}

void sock_enable_timestamp(struct sock *sk, enum sock_flags flag);
int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level,
                       int type);

bool sk_ns_capable(const struct sock *sk,
                   struct user_namespace *user_ns, int cap);
bool sk_capable(const struct sock *sk, int cap);
bool sk_net_capable(const struct sock *sk, int cap);

void sk_get_meminfo(const struct sock *sk, u32 *meminfo);

/* Take into consideration the size of the struct sk_buff overhead in the
 * determination of these values, since that is non-constant across
 * platforms.  This makes socket queueing behavior and performance
 * not depend upon such differences.
 */
#define _SK_MEM_PACKETS                256
#define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
#define SK_WMEM_MAX                (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
#define SK_RMEM_MAX                (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)

extern __u32 sysctl_wmem_max;
extern __u32 sysctl_rmem_max;

extern int sysctl_tstamp_allow_data;
extern int sysctl_optmem_max;

extern __u32 sysctl_wmem_default;
extern __u32 sysctl_rmem_default;

#define SKB_FRAG_PAGE_ORDER        get_order(32768)
DECLARE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);

static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto)
{
        /* Does this proto have per netns sysctl_wmem ? */
        if (proto->sysctl_wmem_offset)
                return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset));

        return READ_ONCE(*proto->sysctl_wmem);
}

static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto)
{
        /* Does this proto have per netns sysctl_rmem ? */
        if (proto->sysctl_rmem_offset)
                return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset));

        return READ_ONCE(*proto->sysctl_rmem);
}

/* Default TCP Small queue budget is ~1 ms of data (1sec >> 10)
 * Some wifi drivers need to tweak it to get more chunks.
 * They can use this helper from their ndo_start_xmit()
 */
static inline void sk_pacing_shift_update(struct sock *sk, int val)
{
        if (!sk || !sk_fullsock(sk) || READ_ONCE(sk->sk_pacing_shift) == val)
                return;
        WRITE_ONCE(sk->sk_pacing_shift, val);
}

/* if a socket is bound to a device, check that the given device
 * index is either the same or that the socket is bound to an L3
 * master device and the given device index is also enslaved to
 * that L3 master
 */
static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
{
        int mdif;

        if (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif)
                return true;

        mdif = l3mdev_master_ifindex_by_index(sock_net(sk), dif);
        if (mdif && mdif == sk->sk_bound_dev_if)
                return true;

        return false;
}

void sock_def_readable(struct sock *sk);

int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
void sock_enable_timestamps(struct sock *sk);
void sock_no_linger(struct sock *sk);
void sock_set_keepalive(struct sock *sk);
void sock_set_priority(struct sock *sk, u32 priority);
void sock_set_rcvbuf(struct sock *sk, int val);
void sock_set_mark(struct sock *sk, u32 val);
void sock_set_reuseaddr(struct sock *sk);
void sock_set_reuseport(struct sock *sk);
void sock_set_sndtimeo(struct sock *sk, s64 secs);

int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len);

#endif        /* _SOCK_H */
























































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_MM_H
#define _LINUX_SCHED_MM_H

#include <linux/kernel.h>
#include <linux/atomic.h>
#include <linux/sched.h>
#include <linux/mm_types.h>
#include <linux/gfp.h>
#include <linux/sync_core.h>

/*
 * Routines for handling mm_structs
 */
extern struct mm_struct *mm_alloc(void);

/**
 * mmgrab() - Pin a &struct mm_struct.
 * @mm: The &struct mm_struct to pin.
 *
 * Make sure that @mm will not get freed even after the owning task
 * exits. This doesn't guarantee that the associated address space
 * will still exist later on and mmget_not_zero() has to be used before
 * accessing it.
 *
 * This is a preferred way to pin @mm for a longer/unbounded amount
 * of time.
 *
 * Use mmdrop() to release the reference acquired by mmgrab().
 *
 * See also <Documentation/vm/active_mm.rst> for an in-depth explanation
 * of &mm_struct.mm_count vs &mm_struct.mm_users.
 */
static inline void mmgrab(struct mm_struct *mm)
{
        atomic_inc(&mm->mm_count);
}

extern void __mmdrop(struct mm_struct *mm);

static inline void mmdrop(struct mm_struct *mm)
{
        /*
         * The implicit full barrier implied by atomic_dec_and_test() is
         * required by the membarrier system call before returning to
         * user-space, after storing to rq->curr.
         */
        if (unlikely(atomic_dec_and_test(&mm->mm_count)))
                __mmdrop(mm);
}

/**
 * mmget() - Pin the address space associated with a &struct mm_struct.
 * @mm: The address space to pin.
 *
 * Make sure that the address space of the given &struct mm_struct doesn't
 * go away. This does not protect against parts of the address space being
 * modified or freed, however.
 *
 * Never use this function to pin this address space for an
 * unbounded/indefinite amount of time.
 *
 * Use mmput() to release the reference acquired by mmget().
 *
 * See also <Documentation/vm/active_mm.rst> for an in-depth explanation
 * of &mm_struct.mm_count vs &mm_struct.mm_users.
 */
static inline void mmget(struct mm_struct *mm)
{
        atomic_inc(&mm->mm_users);
}

static inline bool mmget_not_zero(struct mm_struct *mm)
{
        return atomic_inc_not_zero(&mm->mm_users);
}

/* mmput gets rid of the mappings and all user-space */
extern void mmput(struct mm_struct *);
#ifdef CONFIG_MMU
/* same as above but performs the slow path from the async context. Can
 * be called from the atomic context as well
 */
void mmput_async(struct mm_struct *);
#endif

/* Grab a reference to a task's mm, if it is not already going away */
extern struct mm_struct *get_task_mm(struct task_struct *task);
/*
 * Grab a reference to a task's mm, if it is not already going away
 * and ptrace_may_access with the mode parameter passed to it
 * succeeds.
 */
extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
/* Remove the current tasks stale references to the old mm_struct on exit() */
extern void exit_mm_release(struct task_struct *, struct mm_struct *);
/* Remove the current tasks stale references to the old mm_struct on exec() */
extern void exec_mm_release(struct task_struct *, struct mm_struct *);

#ifdef CONFIG_MEMCG
extern void mm_update_next_owner(struct mm_struct *mm);
#else
static inline void mm_update_next_owner(struct mm_struct *mm)
{
}
#endif /* CONFIG_MEMCG */

#ifdef CONFIG_MMU
#ifndef arch_get_mmap_end
#define arch_get_mmap_end(addr)        (TASK_SIZE)
#endif

#ifndef arch_get_mmap_base
#define arch_get_mmap_base(addr, base) (base)
#endif

extern void arch_pick_mmap_layout(struct mm_struct *mm,
                                  struct rlimit *rlim_stack);
extern unsigned long
arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
                       unsigned long, unsigned long);
extern unsigned long
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
                          unsigned long len, unsigned long pgoff,
                          unsigned long flags);
#else
static inline void arch_pick_mmap_layout(struct mm_struct *mm,
                                         struct rlimit *rlim_stack) {}
#endif

static inline bool in_vfork(struct task_struct *tsk)
{
        bool ret;

        /*
         * need RCU to access ->real_parent if CLONE_VM was used along with
         * CLONE_PARENT.
         *
         * We check real_parent->mm == tsk->mm because CLONE_VFORK does not
         * imply CLONE_VM
         *
         * CLONE_VFORK can be used with CLONE_PARENT/CLONE_THREAD and thus
         * ->real_parent is not necessarily the task doing vfork(), so in
         * theory we can't rely on task_lock() if we want to dereference it.
         *
         * And in this case we can't trust the real_parent->mm == tsk->mm
         * check, it can be false negative. But we do not care, if init or
         * another oom-unkillable task does this it should blame itself.
         */
        rcu_read_lock();
        ret = tsk->vfork_done &&
                        rcu_dereference(tsk->real_parent)->mm == tsk->mm;
        rcu_read_unlock();

        return ret;
}

/*
 * Applies per-task gfp context to the given allocation flags.
 * PF_MEMALLOC_NOIO implies GFP_NOIO
 * PF_MEMALLOC_NOFS implies GFP_NOFS
 */
static inline gfp_t current_gfp_context(gfp_t flags)
{
        unsigned int pflags = READ_ONCE(current->flags);

        if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS))) {
                /*
                 * NOIO implies both NOIO and NOFS and it is a weaker context
                 * so always make sure it makes precedence
                 */
                if (pflags & PF_MEMALLOC_NOIO)
                        flags &= ~(__GFP_IO | __GFP_FS);
                else if (pflags & PF_MEMALLOC_NOFS)
                        flags &= ~__GFP_FS;
        }
        return flags;
}

#ifdef CONFIG_LOCKDEP
extern void __fs_reclaim_acquire(void);
extern void __fs_reclaim_release(void);
extern void fs_reclaim_acquire(gfp_t gfp_mask);
extern void fs_reclaim_release(gfp_t gfp_mask);
#else
static inline void __fs_reclaim_acquire(void) { }
static inline void __fs_reclaim_release(void) { }
static inline void fs_reclaim_acquire(gfp_t gfp_mask) { }
static inline void fs_reclaim_release(gfp_t gfp_mask) { }
#endif

/**
 * memalloc_noio_save - Marks implicit GFP_NOIO allocation scope.
 *
 * This functions marks the beginning of the GFP_NOIO allocation scope.
 * All further allocations will implicitly drop __GFP_IO flag and so
 * they are safe for the IO critical section from the allocation recursion
 * point of view. Use memalloc_noio_restore to end the scope with flags
 * returned by this function.
 *
 * This function is safe to be used from any context.
 */
static inline unsigned int memalloc_noio_save(void)
{
        unsigned int flags = current->flags & PF_MEMALLOC_NOIO;
        current->flags |= PF_MEMALLOC_NOIO;
        return flags;
}

/**
 * memalloc_noio_restore - Ends the implicit GFP_NOIO scope.
 * @flags: Flags to restore.
 *
 * Ends the implicit GFP_NOIO scope started by memalloc_noio_save function.
 * Always make sure that the given flags is the return value from the
 * pairing memalloc_noio_save call.
 */
static inline void memalloc_noio_restore(unsigned int flags)
{
        current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
}

/**
 * memalloc_nofs_save - Marks implicit GFP_NOFS allocation scope.
 *
 * This functions marks the beginning of the GFP_NOFS allocation scope.
 * All further allocations will implicitly drop __GFP_FS flag and so
 * they are safe for the FS critical section from the allocation recursion
 * point of view. Use memalloc_nofs_restore to end the scope with flags
 * returned by this function.
 *
 * This function is safe to be used from any context.
 */
static inline unsigned int memalloc_nofs_save(void)
{
        unsigned int flags = current->flags & PF_MEMALLOC_NOFS;
        current->flags |= PF_MEMALLOC_NOFS;
        return flags;
}

/**
 * memalloc_nofs_restore - Ends the implicit GFP_NOFS scope.
 * @flags: Flags to restore.
 *
 * Ends the implicit GFP_NOFS scope started by memalloc_nofs_save function.
 * Always make sure that the given flags is the return value from the
 * pairing memalloc_nofs_save call.
 */
static inline void memalloc_nofs_restore(unsigned int flags)
{
        current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
}

static inline unsigned int memalloc_noreclaim_save(void)
{
        unsigned int flags = current->flags & PF_MEMALLOC;
        current->flags |= PF_MEMALLOC;
        return flags;
}

static inline void memalloc_noreclaim_restore(unsigned int flags)
{
        current->flags = (current->flags & ~PF_MEMALLOC) | flags;
}

#ifdef CONFIG_CMA
static inline unsigned int memalloc_nocma_save(void)
{
        unsigned int flags = current->flags & PF_MEMALLOC_NOCMA;

        current->flags |= PF_MEMALLOC_NOCMA;
        return flags;
}

static inline void memalloc_nocma_restore(unsigned int flags)
{
        current->flags = (current->flags & ~PF_MEMALLOC_NOCMA) | flags;
}
#else
static inline unsigned int memalloc_nocma_save(void)
{
        return 0;
}

static inline void memalloc_nocma_restore(unsigned int flags)
{
}
#endif

#ifdef CONFIG_MEMCG
DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg);
/**
 * set_active_memcg - Starts the remote memcg charging scope.
 * @memcg: memcg to charge.
 *
 * This function marks the beginning of the remote memcg charging scope. All the
 * __GFP_ACCOUNT allocations till the end of the scope will be charged to the
 * given memcg.
 *
 * NOTE: This function can nest. Users must save the return value and
 * reset the previous value after their own charging scope is over.
 */
static inline struct mem_cgroup *
set_active_memcg(struct mem_cgroup *memcg)
{
        struct mem_cgroup *old;

        if (in_interrupt()) {
                old = this_cpu_read(int_active_memcg);
                this_cpu_write(int_active_memcg, memcg);
        } else {
                old = current->active_memcg;
                current->active_memcg = memcg;
        }

        return old;
}
#else
static inline struct mem_cgroup *
set_active_memcg(struct mem_cgroup *memcg)
{
        return NULL;
}
#endif

#ifdef CONFIG_MEMBARRIER
enum {
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY                = (1U << 0),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED                        = (1U << 1),
        MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY                        = (1U << 2),
        MEMBARRIER_STATE_GLOBAL_EXPEDITED                        = (1U << 3),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY        = (1U << 4),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE                = (1U << 5),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY                = (1U << 6),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ                        = (1U << 7),
};

enum {
        MEMBARRIER_FLAG_SYNC_CORE        = (1U << 0),
        MEMBARRIER_FLAG_RSEQ                = (1U << 1),
};

#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
#include <asm/membarrier.h>
#endif

static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
        if (current->mm != mm)
                return;
        if (likely(!(atomic_read(&mm->membarrier_state) &
                     MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE)))
                return;
        sync_core_before_usermode();
}

extern void membarrier_exec_mmap(struct mm_struct *mm);

#else
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
                                             struct mm_struct *next,
                                             struct task_struct *tsk)
{
}
#endif
static inline void membarrier_exec_mmap(struct mm_struct *mm)
{
}
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
}
#endif

#endif /* _LINUX_SCHED_MM_H */




































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 





































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
/*
 *  kernel/cpuset.c
 *
 *  Processor and Memory placement constraints for sets of tasks.
 *
 *  Copyright (C) 2003 BULL SA.
 *  Copyright (C) 2004-2007 Silicon Graphics, Inc.
 *  Copyright (C) 2006 Google, Inc
 *
 *  Portions derived from Patrick Mochel's sysfs code.
 *  sysfs is Copyright (c) 2001-3 Patrick Mochel
 *
 *  2003-10-10 Written by Simon Derr.
 *  2003-10-22 Updates by Stephen Hemminger.
 *  2004 May-July Rework by Paul Jackson.
 *  2006 Rework by Paul Menage to use generic cgroups
 *  2008 Rework of the scheduler domains and CPU hotplug handling
 *       by Max Krasnyansky
 *
 *  This file is subject to the terms and conditions of the GNU General Public
 *  License.  See the file COPYING in the main directory of the Linux
 *  distribution for more details.
 */

#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/cpuset.h>
#include <linux/err.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
#include <linux/kthread.h>
#include <linux/list.h>
#include <linux/mempolicy.h>
#include <linux/mm.h>
#include <linux/memory.h>
#include <linux/export.h>
#include <linux/mount.h>
#include <linux/fs_context.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/deadline.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/seq_file.h>
#include <linux/security.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/time.h>
#include <linux/time64.h>
#include <linux/backing-dev.h>
#include <linux/sort.h>
#include <linux/oom.h>
#include <linux/sched/isolation.h>
#include <linux/uaccess.h>
#include <linux/atomic.h>
#include <linux/mutex.h>
#include <linux/cgroup.h>
#include <linux/wait.h>

DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);

/* See "Frequency meter" comments, below. */

struct fmeter {
        int cnt;                /* unprocessed events count */
        int val;                /* most recent output value */
        time64_t time;                /* clock (secs) when val computed */
        spinlock_t lock;        /* guards read or write of above */
};

struct cpuset {
        struct cgroup_subsys_state css;

        unsigned long flags;                /* "unsigned long" so bitops work */

        /*
         * On default hierarchy:
         *
         * The user-configured masks can only be changed by writing to
         * cpuset.cpus and cpuset.mems, and won't be limited by the
         * parent masks.
         *
         * The effective masks is the real masks that apply to the tasks
         * in the cpuset. They may be changed if the configured masks are
         * changed or hotplug happens.
         *
         * effective_mask == configured_mask & parent's effective_mask,
         * and if it ends up empty, it will inherit the parent's mask.
         *
         *
         * On legacy hierachy:
         *
         * The user-configured masks are always the same with effective masks.
         */

        /* user-configured CPUs and Memory Nodes allow to tasks */
        cpumask_var_t cpus_allowed;
        nodemask_t mems_allowed;

        /* effective CPUs and Memory Nodes allow to tasks */
        cpumask_var_t effective_cpus;
        nodemask_t effective_mems;

        /*
         * CPUs allocated to child sub-partitions (default hierarchy only)
         * - CPUs granted by the parent = effective_cpus U subparts_cpus
         * - effective_cpus and subparts_cpus are mutually exclusive.
         *
         * effective_cpus contains only onlined CPUs, but subparts_cpus
         * may have offlined ones.
         */
        cpumask_var_t subparts_cpus;

        /*
         * This is old Memory Nodes tasks took on.
         *
         * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
         * - A new cpuset's old_mems_allowed is initialized when some
         *   task is moved into it.
         * - old_mems_allowed is used in cpuset_migrate_mm() when we change
         *   cpuset.mems_allowed and have tasks' nodemask updated, and
         *   then old_mems_allowed is updated to mems_allowed.
         */
        nodemask_t old_mems_allowed;

        struct fmeter fmeter;                /* memory_pressure filter */

        /*
         * Tasks are being attached to this cpuset.  Used to prevent
         * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
         */
        int attach_in_progress;

        /* partition number for rebuild_sched_domains() */
        int pn;

        /* for custom sched domain */
        int relax_domain_level;

        /* number of CPUs in subparts_cpus */
        int nr_subparts_cpus;

        /* partition root state */
        int partition_root_state;

        /*
         * Default hierarchy only:
         * use_parent_ecpus - set if using parent's effective_cpus
         * child_ecpus_count - # of children with use_parent_ecpus set
         */
        int use_parent_ecpus;
        int child_ecpus_count;

        /*
         * number of SCHED_DEADLINE tasks attached to this cpuset, so that we
         * know when to rebuild associated root domain bandwidth information.
         */
        int nr_deadline_tasks;
        int nr_migrate_dl_tasks;
        u64 sum_migrate_dl_bw;
};

/*
 * Partition root states:
 *
 *   0 - not a partition root
 *
 *   1 - partition root
 *
 *  -1 - invalid partition root
 *       None of the cpus in cpus_allowed can be put into the parent's
 *       subparts_cpus. In this case, the cpuset is not a real partition
 *       root anymore.  However, the CPU_EXCLUSIVE bit will still be set
 *       and the cpuset can be restored back to a partition root if the
 *       parent cpuset can give more CPUs back to this child cpuset.
 */
#define PRS_DISABLED                0
#define PRS_ENABLED                1
#define PRS_ERROR                -1

/*
 * Temporary cpumasks for working with partitions that are passed among
 * functions to avoid memory allocation in inner functions.
 */
struct tmpmasks {
        cpumask_var_t addmask, delmask;        /* For partition root */
        cpumask_var_t new_cpus;                /* For update_cpumasks_hier() */
};

static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
{
        return css ? container_of(css, struct cpuset, css) : NULL;
}

/* Retrieve the cpuset for a task */
static inline struct cpuset *task_cs(struct task_struct *task)
{
        return css_cs(task_css(task, cpuset_cgrp_id));
}

static inline struct cpuset *parent_cs(struct cpuset *cs)
{
        return css_cs(cs->css.parent);
}

void inc_dl_tasks_cs(struct task_struct *p)
{
        struct cpuset *cs = task_cs(p);

        cs->nr_deadline_tasks++;
}

void dec_dl_tasks_cs(struct task_struct *p)
{
        struct cpuset *cs = task_cs(p);

        cs->nr_deadline_tasks--;
}

/* bits in struct cpuset flags field */
typedef enum {
        CS_ONLINE,
        CS_CPU_EXCLUSIVE,
        CS_MEM_EXCLUSIVE,
        CS_MEM_HARDWALL,
        CS_MEMORY_MIGRATE,
        CS_SCHED_LOAD_BALANCE,
        CS_SPREAD_PAGE,
        CS_SPREAD_SLAB,
} cpuset_flagbits_t;

/* convenient tests for these bits */
static inline bool is_cpuset_online(struct cpuset *cs)
{
        return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
}

static inline int is_cpu_exclusive(const struct cpuset *cs)
{
        return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
}

static inline int is_mem_exclusive(const struct cpuset *cs)
{
        return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
}

static inline int is_mem_hardwall(const struct cpuset *cs)
{
        return test_bit(CS_MEM_HARDWALL, &cs->flags);
}

static inline int is_sched_load_balance(const struct cpuset *cs)
{
        return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
}

static inline int is_memory_migrate(const struct cpuset *cs)
{
        return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
}

static inline int is_spread_page(const struct cpuset *cs)
{
        return test_bit(CS_SPREAD_PAGE, &cs->flags);
}

static inline int is_spread_slab(const struct cpuset *cs)
{
        return test_bit(CS_SPREAD_SLAB, &cs->flags);
}

static inline int is_partition_root(const struct cpuset *cs)
{
        return cs->partition_root_state > 0;
}

static struct cpuset top_cpuset = {
        .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
                  (1 << CS_MEM_EXCLUSIVE)),
        .partition_root_state = PRS_ENABLED,
};

/**
 * cpuset_for_each_child - traverse online children of a cpuset
 * @child_cs: loop cursor pointing to the current child
 * @pos_css: used for iteration
 * @parent_cs: target cpuset to walk children of
 *
 * Walk @child_cs through the online children of @parent_cs.  Must be used
 * with RCU read locked.
 */
#define cpuset_for_each_child(child_cs, pos_css, parent_cs)                \
        css_for_each_child((pos_css), &(parent_cs)->css)                \
                if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))

/**
 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
 * @des_cs: loop cursor pointing to the current descendant
 * @pos_css: used for iteration
 * @root_cs: target cpuset to walk ancestor of
 *
 * Walk @des_cs through the online descendants of @root_cs.  Must be used
 * with RCU read locked.  The caller may modify @pos_css by calling
 * css_rightmost_descendant() to skip subtree.  @root_cs is included in the
 * iteration and the first node to be visited.
 */
#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs)        \
        css_for_each_descendant_pre((pos_css), &(root_cs)->css)                \
                if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))

/*
 * There are two global locks guarding cpuset structures - cpuset_mutex and
 * callback_lock. We also require taking task_lock() when dereferencing a
 * task's cpuset pointer. See "The task_lock() exception", at the end of this
 * comment.
 *
 * A task must hold both locks to modify cpusets.  If a task holds
 * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
 * is the only task able to also acquire callback_lock and be able to
 * modify cpusets.  It can perform various checks on the cpuset structure
 * first, knowing nothing will change.  It can also allocate memory while
 * just holding cpuset_mutex.  While it is performing these checks, various
 * callback routines can briefly acquire callback_lock to query cpusets.
 * Once it is ready to make the changes, it takes callback_lock, blocking
 * everyone else.
 *
 * Calls to the kernel memory allocator can not be made while holding
 * callback_lock, as that would risk double tripping on callback_lock
 * from one of the callbacks into the cpuset code from within
 * __alloc_pages().
 *
 * If a task is only holding callback_lock, then it has read-only
 * access to cpusets.
 *
 * Now, the task_struct fields mems_allowed and mempolicy may be changed
 * by other task, we use alloc_lock in the task_struct fields to protect
 * them.
 *
 * The cpuset_common_file_read() handlers only hold callback_lock across
 * small pieces of code, such as when reading out possibly multi-word
 * cpumasks and nodemasks.
 *
 * Accessing a task's cpuset should be done in accordance with the
 * guidelines for accessing subsystem state in kernel/cgroup.c
 */

static DEFINE_MUTEX(cpuset_mutex);

void cpuset_lock(void)
{
        mutex_lock(&cpuset_mutex);
}

void cpuset_unlock(void)
{
        mutex_unlock(&cpuset_mutex);
}

static DEFINE_SPINLOCK(callback_lock);

static struct workqueue_struct *cpuset_migrate_mm_wq;

/*
 * CPU / memory hotplug is handled asynchronously.
 */
static void cpuset_hotplug_workfn(struct work_struct *work);
static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);

static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);

/*
 * Cgroup v2 behavior is used on the "cpus" and "mems" control files when
 * on default hierarchy or when the cpuset_v2_mode flag is set by mounting
 * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option.
 * With v2 behavior, "cpus" and "mems" are always what the users have
 * requested and won't be changed by hotplug events. Only the effective
 * cpus or mems will be affected.
 */
static inline bool is_in_v2_mode(void)
{
        return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
              (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
}

/*
 * Return in pmask the portion of a cpusets's cpus_allowed that
 * are online.  If none are online, walk up the cpuset hierarchy
 * until we find one that does have some online cpus.
 *
 * One way or another, we guarantee to return some non-empty subset
 * of cpu_online_mask.
 *
 * Call with callback_lock or cpuset_mutex held.
 */
static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
{
        while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
                cs = parent_cs(cs);
                if (unlikely(!cs)) {
                        /*
                         * The top cpuset doesn't have any online cpu as a
                         * consequence of a race between cpuset_hotplug_work
                         * and cpu hotplug notifier.  But we know the top
                         * cpuset's effective_cpus is on its way to be
                         * identical to cpu_online_mask.
                         */
                        cpumask_copy(pmask, cpu_online_mask);
                        return;
                }
        }
        cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
}

/*
 * Return in *pmask the portion of a cpusets's mems_allowed that
 * are online, with memory.  If none are online with memory, walk
 * up the cpuset hierarchy until we find one that does have some
 * online mems.  The top cpuset always has some mems online.
 *
 * One way or another, we guarantee to return some non-empty subset
 * of node_states[N_MEMORY].
 *
 * Call with callback_lock or cpuset_mutex held.
 */
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
        while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
                cs = parent_cs(cs);
        nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
}

/*
 * update task's spread flag if cpuset's page/slab spread flag is set
 *
 * Call with callback_lock or cpuset_mutex held.
 */
static void cpuset_update_task_spread_flag(struct cpuset *cs,
                                        struct task_struct *tsk)
{
        if (is_spread_page(cs))
                task_set_spread_page(tsk);
        else
                task_clear_spread_page(tsk);

        if (is_spread_slab(cs))
                task_set_spread_slab(tsk);
        else
                task_clear_spread_slab(tsk);
}

/*
 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
 *
 * One cpuset is a subset of another if all its allowed CPUs and
 * Memory Nodes are a subset of the other, and its exclusive flags
 * are only set if the other's are set.  Call holding cpuset_mutex.
 */

static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
{
        return        cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
                nodes_subset(p->mems_allowed, q->mems_allowed) &&
                is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
                is_mem_exclusive(p) <= is_mem_exclusive(q);
}

/**
 * alloc_cpumasks - allocate three cpumasks for cpuset
 * @cs:  the cpuset that have cpumasks to be allocated.
 * @tmp: the tmpmasks structure pointer
 * Return: 0 if successful, -ENOMEM otherwise.
 *
 * Only one of the two input arguments should be non-NULL.
 */
static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
{
        cpumask_var_t *pmask1, *pmask2, *pmask3;

        if (cs) {
                pmask1 = &cs->cpus_allowed;
                pmask2 = &cs->effective_cpus;
                pmask3 = &cs->subparts_cpus;
        } else {
                pmask1 = &tmp->new_cpus;
                pmask2 = &tmp->addmask;
                pmask3 = &tmp->delmask;
        }

        if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
                return -ENOMEM;

        if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
                goto free_one;

        if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
                goto free_two;

        return 0;

free_two:
        free_cpumask_var(*pmask2);
free_one:
        free_cpumask_var(*pmask1);
        return -ENOMEM;
}

/**
 * free_cpumasks - free cpumasks in a tmpmasks structure
 * @cs:  the cpuset that have cpumasks to be free.
 * @tmp: the tmpmasks structure pointer
 */
static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
{
        if (cs) {
                free_cpumask_var(cs->cpus_allowed);
                free_cpumask_var(cs->effective_cpus);
                free_cpumask_var(cs->subparts_cpus);
        }
        if (tmp) {
                free_cpumask_var(tmp->new_cpus);
                free_cpumask_var(tmp->addmask);
                free_cpumask_var(tmp->delmask);
        }
}

/**
 * alloc_trial_cpuset - allocate a trial cpuset
 * @cs: the cpuset that the trial cpuset duplicates
 */
static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
{
        struct cpuset *trial;

        trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
        if (!trial)
                return NULL;

        if (alloc_cpumasks(trial, NULL)) {
                kfree(trial);
                return NULL;
        }

        cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
        cpumask_copy(trial->effective_cpus, cs->effective_cpus);
        return trial;
}

/**
 * free_cpuset - free the cpuset
 * @cs: the cpuset to be freed
 */
static inline void free_cpuset(struct cpuset *cs)
{
        free_cpumasks(cs, NULL);
        kfree(cs);
}

/*
 * validate_change() - Used to validate that any proposed cpuset change
 *                       follows the structural rules for cpusets.
 *
 * If we replaced the flag and mask values of the current cpuset
 * (cur) with those values in the trial cpuset (trial), would
 * our various subset and exclusive rules still be valid?  Presumes
 * cpuset_mutex held.
 *
 * 'cur' is the address of an actual, in-use cpuset.  Operations
 * such as list traversal that depend on the actual address of the
 * cpuset in the list must use cur below, not trial.
 *
 * 'trial' is the address of bulk structure copy of cur, with
 * perhaps one or more of the fields cpus_allowed, mems_allowed,
 * or flags changed to new, trial values.
 *
 * Return 0 if valid, -errno if not.
 */

static int validate_change(struct cpuset *cur, struct cpuset *trial)
{
        struct cgroup_subsys_state *css;
        struct cpuset *c, *par;
        int ret;

        rcu_read_lock();

        /* Each of our child cpusets must be a subset of us */
        ret = -EBUSY;
        cpuset_for_each_child(c, css, cur)
                if (!is_cpuset_subset(c, trial))
                        goto out;

        /* Remaining checks don't apply to root cpuset */
        ret = 0;
        if (cur == &top_cpuset)
                goto out;

        par = parent_cs(cur);

        /* On legacy hiearchy, we must be a subset of our parent cpuset. */
        ret = -EACCES;
        if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
                goto out;

        /*
         * If either I or some sibling (!= me) is exclusive, we can't
         * overlap
         */
        ret = -EINVAL;
        cpuset_for_each_child(c, css, par) {
                if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
                    c != cur &&
                    cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
                        goto out;
                if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
                    c != cur &&
                    nodes_intersects(trial->mems_allowed, c->mems_allowed))
                        goto out;
        }

        /*
         * Cpusets with tasks - existing or newly being attached - can't
         * be changed to have empty cpus_allowed or mems_allowed.
         */
        ret = -ENOSPC;
        if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
                if (!cpumask_empty(cur->cpus_allowed) &&
                    cpumask_empty(trial->cpus_allowed))
                        goto out;
                if (!nodes_empty(cur->mems_allowed) &&
                    nodes_empty(trial->mems_allowed))
                        goto out;
        }

        /*
         * We can't shrink if we won't have enough room for SCHED_DEADLINE
         * tasks.
         */
        ret = -EBUSY;
        if (is_cpu_exclusive(cur) &&
            !cpuset_cpumask_can_shrink(cur->cpus_allowed,
                                       trial->cpus_allowed))
                goto out;

        ret = 0;
out:
        rcu_read_unlock();
        return ret;
}

#ifdef CONFIG_SMP
/*
 * Helper routine for generate_sched_domains().
 * Do cpusets a, b have overlapping effective cpus_allowed masks?
 */
static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
{
        return cpumask_intersects(a->effective_cpus, b->effective_cpus);
}

static void
update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
{
        if (dattr->relax_domain_level < c->relax_domain_level)
                dattr->relax_domain_level = c->relax_domain_level;
        return;
}

static void update_domain_attr_tree(struct sched_domain_attr *dattr,
                                    struct cpuset *root_cs)
{
        struct cpuset *cp;
        struct cgroup_subsys_state *pos_css;

        rcu_read_lock();
        cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
                /* skip the whole subtree if @cp doesn't have any CPU */
                if (cpumask_empty(cp->cpus_allowed)) {
                        pos_css = css_rightmost_descendant(pos_css);
                        continue;
                }

                if (is_sched_load_balance(cp))
                        update_domain_attr(dattr, cp);
        }
        rcu_read_unlock();
}

/* Must be called with cpuset_mutex held.  */
static inline int nr_cpusets(void)
{
        /* jump label reference count + the top-level cpuset */
        return static_key_count(&cpusets_enabled_key.key) + 1;
}

/*
 * generate_sched_domains()
 *
 * This function builds a partial partition of the systems CPUs
 * A 'partial partition' is a set of non-overlapping subsets whose
 * union is a subset of that set.
 * The output of this function needs to be passed to kernel/sched/core.c
 * partition_sched_domains() routine, which will rebuild the scheduler's
 * load balancing domains (sched domains) as specified by that partial
 * partition.
 *
 * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst
 * for a background explanation of this.
 *
 * Does not return errors, on the theory that the callers of this
 * routine would rather not worry about failures to rebuild sched
 * domains when operating in the severe memory shortage situations
 * that could cause allocation failures below.
 *
 * Must be called with cpuset_mutex held.
 *
 * The three key local variables below are:
 *    cp - cpuset pointer, used (together with pos_css) to perform a
 *           top-down scan of all cpusets. For our purposes, rebuilding
 *           the schedulers sched domains, we can ignore !is_sched_load_
 *           balance cpusets.
 *  csa  - (for CpuSet Array) Array of pointers to all the cpusets
 *           that need to be load balanced, for convenient iterative
 *           access by the subsequent code that finds the best partition,
 *           i.e the set of domains (subsets) of CPUs such that the
 *           cpus_allowed of every cpuset marked is_sched_load_balance
 *           is a subset of one of these domains, while there are as
 *           many such domains as possible, each as small as possible.
 * doms  - Conversion of 'csa' to an array of cpumasks, for passing to
 *           the kernel/sched/core.c routine partition_sched_domains() in a
 *           convenient format, that can be easily compared to the prior
 *           value to determine what partition elements (sched domains)
 *           were changed (added or removed.)
 *
 * Finding the best partition (set of domains):
 *        The triple nested loops below over i, j, k scan over the
 *        load balanced cpusets (using the array of cpuset pointers in
 *        csa[]) looking for pairs of cpusets that have overlapping
 *        cpus_allowed, but which don't have the same 'pn' partition
 *        number and gives them in the same partition number.  It keeps
 *        looping on the 'restart' label until it can no longer find
 *        any such pairs.
 *
 *        The union of the cpus_allowed masks from the set of
 *        all cpusets having the same 'pn' value then form the one
 *        element of the partition (one sched domain) to be passed to
 *        partition_sched_domains().
 */
static int generate_sched_domains(cpumask_var_t **domains,
                        struct sched_domain_attr **attributes)
{
        struct cpuset *cp;        /* top-down scan of cpusets */
        struct cpuset **csa;        /* array of all cpuset ptrs */
        int csn;                /* how many cpuset ptrs in csa so far */
        int i, j, k;                /* indices for partition finding loops */
        cpumask_var_t *doms;        /* resulting partition; i.e. sched domains */
        struct sched_domain_attr *dattr;  /* attributes for custom domains */
        int ndoms = 0;                /* number of sched domains in result */
        int nslot;                /* next empty doms[] struct cpumask slot */
        struct cgroup_subsys_state *pos_css;
        bool root_load_balance = is_sched_load_balance(&top_cpuset);

        doms = NULL;
        dattr = NULL;
        csa = NULL;

        /* Special case for the 99% of systems with one, full, sched domain */
        if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
                ndoms = 1;
                doms = alloc_sched_domains(ndoms);
                if (!doms)
                        goto done;

                dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
                if (dattr) {
                        *dattr = SD_ATTR_INIT;
                        update_domain_attr_tree(dattr, &top_cpuset);
                }
                cpumask_and(doms[0], top_cpuset.effective_cpus,
                            housekeeping_cpumask(HK_FLAG_DOMAIN));

                goto done;
        }

        csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
        if (!csa)
                goto done;
        csn = 0;

        rcu_read_lock();
        if (root_load_balance)
                csa[csn++] = &top_cpuset;
        cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
                if (cp == &top_cpuset)
                        continue;
                /*
                 * Continue traversing beyond @cp iff @cp has some CPUs and
                 * isn't load balancing.  The former is obvious.  The
                 * latter: All child cpusets contain a subset of the
                 * parent's cpus, so just skip them, and then we call
                 * update_domain_attr_tree() to calc relax_domain_level of
                 * the corresponding sched domain.
                 *
                 * If root is load-balancing, we can skip @cp if it
                 * is a subset of the root's effective_cpus.
                 */
                if (!cpumask_empty(cp->cpus_allowed) &&
                    !(is_sched_load_balance(cp) &&
                      cpumask_intersects(cp->cpus_allowed,
                                         housekeeping_cpumask(HK_FLAG_DOMAIN))))
                        continue;

                if (root_load_balance &&
                    cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
                        continue;

                if (is_sched_load_balance(cp) &&
                    !cpumask_empty(cp->effective_cpus))
                        csa[csn++] = cp;

                /* skip @cp's subtree if not a partition root */
                if (!is_partition_root(cp))
                        pos_css = css_rightmost_descendant(pos_css);
        }
        rcu_read_unlock();

        for (i = 0; i < csn; i++)
                csa[i]->pn = i;
        ndoms = csn;

restart:
        /* Find the best partition (set of sched domains) */
        for (i = 0; i < csn; i++) {
                struct cpuset *a = csa[i];
                int apn = a->pn;

                for (j = 0; j < csn; j++) {
                        struct cpuset *b = csa[j];
                        int bpn = b->pn;

                        if (apn != bpn && cpusets_overlap(a, b)) {
                                for (k = 0; k < csn; k++) {
                                        struct cpuset *c = csa[k];

                                        if (c->pn == bpn)
                                                c->pn = apn;
                                }
                                ndoms--;        /* one less element */
                                goto restart;
                        }
                }
        }

        /*
         * Now we know how many domains to create.
         * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
         */
        doms = alloc_sched_domains(ndoms);
        if (!doms)
                goto done;

        /*
         * The rest of the code, including the scheduler, can deal with
         * dattr==NULL case. No need to abort if alloc fails.
         */
        dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
                              GFP_KERNEL);

        for (nslot = 0, i = 0; i < csn; i++) {
                struct cpuset *a = csa[i];
                struct cpumask *dp;
                int apn = a->pn;

                if (apn < 0) {
                        /* Skip completed partitions */
                        continue;
                }

                dp = doms[nslot];

                if (nslot == ndoms) {
                        static int warnings = 10;
                        if (warnings) {
                                pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
                                        nslot, ndoms, csn, i, apn);
                                warnings--;
                        }
                        continue;
                }

                cpumask_clear(dp);
                if (dattr)
                        *(dattr + nslot) = SD_ATTR_INIT;
                for (j = i; j < csn; j++) {
                        struct cpuset *b = csa[j];

                        if (apn == b->pn) {
                                cpumask_or(dp, dp, b->effective_cpus);
                                cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
                                if (dattr)
                                        update_domain_attr_tree(dattr + nslot, b);

                                /* Done with this partition */
                                b->pn = -1;
                        }
                }
                nslot++;
        }
        BUG_ON(nslot != ndoms);

done:
        kfree(csa);

        /*
         * Fallback to the default domain if kmalloc() failed.
         * See comments in partition_sched_domains().
         */
        if (doms == NULL)
                ndoms = 1;

        *domains    = doms;
        *attributes = dattr;
        return ndoms;
}

static void dl_update_tasks_root_domain(struct cpuset *cs)
{
        struct css_task_iter it;
        struct task_struct *task;

        if (cs->nr_deadline_tasks == 0)
                return;

        css_task_iter_start(&cs->css, 0, &it);

        while ((task = css_task_iter_next(&it)))
                dl_add_task_root_domain(task);

        css_task_iter_end(&it);
}

static void dl_rebuild_rd_accounting(void)
{
        struct cpuset *cs = NULL;
        struct cgroup_subsys_state *pos_css;

        lockdep_assert_held(&cpuset_mutex);
        lockdep_assert_cpus_held();
        lockdep_assert_held(&sched_domains_mutex);

        rcu_read_lock();

        /*
         * Clear default root domain DL accounting, it will be computed again
         * if a task belongs to it.
         */
        dl_clear_root_domain(&def_root_domain);

        cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {

                if (cpumask_empty(cs->effective_cpus)) {
                        pos_css = css_rightmost_descendant(pos_css);
                        continue;
                }

                css_get(&cs->css);

                rcu_read_unlock();

                dl_update_tasks_root_domain(cs);

                rcu_read_lock();
                css_put(&cs->css);
        }
        rcu_read_unlock();
}

static void
partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
                                    struct sched_domain_attr *dattr_new)
{
        mutex_lock(&sched_domains_mutex);
        partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
        dl_rebuild_rd_accounting();
        mutex_unlock(&sched_domains_mutex);
}

/*
 * Rebuild scheduler domains.
 *
 * If the flag 'sched_load_balance' of any cpuset with non-empty
 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
 * which has that flag enabled, or if any cpuset with a non-empty
 * 'cpus' is removed, then call this routine to rebuild the
 * scheduler's dynamic sched domains.
 *
 * Call with cpuset_mutex held.  Takes get_online_cpus().
 */
static void rebuild_sched_domains_locked(void)
{
        struct cgroup_subsys_state *pos_css;
        struct sched_domain_attr *attr;
        cpumask_var_t *doms;
        struct cpuset *cs;
        int ndoms;

        lockdep_assert_cpus_held();
        lockdep_assert_held(&cpuset_mutex);

        /*
         * If we have raced with CPU hotplug, return early to avoid
         * passing doms with offlined cpu to partition_sched_domains().
         * Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
         *
         * With no CPUs in any subpartitions, top_cpuset's effective CPUs
         * should be the same as the active CPUs, so checking only top_cpuset
         * is enough to detect racing CPU offlines.
         */
        if (!top_cpuset.nr_subparts_cpus &&
            !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
                return;

        /*
         * With subpartition CPUs, however, the effective CPUs of a partition
         * root should be only a subset of the active CPUs.  Since a CPU in any
         * partition root could be offlined, all must be checked.
         */
        if (top_cpuset.nr_subparts_cpus) {
                rcu_read_lock();
                cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
                        if (!is_partition_root(cs)) {
                                pos_css = css_rightmost_descendant(pos_css);
                                continue;
                        }
                        if (!cpumask_subset(cs->effective_cpus,
                                            cpu_active_mask)) {
                                rcu_read_unlock();
                                return;
                        }
                }
                rcu_read_unlock();
        }

        /* Generate domain masks and attrs */
        ndoms = generate_sched_domains(&doms, &attr);

        /* Have scheduler rebuild the domains */
        partition_and_rebuild_sched_domains(ndoms, doms, attr);
}
#else /* !CONFIG_SMP */
static void rebuild_sched_domains_locked(void)
{
}
#endif /* CONFIG_SMP */

void rebuild_sched_domains(void)
{
        get_online_cpus();
        mutex_lock(&cpuset_mutex);
        rebuild_sched_domains_locked();
        mutex_unlock(&cpuset_mutex);
        put_online_cpus();
}

/**
 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
 *
 * Iterate through each task of @cs updating its cpus_allowed to the
 * effective cpuset's.  As this function is called with cpuset_mutex held,
 * cpuset membership stays stable.
 */
static void update_tasks_cpumask(struct cpuset *cs)
{
        struct css_task_iter it;
        struct task_struct *task;
        bool top_cs = cs == &top_cpuset;

        css_task_iter_start(&cs->css, 0, &it);
        while ((task = css_task_iter_next(&it))) {
                /*
                 * Percpu kthreads in top_cpuset are ignored
                 */
                if (top_cs && (task->flags & PF_KTHREAD) &&
                    kthread_is_per_cpu(task))
                        continue;
                set_cpus_allowed_ptr(task, cs->effective_cpus);
        }
        css_task_iter_end(&it);
}

/**
 * compute_effective_cpumask - Compute the effective cpumask of the cpuset
 * @new_cpus: the temp variable for the new effective_cpus mask
 * @cs: the cpuset the need to recompute the new effective_cpus mask
 * @parent: the parent cpuset
 *
 * If the parent has subpartition CPUs, include them in the list of
 * allowable CPUs in computing the new effective_cpus mask. Since offlined
 * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask
 * to mask those out.
 */
static void compute_effective_cpumask(struct cpumask *new_cpus,
                                      struct cpuset *cs, struct cpuset *parent)
{
        if (parent->nr_subparts_cpus) {
                cpumask_or(new_cpus, parent->effective_cpus,
                           parent->subparts_cpus);
                cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
                cpumask_and(new_cpus, new_cpus, cpu_active_mask);
        } else {
                cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
        }
}

/*
 * Commands for update_parent_subparts_cpumask
 */
enum subparts_cmd {
        partcmd_enable,                /* Enable partition root         */
        partcmd_disable,        /* Disable partition root         */
        partcmd_update,                /* Update parent's subparts_cpus */
};

/**
 * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
 * @cpuset:  The cpuset that requests change in partition root state
 * @cmd:     Partition root state change command
 * @newmask: Optional new cpumask for partcmd_update
 * @tmp:     Temporary addmask and delmask
 * Return:   0, 1 or an error code
 *
 * For partcmd_enable, the cpuset is being transformed from a non-partition
 * root to a partition root. The cpus_allowed mask of the given cpuset will
 * be put into parent's subparts_cpus and taken away from parent's
 * effective_cpus. The function will return 0 if all the CPUs listed in
 * cpus_allowed can be granted or an error code will be returned.
 *
 * For partcmd_disable, the cpuset is being transofrmed from a partition
 * root back to a non-partition root. Any CPUs in cpus_allowed that are in
 * parent's subparts_cpus will be taken away from that cpumask and put back
 * into parent's effective_cpus. 0 should always be returned.
 *
 * For partcmd_update, if the optional newmask is specified, the cpu
 * list is to be changed from cpus_allowed to newmask. Otherwise,
 * cpus_allowed is assumed to remain the same. The cpuset should either
 * be a partition root or an invalid partition root. The partition root
 * state may change if newmask is NULL and none of the requested CPUs can
 * be granted by the parent. The function will return 1 if changes to
 * parent's subparts_cpus and effective_cpus happen or 0 otherwise.
 * Error code should only be returned when newmask is non-NULL.
 *
 * The partcmd_enable and partcmd_disable commands are used by
 * update_prstate(). The partcmd_update command is used by
 * update_cpumasks_hier() with newmask NULL and update_cpumask() with
 * newmask set.
 *
 * The checking is more strict when enabling partition root than the
 * other two commands.
 *
 * Because of the implicit cpu exclusive nature of a partition root,
 * cpumask changes that violates the cpu exclusivity rule will not be
 * permitted when checked by validate_change(). The validate_change()
 * function will also prevent any changes to the cpu list if it is not
 * a superset of children's cpu lists.
 */
static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
                                          struct cpumask *newmask,
                                          struct tmpmasks *tmp)
{
        struct cpuset *parent = parent_cs(cpuset);
        int adding;        /* Moving cpus from effective_cpus to subparts_cpus */
        int deleting;        /* Moving cpus from subparts_cpus to effective_cpus */
        int new_prs;
        bool part_error = false;        /* Partition error? */

        lockdep_assert_held(&cpuset_mutex);

        /*
         * The parent must be a partition root.
         * The new cpumask, if present, or the current cpus_allowed must
         * not be empty.
         */
        if (!is_partition_root(parent) ||
           (newmask && cpumask_empty(newmask)) ||
           (!newmask && cpumask_empty(cpuset->cpus_allowed)))
                return -EINVAL;

        /*
         * Enabling/disabling partition root is not allowed if there are
         * online children.
         */
        if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
                return -EBUSY;

        /*
         * Enabling partition root is not allowed if not all the CPUs
         * can be granted from parent's effective_cpus or at least one
         * CPU will be left after that.
         */
        if ((cmd == partcmd_enable) &&
           (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
             cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
                return -EINVAL;

        /*
         * A cpumask update cannot make parent's effective_cpus become empty.
         */
        adding = deleting = false;
        new_prs = cpuset->partition_root_state;
        if (cmd == partcmd_enable) {
                cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
                adding = true;
        } else if (cmd == partcmd_disable) {
                deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
                                       parent->subparts_cpus);
        } else if (newmask) {
                /*
                 * partcmd_update with newmask:
                 *
                 * delmask = cpus_allowed & ~newmask & parent->subparts_cpus
                 * addmask = newmask & parent->effective_cpus
                 *                     & ~parent->subparts_cpus
                 */
                cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
                deleting = cpumask_and(tmp->delmask, tmp->delmask,
                                       parent->subparts_cpus);

                cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
                adding = cpumask_andnot(tmp->addmask, tmp->addmask,
                                        parent->subparts_cpus);
                /*
                 * Return error if the new effective_cpus could become empty.
                 */
                if (adding &&
                    cpumask_equal(parent->effective_cpus, tmp->addmask)) {
                        if (!deleting)
                                return -EINVAL;
                        /*
                         * As some of the CPUs in subparts_cpus might have
                         * been offlined, we need to compute the real delmask
                         * to confirm that.
                         */
                        if (!cpumask_and(tmp->addmask, tmp->delmask,
                                         cpu_active_mask))
                                return -EINVAL;
                        cpumask_copy(tmp->addmask, parent->effective_cpus);
                }
        } else {
                /*
                 * partcmd_update w/o newmask:
                 *
                 * addmask = cpus_allowed & parent->effective_cpus
                 *
                 * Note that parent's subparts_cpus may have been
                 * pre-shrunk in case there is a change in the cpu list.
                 * So no deletion is needed.
                 */
                adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
                                     parent->effective_cpus);
                part_error = cpumask_equal(tmp->addmask,
                                           parent->effective_cpus);
        }

        if (cmd == partcmd_update) {
                int prev_prs = cpuset->partition_root_state;

                /*
                 * Check for possible transition between PRS_ENABLED
                 * and PRS_ERROR.
                 */
                switch (cpuset->partition_root_state) {
                case PRS_ENABLED:
                        if (part_error)
                                new_prs = PRS_ERROR;
                        break;
                case PRS_ERROR:
                        if (!part_error)
                                new_prs = PRS_ENABLED;
                        break;
                }
                /*
                 * Set part_error if previously in invalid state.
                 */
                part_error = (prev_prs == PRS_ERROR);
        }

        if (!part_error && (new_prs == PRS_ERROR))
                return 0;        /* Nothing need to be done */

        if (new_prs == PRS_ERROR) {
                /*
                 * Remove all its cpus from parent's subparts_cpus.
                 */
                adding = false;
                deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
                                       parent->subparts_cpus);
        }

        if (!adding && !deleting && (new_prs == cpuset->partition_root_state))
                return 0;

        /*
         * Change the parent's subparts_cpus.
         * Newly added CPUs will be removed from effective_cpus and
         * newly deleted ones will be added back to effective_cpus.
         */
        spin_lock_irq(&callback_lock);
        if (adding) {
                cpumask_or(parent->subparts_cpus,
                           parent->subparts_cpus, tmp->addmask);
                cpumask_andnot(parent->effective_cpus,
                               parent->effective_cpus, tmp->addmask);
        }
        if (deleting) {
                cpumask_andnot(parent->subparts_cpus,
                               parent->subparts_cpus, tmp->delmask);
                /*
                 * Some of the CPUs in subparts_cpus might have been offlined.
                 */
                cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
                cpumask_or(parent->effective_cpus,
                           parent->effective_cpus, tmp->delmask);
        }

        parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);

        if (cpuset->partition_root_state != new_prs)
                cpuset->partition_root_state = new_prs;
        spin_unlock_irq(&callback_lock);

        return cmd == partcmd_update;
}

/*
 * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
 * @cs:  the cpuset to consider
 * @tmp: temp variables for calculating effective_cpus & partition setup
 *
 * When congifured cpumask is changed, the effective cpumasks of this cpuset
 * and all its descendants need to be updated.
 *
 * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
 *
 * Called with cpuset_mutex held
 */
static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
{
        struct cpuset *cp;
        struct cgroup_subsys_state *pos_css;
        bool need_rebuild_sched_domains = false;
        int new_prs;

        rcu_read_lock();
        cpuset_for_each_descendant_pre(cp, pos_css, cs) {
                struct cpuset *parent = parent_cs(cp);

                compute_effective_cpumask(tmp->new_cpus, cp, parent);

                /*
                 * If it becomes empty, inherit the effective mask of the
                 * parent, which is guaranteed to have some CPUs.
                 */
                if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
                        cpumask_copy(tmp->new_cpus, parent->effective_cpus);
                        if (!cp->use_parent_ecpus) {
                                cp->use_parent_ecpus = true;
                                parent->child_ecpus_count++;
                        }
                } else if (cp->use_parent_ecpus) {
                        cp->use_parent_ecpus = false;
                        WARN_ON_ONCE(!parent->child_ecpus_count);
                        parent->child_ecpus_count--;
                }

                /*
                 * Skip the whole subtree if the cpumask remains the same
                 * and has no partition root state.
                 */
                if (!cp->partition_root_state &&
                    cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
                        pos_css = css_rightmost_descendant(pos_css);
                        continue;
                }

                /*
                 * update_parent_subparts_cpumask() should have been called
                 * for cs already in update_cpumask(). We should also call
                 * update_tasks_cpumask() again for tasks in the parent
                 * cpuset if the parent's subparts_cpus changes.
                 */
                new_prs = cp->partition_root_state;
                if ((cp != cs) && new_prs) {
                        switch (parent->partition_root_state) {
                        case PRS_DISABLED:
                                /*
                                 * If parent is not a partition root or an
                                 * invalid partition root, clear its state
                                 * and its CS_CPU_EXCLUSIVE flag.
                                 */
                                WARN_ON_ONCE(cp->partition_root_state
                                             != PRS_ERROR);
                                new_prs = PRS_DISABLED;

                                /*
                                 * clear_bit() is an atomic operation and
                                 * readers aren't interested in the state
                                 * of CS_CPU_EXCLUSIVE anyway. So we can
                                 * just update the flag without holding
                                 * the callback_lock.
                                 */
                                clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
                                break;

                        case PRS_ENABLED:
                                if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
                                        update_tasks_cpumask(parent);
                                break;

                        case PRS_ERROR:
                                /*
                                 * When parent is invalid, it has to be too.
                                 */
                                new_prs = PRS_ERROR;
                                break;
                        }
                }

                if (!css_tryget_online(&cp->css))
                        continue;
                rcu_read_unlock();

                spin_lock_irq(&callback_lock);

                cpumask_copy(cp->effective_cpus, tmp->new_cpus);
                if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) {
                        cp->nr_subparts_cpus = 0;
                        cpumask_clear(cp->subparts_cpus);
                } else if (cp->nr_subparts_cpus) {
                        /*
                         * Make sure that effective_cpus & subparts_cpus
                         * are mutually exclusive.
                         *
                         * In the unlikely event that effective_cpus
                         * becomes empty. we clear cp->nr_subparts_cpus and
                         * let its child partition roots to compete for
                         * CPUs again.
                         */
                        cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
                                       cp->subparts_cpus);
                        if (cpumask_empty(cp->effective_cpus)) {
                                cpumask_copy(cp->effective_cpus, tmp->new_cpus);
                                cpumask_clear(cp->subparts_cpus);
                                cp->nr_subparts_cpus = 0;
                        } else if (!cpumask_subset(cp->subparts_cpus,
                                                   tmp->new_cpus)) {
                                cpumask_andnot(cp->subparts_cpus,
                                        cp->subparts_cpus, tmp->new_cpus);
                                cp->nr_subparts_cpus
                                        = cpumask_weight(cp->subparts_cpus);
                        }
                }

                if (new_prs != cp->partition_root_state)
                        cp->partition_root_state = new_prs;

                spin_unlock_irq(&callback_lock);

                WARN_ON(!is_in_v2_mode() &&
                        !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));

                update_tasks_cpumask(cp);

                /*
                 * On legacy hierarchy, if the effective cpumask of any non-
                 * empty cpuset is changed, we need to rebuild sched domains.
                 * On default hierarchy, the cpuset needs to be a partition
                 * root as well.
                 */
                if (!cpumask_empty(cp->cpus_allowed) &&
                    is_sched_load_balance(cp) &&
                   (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
                    is_partition_root(cp)))
                        need_rebuild_sched_domains = true;

                rcu_read_lock();
                css_put(&cp->css);
        }
        rcu_read_unlock();

        if (need_rebuild_sched_domains)
                rebuild_sched_domains_locked();
}

/**
 * update_sibling_cpumasks - Update siblings cpumasks
 * @parent:  Parent cpuset
 * @cs:      Current cpuset
 * @tmp:     Temp variables
 */
static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
                                    struct tmpmasks *tmp)
{
        struct cpuset *sibling;
        struct cgroup_subsys_state *pos_css;

        lockdep_assert_held(&cpuset_mutex);

        /*
         * Check all its siblings and call update_cpumasks_hier()
         * if their use_parent_ecpus flag is set in order for them
         * to use the right effective_cpus value.
         *
         * The update_cpumasks_hier() function may sleep. So we have to
         * release the RCU read lock before calling it.
         */
        rcu_read_lock();
        cpuset_for_each_child(sibling, pos_css, parent) {
                if (sibling == cs)
                        continue;
                if (!sibling->use_parent_ecpus)
                        continue;
                if (!css_tryget_online(&sibling->css))
                        continue;

                rcu_read_unlock();
                update_cpumasks_hier(sibling, tmp);
                rcu_read_lock();
                css_put(&sibling->css);
        }
        rcu_read_unlock();
}

/**
 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
 * @cs: the cpuset to consider
 * @trialcs: trial cpuset
 * @buf: buffer of cpu numbers written to this cpuset
 */
static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
                          const char *buf)
{
        int retval;
        struct tmpmasks tmp;

        /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
        if (cs == &top_cpuset)
                return -EACCES;

        /*
         * An empty cpus_allowed is ok only if the cpuset has no tasks.
         * Since cpulist_parse() fails on an empty mask, we special case
         * that parsing.  The validate_change() call ensures that cpusets
         * with tasks have cpus.
         */
        if (!*buf) {
                cpumask_clear(trialcs->cpus_allowed);
        } else {
                retval = cpulist_parse(buf, trialcs->cpus_allowed);
                if (retval < 0)
                        return retval;

                if (!cpumask_subset(trialcs->cpus_allowed,
                                    top_cpuset.cpus_allowed))
                        return -EINVAL;
        }

        /* Nothing to do if the cpus didn't change */
        if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
                return 0;

        retval = validate_change(cs, trialcs);
        if (retval < 0)
                return retval;

#ifdef CONFIG_CPUMASK_OFFSTACK
        /*
         * Use the cpumasks in trialcs for tmpmasks when they are pointers
         * to allocated cpumasks.
         */
        tmp.addmask  = trialcs->subparts_cpus;
        tmp.delmask  = trialcs->effective_cpus;
        tmp.new_cpus = trialcs->cpus_allowed;
#endif

        if (cs->partition_root_state) {
                /* Cpumask of a partition root cannot be empty */
                if (cpumask_empty(trialcs->cpus_allowed))
                        return -EINVAL;
                if (update_parent_subparts_cpumask(cs, partcmd_update,
                                        trialcs->cpus_allowed, &tmp) < 0)
                        return -EINVAL;
        }

        spin_lock_irq(&callback_lock);
        cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);

        /*
         * Make sure that subparts_cpus is a subset of cpus_allowed.
         */
        if (cs->nr_subparts_cpus) {
                cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed);
                cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
        }
        spin_unlock_irq(&callback_lock);

        update_cpumasks_hier(cs, &tmp);

        if (cs->partition_root_state) {
                struct cpuset *parent = parent_cs(cs);

                /*
                 * For partition root, update the cpumasks of sibling
                 * cpusets if they use parent's effective_cpus.
                 */
                if (parent->child_ecpus_count)
                        update_sibling_cpumasks(parent, cs, &tmp);
        }
        return 0;
}

/*
 * Migrate memory region from one set of nodes to another.  This is
 * performed asynchronously as it can be called from process migration path
 * holding locks involved in process management.  All mm migrations are
 * performed in the queued order and can be waited for by flushing
 * cpuset_migrate_mm_wq.
 */

struct cpuset_migrate_mm_work {
        struct work_struct        work;
        struct mm_struct        *mm;
        nodemask_t                from;
        nodemask_t                to;
};

static void cpuset_migrate_mm_workfn(struct work_struct *work)
{
        struct cpuset_migrate_mm_work *mwork =
                container_of(work, struct cpuset_migrate_mm_work, work);

        /* on a wq worker, no need to worry about %current's mems_allowed */
        do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
        mmput(mwork->mm);
        kfree(mwork);
}

static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
                                                        const nodemask_t *to)
{
        struct cpuset_migrate_mm_work *mwork;

        mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
        if (mwork) {
                mwork->mm = mm;
                mwork->from = *from;
                mwork->to = *to;
                INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
                queue_work(cpuset_migrate_mm_wq, &mwork->work);
        } else {
                mmput(mm);
        }
}

static void cpuset_post_attach(void)
{
        flush_workqueue(cpuset_migrate_mm_wq);
}

/*
 * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
 * @tsk: the task to change
 * @newmems: new nodes that the task will be set
 *
 * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
 * and rebind an eventual tasks' mempolicy. If the task is allocating in
 * parallel, it might temporarily see an empty intersection, which results in
 * a seqlock check and retry before OOM or allocation failure.
 */
static void cpuset_change_task_nodemask(struct task_struct *tsk,
                                        nodemask_t *newmems)
{
        task_lock(tsk);

        local_irq_disable();
        write_seqcount_begin(&tsk->mems_allowed_seq);

        nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
        mpol_rebind_task(tsk, newmems);
        tsk->mems_allowed = *newmems;

        write_seqcount_end(&tsk->mems_allowed_seq);
        local_irq_enable();

        task_unlock(tsk);
}

static void *cpuset_being_rebound;

/**
 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
 *
 * Iterate through each task of @cs updating its mems_allowed to the
 * effective cpuset's.  As this function is called with cpuset_mutex held,
 * cpuset membership stays stable.
 */
static void update_tasks_nodemask(struct cpuset *cs)
{
        static nodemask_t newmems;        /* protected by cpuset_mutex */
        struct css_task_iter it;
        struct task_struct *task;

        cpuset_being_rebound = cs;                /* causes mpol_dup() rebind */

        guarantee_online_mems(cs, &newmems);

        /*
         * The mpol_rebind_mm() call takes mmap_lock, which we couldn't
         * take while holding tasklist_lock.  Forks can happen - the
         * mpol_dup() cpuset_being_rebound check will catch such forks,
         * and rebind their vma mempolicies too.  Because we still hold
         * the global cpuset_mutex, we know that no other rebind effort
         * will be contending for the global variable cpuset_being_rebound.
         * It's ok if we rebind the same mm twice; mpol_rebind_mm()
         * is idempotent.  Also migrate pages in each mm to new nodes.
         */
        css_task_iter_start(&cs->css, 0, &it);
        while ((task = css_task_iter_next(&it))) {
                struct mm_struct *mm;
                bool migrate;

                cpuset_change_task_nodemask(task, &newmems);

                mm = get_task_mm(task);
                if (!mm)
                        continue;

                migrate = is_memory_migrate(cs);

                mpol_rebind_mm(mm, &cs->mems_allowed);
                if (migrate)
                        cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
                else
                        mmput(mm);
        }
        css_task_iter_end(&it);

        /*
         * All the tasks' nodemasks have been updated, update
         * cs->old_mems_allowed.
         */
        cs->old_mems_allowed = newmems;

        /* We're done rebinding vmas to this cpuset's new mems_allowed. */
        cpuset_being_rebound = NULL;
}

/*
 * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
 * @cs: the cpuset to consider
 * @new_mems: a temp variable for calculating new effective_mems
 *
 * When configured nodemask is changed, the effective nodemasks of this cpuset
 * and all its descendants need to be updated.
 *
 * On legacy hiearchy, effective_mems will be the same with mems_allowed.
 *
 * Called with cpuset_mutex held
 */
static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
{
        struct cpuset *cp;
        struct cgroup_subsys_state *pos_css;

        rcu_read_lock();
        cpuset_for_each_descendant_pre(cp, pos_css, cs) {
                struct cpuset *parent = parent_cs(cp);

                nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);

                /*
                 * If it becomes empty, inherit the effective mask of the
                 * parent, which is guaranteed to have some MEMs.
                 */
                if (is_in_v2_mode() && nodes_empty(*new_mems))
                        *new_mems = parent->effective_mems;

                /* Skip the whole subtree if the nodemask remains the same. */
                if (nodes_equal(*new_mems, cp->effective_mems)) {
                        pos_css = css_rightmost_descendant(pos_css);
                        continue;
                }

                if (!css_tryget_online(&cp->css))
                        continue;
                rcu_read_unlock();

                spin_lock_irq(&callback_lock);
                cp->effective_mems = *new_mems;
                spin_unlock_irq(&callback_lock);

                WARN_ON(!is_in_v2_mode() &&
                        !nodes_equal(cp->mems_allowed, cp->effective_mems));

                update_tasks_nodemask(cp);

                rcu_read_lock();
                css_put(&cp->css);
        }
        rcu_read_unlock();
}

/*
 * Handle user request to change the 'mems' memory placement
 * of a cpuset.  Needs to validate the request, update the
 * cpusets mems_allowed, and for each task in the cpuset,
 * update mems_allowed and rebind task's mempolicy and any vma
 * mempolicies and if the cpuset is marked 'memory_migrate',
 * migrate the tasks pages to the new memory.
 *
 * Call with cpuset_mutex held. May take callback_lock during call.
 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
 * lock each such tasks mm->mmap_lock, scan its vma's and rebind
 * their mempolicies to the cpusets new mems_allowed.
 */
static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
                           const char *buf)
{
        int retval;

        /*
         * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
         * it's read-only
         */
        if (cs == &top_cpuset) {
                retval = -EACCES;
                goto done;
        }

        /*
         * An empty mems_allowed is ok iff there are no tasks in the cpuset.
         * Since nodelist_parse() fails on an empty mask, we special case
         * that parsing.  The validate_change() call ensures that cpusets
         * with tasks have memory.
         */
        if (!*buf) {
                nodes_clear(trialcs->mems_allowed);
        } else {
                retval = nodelist_parse(buf, trialcs->mems_allowed);
                if (retval < 0)
                        goto done;

                if (!nodes_subset(trialcs->mems_allowed,
                                  top_cpuset.mems_allowed)) {
                        retval = -EINVAL;
                        goto done;
                }
        }

        if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
                retval = 0;                /* Too easy - nothing to do */
                goto done;
        }
        retval = validate_change(cs, trialcs);
        if (retval < 0)
                goto done;

        spin_lock_irq(&callback_lock);
        cs->mems_allowed = trialcs->mems_allowed;
        spin_unlock_irq(&callback_lock);

        /* use trialcs->mems_allowed as a temp variable */
        update_nodemasks_hier(cs, &trialcs->mems_allowed);
done:
        return retval;
}

bool current_cpuset_is_being_rebound(void)
{
        bool ret;

        rcu_read_lock();
        ret = task_cs(current) == cpuset_being_rebound;
        rcu_read_unlock();

        return ret;
}

static int update_relax_domain_level(struct cpuset *cs, s64 val)
{
#ifdef CONFIG_SMP
        if (val < -1 || val >= sched_domain_level_max)
                return -EINVAL;
#endif

        if (val != cs->relax_domain_level) {
                cs->relax_domain_level = val;
                if (!cpumask_empty(cs->cpus_allowed) &&
                    is_sched_load_balance(cs))
                        rebuild_sched_domains_locked();
        }

        return 0;
}

/**
 * update_tasks_flags - update the spread flags of tasks in the cpuset.
 * @cs: the cpuset in which each task's spread flags needs to be changed
 *
 * Iterate through each task of @cs updating its spread flags.  As this
 * function is called with cpuset_mutex held, cpuset membership stays
 * stable.
 */
static void update_tasks_flags(struct cpuset *cs)
{
        struct css_task_iter it;
        struct task_struct *task;

        css_task_iter_start(&cs->css, 0, &it);
        while ((task = css_task_iter_next(&it)))
                cpuset_update_task_spread_flag(cs, task);
        css_task_iter_end(&it);
}

/*
 * update_flag - read a 0 or a 1 in a file and update associated flag
 * bit:                the bit to update (see cpuset_flagbits_t)
 * cs:                the cpuset to update
 * turning_on:         whether the flag is being set or cleared
 *
 * Call with cpuset_mutex held.
 */

static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
                       int turning_on)
{
        struct cpuset *trialcs;
        int balance_flag_changed;
        int spread_flag_changed;
        int err;

        trialcs = alloc_trial_cpuset(cs);
        if (!trialcs)
                return -ENOMEM;

        if (turning_on)
                set_bit(bit, &trialcs->flags);
        else
                clear_bit(bit, &trialcs->flags);

        err = validate_change(cs, trialcs);
        if (err < 0)
                goto out;

        balance_flag_changed = (is_sched_load_balance(cs) !=
                                is_sched_load_balance(trialcs));

        spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
                        || (is_spread_page(cs) != is_spread_page(trialcs)));

        spin_lock_irq(&callback_lock);
        cs->flags = trialcs->flags;
        spin_unlock_irq(&callback_lock);

        if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
                rebuild_sched_domains_locked();

        if (spread_flag_changed)
                update_tasks_flags(cs);
out:
        free_cpuset(trialcs);
        return err;
}

/*
 * update_prstate - update partititon_root_state
 * cs: the cpuset to update
 * new_prs: new partition root state
 *
 * Call with cpuset_mutex held.
 */
static int update_prstate(struct cpuset *cs, int new_prs)
{
        int err, old_prs = cs->partition_root_state;
        struct cpuset *parent = parent_cs(cs);
        struct tmpmasks tmpmask;

        if (old_prs == new_prs)
                return 0;

        /*
         * Cannot force a partial or invalid partition root to a full
         * partition root.
         */
        if (new_prs && (old_prs == PRS_ERROR))
                return -EINVAL;

        if (alloc_cpumasks(NULL, &tmpmask))
                return -ENOMEM;

        err = -EINVAL;
        if (!old_prs) {
                /*
                 * Turning on partition root requires setting the
                 * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
                 * cannot be NULL.
                 */
                if (cpumask_empty(cs->cpus_allowed))
                        goto out;

                err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
                if (err)
                        goto out;

                err = update_parent_subparts_cpumask(cs, partcmd_enable,
                                                     NULL, &tmpmask);
                if (err) {
                        update_flag(CS_CPU_EXCLUSIVE, cs, 0);
                        goto out;
                }
        } else {
                /*
                 * Turning off partition root will clear the
                 * CS_CPU_EXCLUSIVE bit.
                 */
                if (old_prs == PRS_ERROR) {
                        update_flag(CS_CPU_EXCLUSIVE, cs, 0);
                        err = 0;
                        goto out;
                }

                err = update_parent_subparts_cpumask(cs, partcmd_disable,
                                                     NULL, &tmpmask);
                if (err)
                        goto out;

                /* Turning off CS_CPU_EXCLUSIVE will not return error */
                update_flag(CS_CPU_EXCLUSIVE, cs, 0);
        }

        update_tasks_cpumask(parent);

        if (parent->child_ecpus_count)
                update_sibling_cpumasks(parent, cs, &tmpmask);

        rebuild_sched_domains_locked();
out:
        if (!err) {
                spin_lock_irq(&callback_lock);
                cs->partition_root_state = new_prs;
                spin_unlock_irq(&callback_lock);
        }

        free_cpumasks(NULL, &tmpmask);
        return err;
}

/*
 * Frequency meter - How fast is some event occurring?
 *
 * These routines manage a digitally filtered, constant time based,
 * event frequency meter.  There are four routines:
 *   fmeter_init() - initialize a frequency meter.
 *   fmeter_markevent() - called each time the event happens.
 *   fmeter_getrate() - returns the recent rate of such events.
 *   fmeter_update() - internal routine used to update fmeter.
 *
 * A common data structure is passed to each of these routines,
 * which is used to keep track of the state required to manage the
 * frequency meter and its digital filter.
 *
 * The filter works on the number of events marked per unit time.
 * The filter is single-pole low-pass recursive (IIR).  The time unit
 * is 1 second.  Arithmetic is done using 32-bit integers scaled to
 * simulate 3 decimal digits of precision (multiplied by 1000).
 *
 * With an FM_COEF of 933, and a time base of 1 second, the filter
 * has a half-life of 10 seconds, meaning that if the events quit
 * happening, then the rate returned from the fmeter_getrate()
 * will be cut in half each 10 seconds, until it converges to zero.
 *
 * It is not worth doing a real infinitely recursive filter.  If more
 * than FM_MAXTICKS ticks have elapsed since the last filter event,
 * just compute FM_MAXTICKS ticks worth, by which point the level
 * will be stable.
 *
 * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
 * arithmetic overflow in the fmeter_update() routine.
 *
 * Given the simple 32 bit integer arithmetic used, this meter works
 * best for reporting rates between one per millisecond (msec) and
 * one per 32 (approx) seconds.  At constant rates faster than one
 * per msec it maxes out at values just under 1,000,000.  At constant
 * rates between one per msec, and one per second it will stabilize
 * to a value N*1000, where N is the rate of events per second.
 * At constant rates between one per second and one per 32 seconds,
 * it will be choppy, moving up on the seconds that have an event,
 * and then decaying until the next event.  At rates slower than
 * about one in 32 seconds, it decays all the way back to zero between
 * each event.
 */

#define FM_COEF 933                /* coefficient for half-life of 10 secs */
#define FM_MAXTICKS ((u32)99)   /* useless computing more ticks than this */
#define FM_MAXCNT 1000000        /* limit cnt to avoid overflow */
#define FM_SCALE 1000                /* faux fixed point scale */

/* Initialize a frequency meter */
static void fmeter_init(struct fmeter *fmp)
{
        fmp->cnt = 0;
        fmp->val = 0;
        fmp->time = 0;
        spin_lock_init(&fmp->lock);
}

/* Internal meter update - process cnt events and update value */
static void fmeter_update(struct fmeter *fmp)
{
        time64_t now;
        u32 ticks;

        now = ktime_get_seconds();
        ticks = now - fmp->time;

        if (ticks == 0)
                return;

        ticks = min(FM_MAXTICKS, ticks);
        while (ticks-- > 0)
                fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
        fmp->time = now;

        fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
        fmp->cnt = 0;
}

/* Process any previous ticks, then bump cnt by one (times scale). */
static void fmeter_markevent(struct fmeter *fmp)
{
        spin_lock(&fmp->lock);
        fmeter_update(fmp);
        fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
        spin_unlock(&fmp->lock);
}

/* Process any previous ticks, then return current value. */
static int fmeter_getrate(struct fmeter *fmp)
{
        int val;

        spin_lock(&fmp->lock);
        fmeter_update(fmp);
        val = fmp->val;
        spin_unlock(&fmp->lock);
        return val;
}

static struct cpuset *cpuset_attach_old_cs;

static void reset_migrate_dl_data(struct cpuset *cs)
{
        cs->nr_migrate_dl_tasks = 0;
        cs->sum_migrate_dl_bw = 0;
}

/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
static int cpuset_can_attach(struct cgroup_taskset *tset)
{
        struct cgroup_subsys_state *css;
        struct cpuset *cs, *oldcs;
        struct task_struct *task;
        int ret;

        /* used later by cpuset_attach() */
        cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
        oldcs = cpuset_attach_old_cs;
        cs = css_cs(css);

        mutex_lock(&cpuset_mutex);

        /* allow moving tasks into an empty cpuset if on default hierarchy */
        ret = -ENOSPC;
        if (!is_in_v2_mode() &&
            (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
                goto out_unlock;

        cgroup_taskset_for_each(task, css, tset) {
                ret = task_can_attach(task);
                if (ret)
                        goto out_unlock;
                ret = security_task_setscheduler(task);
                if (ret)
                        goto out_unlock;

                if (dl_task(task)) {
                        cs->nr_migrate_dl_tasks++;
                        cs->sum_migrate_dl_bw += task->dl.dl_bw;
                }
        }

        if (!cs->nr_migrate_dl_tasks)
                goto out_success;

        if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {
                int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);

                if (unlikely(cpu >= nr_cpu_ids)) {
                        reset_migrate_dl_data(cs);
                        ret = -EINVAL;
                        goto out_unlock;
                }

                ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
                if (ret) {
                        reset_migrate_dl_data(cs);
                        goto out_unlock;
                }
        }

out_success:
        /*
         * Mark attach is in progress.  This makes validate_change() fail
         * changes which zero cpus/mems_allowed.
         */
        cs->attach_in_progress++;
        ret = 0;
out_unlock:
        mutex_unlock(&cpuset_mutex);
        return ret;
}

static void cpuset_cancel_attach(struct cgroup_taskset *tset)
{
        struct cgroup_subsys_state *css;
        struct cpuset *cs;

        cgroup_taskset_first(tset, &css);
        cs = css_cs(css);

        mutex_lock(&cpuset_mutex);
        cs->attach_in_progress--;
        if (!cs->attach_in_progress)
                wake_up(&cpuset_attach_wq);

        if (cs->nr_migrate_dl_tasks) {
                int cpu = cpumask_any(cs->effective_cpus);

                dl_bw_free(cpu, cs->sum_migrate_dl_bw);
                reset_migrate_dl_data(cs);
        }

        mutex_unlock(&cpuset_mutex);
}

/*
 * Protected by cpuset_mutex.  cpus_attach is used only by cpuset_attach()
 * but we can't allocate it dynamically there.  Define it global and
 * allocate from cpuset_init().
 */
static cpumask_var_t cpus_attach;

static void cpuset_attach(struct cgroup_taskset *tset)
{
        /* static buf protected by cpuset_mutex */
        static nodemask_t cpuset_attach_nodemask_to;
        struct task_struct *task;
        struct task_struct *leader;
        struct cgroup_subsys_state *css;
        struct cpuset *cs;
        struct cpuset *oldcs = cpuset_attach_old_cs;

        cgroup_taskset_first(tset, &css);
        cs = css_cs(css);

        lockdep_assert_cpus_held();        /* see cgroup_attach_lock() */
        mutex_lock(&cpuset_mutex);

        /* prepare for attach */
        if (cs == &top_cpuset)
                cpumask_copy(cpus_attach, cpu_possible_mask);
        else
                guarantee_online_cpus(cs, cpus_attach);

        guarantee_online_mems(cs, &cpuset_attach_nodemask_to);

        cgroup_taskset_for_each(task, css, tset) {
                /*
                 * can_attach beforehand should guarantee that this doesn't
                 * fail.  TODO: have a better way to handle failure here
                 */
                WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));

                cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
                cpuset_update_task_spread_flag(cs, task);
        }

        /*
         * Change mm for all threadgroup leaders. This is expensive and may
         * sleep and should be moved outside migration path proper.
         */
        cpuset_attach_nodemask_to = cs->effective_mems;
        cgroup_taskset_for_each_leader(leader, css, tset) {
                struct mm_struct *mm = get_task_mm(leader);

                if (mm) {
                        mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);

                        /*
                         * old_mems_allowed is the same with mems_allowed
                         * here, except if this task is being moved
                         * automatically due to hotplug.  In that case
                         * @mems_allowed has been updated and is empty, so
                         * @old_mems_allowed is the right nodesets that we
                         * migrate mm from.
                         */
                        if (is_memory_migrate(cs))
                                cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
                                                  &cpuset_attach_nodemask_to);
                        else
                                mmput(mm);
                }
        }

        cs->old_mems_allowed = cpuset_attach_nodemask_to;

        if (cs->nr_migrate_dl_tasks) {
                cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
                oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;
                reset_migrate_dl_data(cs);
        }

        cs->attach_in_progress--;
        if (!cs->attach_in_progress)
                wake_up(&cpuset_attach_wq);

        mutex_unlock(&cpuset_mutex);
}

/* The various types of files and directories in a cpuset file system */

typedef enum {
        FILE_MEMORY_MIGRATE,
        FILE_CPULIST,
        FILE_MEMLIST,
        FILE_EFFECTIVE_CPULIST,
        FILE_EFFECTIVE_MEMLIST,
        FILE_SUBPARTS_CPULIST,
        FILE_CPU_EXCLUSIVE,
        FILE_MEM_EXCLUSIVE,
        FILE_MEM_HARDWALL,
        FILE_SCHED_LOAD_BALANCE,
        FILE_PARTITION_ROOT,
        FILE_SCHED_RELAX_DOMAIN_LEVEL,
        FILE_MEMORY_PRESSURE_ENABLED,
        FILE_MEMORY_PRESSURE,
        FILE_SPREAD_PAGE,
        FILE_SPREAD_SLAB,
} cpuset_filetype_t;

static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
                            u64 val)
{
        struct cpuset *cs = css_cs(css);
        cpuset_filetype_t type = cft->private;
        int retval = 0;

        get_online_cpus();
        mutex_lock(&cpuset_mutex);
        if (!is_cpuset_online(cs)) {
                retval = -ENODEV;
                goto out_unlock;
        }

        switch (type) {
        case FILE_CPU_EXCLUSIVE:
                retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
                break;
        case FILE_MEM_EXCLUSIVE:
                retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
                break;
        case FILE_MEM_HARDWALL:
                retval = update_flag(CS_MEM_HARDWALL, cs, val);
                break;
        case FILE_SCHED_LOAD_BALANCE:
                retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
                break;
        case FILE_MEMORY_MIGRATE:
                retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
                break;
        case FILE_MEMORY_PRESSURE_ENABLED:
                cpuset_memory_pressure_enabled = !!val;
                break;
        case FILE_SPREAD_PAGE:
                retval = update_flag(CS_SPREAD_PAGE, cs, val);
                break;
        case FILE_SPREAD_SLAB:
                retval = update_flag(CS_SPREAD_SLAB, cs, val);
                break;
        default:
                retval = -EINVAL;
                break;
        }
out_unlock:
        mutex_unlock(&cpuset_mutex);
        put_online_cpus();
        return retval;
}

static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
                            s64 val)
{
        struct cpuset *cs = css_cs(css);
        cpuset_filetype_t type = cft->private;
        int retval = -ENODEV;

        get_online_cpus();
        mutex_lock(&cpuset_mutex);
        if (!is_cpuset_online(cs))
                goto out_unlock;

        switch (type) {
        case FILE_SCHED_RELAX_DOMAIN_LEVEL:
                retval = update_relax_domain_level(cs, val);
                break;
        default:
                retval = -EINVAL;
                break;
        }
out_unlock:
        mutex_unlock(&cpuset_mutex);
        put_online_cpus();
        return retval;
}

/*
 * Common handling for a write to a "cpus" or "mems" file.
 */
static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
                                    char *buf, size_t nbytes, loff_t off)
{
        struct cpuset *cs = css_cs(of_css(of));
        struct cpuset *trialcs;
        int retval = -ENODEV;

        buf = strstrip(buf);

        /*
         * CPU or memory hotunplug may leave @cs w/o any execution
         * resources, in which case the hotplug code asynchronously updates
         * configuration and transfers all tasks to the nearest ancestor
         * which can execute.
         *
         * As writes to "cpus" or "mems" may restore @cs's execution
         * resources, wait for the previously scheduled operations before
         * proceeding, so that we don't end up keep removing tasks added
         * after execution capability is restored.
         *
         * cpuset_hotplug_work calls back into cgroup core via
         * cgroup_transfer_tasks() and waiting for it from a cgroupfs
         * operation like this one can lead to a deadlock through kernfs
         * active_ref protection.  Let's break the protection.  Losing the
         * protection is okay as we check whether @cs is online after
         * grabbing cpuset_mutex anyway.  This only happens on the legacy
         * hierarchies.
         */
        css_get(&cs->css);
        kernfs_break_active_protection(of->kn);
        flush_work(&cpuset_hotplug_work);

        get_online_cpus();
        mutex_lock(&cpuset_mutex);
        if (!is_cpuset_online(cs))
                goto out_unlock;

        trialcs = alloc_trial_cpuset(cs);
        if (!trialcs) {
                retval = -ENOMEM;
                goto out_unlock;
        }

        switch (of_cft(of)->private) {
        case FILE_CPULIST:
                retval = update_cpumask(cs, trialcs, buf);
                break;
        case FILE_MEMLIST:
                retval = update_nodemask(cs, trialcs, buf);
                break;
        default:
                retval = -EINVAL;
                break;
        }

        free_cpuset(trialcs);
out_unlock:
        mutex_unlock(&cpuset_mutex);
        put_online_cpus();
        kernfs_unbreak_active_protection(of->kn);
        css_put(&cs->css);
        flush_workqueue(cpuset_migrate_mm_wq);
        return retval ?: nbytes;
}

/*
 * These ascii lists should be read in a single call, by using a user
 * buffer large enough to hold the entire map.  If read in smaller
 * chunks, there is no guarantee of atomicity.  Since the display format
 * used, list of ranges of sequential numbers, is variable length,
 * and since these maps can change value dynamically, one could read
 * gibberish by doing partial reads while a list was changing.
 */
static int cpuset_common_seq_show(struct seq_file *sf, void *v)
{
        struct cpuset *cs = css_cs(seq_css(sf));
        cpuset_filetype_t type = seq_cft(sf)->private;
        int ret = 0;

        spin_lock_irq(&callback_lock);

        switch (type) {
        case FILE_CPULIST:
                seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
                break;
        case FILE_MEMLIST:
                seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
                break;
        case FILE_EFFECTIVE_CPULIST:
                seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
                break;
        case FILE_EFFECTIVE_MEMLIST:
                seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
                break;
        case FILE_SUBPARTS_CPULIST:
                seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
                break;
        default:
                ret = -EINVAL;
        }

        spin_unlock_irq(&callback_lock);
        return ret;
}

static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
{
        struct cpuset *cs = css_cs(css);
        cpuset_filetype_t type = cft->private;
        switch (type) {
        case FILE_CPU_EXCLUSIVE:
                return is_cpu_exclusive(cs);
        case FILE_MEM_EXCLUSIVE:
                return is_mem_exclusive(cs);
        case FILE_MEM_HARDWALL:
                return is_mem_hardwall(cs);
        case FILE_SCHED_LOAD_BALANCE:
                return is_sched_load_balance(cs);
        case FILE_MEMORY_MIGRATE:
                return is_memory_migrate(cs);
        case FILE_MEMORY_PRESSURE_ENABLED:
                return cpuset_memory_pressure_enabled;
        case FILE_MEMORY_PRESSURE:
                return fmeter_getrate(&cs->fmeter);
        case FILE_SPREAD_PAGE:
                return is_spread_page(cs);
        case FILE_SPREAD_SLAB:
                return is_spread_slab(cs);
        default:
                BUG();
        }

        /* Unreachable but makes gcc happy */
        return 0;
}

static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
{
        struct cpuset *cs = css_cs(css);
        cpuset_filetype_t type = cft->private;
        switch (type) {
        case FILE_SCHED_RELAX_DOMAIN_LEVEL:
                return cs->relax_domain_level;
        default:
                BUG();
        }

        /* Unrechable but makes gcc happy */
        return 0;
}

static int sched_partition_show(struct seq_file *seq, void *v)
{
        struct cpuset *cs = css_cs(seq_css(seq));

        switch (cs->partition_root_state) {
        case PRS_ENABLED:
                seq_puts(seq, "root\n");
                break;
        case PRS_DISABLED:
                seq_puts(seq, "member\n");
                break;
        case PRS_ERROR:
                seq_puts(seq, "root invalid\n");
                break;
        }
        return 0;
}

static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
                                     size_t nbytes, loff_t off)
{
        struct cpuset *cs = css_cs(of_css(of));
        int val;
        int retval = -ENODEV;

        buf = strstrip(buf);

        /*
         * Convert "root" to ENABLED, and convert "member" to DISABLED.
         */
        if (!strcmp(buf, "root"))
                val = PRS_ENABLED;
        else if (!strcmp(buf, "member"))
                val = PRS_DISABLED;
        else
                return -EINVAL;

        css_get(&cs->css);
        get_online_cpus();
        mutex_lock(&cpuset_mutex);
        if (!is_cpuset_online(cs))
                goto out_unlock;

        retval = update_prstate(cs, val);
out_unlock:
        mutex_unlock(&cpuset_mutex);
        put_online_cpus();
        css_put(&cs->css);
        return retval ?: nbytes;
}

/*
 * for the common functions, 'private' gives the type of file
 */

static struct cftype legacy_files[] = {
        {
                .name = "cpus",
                .seq_show = cpuset_common_seq_show,
                .write = cpuset_write_resmask,
                .max_write_len = (100U + 6 * NR_CPUS),
                .private = FILE_CPULIST,
        },

        {
                .name = "mems",
                .seq_show = cpuset_common_seq_show,
                .write = cpuset_write_resmask,
                .max_write_len = (100U + 6 * MAX_NUMNODES),
                .private = FILE_MEMLIST,
        },

        {
                .name = "effective_cpus",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_EFFECTIVE_CPULIST,
        },

        {
                .name = "effective_mems",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_EFFECTIVE_MEMLIST,
        },

        {
                .name = "cpu_exclusive",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_CPU_EXCLUSIVE,
        },

        {
                .name = "mem_exclusive",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_MEM_EXCLUSIVE,
        },

        {
                .name = "mem_hardwall",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_MEM_HARDWALL,
        },

        {
                .name = "sched_load_balance",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_SCHED_LOAD_BALANCE,
        },

        {
                .name = "sched_relax_domain_level",
                .read_s64 = cpuset_read_s64,
                .write_s64 = cpuset_write_s64,
                .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
        },

        {
                .name = "memory_migrate",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_MEMORY_MIGRATE,
        },

        {
                .name = "memory_pressure",
                .read_u64 = cpuset_read_u64,
                .private = FILE_MEMORY_PRESSURE,
        },

        {
                .name = "memory_spread_page",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_SPREAD_PAGE,
        },

        {
                .name = "memory_spread_slab",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_SPREAD_SLAB,
        },

        {
                .name = "memory_pressure_enabled",
                .flags = CFTYPE_ONLY_ON_ROOT,
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_MEMORY_PRESSURE_ENABLED,
        },

        { }        /* terminate */
};

/*
 * This is currently a minimal set for the default hierarchy. It can be
 * expanded later on by migrating more features and control files from v1.
 */
static struct cftype dfl_files[] = {
        {
                .name = "cpus",
                .seq_show = cpuset_common_seq_show,
                .write = cpuset_write_resmask,
                .max_write_len = (100U + 6 * NR_CPUS),
                .private = FILE_CPULIST,
                .flags = CFTYPE_NOT_ON_ROOT,
        },

        {
                .name = "mems",
                .seq_show = cpuset_common_seq_show,
                .write = cpuset_write_resmask,
                .max_write_len = (100U + 6 * MAX_NUMNODES),
                .private = FILE_MEMLIST,
                .flags = CFTYPE_NOT_ON_ROOT,
        },

        {
                .name = "cpus.effective",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_EFFECTIVE_CPULIST,
        },

        {
                .name = "mems.effective",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_EFFECTIVE_MEMLIST,
        },

        {
                .name = "cpus.partition",
                .seq_show = sched_partition_show,
                .write = sched_partition_write,
                .private = FILE_PARTITION_ROOT,
                .flags = CFTYPE_NOT_ON_ROOT,
        },

        {
                .name = "cpus.subpartitions",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_SUBPARTS_CPULIST,
                .flags = CFTYPE_DEBUG,
        },

        { }        /* terminate */
};


/*
 *        cpuset_css_alloc - allocate a cpuset css
 *        cgrp:        control group that the new cpuset will be part of
 */

static struct cgroup_subsys_state *
cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
{
        struct cpuset *cs;

        if (!parent_css)
                return &top_cpuset.css;

        cs = kzalloc(sizeof(*cs), GFP_KERNEL);
        if (!cs)
                return ERR_PTR(-ENOMEM);

        if (alloc_cpumasks(cs, NULL)) {
                kfree(cs);
                return ERR_PTR(-ENOMEM);
        }

        set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
        nodes_clear(cs->mems_allowed);
        nodes_clear(cs->effective_mems);
        fmeter_init(&cs->fmeter);
        cs->relax_domain_level = -1;

        return &cs->css;
}

static int cpuset_css_online(struct cgroup_subsys_state *css)
{
        struct cpuset *cs = css_cs(css);
        struct cpuset *parent = parent_cs(cs);
        struct cpuset *tmp_cs;
        struct cgroup_subsys_state *pos_css;

        if (!parent)
                return 0;

        get_online_cpus();
        mutex_lock(&cpuset_mutex);

        set_bit(CS_ONLINE, &cs->flags);
        if (is_spread_page(parent))
                set_bit(CS_SPREAD_PAGE, &cs->flags);
        if (is_spread_slab(parent))
                set_bit(CS_SPREAD_SLAB, &cs->flags);

        cpuset_inc();

        spin_lock_irq(&callback_lock);
        if (is_in_v2_mode()) {
                cpumask_copy(cs->effective_cpus, parent->effective_cpus);
                cs->effective_mems = parent->effective_mems;
                cs->use_parent_ecpus = true;
                parent->child_ecpus_count++;
        }
        spin_unlock_irq(&callback_lock);

        if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
                goto out_unlock;

        /*
         * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
         * set.  This flag handling is implemented in cgroup core for
         * histrical reasons - the flag may be specified during mount.
         *
         * Currently, if any sibling cpusets have exclusive cpus or mem, we
         * refuse to clone the configuration - thereby refusing the task to
         * be entered, and as a result refusing the sys_unshare() or
         * clone() which initiated it.  If this becomes a problem for some
         * users who wish to allow that scenario, then this could be
         * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
         * (and likewise for mems) to the new cgroup.
         */
        rcu_read_lock();
        cpuset_for_each_child(tmp_cs, pos_css, parent) {
                if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
                        rcu_read_unlock();
                        goto out_unlock;
                }
        }
        rcu_read_unlock();

        spin_lock_irq(&callback_lock);
        cs->mems_allowed = parent->mems_allowed;
        cs->effective_mems = parent->mems_allowed;
        cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
        cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
        spin_unlock_irq(&callback_lock);
out_unlock:
        mutex_unlock(&cpuset_mutex);
        put_online_cpus();
        return 0;
}

/*
 * If the cpuset being removed has its flag 'sched_load_balance'
 * enabled, then simulate turning sched_load_balance off, which
 * will call rebuild_sched_domains_locked(). That is not needed
 * in the default hierarchy where only changes in partition
 * will cause repartitioning.
 *
 * If the cpuset has the 'sched.partition' flag enabled, simulate
 * turning 'sched.partition" off.
 */

static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
        struct cpuset *cs = css_cs(css);

        get_online_cpus();
        mutex_lock(&cpuset_mutex);

        if (is_partition_root(cs))
                update_prstate(cs, 0);

        if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
            is_sched_load_balance(cs))
                update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);

        if (cs->use_parent_ecpus) {
                struct cpuset *parent = parent_cs(cs);

                cs->use_parent_ecpus = false;
                parent->child_ecpus_count--;
        }

        cpuset_dec();
        clear_bit(CS_ONLINE, &cs->flags);

        mutex_unlock(&cpuset_mutex);
        put_online_cpus();
}

static void cpuset_css_free(struct cgroup_subsys_state *css)
{
        struct cpuset *cs = css_cs(css);

        free_cpuset(cs);
}

static void cpuset_bind(struct cgroup_subsys_state *root_css)
{
        mutex_lock(&cpuset_mutex);
        spin_lock_irq(&callback_lock);

        if (is_in_v2_mode()) {
                cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
                top_cpuset.mems_allowed = node_possible_map;
        } else {
                cpumask_copy(top_cpuset.cpus_allowed,
                             top_cpuset.effective_cpus);
                top_cpuset.mems_allowed = top_cpuset.effective_mems;
        }

        spin_unlock_irq(&callback_lock);
        mutex_unlock(&cpuset_mutex);
}

/*
 * Make sure the new task conform to the current state of its parent,
 * which could have been changed by cpuset just after it inherits the
 * state from the parent and before it sits on the cgroup's task list.
 */
static void cpuset_fork(struct task_struct *task)
{
        if (task_css_is_root(task, cpuset_cgrp_id))
                return;

        set_cpus_allowed_ptr(task, current->cpus_ptr);
        task->mems_allowed = current->mems_allowed;
}

struct cgroup_subsys cpuset_cgrp_subsys = {
        .css_alloc        = cpuset_css_alloc,
        .css_online        = cpuset_css_online,
        .css_offline        = cpuset_css_offline,
        .css_free        = cpuset_css_free,
        .can_attach        = cpuset_can_attach,
        .cancel_attach        = cpuset_cancel_attach,
        .attach                = cpuset_attach,
        .post_attach        = cpuset_post_attach,
        .bind                = cpuset_bind,
        .fork                = cpuset_fork,
        .legacy_cftypes        = legacy_files,
        .dfl_cftypes        = dfl_files,
        .early_init        = true,
        .threaded        = true,
};

/**
 * cpuset_init - initialize cpusets at system boot
 *
 * Description: Initialize top_cpuset
 **/

int __init cpuset_init(void)
{
        BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
        BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
        BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));

        cpumask_setall(top_cpuset.cpus_allowed);
        nodes_setall(top_cpuset.mems_allowed);
        cpumask_setall(top_cpuset.effective_cpus);
        nodes_setall(top_cpuset.effective_mems);

        fmeter_init(&top_cpuset.fmeter);
        set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
        top_cpuset.relax_domain_level = -1;

        BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));

        return 0;
}

/*
 * If CPU and/or memory hotplug handlers, below, unplug any CPUs
 * or memory nodes, we need to walk over the cpuset hierarchy,
 * removing that CPU or node from all cpusets.  If this removes the
 * last CPU or node from a cpuset, then move the tasks in the empty
 * cpuset to its next-highest non-empty parent.
 */
static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
{
        struct cpuset *parent;

        /*
         * Find its next-highest non-empty parent, (top cpuset
         * has online cpus, so can't be empty).
         */
        parent = parent_cs(cs);
        while (cpumask_empty(parent->cpus_allowed) ||
                        nodes_empty(parent->mems_allowed))
                parent = parent_cs(parent);

        if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
                pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
                pr_cont_cgroup_name(cs->css.cgroup);
                pr_cont("\n");
        }
}

static void
hotplug_update_tasks_legacy(struct cpuset *cs,
                            struct cpumask *new_cpus, nodemask_t *new_mems,
                            bool cpus_updated, bool mems_updated)
{
        bool is_empty;

        spin_lock_irq(&callback_lock);
        cpumask_copy(cs->cpus_allowed, new_cpus);
        cpumask_copy(cs->effective_cpus, new_cpus);
        cs->mems_allowed = *new_mems;
        cs->effective_mems = *new_mems;
        spin_unlock_irq(&callback_lock);

        /*
         * Don't call update_tasks_cpumask() if the cpuset becomes empty,
         * as the tasks will be migratecd to an ancestor.
         */
        if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
                update_tasks_cpumask(cs);
        if (mems_updated && !nodes_empty(cs->mems_allowed))
                update_tasks_nodemask(cs);

        is_empty = cpumask_empty(cs->cpus_allowed) ||
                   nodes_empty(cs->mems_allowed);

        mutex_unlock(&cpuset_mutex);

        /*
         * Move tasks to the nearest ancestor with execution resources,
         * This is full cgroup operation which will also call back into
         * cpuset. Should be done outside any lock.
         */
        if (is_empty)
                remove_tasks_in_empty_cpuset(cs);

        mutex_lock(&cpuset_mutex);
}

static void
hotplug_update_tasks(struct cpuset *cs,
                     struct cpumask *new_cpus, nodemask_t *new_mems,
                     bool cpus_updated, bool mems_updated)
{
        if (cpumask_empty(new_cpus))
                cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
        if (nodes_empty(*new_mems))
                *new_mems = parent_cs(cs)->effective_mems;

        spin_lock_irq(&callback_lock);
        cpumask_copy(cs->effective_cpus, new_cpus);
        cs->effective_mems = *new_mems;
        spin_unlock_irq(&callback_lock);

        if (cpus_updated)
                update_tasks_cpumask(cs);
        if (mems_updated)
                update_tasks_nodemask(cs);
}

static bool force_rebuild;

void cpuset_force_rebuild(void)
{
        force_rebuild = true;
}

/**
 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
 * @cs: cpuset in interest
 * @tmp: the tmpmasks structure pointer
 *
 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
 * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,
 * all its tasks are moved to the nearest ancestor with both resources.
 */
static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
{
        static cpumask_t new_cpus;
        static nodemask_t new_mems;
        bool cpus_updated;
        bool mems_updated;
        struct cpuset *parent;
retry:
        wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);

        mutex_lock(&cpuset_mutex);

        /*
         * We have raced with task attaching. We wait until attaching
         * is finished, so we won't attach a task to an empty cpuset.
         */
        if (cs->attach_in_progress) {
                mutex_unlock(&cpuset_mutex);
                goto retry;
        }

        parent = parent_cs(cs);
        compute_effective_cpumask(&new_cpus, cs, parent);
        nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);

        if (cs->nr_subparts_cpus)
                /*
                 * Make sure that CPUs allocated to child partitions
                 * do not show up in effective_cpus.
                 */
                cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);

        if (!tmp || !cs->partition_root_state)
                goto update_tasks;

        /*
         * In the unlikely event that a partition root has empty
         * effective_cpus or its parent becomes erroneous, we have to
         * transition it to the erroneous state.
         */
        if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
           (parent->partition_root_state == PRS_ERROR))) {
                if (cs->nr_subparts_cpus) {
                        spin_lock_irq(&callback_lock);
                        cs->nr_subparts_cpus = 0;
                        cpumask_clear(cs->subparts_cpus);
                        spin_unlock_irq(&callback_lock);
                        compute_effective_cpumask(&new_cpus, cs, parent);
                }

                /*
                 * If the effective_cpus is empty because the child
                 * partitions take away all the CPUs, we can keep
                 * the current partition and let the child partitions
                 * fight for available CPUs.
                 */
                if ((parent->partition_root_state == PRS_ERROR) ||
                     cpumask_empty(&new_cpus)) {
                        update_parent_subparts_cpumask(cs, partcmd_disable,
                                                       NULL, tmp);
                        spin_lock_irq(&callback_lock);
                        cs->partition_root_state = PRS_ERROR;
                        spin_unlock_irq(&callback_lock);
                }
                cpuset_force_rebuild();
        }

        /*
         * On the other hand, an erroneous partition root may be transitioned
         * back to a regular one or a partition root with no CPU allocated
         * from the parent may change to erroneous.
         */
        if (is_partition_root(parent) &&
           ((cs->partition_root_state == PRS_ERROR) ||
            !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
             update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
                cpuset_force_rebuild();

update_tasks:
        cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
        mems_updated = !nodes_equal(new_mems, cs->effective_mems);

        if (is_in_v2_mode())
                hotplug_update_tasks(cs, &new_cpus, &new_mems,
                                     cpus_updated, mems_updated);
        else
                hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
                                            cpus_updated, mems_updated);

        mutex_unlock(&cpuset_mutex);
}

/**
 * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
 *
 * This function is called after either CPU or memory configuration has
 * changed and updates cpuset accordingly.  The top_cpuset is always
 * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
 * order to make cpusets transparent (of no affect) on systems that are
 * actively using CPU hotplug but making no active use of cpusets.
 *
 * Non-root cpusets are only affected by offlining.  If any CPUs or memory
 * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
 * all descendants.
 *
 * Note that CPU offlining during suspend is ignored.  We don't modify
 * cpusets across suspend/resume cycles at all.
 */
static void cpuset_hotplug_workfn(struct work_struct *work)
{
        static cpumask_t new_cpus;
        static nodemask_t new_mems;
        bool cpus_updated, mems_updated;
        bool on_dfl = is_in_v2_mode();
        struct tmpmasks tmp, *ptmp = NULL;

        if (on_dfl && !alloc_cpumasks(NULL, &tmp))
                ptmp = &tmp;

        mutex_lock(&cpuset_mutex);

        /* fetch the available cpus/mems and find out which changed how */
        cpumask_copy(&new_cpus, cpu_active_mask);
        new_mems = node_states[N_MEMORY];

        /*
         * If subparts_cpus is populated, it is likely that the check below
         * will produce a false positive on cpus_updated when the cpu list
         * isn't changed. It is extra work, but it is better to be safe.
         */
        cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
        mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);

        /*
         * In the rare case that hotplug removes all the cpus in subparts_cpus,
         * we assumed that cpus are updated.
         */
        if (!cpus_updated && top_cpuset.nr_subparts_cpus)
                cpus_updated = true;

        /* synchronize cpus_allowed to cpu_active_mask */
        if (cpus_updated) {
                spin_lock_irq(&callback_lock);
                if (!on_dfl)
                        cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
                /*
                 * Make sure that CPUs allocated to child partitions
                 * do not show up in effective_cpus. If no CPU is left,
                 * we clear the subparts_cpus & let the child partitions
                 * fight for the CPUs again.
                 */
                if (top_cpuset.nr_subparts_cpus) {
                        if (cpumask_subset(&new_cpus,
                                           top_cpuset.subparts_cpus)) {
                                top_cpuset.nr_subparts_cpus = 0;
                                cpumask_clear(top_cpuset.subparts_cpus);
                        } else {
                                cpumask_andnot(&new_cpus, &new_cpus,
                                               top_cpuset.subparts_cpus);
                        }
                }
                cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
                spin_unlock_irq(&callback_lock);
                /* we don't mess with cpumasks of tasks in top_cpuset */
        }

        /* synchronize mems_allowed to N_MEMORY */
        if (mems_updated) {
                spin_lock_irq(&callback_lock);
                if (!on_dfl)
                        top_cpuset.mems_allowed = new_mems;
                top_cpuset.effective_mems = new_mems;
                spin_unlock_irq(&callback_lock);
                update_tasks_nodemask(&top_cpuset);
        }

        mutex_unlock(&cpuset_mutex);

        /* if cpus or mems changed, we need to propagate to descendants */
        if (cpus_updated || mems_updated) {
                struct cpuset *cs;
                struct cgroup_subsys_state *pos_css;

                rcu_read_lock();
                cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
                        if (cs == &top_cpuset || !css_tryget_online(&cs->css))
                                continue;
                        rcu_read_unlock();

                        cpuset_hotplug_update_tasks(cs, ptmp);

                        rcu_read_lock();
                        css_put(&cs->css);
                }
                rcu_read_unlock();
        }

        /* rebuild sched domains if cpus_allowed has changed */
        if (cpus_updated || force_rebuild) {
                force_rebuild = false;
                rebuild_sched_domains();
        }

        free_cpumasks(NULL, ptmp);
}

void cpuset_update_active_cpus(void)
{
        /*
         * We're inside cpu hotplug critical region which usually nests
         * inside cgroup synchronization.  Bounce actual hotplug processing
         * to a work item to avoid reverse locking order.
         */
        schedule_work(&cpuset_hotplug_work);
}

void cpuset_wait_for_hotplug(void)
{
        flush_work(&cpuset_hotplug_work);
}

/*
 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
 * Call this routine anytime after node_states[N_MEMORY] changes.
 * See cpuset_update_active_cpus() for CPU hotplug handling.
 */
static int cpuset_track_online_nodes(struct notifier_block *self,
                                unsigned long action, void *arg)
{
        schedule_work(&cpuset_hotplug_work);
        return NOTIFY_OK;
}

static struct notifier_block cpuset_track_online_nodes_nb = {
        .notifier_call = cpuset_track_online_nodes,
        .priority = 10,                /* ??! */
};

/**
 * cpuset_init_smp - initialize cpus_allowed
 *
 * Description: Finish top cpuset after cpu, node maps are initialized
 */
void __init cpuset_init_smp(void)
{
        /*
         * cpus_allowd/mems_allowed set to v2 values in the initial
         * cpuset_bind() call will be reset to v1 values in another
         * cpuset_bind() call when v1 cpuset is mounted.
         */
        top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;

        cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
        top_cpuset.effective_mems = node_states[N_MEMORY];

        register_hotmemory_notifier(&cpuset_track_online_nodes_nb);

        cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
        BUG_ON(!cpuset_migrate_mm_wq);
}

/**
 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
 * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
 *
 * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
 * attached to the specified @tsk.  Guaranteed to return some non-empty
 * subset of cpu_online_mask, even if this means going outside the
 * tasks cpuset.
 **/

void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
        unsigned long flags;

        spin_lock_irqsave(&callback_lock, flags);
        rcu_read_lock();
        guarantee_online_cpus(task_cs(tsk), pmask);
        rcu_read_unlock();
        spin_unlock_irqrestore(&callback_lock, flags);
}

/**
 * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
 * @tsk: pointer to task_struct with which the scheduler is struggling
 *
 * Description: In the case that the scheduler cannot find an allowed cpu in
 * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
 * mode however, this value is the same as task_cs(tsk)->effective_cpus,
 * which will not contain a sane cpumask during cases such as cpu hotplugging.
 * This is the absolute last resort for the scheduler and it is only used if
 * _every_ other avenue has been traveled.
 **/

void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
        rcu_read_lock();
        do_set_cpus_allowed(tsk, is_in_v2_mode() ?
                task_cs(tsk)->cpus_allowed : cpu_possible_mask);
        rcu_read_unlock();

        /*
         * We own tsk->cpus_allowed, nobody can change it under us.
         *
         * But we used cs && cs->cpus_allowed lockless and thus can
         * race with cgroup_attach_task() or update_cpumask() and get
         * the wrong tsk->cpus_allowed. However, both cases imply the
         * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
         * which takes task_rq_lock().
         *
         * If we are called after it dropped the lock we must see all
         * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
         * set any mask even if it is not right from task_cs() pov,
         * the pending set_cpus_allowed_ptr() will fix things.
         *
         * select_fallback_rq() will fix things ups and set cpu_possible_mask
         * if required.
         */
}

void __init cpuset_init_current_mems_allowed(void)
{
        nodes_setall(current->mems_allowed);
}

/**
 * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
 * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
 *
 * Description: Returns the nodemask_t mems_allowed of the cpuset
 * attached to the specified @tsk.  Guaranteed to return some non-empty
 * subset of node_states[N_MEMORY], even if this means going outside the
 * tasks cpuset.
 **/

nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
{
        nodemask_t mask;
        unsigned long flags;

        spin_lock_irqsave(&callback_lock, flags);
        rcu_read_lock();
        guarantee_online_mems(task_cs(tsk), &mask);
        rcu_read_unlock();
        spin_unlock_irqrestore(&callback_lock, flags);

        return mask;
}

/**
 * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
 * @nodemask: the nodemask to be checked
 *
 * Are any of the nodes in the nodemask allowed in current->mems_allowed?
 */
int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
{
        return nodes_intersects(*nodemask, current->mems_allowed);
}

/*
 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
 * mem_hardwall ancestor to the specified cpuset.  Call holding
 * callback_lock.  If no ancestor is mem_exclusive or mem_hardwall
 * (an unusual configuration), then returns the root cpuset.
 */
static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
{
        while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
                cs = parent_cs(cs);
        return cs;
}

/**
 * cpuset_node_allowed - Can we allocate on a memory node?
 * @node: is this an allowed node?
 * @gfp_mask: memory allocation flags
 *
 * If we're in interrupt, yes, we can always allocate.  If @node is set in
 * current's mems_allowed, yes.  If it's not a __GFP_HARDWALL request and this
 * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
 * yes.  If current has access to memory reserves as an oom victim, yes.
 * Otherwise, no.
 *
 * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
 * and do not allow allocations outside the current tasks cpuset
 * unless the task has been OOM killed.
 * GFP_KERNEL allocations are not so marked, so can escape to the
 * nearest enclosing hardwalled ancestor cpuset.
 *
 * Scanning up parent cpusets requires callback_lock.  The
 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
 * current tasks mems_allowed came up empty on the first pass over
 * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
 * cpuset are short of memory, might require taking the callback_lock.
 *
 * The first call here from mm/page_alloc:get_page_from_freelist()
 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
 * so no allocation on a node outside the cpuset is allowed (unless
 * in interrupt, of course).
 *
 * The second pass through get_page_from_freelist() doesn't even call
 * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
 * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
 * in alloc_flags.  That logic and the checks below have the combined
 * affect that:
 *        in_interrupt - any node ok (current task context irrelevant)
 *        GFP_ATOMIC   - any node ok
 *        tsk_is_oom_victim   - any node ok
 *        GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
 *        GFP_USER     - only nodes in current tasks mems allowed ok.
 */
bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
{
        struct cpuset *cs;                /* current cpuset ancestors */
        int allowed;                        /* is allocation in zone z allowed? */
        unsigned long flags;

        if (in_interrupt())
                return true;
        if (node_isset(node, current->mems_allowed))
                return true;
        /*
         * Allow tasks that have access to memory reserves because they have
         * been OOM killed to get memory anywhere.
         */
        if (unlikely(tsk_is_oom_victim(current)))
                return true;
        if (gfp_mask & __GFP_HARDWALL)        /* If hardwall request, stop here */
                return false;

        if (current->flags & PF_EXITING) /* Let dying task have memory */
                return true;

        /* Not hardwall and node outside mems_allowed: scan up cpusets */
        spin_lock_irqsave(&callback_lock, flags);

        rcu_read_lock();
        cs = nearest_hardwall_ancestor(task_cs(current));
        allowed = node_isset(node, cs->mems_allowed);
        rcu_read_unlock();

        spin_unlock_irqrestore(&callback_lock, flags);
        return allowed;
}

/**
 * cpuset_mem_spread_node() - On which node to begin search for a file page
 * cpuset_slab_spread_node() - On which node to begin search for a slab page
 *
 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
 * tasks in a cpuset with is_spread_page or is_spread_slab set),
 * and if the memory allocation used cpuset_mem_spread_node()
 * to determine on which node to start looking, as it will for
 * certain page cache or slab cache pages such as used for file
 * system buffers and inode caches, then instead of starting on the
 * local node to look for a free page, rather spread the starting
 * node around the tasks mems_allowed nodes.
 *
 * We don't have to worry about the returned node being offline
 * because "it can't happen", and even if it did, it would be ok.
 *
 * The routines calling guarantee_online_mems() are careful to
 * only set nodes in task->mems_allowed that are online.  So it
 * should not be possible for the following code to return an
 * offline node.  But if it did, that would be ok, as this routine
 * is not returning the node where the allocation must be, only
 * the node where the search should start.  The zonelist passed to
 * __alloc_pages() will include all nodes.  If the slab allocator
 * is passed an offline node, it will fall back to the local node.
 * See kmem_cache_alloc_node().
 */

static int cpuset_spread_node(int *rotor)
{
        return *rotor = next_node_in(*rotor, current->mems_allowed);
}

int cpuset_mem_spread_node(void)
{
        if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
                current->cpuset_mem_spread_rotor =
                        node_random(&current->mems_allowed);

        return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
}

int cpuset_slab_spread_node(void)
{
        if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
                current->cpuset_slab_spread_rotor =
                        node_random(&current->mems_allowed);

        return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
}

EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);

/**
 * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
 * @tsk1: pointer to task_struct of some task.
 * @tsk2: pointer to task_struct of some other task.
 *
 * Description: Return true if @tsk1's mems_allowed intersects the
 * mems_allowed of @tsk2.  Used by the OOM killer to determine if
 * one of the task's memory usage might impact the memory available
 * to the other.
 **/

int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
                                   const struct task_struct *tsk2)
{
        return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
}

/**
 * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
 *
 * Description: Prints current's name, cpuset name, and cached copy of its
 * mems_allowed to the kernel log.
 */
void cpuset_print_current_mems_allowed(void)
{
        struct cgroup *cgrp;

        rcu_read_lock();

        cgrp = task_cs(current)->css.cgroup;
        pr_cont(",cpuset=");
        pr_cont_cgroup_name(cgrp);
        pr_cont(",mems_allowed=%*pbl",
                nodemask_pr_args(&current->mems_allowed));

        rcu_read_unlock();
}

/*
 * Collection of memory_pressure is suppressed unless
 * this flag is enabled by writing "1" to the special
 * cpuset file 'memory_pressure_enabled' in the root cpuset.
 */

int cpuset_memory_pressure_enabled __read_mostly;

/**
 * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
 *
 * Keep a running average of the rate of synchronous (direct)
 * page reclaim efforts initiated by tasks in each cpuset.
 *
 * This represents the rate at which some task in the cpuset
 * ran low on memory on all nodes it was allowed to use, and
 * had to enter the kernels page reclaim code in an effort to
 * create more free memory by tossing clean pages or swapping
 * or writing dirty pages.
 *
 * Display to user space in the per-cpuset read-only file
 * "memory_pressure".  Value displayed is an integer
 * representing the recent rate of entry into the synchronous
 * (direct) page reclaim by any task attached to the cpuset.
 **/

void __cpuset_memory_pressure_bump(void)
{
        rcu_read_lock();
        fmeter_markevent(&task_cs(current)->fmeter);
        rcu_read_unlock();
}

#ifdef CONFIG_PROC_PID_CPUSET
/*
 * proc_cpuset_show()
 *  - Print tasks cpuset path into seq_file.
 *  - Used for /proc/<pid>/cpuset.
 *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
 *    doesn't really matter if tsk->cpuset changes after we read it,
 *    and we take cpuset_mutex, keeping cpuset_attach() from changing it
 *    anyway.
 */
int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
                     struct pid *pid, struct task_struct *tsk)
{
        char *buf;
        struct cgroup_subsys_state *css;
        int retval;

        retval = -ENOMEM;
        buf = kmalloc(PATH_MAX, GFP_KERNEL);
        if (!buf)
                goto out;

        css = task_get_css(tsk, cpuset_cgrp_id);
        retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
                                current->nsproxy->cgroup_ns);
        css_put(css);
        if (retval >= PATH_MAX)
                retval = -ENAMETOOLONG;
        if (retval < 0)
                goto out_free;
        seq_puts(m, buf);
        seq_putc(m, '\n');
        retval = 0;
out_free:
        kfree(buf);
out:
        return retval;
}
#endif /* CONFIG_PROC_PID_CPUSET */

/* Display task mems_allowed in /proc/<pid>/status file. */
void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
{
        seq_printf(m, "Mems_allowed:\t%*pb\n",
                   nodemask_pr_args(&task->mems_allowed));
        seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
                   nodemask_pr_args(&task->mems_allowed));
}






























    1 

    1 
    1 








    1 

    1 
    1 



    1 













    1 
    1 


    1 








    1 


    1 


    1 



















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
// SPDX-License-Identifier: GPL-2.0
/*
 * preemptoff and irqoff tracepoints
 *
 * Copyright (C) Joel Fernandes (Google) <joel@joelfernandes.org>
 */

#include <linux/kallsyms.h>
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ftrace.h>
#include <linux/kprobes.h>
#include "trace.h"

#define CREATE_TRACE_POINTS
#include <trace/events/preemptirq.h>

#ifdef CONFIG_TRACE_IRQFLAGS
/* Per-cpu variable to prevent redundant calls when IRQs already off */
static DEFINE_PER_CPU(int, tracing_irq_cpu);

/*
 * Like trace_hardirqs_on() but without the lockdep invocation. This is
 * used in the low level entry code where the ordering vs. RCU is important
 * and lockdep uses a staged approach which splits the lockdep hardirq
 * tracking into a RCU on and a RCU off section.
 */
void trace_hardirqs_on_prepare(void)
{
        if (this_cpu_read(tracing_irq_cpu)) {
                if (!in_nmi())
                        trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1);
                tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
                this_cpu_write(tracing_irq_cpu, 0);
        }
}
EXPORT_SYMBOL(trace_hardirqs_on_prepare);
NOKPROBE_SYMBOL(trace_hardirqs_on_prepare);

void trace_hardirqs_on(void)
{
        if (this_cpu_read(tracing_irq_cpu)) {
                if (!in_nmi())
                        trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
                tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
                this_cpu_write(tracing_irq_cpu, 0);
        }

        lockdep_hardirqs_on_prepare(CALLER_ADDR0);
        lockdep_hardirqs_on(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_on);
NOKPROBE_SYMBOL(trace_hardirqs_on);

/*
 * Like trace_hardirqs_off() but without the lockdep invocation. This is
 * used in the low level entry code where the ordering vs. RCU is important
 * and lockdep uses a staged approach which splits the lockdep hardirq
 * tracking into a RCU on and a RCU off section.
 */
void trace_hardirqs_off_finish(void)
{
        if (!this_cpu_read(tracing_irq_cpu)) {
                this_cpu_write(tracing_irq_cpu, 1);
                tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
                if (!in_nmi())
                        trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1);
        }

}
EXPORT_SYMBOL(trace_hardirqs_off_finish);
NOKPROBE_SYMBOL(trace_hardirqs_off_finish);

void trace_hardirqs_off(void)
{
        lockdep_hardirqs_off(CALLER_ADDR0);

        if (!this_cpu_read(tracing_irq_cpu)) {
                this_cpu_write(tracing_irq_cpu, 1);
                tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
                if (!in_nmi())
                        trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
        }
}
EXPORT_SYMBOL(trace_hardirqs_off);
NOKPROBE_SYMBOL(trace_hardirqs_off);

__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
{
        if (this_cpu_read(tracing_irq_cpu)) {
                if (!in_nmi())
                        trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr);
                tracer_hardirqs_on(CALLER_ADDR0, caller_addr);
                this_cpu_write(tracing_irq_cpu, 0);
        }

        lockdep_hardirqs_on_prepare(caller_addr);
        lockdep_hardirqs_on(caller_addr);
}
EXPORT_SYMBOL(trace_hardirqs_on_caller);
NOKPROBE_SYMBOL(trace_hardirqs_on_caller);

__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
{
        lockdep_hardirqs_off(caller_addr);

        if (!this_cpu_read(tracing_irq_cpu)) {
                this_cpu_write(tracing_irq_cpu, 1);
                tracer_hardirqs_off(CALLER_ADDR0, caller_addr);
                if (!in_nmi())
                        trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr);
        }
}
EXPORT_SYMBOL(trace_hardirqs_off_caller);
NOKPROBE_SYMBOL(trace_hardirqs_off_caller);
#endif /* CONFIG_TRACE_IRQFLAGS */

#ifdef CONFIG_TRACE_PREEMPT_TOGGLE

void trace_preempt_on(unsigned long a0, unsigned long a1)
{
        if (!in_nmi())
                trace_preempt_enable_rcuidle(a0, a1);
        tracer_preempt_on(a0, a1);
}

void trace_preempt_off(unsigned long a0, unsigned long a1)
{
        if (!in_nmi())
                trace_preempt_disable_rcuidle(a0, a1);
        tracer_preempt_off(a0, a1);
}
#endif

























    1 















    1 





















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_JUMP_LABEL_H
#define _ASM_X86_JUMP_LABEL_H

#define HAVE_JUMP_LABEL_BATCH

#define JUMP_LABEL_NOP_SIZE 5

#ifdef CONFIG_X86_64
# define STATIC_KEY_INIT_NOP P6_NOP5_ATOMIC
#else
# define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC
#endif

#include <asm/asm.h>
#include <asm/nops.h>

#ifndef __ASSEMBLY__

#include <linux/stringify.h>
#include <linux/types.h>

static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
{
        asm_volatile_goto("1:"
                ".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t"
                ".pushsection __jump_table,  \"aw\" \n\t"
                _ASM_ALIGN "\n\t"
                ".long 1b - ., %l[l_yes] - . \n\t"
                _ASM_PTR "%c0 + %c1 - .\n\t"
                ".popsection \n\t"
                : :  "i" (key), "i" (branch) : : l_yes);

        return false;
l_yes:
        return true;
}

static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
{
        asm_volatile_goto("1:"
                ".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t"
                "2:\n\t"
                ".pushsection __jump_table,  \"aw\" \n\t"
                _ASM_ALIGN "\n\t"
                ".long 1b - ., %l[l_yes] - . \n\t"
                _ASM_PTR "%c0 + %c1 - .\n\t"
                ".popsection \n\t"
                : :  "i" (key), "i" (branch) : : l_yes);

        return false;
l_yes:
        return true;
}

#else        /* __ASSEMBLY__ */

.macro STATIC_JUMP_IF_TRUE target, key, def
.Lstatic_jump_\@:
        .if \def
        /* Equivalent to "jmp.d32 \target" */
        .byte                0xe9
        .long                \target - .Lstatic_jump_after_\@
.Lstatic_jump_after_\@:
        .else
        .byte                STATIC_KEY_INIT_NOP
        .endif
        .pushsection __jump_table, "aw"
        _ASM_ALIGN
        .long                .Lstatic_jump_\@ - ., \target - .
        _ASM_PTR        \key - .
        .popsection
.endm

.macro STATIC_JUMP_IF_FALSE target, key, def
.Lstatic_jump_\@:
        .if \def
        .byte                STATIC_KEY_INIT_NOP
        .else
        /* Equivalent to "jmp.d32 \target" */
        .byte                0xe9
        .long                \target - .Lstatic_jump_after_\@
.Lstatic_jump_after_\@:
        .endif
        .pushsection __jump_table, "aw"
        _ASM_ALIGN
        .long                .Lstatic_jump_\@ - ., \target - .
        _ASM_PTR        \key + 1 - .
        .popsection
.endm

#endif        /* __ASSEMBLY__ */

#endif















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_NETLINK_H
#define __NET_NETLINK_H

#include <linux/types.h>
#include <linux/netlink.h>
#include <linux/jiffies.h>
#include <linux/in6.h>

/* ========================================================================
 *         Netlink Messages and Attributes Interface (As Seen On TV)
 * ------------------------------------------------------------------------
 *                          Messages Interface
 * ------------------------------------------------------------------------
 *
 * Message Format:
 *    <--- nlmsg_total_size(payload)  --->
 *    <-- nlmsg_msg_size(payload) ->
 *   +----------+- - -+-------------+- - -+-------- - -
 *   | nlmsghdr | Pad |   Payload   | Pad | nlmsghdr
 *   +----------+- - -+-------------+- - -+-------- - -
 *   nlmsg_data(nlh)---^                   ^
 *   nlmsg_next(nlh)-----------------------+
 *
 * Payload Format:
 *    <---------------------- nlmsg_len(nlh) --------------------->
 *    <------ hdrlen ------>       <- nlmsg_attrlen(nlh, hdrlen) ->
 *   +----------------------+- - -+--------------------------------+
 *   |     Family Header    | Pad |           Attributes           |
 *   +----------------------+- - -+--------------------------------+
 *   nlmsg_attrdata(nlh, hdrlen)---^
 *
 * Data Structures:
 *   struct nlmsghdr                        netlink message header
 *
 * Message Construction:
 *   nlmsg_new()                        create a new netlink message
 *   nlmsg_put()                        add a netlink message to an skb
 *   nlmsg_put_answer()                        callback based nlmsg_put()
 *   nlmsg_end()                        finalize netlink message
 *   nlmsg_get_pos()                        return current position in message
 *   nlmsg_trim()                        trim part of message
 *   nlmsg_cancel()                        cancel message construction
 *   nlmsg_free()                        free a netlink message
 *
 * Message Sending:
 *   nlmsg_multicast()                        multicast message to several groups
 *   nlmsg_unicast()                        unicast a message to a single socket
 *   nlmsg_notify()                        send notification message
 *
 * Message Length Calculations:
 *   nlmsg_msg_size(payload)                length of message w/o padding
 *   nlmsg_total_size(payload)                length of message w/ padding
 *   nlmsg_padlen(payload)                length of padding at tail
 *
 * Message Payload Access:
 *   nlmsg_data(nlh)                        head of message payload
 *   nlmsg_len(nlh)                        length of message payload
 *   nlmsg_attrdata(nlh, hdrlen)        head of attributes data
 *   nlmsg_attrlen(nlh, hdrlen)                length of attributes data
 *
 * Message Parsing:
 *   nlmsg_ok(nlh, remaining)                does nlh fit into remaining bytes?
 *   nlmsg_next(nlh, remaining)                get next netlink message
 *   nlmsg_parse()                        parse attributes of a message
 *   nlmsg_find_attr()                        find an attribute in a message
 *   nlmsg_for_each_msg()                loop over all messages
 *   nlmsg_validate()                        validate netlink message incl. attrs
 *   nlmsg_for_each_attr()                loop over all attributes
 *
 * Misc:
 *   nlmsg_report()                        report back to application?
 *
 * ------------------------------------------------------------------------
 *                          Attributes Interface
 * ------------------------------------------------------------------------
 *
 * Attribute Format:
 *    <------- nla_total_size(payload) ------->
 *    <---- nla_attr_size(payload) ----->
 *   +----------+- - -+- - - - - - - - - +- - -+-------- - -
 *   |  Header  | Pad |     Payload      | Pad |  Header
 *   +----------+- - -+- - - - - - - - - +- - -+-------- - -
 *                     <- nla_len(nla) ->      ^
 *   nla_data(nla)----^                        |
 *   nla_next(nla)-----------------------------'
 *
 * Data Structures:
 *   struct nlattr                        netlink attribute header
 *
 * Attribute Construction:
 *   nla_reserve(skb, type, len)        reserve room for an attribute
 *   nla_reserve_nohdr(skb, len)        reserve room for an attribute w/o hdr
 *   nla_put(skb, type, len, data)        add attribute to skb
 *   nla_put_nohdr(skb, len, data)        add attribute w/o hdr
 *   nla_append(skb, len, data)                append data to skb
 *
 * Attribute Construction for Basic Types:
 *   nla_put_u8(skb, type, value)        add u8 attribute to skb
 *   nla_put_u16(skb, type, value)        add u16 attribute to skb
 *   nla_put_u32(skb, type, value)        add u32 attribute to skb
 *   nla_put_u64_64bit(skb, type,
 *                     value, padattr)        add u64 attribute to skb
 *   nla_put_s8(skb, type, value)        add s8 attribute to skb
 *   nla_put_s16(skb, type, value)        add s16 attribute to skb
 *   nla_put_s32(skb, type, value)        add s32 attribute to skb
 *   nla_put_s64(skb, type, value,
 *               padattr)                add s64 attribute to skb
 *   nla_put_string(skb, type, str)        add string attribute to skb
 *   nla_put_flag(skb, type)                add flag attribute to skb
 *   nla_put_msecs(skb, type, jiffies,
 *                 padattr)                add msecs attribute to skb
 *   nla_put_in_addr(skb, type, addr)        add IPv4 address attribute to skb
 *   nla_put_in6_addr(skb, type, addr)        add IPv6 address attribute to skb
 *
 * Nested Attributes Construction:
 *   nla_nest_start(skb, type)                start a nested attribute
 *   nla_nest_end(skb, nla)                finalize a nested attribute
 *   nla_nest_cancel(skb, nla)                cancel nested attribute construction
 *
 * Attribute Length Calculations:
 *   nla_attr_size(payload)                length of attribute w/o padding
 *   nla_total_size(payload)                length of attribute w/ padding
 *   nla_padlen(payload)                length of padding
 *
 * Attribute Payload Access:
 *   nla_data(nla)                        head of attribute payload
 *   nla_len(nla)                        length of attribute payload
 *
 * Attribute Payload Access for Basic Types:
 *   nla_get_u8(nla)                        get payload for a u8 attribute
 *   nla_get_u16(nla)                        get payload for a u16 attribute
 *   nla_get_u32(nla)                        get payload for a u32 attribute
 *   nla_get_u64(nla)                        get payload for a u64 attribute
 *   nla_get_s8(nla)                        get payload for a s8 attribute
 *   nla_get_s16(nla)                        get payload for a s16 attribute
 *   nla_get_s32(nla)                        get payload for a s32 attribute
 *   nla_get_s64(nla)                        get payload for a s64 attribute
 *   nla_get_flag(nla)                        return 1 if flag is true
 *   nla_get_msecs(nla)                        get payload for a msecs attribute
 *
 * Attribute Misc:
 *   nla_memcpy(dest, nla, count)        copy attribute into memory
 *   nla_memcmp(nla, data, size)        compare attribute with memory area
 *   nla_strlcpy(dst, nla, size)        copy attribute to a sized string
 *   nla_strcmp(nla, str)                compare attribute with string
 *
 * Attribute Parsing:
 *   nla_ok(nla, remaining)                does nla fit into remaining bytes?
 *   nla_next(nla, remaining)                get next netlink attribute
 *   nla_validate()                        validate a stream of attributes
 *   nla_validate_nested()                validate a stream of nested attributes
 *   nla_find()                                find attribute in stream of attributes
 *   nla_find_nested()                        find attribute in nested attributes
 *   nla_parse()                        parse and validate stream of attrs
 *   nla_parse_nested()                        parse nested attributes
 *   nla_for_each_attr()                loop over all attributes
 *   nla_for_each_nested()                loop over the nested attributes
 *=========================================================================
 */

 /**
  * Standard attribute types to specify validation policy
  */
enum {
        NLA_UNSPEC,
        NLA_U8,
        NLA_U16,
        NLA_U32,
        NLA_U64,
        NLA_STRING,
        NLA_FLAG,
        NLA_MSECS,
        NLA_NESTED,
        NLA_NESTED_ARRAY,
        NLA_NUL_STRING,
        NLA_BINARY,
        NLA_S8,
        NLA_S16,
        NLA_S32,
        NLA_S64,
        NLA_BITFIELD32,
        NLA_REJECT,
        __NLA_TYPE_MAX,
};

#define NLA_TYPE_MAX (__NLA_TYPE_MAX - 1)

struct netlink_range_validation {
        u64 min, max;
};

struct netlink_range_validation_signed {
        s64 min, max;
};

enum nla_policy_validation {
        NLA_VALIDATE_NONE,
        NLA_VALIDATE_RANGE,
        NLA_VALIDATE_RANGE_WARN_TOO_LONG,
        NLA_VALIDATE_MIN,
        NLA_VALIDATE_MAX,
        NLA_VALIDATE_MASK,
        NLA_VALIDATE_RANGE_PTR,
        NLA_VALIDATE_FUNCTION,
};

/**
 * struct nla_policy - attribute validation policy
 * @type: Type of attribute or NLA_UNSPEC
 * @validation_type: type of attribute validation done in addition to
 *        type-specific validation (e.g. range, function call), see
 *        &enum nla_policy_validation
 * @len: Type specific length of payload
 *
 * Policies are defined as arrays of this struct, the array must be
 * accessible by attribute type up to the highest identifier to be expected.
 *
 * Meaning of `len' field:
 *    NLA_STRING           Maximum length of string
 *    NLA_NUL_STRING       Maximum length of string (excluding NUL)
 *    NLA_FLAG             Unused
 *    NLA_BINARY           Maximum length of attribute payload
 *                         (but see also below with the validation type)
 *    NLA_NESTED,
 *    NLA_NESTED_ARRAY     Length verification is done by checking len of
 *                         nested header (or empty); len field is used if
 *                         nested_policy is also used, for the max attr
 *                         number in the nested policy.
 *    NLA_U8, NLA_U16,
 *    NLA_U32, NLA_U64,
 *    NLA_S8, NLA_S16,
 *    NLA_S32, NLA_S64,
 *    NLA_MSECS            Leaving the length field zero will verify the
 *                         given type fits, using it verifies minimum length
 *                         just like "All other"
 *    NLA_BITFIELD32       Unused
 *    NLA_REJECT           Unused
 *    All other            Minimum length of attribute payload
 *
 * Meaning of validation union:
 *    NLA_BITFIELD32       This is a 32-bit bitmap/bitselector attribute and
 *                         `bitfield32_valid' is the u32 value of valid flags
 *    NLA_REJECT           This attribute is always rejected and `reject_message'
 *                         may point to a string to report as the error instead
 *                         of the generic one in extended ACK.
 *    NLA_NESTED           `nested_policy' to a nested policy to validate, must
 *                         also set `len' to the max attribute number. Use the
 *                         provided NLA_POLICY_NESTED() macro.
 *                         Note that nla_parse() will validate, but of course not
 *                         parse, the nested sub-policies.
 *    NLA_NESTED_ARRAY     `nested_policy' points to a nested policy to validate,
 *                         must also set `len' to the max attribute number. Use
 *                         the provided NLA_POLICY_NESTED_ARRAY() macro.
 *                         The difference to NLA_NESTED is the structure:
 *                         NLA_NESTED has the nested attributes directly inside
 *                         while an array has the nested attributes at another
 *                         level down and the attribute types directly in the
 *                         nesting don't matter.
 *    NLA_U8,
 *    NLA_U16,
 *    NLA_U32,
 *    NLA_U64,
 *    NLA_S8,
 *    NLA_S16,
 *    NLA_S32,
 *    NLA_S64              The `min' and `max' fields are used depending on the
 *                         validation_type field, if that is min/max/range then
 *                         the min, max or both are used (respectively) to check
 *                         the value of the integer attribute.
 *                         Note that in the interest of code simplicity and
 *                         struct size both limits are s16, so you cannot
 *                         enforce a range that doesn't fall within the range
 *                         of s16 - do that as usual in the code instead.
 *                         Use the NLA_POLICY_MIN(), NLA_POLICY_MAX() and
 *                         NLA_POLICY_RANGE() macros.
 *    NLA_U8,
 *    NLA_U16,
 *    NLA_U32,
 *    NLA_U64              If the validation_type field instead is set to
 *                         NLA_VALIDATE_RANGE_PTR, `range' must be a pointer
 *                         to a struct netlink_range_validation that indicates
 *                         the min/max values.
 *                         Use NLA_POLICY_FULL_RANGE().
 *    NLA_S8,
 *    NLA_S16,
 *    NLA_S32,
 *    NLA_S64              If the validation_type field instead is set to
 *                         NLA_VALIDATE_RANGE_PTR, `range_signed' must be a
 *                         pointer to a struct netlink_range_validation_signed
 *                         that indicates the min/max values.
 *                         Use NLA_POLICY_FULL_RANGE_SIGNED().
 *
 *    NLA_BINARY           If the validation type is like the ones for integers
 *                         above, then the min/max length (not value like for
 *                         integers) of the attribute is enforced.
 *
 *    All other            Unused - but note that it's a union
 *
 * Meaning of `validate' field, use via NLA_POLICY_VALIDATE_FN:
 *    NLA_BINARY           Validation function called for the attribute.
 *    All other            Unused - but note that it's a union
 *
 * Example:
 *
 * static const u32 myvalidflags = 0xff231023;
 *
 * static const struct nla_policy my_policy[ATTR_MAX+1] = {
 *         [ATTR_FOO] = { .type = NLA_U16 },
 *        [ATTR_BAR] = { .type = NLA_STRING, .len = BARSIZ },
 *        [ATTR_BAZ] = NLA_POLICY_EXACT_LEN(sizeof(struct mystruct)),
 *        [ATTR_GOO] = NLA_POLICY_BITFIELD32(myvalidflags),
 * };
 */
struct nla_policy {
        u8                type;
        u8                validation_type;
        u16                len;
        union {
                const u32 bitfield32_valid;
                const u32 mask;
                const char *reject_message;
                const struct nla_policy *nested_policy;
                struct netlink_range_validation *range;
                struct netlink_range_validation_signed *range_signed;
                struct {
                        s16 min, max;
                };
                int (*validate)(const struct nlattr *attr,
                                struct netlink_ext_ack *extack);
                /* This entry is special, and used for the attribute at index 0
                 * only, and specifies special data about the policy, namely it
                 * specifies the "boundary type" where strict length validation
                 * starts for any attribute types >= this value, also, strict
                 * nesting validation starts here.
                 *
                 * Additionally, it means that NLA_UNSPEC is actually NLA_REJECT
                 * for any types >= this, so need to use NLA_POLICY_MIN_LEN() to
                 * get the previous pure { .len = xyz } behaviour. The advantage
                 * of this is that types not specified in the policy will be
                 * rejected.
                 *
                 * For completely new families it should be set to 1 so that the
                 * validation is enforced for all attributes. For existing ones
                 * it should be set at least when new attributes are added to
                 * the enum used by the policy, and be set to the new value that
                 * was added to enforce strict validation from thereon.
                 */
                u16 strict_start_type;
        };
};

#define NLA_POLICY_ETH_ADDR                NLA_POLICY_EXACT_LEN(ETH_ALEN)
#define NLA_POLICY_ETH_ADDR_COMPAT        NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN)

#define _NLA_POLICY_NESTED(maxattr, policy) \
        { .type = NLA_NESTED, .nested_policy = policy, .len = maxattr }
#define _NLA_POLICY_NESTED_ARRAY(maxattr, policy) \
        { .type = NLA_NESTED_ARRAY, .nested_policy = policy, .len = maxattr }
#define NLA_POLICY_NESTED(policy) \
        _NLA_POLICY_NESTED(ARRAY_SIZE(policy) - 1, policy)
#define NLA_POLICY_NESTED_ARRAY(policy) \
        _NLA_POLICY_NESTED_ARRAY(ARRAY_SIZE(policy) - 1, policy)
#define NLA_POLICY_BITFIELD32(valid) \
        { .type = NLA_BITFIELD32, .bitfield32_valid = valid }

#define __NLA_IS_UINT_TYPE(tp)                                                \
        (tp == NLA_U8 || tp == NLA_U16 || tp == NLA_U32 || tp == NLA_U64)
#define __NLA_IS_SINT_TYPE(tp)                                                \
        (tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64)

#define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition))
#define NLA_ENSURE_UINT_TYPE(tp)                        \
        (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp)) + tp)
#define NLA_ENSURE_UINT_OR_BINARY_TYPE(tp)                \
        (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) ||        \
                      tp == NLA_MSECS ||                \
                      tp == NLA_BINARY) + tp)
#define NLA_ENSURE_SINT_TYPE(tp)                        \
        (__NLA_ENSURE(__NLA_IS_SINT_TYPE(tp)) + tp)
#define NLA_ENSURE_INT_OR_BINARY_TYPE(tp)                \
        (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) ||                \
                      __NLA_IS_SINT_TYPE(tp) ||                \
                      tp == NLA_MSECS ||                \
                      tp == NLA_BINARY) + tp)
#define NLA_ENSURE_NO_VALIDATION_PTR(tp)                \
        (__NLA_ENSURE(tp != NLA_BITFIELD32 &&                \
                      tp != NLA_REJECT &&                \
                      tp != NLA_NESTED &&                \
                      tp != NLA_NESTED_ARRAY) + tp)

#define NLA_POLICY_RANGE(tp, _min, _max) {                \
        .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp),        \
        .validation_type = NLA_VALIDATE_RANGE,                \
        .min = _min,                                        \
        .max = _max                                        \
}

#define NLA_POLICY_FULL_RANGE(tp, _range) {                \
        .type = NLA_ENSURE_UINT_OR_BINARY_TYPE(tp),        \
        .validation_type = NLA_VALIDATE_RANGE_PTR,        \
        .range = _range,                                \
}

#define NLA_POLICY_FULL_RANGE_SIGNED(tp, _range) {        \
        .type = NLA_ENSURE_SINT_TYPE(tp),                \
        .validation_type = NLA_VALIDATE_RANGE_PTR,        \
        .range_signed = _range,                                \
}

#define NLA_POLICY_MIN(tp, _min) {                        \
        .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp),        \
        .validation_type = NLA_VALIDATE_MIN,                \
        .min = _min,                                        \
}

#define NLA_POLICY_MAX(tp, _max) {                        \
        .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp),        \
        .validation_type = NLA_VALIDATE_MAX,                \
        .max = _max,                                        \
}

#define NLA_POLICY_MASK(tp, _mask) {                        \
        .type = NLA_ENSURE_UINT_TYPE(tp),                \
        .validation_type = NLA_VALIDATE_MASK,                \
        .mask = _mask,                                        \
}

#define NLA_POLICY_VALIDATE_FN(tp, fn, ...) {                \
        .type = NLA_ENSURE_NO_VALIDATION_PTR(tp),        \
        .validation_type = NLA_VALIDATE_FUNCTION,        \
        .validate = fn,                                        \
        .len = __VA_ARGS__ + 0,                                \
}

#define NLA_POLICY_EXACT_LEN(_len)        NLA_POLICY_RANGE(NLA_BINARY, _len, _len)
#define NLA_POLICY_EXACT_LEN_WARN(_len) {                        \
        .type = NLA_BINARY,                                        \
        .validation_type = NLA_VALIDATE_RANGE_WARN_TOO_LONG,        \
        .min = _len,                                                \
        .max = _len                                                \
}
#define NLA_POLICY_MIN_LEN(_len)        NLA_POLICY_MIN(NLA_BINARY, _len)

/**
 * struct nl_info - netlink source information
 * @nlh: Netlink message header of original request
 * @nl_net: Network namespace
 * @portid: Netlink PORTID of requesting application
 * @skip_notify: Skip netlink notifications to user space
 * @skip_notify_kernel: Skip selected in-kernel notifications
 */
struct nl_info {
        struct nlmsghdr                *nlh;
        struct net                *nl_net;
        u32                        portid;
        u8                        skip_notify:1,
                                skip_notify_kernel:1;
};

/**
 * enum netlink_validation - netlink message/attribute validation levels
 * @NL_VALIDATE_LIBERAL: Old-style "be liberal" validation, not caring about
 *        extra data at the end of the message, attributes being longer than
 *        they should be, or unknown attributes being present.
 * @NL_VALIDATE_TRAILING: Reject junk data encountered after attribute parsing.
 * @NL_VALIDATE_MAXTYPE: Reject attributes > max type; Together with _TRAILING
 *        this is equivalent to the old nla_parse_strict()/nlmsg_parse_strict().
 * @NL_VALIDATE_UNSPEC: Reject attributes with NLA_UNSPEC in the policy.
 *        This can safely be set by the kernel when the given policy has no
 *        NLA_UNSPEC anymore, and can thus be used to ensure policy entries
 *        are enforced going forward.
 * @NL_VALIDATE_STRICT_ATTRS: strict attribute policy parsing (e.g.
 *        U8, U16, U32 must have exact size, etc.)
 * @NL_VALIDATE_NESTED: Check that NLA_F_NESTED is set for NLA_NESTED(_ARRAY)
 *        and unset for other policies.
 */
enum netlink_validation {
        NL_VALIDATE_LIBERAL = 0,
        NL_VALIDATE_TRAILING = BIT(0),
        NL_VALIDATE_MAXTYPE = BIT(1),
        NL_VALIDATE_UNSPEC = BIT(2),
        NL_VALIDATE_STRICT_ATTRS = BIT(3),
        NL_VALIDATE_NESTED = BIT(4),
};

#define NL_VALIDATE_DEPRECATED_STRICT (NL_VALIDATE_TRAILING |\
                                       NL_VALIDATE_MAXTYPE)
#define NL_VALIDATE_STRICT (NL_VALIDATE_TRAILING |\
                            NL_VALIDATE_MAXTYPE |\
                            NL_VALIDATE_UNSPEC |\
                            NL_VALIDATE_STRICT_ATTRS |\
                            NL_VALIDATE_NESTED)

int netlink_rcv_skb(struct sk_buff *skb,
                    int (*cb)(struct sk_buff *, struct nlmsghdr *,
                              struct netlink_ext_ack *));
int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid,
                 unsigned int group, int report, gfp_t flags);

int __nla_validate(const struct nlattr *head, int len, int maxtype,
                   const struct nla_policy *policy, unsigned int validate,
                   struct netlink_ext_ack *extack);
int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head,
                int len, const struct nla_policy *policy, unsigned int validate,
                struct netlink_ext_ack *extack);
int nla_policy_len(const struct nla_policy *, int);
struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype);
size_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize);
char *nla_strdup(const struct nlattr *nla, gfp_t flags);
int nla_memcpy(void *dest, const struct nlattr *src, int count);
int nla_memcmp(const struct nlattr *nla, const void *data, size_t size);
int nla_strcmp(const struct nlattr *nla, const char *str);
struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen);
struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype,
                                   int attrlen, int padattr);
void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen);
struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen);
struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype,
                                 int attrlen, int padattr);
void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen);
void __nla_put(struct sk_buff *skb, int attrtype, int attrlen,
               const void *data);
void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen,
                     const void *data, int padattr);
void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data);
int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data);
int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen,
                  const void *data, int padattr);
int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data);
int nla_append(struct sk_buff *skb, int attrlen, const void *data);

/**************************************************************************
 * Netlink Messages
 **************************************************************************/

/**
 * nlmsg_msg_size - length of netlink message not including padding
 * @payload: length of message payload
 */
static inline int nlmsg_msg_size(int payload)
{
        return NLMSG_HDRLEN + payload;
}

/**
 * nlmsg_total_size - length of netlink message including padding
 * @payload: length of message payload
 */
static inline int nlmsg_total_size(int payload)
{
        return NLMSG_ALIGN(nlmsg_msg_size(payload));
}

/**
 * nlmsg_padlen - length of padding at the message's tail
 * @payload: length of message payload
 */
static inline int nlmsg_padlen(int payload)
{
        return nlmsg_total_size(payload) - nlmsg_msg_size(payload);
}

/**
 * nlmsg_data - head of message payload
 * @nlh: netlink message header
 */
static inline void *nlmsg_data(const struct nlmsghdr *nlh)
{
        return (unsigned char *) nlh + NLMSG_HDRLEN;
}

/**
 * nlmsg_len - length of message payload
 * @nlh: netlink message header
 */
static inline int nlmsg_len(const struct nlmsghdr *nlh)
{
        return nlh->nlmsg_len - NLMSG_HDRLEN;
}

/**
 * nlmsg_attrdata - head of attributes data
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 */
static inline struct nlattr *nlmsg_attrdata(const struct nlmsghdr *nlh,
                                            int hdrlen)
{
        unsigned char *data = nlmsg_data(nlh);
        return (struct nlattr *) (data + NLMSG_ALIGN(hdrlen));
}

/**
 * nlmsg_attrlen - length of attributes data
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 */
static inline int nlmsg_attrlen(const struct nlmsghdr *nlh, int hdrlen)
{
        return nlmsg_len(nlh) - NLMSG_ALIGN(hdrlen);
}

/**
 * nlmsg_ok - check if the netlink message fits into the remaining bytes
 * @nlh: netlink message header
 * @remaining: number of bytes remaining in message stream
 */
static inline int nlmsg_ok(const struct nlmsghdr *nlh, int remaining)
{
        return (remaining >= (int) sizeof(struct nlmsghdr) &&
                nlh->nlmsg_len >= sizeof(struct nlmsghdr) &&
                nlh->nlmsg_len <= remaining);
}

/**
 * nlmsg_next - next netlink message in message stream
 * @nlh: netlink message header
 * @remaining: number of bytes remaining in message stream
 *
 * Returns the next netlink message in the message stream and
 * decrements remaining by the size of the current message.
 */
static inline struct nlmsghdr *
nlmsg_next(const struct nlmsghdr *nlh, int *remaining)
{
        int totlen = NLMSG_ALIGN(nlh->nlmsg_len);

        *remaining -= totlen;

        return (struct nlmsghdr *) ((unsigned char *) nlh + totlen);
}

/**
 * nla_parse - Parse a stream of attributes into a tb buffer
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @policy: validation policy
 * @extack: extended ACK pointer
 *
 * Parses a stream of attributes and stores a pointer to each attribute in
 * the tb array accessible via the attribute type. Attributes with a type
 * exceeding maxtype will be rejected, policy must be specified, attributes
 * will be validated in the strictest way possible.
 *
 * Returns 0 on success or a negative error code.
 */
static inline int nla_parse(struct nlattr **tb, int maxtype,
                            const struct nlattr *head, int len,
                            const struct nla_policy *policy,
                            struct netlink_ext_ack *extack)
{
        return __nla_parse(tb, maxtype, head, len, policy,
                           NL_VALIDATE_STRICT, extack);
}

/**
 * nla_parse_deprecated - Parse a stream of attributes into a tb buffer
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @policy: validation policy
 * @extack: extended ACK pointer
 *
 * Parses a stream of attributes and stores a pointer to each attribute in
 * the tb array accessible via the attribute type. Attributes with a type
 * exceeding maxtype will be ignored and attributes from the policy are not
 * always strictly validated (only for new attributes).
 *
 * Returns 0 on success or a negative error code.
 */
static inline int nla_parse_deprecated(struct nlattr **tb, int maxtype,
                                       const struct nlattr *head, int len,
                                       const struct nla_policy *policy,
                                       struct netlink_ext_ack *extack)
{
        return __nla_parse(tb, maxtype, head, len, policy,
                           NL_VALIDATE_LIBERAL, extack);
}

/**
 * nla_parse_deprecated_strict - Parse a stream of attributes into a tb buffer
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @policy: validation policy
 * @extack: extended ACK pointer
 *
 * Parses a stream of attributes and stores a pointer to each attribute in
 * the tb array accessible via the attribute type. Attributes with a type
 * exceeding maxtype will be rejected as well as trailing data, but the
 * policy is not completely strictly validated (only for new attributes).
 *
 * Returns 0 on success or a negative error code.
 */
static inline int nla_parse_deprecated_strict(struct nlattr **tb, int maxtype,
                                              const struct nlattr *head,
                                              int len,
                                              const struct nla_policy *policy,
                                              struct netlink_ext_ack *extack)
{
        return __nla_parse(tb, maxtype, head, len, policy,
                           NL_VALIDATE_DEPRECATED_STRICT, extack);
}

/**
 * __nlmsg_parse - parse attributes of a netlink message
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @validate: validation strictness
 * @extack: extended ACK report struct
 *
 * See nla_parse()
 */
static inline int __nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen,
                                struct nlattr *tb[], int maxtype,
                                const struct nla_policy *policy,
                                unsigned int validate,
                                struct netlink_ext_ack *extack)
{
        if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) {
                NL_SET_ERR_MSG(extack, "Invalid header length");
                return -EINVAL;
        }

        return __nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen),
                           nlmsg_attrlen(nlh, hdrlen), policy, validate,
                           extack);
}

/**
 * nlmsg_parse - parse attributes of a netlink message
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @extack: extended ACK report struct
 *
 * See nla_parse()
 */
static inline int nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen,
                              struct nlattr *tb[], int maxtype,
                              const struct nla_policy *policy,
                              struct netlink_ext_ack *extack)
{
        return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy,
                             NL_VALIDATE_STRICT, extack);
}

/**
 * nlmsg_parse_deprecated - parse attributes of a netlink message
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @extack: extended ACK report struct
 *
 * See nla_parse_deprecated()
 */
static inline int nlmsg_parse_deprecated(const struct nlmsghdr *nlh, int hdrlen,
                                         struct nlattr *tb[], int maxtype,
                                         const struct nla_policy *policy,
                                         struct netlink_ext_ack *extack)
{
        return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy,
                             NL_VALIDATE_LIBERAL, extack);
}

/**
 * nlmsg_parse_deprecated_strict - parse attributes of a netlink message
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @extack: extended ACK report struct
 *
 * See nla_parse_deprecated_strict()
 */
static inline int
nlmsg_parse_deprecated_strict(const struct nlmsghdr *nlh, int hdrlen,
                              struct nlattr *tb[], int maxtype,
                              const struct nla_policy *policy,
                              struct netlink_ext_ack *extack)
{
        return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy,
                             NL_VALIDATE_DEPRECATED_STRICT, extack);
}

/**
 * nlmsg_find_attr - find a specific attribute in a netlink message
 * @nlh: netlink message header
 * @hdrlen: length of familiy specific header
 * @attrtype: type of attribute to look for
 *
 * Returns the first attribute which matches the specified type.
 */
static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh,
                                             int hdrlen, int attrtype)
{
        return nla_find(nlmsg_attrdata(nlh, hdrlen),
                        nlmsg_attrlen(nlh, hdrlen), attrtype);
}

/**
 * nla_validate_deprecated - Validate a stream of attributes
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @validate: validation strictness
 * @extack: extended ACK report struct
 *
 * Validates all attributes in the specified attribute stream against the
 * specified policy. Validation is done in liberal mode.
 * See documenation of struct nla_policy for more details.
 *
 * Returns 0 on success or a negative error code.
 */
static inline int nla_validate_deprecated(const struct nlattr *head, int len,
                                          int maxtype,
                                          const struct nla_policy *policy,
                                          struct netlink_ext_ack *extack)
{
        return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_LIBERAL,
                              extack);
}

/**
 * nla_validate - Validate a stream of attributes
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 *
 * Validates all attributes in the specified attribute stream against the
 * specified policy. Validation is done in strict mode.
 * See documenation of struct nla_policy for more details.
 *
 * Returns 0 on success or a negative error code.
 */
static inline int nla_validate(const struct nlattr *head, int len, int maxtype,
                               const struct nla_policy *policy,
                               struct netlink_ext_ack *extack)
{
        return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_STRICT,
                              extack);
}

/**
 * nlmsg_validate_deprecated - validate a netlink message including attributes
 * @nlh: netlinket message header
 * @hdrlen: length of familiy specific header
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 */
static inline int nlmsg_validate_deprecated(const struct nlmsghdr *nlh,
                                            int hdrlen, int maxtype,
                                            const struct nla_policy *policy,
                                            struct netlink_ext_ack *extack)
{
        if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
                return -EINVAL;

        return __nla_validate(nlmsg_attrdata(nlh, hdrlen),
                              nlmsg_attrlen(nlh, hdrlen), maxtype,
                              policy, NL_VALIDATE_LIBERAL, extack);
}



/**
 * nlmsg_report - need to report back to application?
 * @nlh: netlink message header
 *
 * Returns 1 if a report back to the application is requested.
 */
static inline int nlmsg_report(const struct nlmsghdr *nlh)
{
        return !!(nlh->nlmsg_flags & NLM_F_ECHO);
}

/**
 * nlmsg_for_each_attr - iterate over a stream of attributes
 * @pos: loop counter, set to current attribute
 * @nlh: netlink message header
 * @hdrlen: length of familiy specific header
 * @rem: initialized to len, holds bytes currently remaining in stream
 */
#define nlmsg_for_each_attr(pos, nlh, hdrlen, rem) \
        nla_for_each_attr(pos, nlmsg_attrdata(nlh, hdrlen), \
                          nlmsg_attrlen(nlh, hdrlen), rem)

/**
 * nlmsg_put - Add a new netlink message to an skb
 * @skb: socket buffer to store message in
 * @portid: netlink PORTID of requesting application
 * @seq: sequence number of message
 * @type: message type
 * @payload: length of message payload
 * @flags: message flags
 *
 * Returns NULL if the tailroom of the skb is insufficient to store
 * the message header and payload.
 */
static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
                                         int type, int payload, int flags)
{
        if (unlikely(skb_tailroom(skb) < nlmsg_total_size(payload)))
                return NULL;

        return __nlmsg_put(skb, portid, seq, type, payload, flags);
}

/**
 * nlmsg_put_answer - Add a new callback based netlink message to an skb
 * @skb: socket buffer to store message in
 * @cb: netlink callback
 * @type: message type
 * @payload: length of message payload
 * @flags: message flags
 *
 * Returns NULL if the tailroom of the skb is insufficient to store
 * the message header and payload.
 */
static inline struct nlmsghdr *nlmsg_put_answer(struct sk_buff *skb,
                                                struct netlink_callback *cb,
                                                int type, int payload,
                                                int flags)
{
        return nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
                         type, payload, flags);
}

/**
 * nlmsg_new - Allocate a new netlink message
 * @payload: size of the message payload
 * @flags: the type of memory to allocate.
 *
 * Use NLMSG_DEFAULT_SIZE if the size of the payload isn't known
 * and a good default is needed.
 */
static inline struct sk_buff *nlmsg_new(size_t payload, gfp_t flags)
{
        return alloc_skb(nlmsg_total_size(payload), flags);
}

/**
 * nlmsg_end - Finalize a netlink message
 * @skb: socket buffer the message is stored in
 * @nlh: netlink message header
 *
 * Corrects the netlink message header to include the appeneded
 * attributes. Only necessary if attributes have been added to
 * the message.
 */
static inline void nlmsg_end(struct sk_buff *skb, struct nlmsghdr *nlh)
{
        nlh->nlmsg_len = skb_tail_pointer(skb) - (unsigned char *)nlh;
}

/**
 * nlmsg_get_pos - return current position in netlink message
 * @skb: socket buffer the message is stored in
 *
 * Returns a pointer to the current tail of the message.
 */
static inline void *nlmsg_get_pos(struct sk_buff *skb)
{
        return skb_tail_pointer(skb);
}

/**
 * nlmsg_trim - Trim message to a mark
 * @skb: socket buffer the message is stored in
 * @mark: mark to trim to
 *
 * Trims the message to the provided mark.
 */
static inline void nlmsg_trim(struct sk_buff *skb, const void *mark)
{
        if (mark) {
                WARN_ON((unsigned char *) mark < skb->data);
                skb_trim(skb, (unsigned char *) mark - skb->data);
        }
}

/**
 * nlmsg_cancel - Cancel construction of a netlink message
 * @skb: socket buffer the message is stored in
 * @nlh: netlink message header
 *
 * Removes the complete netlink message including all
 * attributes from the socket buffer again.
 */
static inline void nlmsg_cancel(struct sk_buff *skb, struct nlmsghdr *nlh)
{
        nlmsg_trim(skb, nlh);
}

/**
 * nlmsg_free - free a netlink message
 * @skb: socket buffer of netlink message
 */
static inline void nlmsg_free(struct sk_buff *skb)
{
        kfree_skb(skb);
}

/**
 * nlmsg_multicast - multicast a netlink message
 * @sk: netlink socket to spread messages to
 * @skb: netlink message as socket buffer
 * @portid: own netlink portid to avoid sending to yourself
 * @group: multicast group id
 * @flags: allocation flags
 */
static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb,
                                  u32 portid, unsigned int group, gfp_t flags)
{
        int err;

        NETLINK_CB(skb).dst_group = group;

        err = netlink_broadcast(sk, skb, portid, group, flags);
        if (err > 0)
                err = 0;

        return err;
}

/**
 * nlmsg_unicast - unicast a netlink message
 * @sk: netlink socket to spread message to
 * @skb: netlink message as socket buffer
 * @portid: netlink portid of the destination socket
 */
static inline int nlmsg_unicast(struct sock *sk, struct sk_buff *skb, u32 portid)
{
        int err;

        err = netlink_unicast(sk, skb, portid, MSG_DONTWAIT);
        if (err > 0)
                err = 0;

        return err;
}

/**
 * nlmsg_for_each_msg - iterate over a stream of messages
 * @pos: loop counter, set to current message
 * @head: head of message stream
 * @len: length of message stream
 * @rem: initialized to len, holds bytes currently remaining in stream
 */
#define nlmsg_for_each_msg(pos, head, len, rem) \
        for (pos = head, rem = len; \
             nlmsg_ok(pos, rem); \
             pos = nlmsg_next(pos, &(rem)))

/**
 * nl_dump_check_consistent - check if sequence is consistent and advertise if not
 * @cb: netlink callback structure that stores the sequence number
 * @nlh: netlink message header to write the flag to
 *
 * This function checks if the sequence (generation) number changed during dump
 * and if it did, advertises it in the netlink message header.
 *
 * The correct way to use it is to set cb->seq to the generation counter when
 * all locks for dumping have been acquired, and then call this function for
 * each message that is generated.
 *
 * Note that due to initialisation concerns, 0 is an invalid sequence number
 * and must not be used by code that uses this functionality.
 */
static inline void
nl_dump_check_consistent(struct netlink_callback *cb,
                         struct nlmsghdr *nlh)
{
        if (cb->prev_seq && cb->seq != cb->prev_seq)
                nlh->nlmsg_flags |= NLM_F_DUMP_INTR;
        cb->prev_seq = cb->seq;
}

/**************************************************************************
 * Netlink Attributes
 **************************************************************************/

/**
 * nla_attr_size - length of attribute not including padding
 * @payload: length of payload
 */
static inline int nla_attr_size(int payload)
{
        return NLA_HDRLEN + payload;
}

/**
 * nla_total_size - total length of attribute including padding
 * @payload: length of payload
 */
static inline int nla_total_size(int payload)
{
        return NLA_ALIGN(nla_attr_size(payload));
}

/**
 * nla_padlen - length of padding at the tail of attribute
 * @payload: length of payload
 */
static inline int nla_padlen(int payload)
{
        return nla_total_size(payload) - nla_attr_size(payload);
}

/**
 * nla_type - attribute type
 * @nla: netlink attribute
 */
static inline int nla_type(const struct nlattr *nla)
{
        return nla->nla_type & NLA_TYPE_MASK;
}

/**
 * nla_data - head of payload
 * @nla: netlink attribute
 */
static inline void *nla_data(const struct nlattr *nla)
{
        return (char *) nla + NLA_HDRLEN;
}

/**
 * nla_len - length of payload
 * @nla: netlink attribute
 */
static inline int nla_len(const struct nlattr *nla)
{
        return nla->nla_len - NLA_HDRLEN;
}

/**
 * nla_ok - check if the netlink attribute fits into the remaining bytes
 * @nla: netlink attribute
 * @remaining: number of bytes remaining in attribute stream
 */
static inline int nla_ok(const struct nlattr *nla, int remaining)
{
        return remaining >= (int) sizeof(*nla) &&
               nla->nla_len >= sizeof(*nla) &&
               nla->nla_len <= remaining;
}

/**
 * nla_next - next netlink attribute in attribute stream
 * @nla: netlink attribute
 * @remaining: number of bytes remaining in attribute stream
 *
 * Returns the next netlink attribute in the attribute stream and
 * decrements remaining by the size of the current attribute.
 */
static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining)
{
        unsigned int totlen = NLA_ALIGN(nla->nla_len);

        *remaining -= totlen;
        return (struct nlattr *) ((char *) nla + totlen);
}

/**
 * nla_find_nested - find attribute in a set of nested attributes
 * @nla: attribute containing the nested attributes
 * @attrtype: type of attribute to look for
 *
 * Returns the first attribute which matches the specified type.
 */
static inline struct nlattr *
nla_find_nested(const struct nlattr *nla, int attrtype)
{
        return nla_find(nla_data(nla), nla_len(nla), attrtype);
}

/**
 * nla_parse_nested - parse nested attributes
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @nla: attribute containing the nested attributes
 * @policy: validation policy
 * @extack: extended ACK report struct
 *
 * See nla_parse()
 */
static inline int nla_parse_nested(struct nlattr *tb[], int maxtype,
                                   const struct nlattr *nla,
                                   const struct nla_policy *policy,
                                   struct netlink_ext_ack *extack)
{
        if (!(nla->nla_type & NLA_F_NESTED)) {
                NL_SET_ERR_MSG_ATTR(extack, nla, "NLA_F_NESTED is missing");
                return -EINVAL;
        }

        return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy,
                           NL_VALIDATE_STRICT, extack);
}

/**
 * nla_parse_nested_deprecated - parse nested attributes
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @nla: attribute containing the nested attributes
 * @policy: validation policy
 * @extack: extended ACK report struct
 *
 * See nla_parse_deprecated()
 */
static inline int nla_parse_nested_deprecated(struct nlattr *tb[], int maxtype,
                                              const struct nlattr *nla,
                                              const struct nla_policy *policy,
                                              struct netlink_ext_ack *extack)
{
        return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy,
                           NL_VALIDATE_LIBERAL, extack);
}

/**
 * nla_put_u8 - Add a u8 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_u8(struct sk_buff *skb, int attrtype, u8 value)
{
        /* temporary variables to work around GCC PR81715 with asan-stack=1 */
        u8 tmp = value;

        return nla_put(skb, attrtype, sizeof(u8), &tmp);
}

/**
 * nla_put_u16 - Add a u16 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_u16(struct sk_buff *skb, int attrtype, u16 value)
{
        u16 tmp = value;

        return nla_put(skb, attrtype, sizeof(u16), &tmp);
}

/**
 * nla_put_be16 - Add a __be16 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_be16(struct sk_buff *skb, int attrtype, __be16 value)
{
        __be16 tmp = value;

        return nla_put(skb, attrtype, sizeof(__be16), &tmp);
}

/**
 * nla_put_net16 - Add 16-bit network byte order netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_net16(struct sk_buff *skb, int attrtype, __be16 value)
{
        __be16 tmp = value;

        return nla_put_be16(skb, attrtype | NLA_F_NET_BYTEORDER, tmp);
}

/**
 * nla_put_le16 - Add a __le16 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_le16(struct sk_buff *skb, int attrtype, __le16 value)
{
        __le16 tmp = value;

        return nla_put(skb, attrtype, sizeof(__le16), &tmp);
}

/**
 * nla_put_u32 - Add a u32 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_u32(struct sk_buff *skb, int attrtype, u32 value)
{
        u32 tmp = value;

        return nla_put(skb, attrtype, sizeof(u32), &tmp);
}

/**
 * nla_put_be32 - Add a __be32 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_be32(struct sk_buff *skb, int attrtype, __be32 value)
{
        __be32 tmp = value;

        return nla_put(skb, attrtype, sizeof(__be32), &tmp);
}

/**
 * nla_put_net32 - Add 32-bit network byte order netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_net32(struct sk_buff *skb, int attrtype, __be32 value)
{
        __be32 tmp = value;

        return nla_put_be32(skb, attrtype | NLA_F_NET_BYTEORDER, tmp);
}

/**
 * nla_put_le32 - Add a __le32 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_le32(struct sk_buff *skb, int attrtype, __le32 value)
{
        __le32 tmp = value;

        return nla_put(skb, attrtype, sizeof(__le32), &tmp);
}

/**
 * nla_put_u64_64bit - Add a u64 netlink attribute to a skb and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 * @padattr: attribute type for the padding
 */
static inline int nla_put_u64_64bit(struct sk_buff *skb, int attrtype,
                                    u64 value, int padattr)
{
        u64 tmp = value;

        return nla_put_64bit(skb, attrtype, sizeof(u64), &tmp, padattr);
}

/**
 * nla_put_be64 - Add a __be64 netlink attribute to a socket buffer and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 * @padattr: attribute type for the padding
 */
static inline int nla_put_be64(struct sk_buff *skb, int attrtype, __be64 value,
                               int padattr)
{
        __be64 tmp = value;

        return nla_put_64bit(skb, attrtype, sizeof(__be64), &tmp, padattr);
}

/**
 * nla_put_net64 - Add 64-bit network byte order nlattr to a skb and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 * @padattr: attribute type for the padding
 */
static inline int nla_put_net64(struct sk_buff *skb, int attrtype, __be64 value,
                                int padattr)
{
        __be64 tmp = value;

        return nla_put_be64(skb, attrtype | NLA_F_NET_BYTEORDER, tmp,
                            padattr);
}

/**
 * nla_put_le64 - Add a __le64 netlink attribute to a socket buffer and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 * @padattr: attribute type for the padding
 */
static inline int nla_put_le64(struct sk_buff *skb, int attrtype, __le64 value,
                               int padattr)
{
        __le64 tmp = value;

        return nla_put_64bit(skb, attrtype, sizeof(__le64), &tmp, padattr);
}

/**
 * nla_put_s8 - Add a s8 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_s8(struct sk_buff *skb, int attrtype, s8 value)
{
        s8 tmp = value;

        return nla_put(skb, attrtype, sizeof(s8), &tmp);
}

/**
 * nla_put_s16 - Add a s16 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_s16(struct sk_buff *skb, int attrtype, s16 value)
{
        s16 tmp = value;

        return nla_put(skb, attrtype, sizeof(s16), &tmp);
}

/**
 * nla_put_s32 - Add a s32 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_s32(struct sk_buff *skb, int attrtype, s32 value)
{
        s32 tmp = value;

        return nla_put(skb, attrtype, sizeof(s32), &tmp);
}

/**
 * nla_put_s64 - Add a s64 netlink attribute to a socket buffer and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 * @padattr: attribute type for the padding
 */
static inline int nla_put_s64(struct sk_buff *skb, int attrtype, s64 value,
                              int padattr)
{
        s64 tmp = value;

        return nla_put_64bit(skb, attrtype, sizeof(s64), &tmp, padattr);
}

/**
 * nla_put_string - Add a string netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @str: NUL terminated string
 */
static inline int nla_put_string(struct sk_buff *skb, int attrtype,
                                 const char *str)
{
        return nla_put(skb, attrtype, strlen(str) + 1, str);
}

/**
 * nla_put_flag - Add a flag netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 */
static inline int nla_put_flag(struct sk_buff *skb, int attrtype)
{
        return nla_put(skb, attrtype, 0, NULL);
}

/**
 * nla_put_msecs - Add a msecs netlink attribute to a skb and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @njiffies: number of jiffies to convert to msecs
 * @padattr: attribute type for the padding
 */
static inline int nla_put_msecs(struct sk_buff *skb, int attrtype,
                                unsigned long njiffies, int padattr)
{
        u64 tmp = jiffies_to_msecs(njiffies);

        return nla_put_64bit(skb, attrtype, sizeof(u64), &tmp, padattr);
}

/**
 * nla_put_in_addr - Add an IPv4 address netlink attribute to a socket
 * buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @addr: IPv4 address
 */
static inline int nla_put_in_addr(struct sk_buff *skb, int attrtype,
                                  __be32 addr)
{
        __be32 tmp = addr;

        return nla_put_be32(skb, attrtype, tmp);
}

/**
 * nla_put_in6_addr - Add an IPv6 address netlink attribute to a socket
 * buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @addr: IPv6 address
 */
static inline int nla_put_in6_addr(struct sk_buff *skb, int attrtype,
                                   const struct in6_addr *addr)
{
        return nla_put(skb, attrtype, sizeof(*addr), addr);
}

/**
 * nla_put_bitfield32 - Add a bitfield32 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: value carrying bits
 * @selector: selector of valid bits
 */
static inline int nla_put_bitfield32(struct sk_buff *skb, int attrtype,
                                     __u32 value, __u32 selector)
{
        struct nla_bitfield32 tmp = { value, selector, };

        return nla_put(skb, attrtype, sizeof(tmp), &tmp);
}

/**
 * nla_get_u32 - return payload of u32 attribute
 * @nla: u32 netlink attribute
 */
static inline u32 nla_get_u32(const struct nlattr *nla)
{
        return *(u32 *) nla_data(nla);
}

/**
 * nla_get_be32 - return payload of __be32 attribute
 * @nla: __be32 netlink attribute
 */
static inline __be32 nla_get_be32(const struct nlattr *nla)
{
        return *(__be32 *) nla_data(nla);
}

/**
 * nla_get_le32 - return payload of __le32 attribute
 * @nla: __le32 netlink attribute
 */
static inline __le32 nla_get_le32(const struct nlattr *nla)
{
        return *(__le32 *) nla_data(nla);
}

/**
 * nla_get_u16 - return payload of u16 attribute
 * @nla: u16 netlink attribute
 */
static inline u16 nla_get_u16(const struct nlattr *nla)
{
        return *(u16 *) nla_data(nla);
}

/**
 * nla_get_be16 - return payload of __be16 attribute
 * @nla: __be16 netlink attribute
 */
static inline __be16 nla_get_be16(const struct nlattr *nla)
{
        return *(__be16 *) nla_data(nla);
}

/**
 * nla_get_le16 - return payload of __le16 attribute
 * @nla: __le16 netlink attribute
 */
static inline __le16 nla_get_le16(const struct nlattr *nla)
{
        return *(__le16 *) nla_data(nla);
}

/**
 * nla_get_u8 - return payload of u8 attribute
 * @nla: u8 netlink attribute
 */
static inline u8 nla_get_u8(const struct nlattr *nla)
{
        return *(u8 *) nla_data(nla);
}

/**
 * nla_get_u64 - return payload of u64 attribute
 * @nla: u64 netlink attribute
 */
static inline u64 nla_get_u64(const struct nlattr *nla)
{
        u64 tmp;

        nla_memcpy(&tmp, nla, sizeof(tmp));

        return tmp;
}

/**
 * nla_get_be64 - return payload of __be64 attribute
 * @nla: __be64 netlink attribute
 */
static inline __be64 nla_get_be64(const struct nlattr *nla)
{
        __be64 tmp;

        nla_memcpy(&tmp, nla, sizeof(tmp));

        return tmp;
}

/**
 * nla_get_le64 - return payload of __le64 attribute
 * @nla: __le64 netlink attribute
 */
static inline __le64 nla_get_le64(const struct nlattr *nla)
{
        return *(__le64 *) nla_data(nla);
}

/**
 * nla_get_s32 - return payload of s32 attribute
 * @nla: s32 netlink attribute
 */
static inline s32 nla_get_s32(const struct nlattr *nla)
{
        return *(s32 *) nla_data(nla);
}

/**
 * nla_get_s16 - return payload of s16 attribute
 * @nla: s16 netlink attribute
 */
static inline s16 nla_get_s16(const struct nlattr *nla)
{
        return *(s16 *) nla_data(nla);
}

/**
 * nla_get_s8 - return payload of s8 attribute
 * @nla: s8 netlink attribute
 */
static inline s8 nla_get_s8(const struct nlattr *nla)
{
        return *(s8 *) nla_data(nla);
}

/**
 * nla_get_s64 - return payload of s64 attribute
 * @nla: s64 netlink attribute
 */
static inline s64 nla_get_s64(const struct nlattr *nla)
{
        s64 tmp;

        nla_memcpy(&tmp, nla, sizeof(tmp));

        return tmp;
}

/**
 * nla_get_flag - return payload of flag attribute
 * @nla: flag netlink attribute
 */
static inline int nla_get_flag(const struct nlattr *nla)
{
        return !!nla;
}

/**
 * nla_get_msecs - return payload of msecs attribute
 * @nla: msecs netlink attribute
 *
 * Returns the number of milliseconds in jiffies.
 */
static inline unsigned long nla_get_msecs(const struct nlattr *nla)
{
        u64 msecs = nla_get_u64(nla);

        return msecs_to_jiffies((unsigned long) msecs);
}

/**
 * nla_get_in_addr - return payload of IPv4 address attribute
 * @nla: IPv4 address netlink attribute
 */
static inline __be32 nla_get_in_addr(const struct nlattr *nla)
{
        return *(__be32 *) nla_data(nla);
}

/**
 * nla_get_in6_addr - return payload of IPv6 address attribute
 * @nla: IPv6 address netlink attribute
 */
static inline struct in6_addr nla_get_in6_addr(const struct nlattr *nla)
{
        struct in6_addr tmp;

        nla_memcpy(&tmp, nla, sizeof(tmp));
        return tmp;
}

/**
 * nla_get_bitfield32 - return payload of 32 bitfield attribute
 * @nla: nla_bitfield32 attribute
 */
static inline struct nla_bitfield32 nla_get_bitfield32(const struct nlattr *nla)
{
        struct nla_bitfield32 tmp;

        nla_memcpy(&tmp, nla, sizeof(tmp));
        return tmp;
}

/**
 * nla_memdup - duplicate attribute memory (kmemdup)
 * @src: netlink attribute to duplicate from
 * @gfp: GFP mask
 */
static inline void *nla_memdup(const struct nlattr *src, gfp_t gfp)
{
        return kmemdup(nla_data(src), nla_len(src), gfp);
}

/**
 * nla_nest_start_noflag - Start a new level of nested attributes
 * @skb: socket buffer to add attributes to
 * @attrtype: attribute type of container
 *
 * This function exists for backward compatibility to use in APIs which never
 * marked their nest attributes with NLA_F_NESTED flag. New APIs should use
 * nla_nest_start() which sets the flag.
 *
 * Returns the container attribute or NULL on error
 */
static inline struct nlattr *nla_nest_start_noflag(struct sk_buff *skb,
                                                   int attrtype)
{
        struct nlattr *start = (struct nlattr *)skb_tail_pointer(skb);

        if (nla_put(skb, attrtype, 0, NULL) < 0)
                return NULL;

        return start;
}

/**
 * nla_nest_start - Start a new level of nested attributes, with NLA_F_NESTED
 * @skb: socket buffer to add attributes to
 * @attrtype: attribute type of container
 *
 * Unlike nla_nest_start_noflag(), mark the nest attribute with NLA_F_NESTED
 * flag. This is the preferred function to use in new code.
 *
 * Returns the container attribute or NULL on error
 */
static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype)
{
        return nla_nest_start_noflag(skb, attrtype | NLA_F_NESTED);
}

/**
 * nla_nest_end - Finalize nesting of attributes
 * @skb: socket buffer the attributes are stored in
 * @start: container attribute
 *
 * Corrects the container attribute header to include the all
 * appeneded attributes.
 *
 * Returns the total data length of the skb.
 */
static inline int nla_nest_end(struct sk_buff *skb, struct nlattr *start)
{
        start->nla_len = skb_tail_pointer(skb) - (unsigned char *)start;
        return skb->len;
}

/**
 * nla_nest_cancel - Cancel nesting of attributes
 * @skb: socket buffer the message is stored in
 * @start: container attribute
 *
 * Removes the container attribute and including all nested
 * attributes. Returns -EMSGSIZE
 */
static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start)
{
        nlmsg_trim(skb, start);
}

/**
 * __nla_validate_nested - Validate a stream of nested attributes
 * @start: container attribute
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @validate: validation strictness
 * @extack: extended ACK report struct
 *
 * Validates all attributes in the nested attribute stream against the
 * specified policy. Attributes with a type exceeding maxtype will be
 * ignored. See documenation of struct nla_policy for more details.
 *
 * Returns 0 on success or a negative error code.
 */
static inline int __nla_validate_nested(const struct nlattr *start, int maxtype,
                                        const struct nla_policy *policy,
                                        unsigned int validate,
                                        struct netlink_ext_ack *extack)
{
        return __nla_validate(nla_data(start), nla_len(start), maxtype, policy,
                              validate, extack);
}

static inline int
nla_validate_nested(const struct nlattr *start, int maxtype,
                    const struct nla_policy *policy,
                    struct netlink_ext_ack *extack)
{
        return __nla_validate_nested(start, maxtype, policy,
                                     NL_VALIDATE_STRICT, extack);
}

static inline int
nla_validate_nested_deprecated(const struct nlattr *start, int maxtype,
                               const struct nla_policy *policy,
                               struct netlink_ext_ack *extack)
{
        return __nla_validate_nested(start, maxtype, policy,
                                     NL_VALIDATE_LIBERAL, extack);
}

/**
 * nla_need_padding_for_64bit - test 64-bit alignment of the next attribute
 * @skb: socket buffer the message is stored in
 *
 * Return true if padding is needed to align the next attribute (nla_data()) to
 * a 64-bit aligned area.
 */
static inline bool nla_need_padding_for_64bit(struct sk_buff *skb)
{
#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
        /* The nlattr header is 4 bytes in size, that's why we test
         * if the skb->data _is_ aligned.  A NOP attribute, plus
         * nlattr header for next attribute, will make nla_data()
         * 8-byte aligned.
         */
        if (IS_ALIGNED((unsigned long)skb_tail_pointer(skb), 8))
                return true;
#endif
        return false;
}

/**
 * nla_align_64bit - 64-bit align the nla_data() of next attribute
 * @skb: socket buffer the message is stored in
 * @padattr: attribute type for the padding
 *
 * Conditionally emit a padding netlink attribute in order to make
 * the next attribute we emit have a 64-bit aligned nla_data() area.
 * This will only be done in architectures which do not have
 * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS defined.
 *
 * Returns zero on success or a negative error code.
 */
static inline int nla_align_64bit(struct sk_buff *skb, int padattr)
{
        if (nla_need_padding_for_64bit(skb) &&
            !nla_reserve(skb, padattr, 0))
                return -EMSGSIZE;

        return 0;
}

/**
 * nla_total_size_64bit - total length of attribute including padding
 * @payload: length of payload
 */
static inline int nla_total_size_64bit(int payload)
{
        return NLA_ALIGN(nla_attr_size(payload))
#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
                + NLA_ALIGN(nla_attr_size(0))
#endif
                ;
}

/**
 * nla_for_each_attr - iterate over a stream of attributes
 * @pos: loop counter, set to current attribute
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @rem: initialized to len, holds bytes currently remaining in stream
 */
#define nla_for_each_attr(pos, head, len, rem) \
        for (pos = head, rem = len; \
             nla_ok(pos, rem); \
             pos = nla_next(pos, &(rem)))

/**
 * nla_for_each_nested - iterate over nested attributes
 * @pos: loop counter, set to current attribute
 * @nla: attribute containing the nested attributes
 * @rem: initialized to len, holds bytes currently remaining in stream
 */
#define nla_for_each_nested(pos, nla, rem) \
        nla_for_each_attr(pos, nla_data(nla), nla_len(nla), rem)

/**
 * nla_is_last - Test if attribute is last in stream
 * @nla: attribute to test
 * @rem: bytes remaining in stream
 */
static inline bool nla_is_last(const struct nlattr *nla, int rem)
{
        return nla->nla_len == rem;
}

void nla_get_range_unsigned(const struct nla_policy *pt,
                            struct netlink_range_validation *range);
void nla_get_range_signed(const struct nla_policy *pt,
                          struct netlink_range_validation_signed *range);

struct netlink_policy_dump_state;

int netlink_policy_dump_add_policy(struct netlink_policy_dump_state **pstate,
                                   const struct nla_policy *policy,
                                   unsigned int maxtype);
int netlink_policy_dump_get_policy_idx(struct netlink_policy_dump_state *state,
                                       const struct nla_policy *policy,
                                       unsigned int maxtype);
bool netlink_policy_dump_loop(struct netlink_policy_dump_state *state);
int netlink_policy_dump_write(struct sk_buff *skb,
                              struct netlink_policy_dump_state *state);
int netlink_policy_dump_attr_size_estimate(const struct nla_policy *pt);
int netlink_policy_dump_write_attr(struct sk_buff *skb,
                                   const struct nla_policy *pt,
                                   int nestattr);
void netlink_policy_dump_free(struct netlink_policy_dump_state *state);

#endif



















































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Berkeley style UIO structures        -        Alan Cox 1994.
 */
#ifndef __LINUX_UIO_H
#define __LINUX_UIO_H

#include <linux/kernel.h>
#include <linux/thread_info.h>
#include <uapi/linux/uio.h>

struct page;
struct pipe_inode_info;

struct kvec {
        void *iov_base; /* and that should *never* hold a userland pointer */
        size_t iov_len;
};

enum iter_type {
        /* iter types */
        ITER_IOVEC = 4,
        ITER_KVEC = 8,
        ITER_BVEC = 16,
        ITER_PIPE = 32,
        ITER_DISCARD = 64,
};

struct iov_iter_state {
        size_t iov_offset;
        size_t count;
        unsigned long nr_segs;
};

struct iov_iter {
        /*
         * Bit 0 is the read/write bit, set if we're writing.
         * Bit 1 is the BVEC_FLAG_NO_REF bit, set if type is a bvec and
         * the caller isn't expecting to drop a page reference when done.
         */
        unsigned int type;
        size_t iov_offset;
        size_t count;
        union {
                const struct iovec *iov;
                const struct kvec *kvec;
                const struct bio_vec *bvec;
                struct pipe_inode_info *pipe;
        };
        union {
                unsigned long nr_segs;
                struct {
                        unsigned int head;
                        unsigned int start_head;
                };
        };
};

static inline enum iter_type iov_iter_type(const struct iov_iter *i)
{
        return i->type & ~(READ | WRITE);
}

static inline void iov_iter_save_state(struct iov_iter *iter,
                                       struct iov_iter_state *state)
{
        state->iov_offset = iter->iov_offset;
        state->count = iter->count;
        state->nr_segs = iter->nr_segs;
}

static inline bool iter_is_iovec(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_IOVEC;
}

static inline bool iov_iter_is_kvec(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_KVEC;
}

static inline bool iov_iter_is_bvec(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_BVEC;
}

static inline bool iov_iter_is_pipe(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_PIPE;
}

static inline bool iov_iter_is_discard(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_DISCARD;
}

static inline unsigned char iov_iter_rw(const struct iov_iter *i)
{
        return i->type & (READ | WRITE);
}

/*
 * Total number of bytes covered by an iovec.
 *
 * NOTE that it is not safe to use this function until all the iovec's
 * segment lengths have been validated.  Because the individual lengths can
 * overflow a size_t when added together.
 */
static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs)
{
        unsigned long seg;
        size_t ret = 0;

        for (seg = 0; seg < nr_segs; seg++)
                ret += iov[seg].iov_len;
        return ret;
}

static inline struct iovec iov_iter_iovec(const struct iov_iter *iter)
{
        return (struct iovec) {
                .iov_base = iter->iov->iov_base + iter->iov_offset,
                .iov_len = min(iter->count,
                               iter->iov->iov_len - iter->iov_offset),
        };
}

size_t iov_iter_copy_from_user_atomic(struct page *page,
                struct iov_iter *i, unsigned long offset, size_t bytes);
void iov_iter_advance(struct iov_iter *i, size_t bytes);
void iov_iter_revert(struct iov_iter *i, size_t bytes);
int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
size_t iov_iter_single_seg_count(const struct iov_iter *i);
size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i);
size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i);

size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i);
bool _copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i);
size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i);
bool _copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i);

static __always_inline __must_check
size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
        if (unlikely(!check_copy_size(addr, bytes, true)))
                return 0;
        else
                return _copy_to_iter(addr, bytes, i);
}

static __always_inline __must_check
size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
        if (unlikely(!check_copy_size(addr, bytes, false)))
                return 0;
        else
                return _copy_from_iter(addr, bytes, i);
}

static __always_inline __must_check
bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i)
{
        if (unlikely(!check_copy_size(addr, bytes, false)))
                return false;
        else
                return _copy_from_iter_full(addr, bytes, i);
}

static __always_inline __must_check
size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
        if (unlikely(!check_copy_size(addr, bytes, false)))
                return 0;
        else
                return _copy_from_iter_nocache(addr, bytes, i);
}

static __always_inline __must_check
bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
        if (unlikely(!check_copy_size(addr, bytes, false)))
                return false;
        else
                return _copy_from_iter_full_nocache(addr, bytes, i);
}

#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
/*
 * Note, users like pmem that depend on the stricter semantics of
 * copy_from_iter_flushcache() than copy_from_iter_nocache() must check for
 * IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) before assuming that the
 * destination is flushed from the cache on return.
 */
size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i);
#else
#define _copy_from_iter_flushcache _copy_from_iter_nocache
#endif

#ifdef CONFIG_ARCH_HAS_COPY_MC
size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
#else
#define _copy_mc_to_iter _copy_to_iter
#endif

static __always_inline __must_check
size_t copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
{
        if (unlikely(!check_copy_size(addr, bytes, false)))
                return 0;
        else
                return _copy_from_iter_flushcache(addr, bytes, i);
}

static __always_inline __must_check
size_t copy_mc_to_iter(void *addr, size_t bytes, struct iov_iter *i)
{
        if (unlikely(!check_copy_size(addr, bytes, true)))
                return 0;
        else
                return _copy_mc_to_iter(addr, bytes, i);
}

size_t iov_iter_zero(size_t bytes, struct iov_iter *);
unsigned long iov_iter_alignment(const struct iov_iter *i);
unsigned long iov_iter_gap_alignment(const struct iov_iter *i);
void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov,
                        unsigned long nr_segs, size_t count);
void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec,
                        unsigned long nr_segs, size_t count);
void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec,
                        unsigned long nr_segs, size_t count);
void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct pipe_inode_info *pipe,
                        size_t count);
void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count);
ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
                        size_t maxsize, unsigned maxpages, size_t *start);
ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages,
                        size_t maxsize, size_t *start);
int iov_iter_npages(const struct iov_iter *i, int maxpages);
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state);

const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags);

static inline size_t iov_iter_count(const struct iov_iter *i)
{
        return i->count;
}

/*
 * Cap the iov_iter by given limit; note that the second argument is
 * *not* the new size - it's upper limit for such.  Passing it a value
 * greater than the amount of data in iov_iter is fine - it'll just do
 * nothing in that case.
 */
static inline void iov_iter_truncate(struct iov_iter *i, u64 count)
{
        /*
         * count doesn't have to fit in size_t - comparison extends both
         * operands to u64 here and any value that would be truncated by
         * conversion in assignement is by definition greater than all
         * values of size_t, including old i->count.
         */
        if (i->count > count)
                i->count = count;
}

/*
 * reexpand a previously truncated iterator; count must be no more than how much
 * we had shrunk it.
 */
static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
{
        i->count = count;
}

struct csum_state {
        __wsum csum;
        size_t off;
};

size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csstate, struct iov_iter *i);
size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
                struct iov_iter *i);

struct iovec *iovec_from_user(const struct iovec __user *uvector,
                unsigned long nr_segs, unsigned long fast_segs,
                struct iovec *fast_iov, bool compat);
ssize_t import_iovec(int type, const struct iovec __user *uvec,
                 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
                 struct iov_iter *i);
ssize_t __import_iovec(int type, const struct iovec __user *uvec,
                 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
                 struct iov_iter *i, bool compat);
int import_single_range(int type, void __user *buf, size_t len,
                 struct iovec *iov, struct iov_iter *i);

int iov_iter_for_each_range(struct iov_iter *i, size_t bytes,
                            int (*f)(struct kvec *vec, void *context),
                            void *context);

#endif

























































































































































































































































































































    1 





    1 


    1 


    1 


































































































    1 












































    1 

    1 


































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* include/asm-generic/tlb.h
 *
 *        Generic TLB shootdown code
 *
 * Copyright 2001 Red Hat, Inc.
 * Based on code from mm/memory.c Copyright Linus Torvalds and others.
 *
 * Copyright 2011 Red Hat, Inc., Peter Zijlstra
 */
#ifndef _ASM_GENERIC__TLB_H
#define _ASM_GENERIC__TLB_H

#include <linux/mmu_notifier.h>
#include <linux/swap.h>
#include <linux/hugetlb_inline.h>
#include <asm/tlbflush.h>
#include <asm/cacheflush.h>

/*
 * Blindly accessing user memory from NMI context can be dangerous
 * if we're in the middle of switching the current user task or switching
 * the loaded mm.
 */
#ifndef nmi_uaccess_okay
# define nmi_uaccess_okay() true
#endif

#ifdef CONFIG_MMU

/*
 * Generic MMU-gather implementation.
 *
 * The mmu_gather data structure is used by the mm code to implement the
 * correct and efficient ordering of freeing pages and TLB invalidations.
 *
 * This correct ordering is:
 *
 *  1) unhook page
 *  2) TLB invalidate page
 *  3) free page
 *
 * That is, we must never free a page before we have ensured there are no live
 * translations left to it. Otherwise it might be possible to observe (or
 * worse, change) the page content after it has been reused.
 *
 * The mmu_gather API consists of:
 *
 *  - tlb_gather_mmu() / tlb_finish_mmu(); start and finish a mmu_gather
 *
 *    Finish in particular will issue a (final) TLB invalidate and free
 *    all (remaining) queued pages.
 *
 *  - tlb_start_vma() / tlb_end_vma(); marks the start / end of a VMA
 *
 *    Defaults to flushing at tlb_end_vma() to reset the range; helps when
 *    there's large holes between the VMAs.
 *
 *  - tlb_remove_table()
 *
 *    tlb_remove_table() is the basic primitive to free page-table directories
 *    (__p*_free_tlb()).  In it's most primitive form it is an alias for
 *    tlb_remove_page() below, for when page directories are pages and have no
 *    additional constraints.
 *
 *    See also MMU_GATHER_TABLE_FREE and MMU_GATHER_RCU_TABLE_FREE.
 *
 *  - tlb_remove_page() / __tlb_remove_page()
 *  - tlb_remove_page_size() / __tlb_remove_page_size()
 *
 *    __tlb_remove_page_size() is the basic primitive that queues a page for
 *    freeing. __tlb_remove_page() assumes PAGE_SIZE. Both will return a
 *    boolean indicating if the queue is (now) full and a call to
 *    tlb_flush_mmu() is required.
 *
 *    tlb_remove_page() and tlb_remove_page_size() imply the call to
 *    tlb_flush_mmu() when required and has no return value.
 *
 *  - tlb_change_page_size()
 *
 *    call before __tlb_remove_page*() to set the current page-size; implies a
 *    possible tlb_flush_mmu() call.
 *
 *  - tlb_flush_mmu() / tlb_flush_mmu_tlbonly()
 *
 *    tlb_flush_mmu_tlbonly() - does the TLB invalidate (and resets
 *                              related state, like the range)
 *
 *    tlb_flush_mmu() - in addition to the above TLB invalidate, also frees
 *                        whatever pages are still batched.
 *
 *  - mmu_gather::fullmm
 *
 *    A flag set by tlb_gather_mmu() to indicate we're going to free
 *    the entire mm; this allows a number of optimizations.
 *
 *    - We can ignore tlb_{start,end}_vma(); because we don't
 *      care about ranges. Everything will be shot down.
 *
 *    - (RISC) architectures that use ASIDs can cycle to a new ASID
 *      and delay the invalidation until ASID space runs out.
 *
 *  - mmu_gather::need_flush_all
 *
 *    A flag that can be set by the arch code if it wants to force
 *    flush the entire TLB irrespective of the range. For instance
 *    x86-PAE needs this when changing top-level entries.
 *
 * And allows the architecture to provide and implement tlb_flush():
 *
 * tlb_flush() may, in addition to the above mentioned mmu_gather fields, make
 * use of:
 *
 *  - mmu_gather::start / mmu_gather::end
 *
 *    which provides the range that needs to be flushed to cover the pages to
 *    be freed.
 *
 *  - mmu_gather::freed_tables
 *
 *    set when we freed page table pages
 *
 *  - tlb_get_unmap_shift() / tlb_get_unmap_size()
 *
 *    returns the smallest TLB entry size unmapped in this range.
 *
 * If an architecture does not provide tlb_flush() a default implementation
 * based on flush_tlb_range() will be used, unless MMU_GATHER_NO_RANGE is
 * specified, in which case we'll default to flush_tlb_mm().
 *
 * Additionally there are a few opt-in features:
 *
 *  MMU_GATHER_PAGE_SIZE
 *
 *  This ensures we call tlb_flush() every time tlb_change_page_size() actually
 *  changes the size and provides mmu_gather::page_size to tlb_flush().
 *
 *  This might be useful if your architecture has size specific TLB
 *  invalidation instructions.
 *
 *  MMU_GATHER_TABLE_FREE
 *
 *  This provides tlb_remove_table(), to be used instead of tlb_remove_page()
 *  for page directores (__p*_free_tlb()).
 *
 *  Useful if your architecture has non-page page directories.
 *
 *  When used, an architecture is expected to provide __tlb_remove_table()
 *  which does the actual freeing of these pages.
 *
 *  MMU_GATHER_RCU_TABLE_FREE
 *
 *  Like MMU_GATHER_TABLE_FREE, and adds semi-RCU semantics to the free (see
 *  comment below).
 *
 *  Useful if your architecture doesn't use IPIs for remote TLB invalidates
 *  and therefore doesn't naturally serialize with software page-table walkers.
 *
 *  MMU_GATHER_NO_RANGE
 *
 *  Use this if your architecture lacks an efficient flush_tlb_range().
 *
 *  MMU_GATHER_NO_GATHER
 *
 *  If the option is set the mmu_gather will not track individual pages for
 *  delayed page free anymore. A platform that enables the option needs to
 *  provide its own implementation of the __tlb_remove_page_size() function to
 *  free pages.
 *
 *  This is useful if your architecture already flushes TLB entries in the
 *  various ptep_get_and_clear() functions.
 */

#ifdef CONFIG_MMU_GATHER_TABLE_FREE

struct mmu_table_batch {
#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
        struct rcu_head                rcu;
#endif
        unsigned int                nr;
        void                        *tables[0];
};

#define MAX_TABLE_BATCH                \
        ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *))

extern void tlb_remove_table(struct mmu_gather *tlb, void *table);

#else /* !CONFIG_MMU_GATHER_HAVE_TABLE_FREE */

/*
 * Without MMU_GATHER_TABLE_FREE the architecture is assumed to have page based
 * page directories and we can use the normal page batching to free them.
 */
#define tlb_remove_table(tlb, page) tlb_remove_page((tlb), (page))

#endif /* CONFIG_MMU_GATHER_TABLE_FREE */

#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
/*
 * This allows an architecture that does not use the linux page-tables for
 * hardware to skip the TLBI when freeing page tables.
 */
#ifndef tlb_needs_table_invalidate
#define tlb_needs_table_invalidate() (true)
#endif

void tlb_remove_table_sync_one(void);

#else

#ifdef tlb_needs_table_invalidate
#error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE
#endif

static inline void tlb_remove_table_sync_one(void) { }

#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */


#ifndef CONFIG_MMU_GATHER_NO_GATHER
/*
 * If we can't allocate a page to make a big batch of page pointers
 * to work on, then just handle a few from the on-stack structure.
 */
#define MMU_GATHER_BUNDLE        8

struct mmu_gather_batch {
        struct mmu_gather_batch        *next;
        unsigned int                nr;
        unsigned int                max;
        struct page                *pages[0];
};

#define MAX_GATHER_BATCH        \
        ((PAGE_SIZE - sizeof(struct mmu_gather_batch)) / sizeof(void *))

/*
 * Limit the maximum number of mmu_gather batches to reduce a risk of soft
 * lockups for non-preemptible kernels on huge machines when a lot of memory
 * is zapped during unmapping.
 * 10K pages freed at once should be safe even without a preemption point.
 */
#define MAX_GATHER_BATCH_COUNT        (10000UL/MAX_GATHER_BATCH)

extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
                                   int page_size);
#endif

/*
 * struct mmu_gather is an opaque type used by the mm code for passing around
 * any data needed by arch specific code for tlb_remove_page.
 */
struct mmu_gather {
        struct mm_struct        *mm;

#ifdef CONFIG_MMU_GATHER_TABLE_FREE
        struct mmu_table_batch        *batch;
#endif

        unsigned long                start;
        unsigned long                end;
        /*
         * we are in the middle of an operation to clear
         * a full mm and can make some optimizations
         */
        unsigned int                fullmm : 1;

        /*
         * we have performed an operation which
         * requires a complete flush of the tlb
         */
        unsigned int                need_flush_all : 1;

        /*
         * we have removed page directories
         */
        unsigned int                freed_tables : 1;

        /*
         * at which levels have we cleared entries?
         */
        unsigned int                cleared_ptes : 1;
        unsigned int                cleared_pmds : 1;
        unsigned int                cleared_puds : 1;
        unsigned int                cleared_p4ds : 1;

        /*
         * tracks VM_EXEC | VM_HUGETLB in tlb_start_vma
         */
        unsigned int                vma_exec : 1;
        unsigned int                vma_huge : 1;

        unsigned int                batch_count;

#ifndef CONFIG_MMU_GATHER_NO_GATHER
        struct mmu_gather_batch *active;
        struct mmu_gather_batch        local;
        struct page                *__pages[MMU_GATHER_BUNDLE];

#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
        unsigned int page_size;
#endif
#endif
};

void tlb_flush_mmu(struct mmu_gather *tlb);

static inline void __tlb_adjust_range(struct mmu_gather *tlb,
                                      unsigned long address,
                                      unsigned int range_size)
{
        tlb->start = min(tlb->start, address);
        tlb->end = max(tlb->end, address + range_size);
}

static inline void __tlb_reset_range(struct mmu_gather *tlb)
{
        if (tlb->fullmm) {
                tlb->start = tlb->end = ~0;
        } else {
                tlb->start = TASK_SIZE;
                tlb->end = 0;
        }
        tlb->freed_tables = 0;
        tlb->cleared_ptes = 0;
        tlb->cleared_pmds = 0;
        tlb->cleared_puds = 0;
        tlb->cleared_p4ds = 0;
        /*
         * Do not reset mmu_gather::vma_* fields here, we do not
         * call into tlb_start_vma() again to set them if there is an
         * intermediate flush.
         */
}

#ifdef CONFIG_MMU_GATHER_NO_RANGE

#if defined(tlb_flush) || defined(tlb_start_vma) || defined(tlb_end_vma)
#error MMU_GATHER_NO_RANGE relies on default tlb_flush(), tlb_start_vma() and tlb_end_vma()
#endif

/*
 * When an architecture does not have efficient means of range flushing TLBs
 * there is no point in doing intermediate flushes on tlb_end_vma() to keep the
 * range small. We equally don't have to worry about page granularity or other
 * things.
 *
 * All we need to do is issue a full flush for any !0 range.
 */
static inline void tlb_flush(struct mmu_gather *tlb)
{
        if (tlb->end)
                flush_tlb_mm(tlb->mm);
}

static inline void
tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { }

#define tlb_end_vma tlb_end_vma
static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { }

#else /* CONFIG_MMU_GATHER_NO_RANGE */

#ifndef tlb_flush

#if defined(tlb_start_vma) || defined(tlb_end_vma)
#error Default tlb_flush() relies on default tlb_start_vma() and tlb_end_vma()
#endif

/*
 * When an architecture does not provide its own tlb_flush() implementation
 * but does have a reasonably efficient flush_vma_range() implementation
 * use that.
 */
static inline void tlb_flush(struct mmu_gather *tlb)
{
        if (tlb->fullmm || tlb->need_flush_all) {
                flush_tlb_mm(tlb->mm);
        } else if (tlb->end) {
                struct vm_area_struct vma = {
                        .vm_mm = tlb->mm,
                        .vm_flags = (tlb->vma_exec ? VM_EXEC    : 0) |
                                    (tlb->vma_huge ? VM_HUGETLB : 0),
                };

                flush_tlb_range(&vma, tlb->start, tlb->end);
        }
}

static inline void
tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
        /*
         * flush_tlb_range() implementations that look at VM_HUGETLB (tile,
         * mips-4k) flush only large pages.
         *
         * flush_tlb_range() implementations that flush I-TLB also flush D-TLB
         * (tile, xtensa, arm), so it's ok to just add VM_EXEC to an existing
         * range.
         *
         * We rely on tlb_end_vma() to issue a flush, such that when we reset
         * these values the batch is empty.
         */
        tlb->vma_huge = is_vm_hugetlb_page(vma);
        tlb->vma_exec = !!(vma->vm_flags & VM_EXEC);
}

#else

static inline void
tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { }

#endif

#endif /* CONFIG_MMU_GATHER_NO_RANGE */

static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
{
        /*
         * Anything calling __tlb_adjust_range() also sets at least one of
         * these bits.
         */
        if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds ||
              tlb->cleared_puds || tlb->cleared_p4ds))
                return;

        tlb_flush(tlb);
        mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
        __tlb_reset_range(tlb);
}

static inline void tlb_remove_page_size(struct mmu_gather *tlb,
                                        struct page *page, int page_size)
{
        if (__tlb_remove_page_size(tlb, page, page_size))
                tlb_flush_mmu(tlb);
}

static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
{
        return __tlb_remove_page_size(tlb, page, PAGE_SIZE);
}

/* tlb_remove_page
 *        Similar to __tlb_remove_page but will call tlb_flush_mmu() itself when
 *        required.
 */
static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
{
        return tlb_remove_page_size(tlb, page, PAGE_SIZE);
}

static inline void tlb_change_page_size(struct mmu_gather *tlb,
                                                     unsigned int page_size)
{
#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
        if (tlb->page_size && tlb->page_size != page_size) {
                if (!tlb->fullmm && !tlb->need_flush_all)
                        tlb_flush_mmu(tlb);
        }

        tlb->page_size = page_size;
#endif
}

static inline unsigned long tlb_get_unmap_shift(struct mmu_gather *tlb)
{
        if (tlb->cleared_ptes)
                return PAGE_SHIFT;
        if (tlb->cleared_pmds)
                return PMD_SHIFT;
        if (tlb->cleared_puds)
                return PUD_SHIFT;
        if (tlb->cleared_p4ds)
                return P4D_SHIFT;

        return PAGE_SHIFT;
}

static inline unsigned long tlb_get_unmap_size(struct mmu_gather *tlb)
{
        return 1UL << tlb_get_unmap_shift(tlb);
}

/*
 * In the case of tlb vma handling, we can optimise these away in the
 * case where we're doing a full MM flush.  When we're doing a munmap,
 * the vmas are adjusted to only cover the region to be torn down.
 */
#ifndef tlb_start_vma
static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
        if (tlb->fullmm)
                return;

        tlb_update_vma_flags(tlb, vma);
        flush_cache_range(vma, vma->vm_start, vma->vm_end);
}
#endif

#ifndef tlb_end_vma
static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
        if (tlb->fullmm)
                return;

        /*
         * Do a TLB flush and reset the range at VMA boundaries; this avoids
         * the ranges growing with the unused space between consecutive VMAs,
         * but also the mmu_gather::vma_* flags from tlb_start_vma() rely on
         * this.
         */
        tlb_flush_mmu_tlbonly(tlb);
}
#endif

/*
 * tlb_flush_{pte|pmd|pud|p4d}_range() adjust the tlb->start and tlb->end,
 * and set corresponding cleared_*.
 */
static inline void tlb_flush_pte_range(struct mmu_gather *tlb,
                                     unsigned long address, unsigned long size)
{
        __tlb_adjust_range(tlb, address, size);
        tlb->cleared_ptes = 1;
}

static inline void tlb_flush_pmd_range(struct mmu_gather *tlb,
                                     unsigned long address, unsigned long size)
{
        __tlb_adjust_range(tlb, address, size);
        tlb->cleared_pmds = 1;
}

static inline void tlb_flush_pud_range(struct mmu_gather *tlb,
                                     unsigned long address, unsigned long size)
{
        __tlb_adjust_range(tlb, address, size);
        tlb->cleared_puds = 1;
}

static inline void tlb_flush_p4d_range(struct mmu_gather *tlb,
                                     unsigned long address, unsigned long size)
{
        __tlb_adjust_range(tlb, address, size);
        tlb->cleared_p4ds = 1;
}

#ifndef __tlb_remove_tlb_entry
#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
#endif

/**
 * tlb_remove_tlb_entry - remember a pte unmapping for later tlb invalidation.
 *
 * Record the fact that pte's were really unmapped by updating the range,
 * so we can later optimise away the tlb invalidate.   This helps when
 * userspace is unmapping already-unmapped pages, which happens quite a lot.
 */
#define tlb_remove_tlb_entry(tlb, ptep, address)                \
        do {                                                        \
                tlb_flush_pte_range(tlb, address, PAGE_SIZE);        \
                __tlb_remove_tlb_entry(tlb, ptep, address);        \
        } while (0)

#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)        \
        do {                                                        \
                unsigned long _sz = huge_page_size(h);                \
                if (_sz >= P4D_SIZE)                                \
                        tlb_flush_p4d_range(tlb, address, _sz);        \
                else if (_sz >= PUD_SIZE)                        \
                        tlb_flush_pud_range(tlb, address, _sz);        \
                else if (_sz >= PMD_SIZE)                        \
                        tlb_flush_pmd_range(tlb, address, _sz);        \
                else                                                \
                        tlb_flush_pte_range(tlb, address, _sz);        \
                __tlb_remove_tlb_entry(tlb, ptep, address);        \
        } while (0)

/**
 * tlb_remove_pmd_tlb_entry - remember a pmd mapping for later tlb invalidation
 * This is a nop so far, because only x86 needs it.
 */
#ifndef __tlb_remove_pmd_tlb_entry
#define __tlb_remove_pmd_tlb_entry(tlb, pmdp, address) do {} while (0)
#endif

#define tlb_remove_pmd_tlb_entry(tlb, pmdp, address)                        \
        do {                                                                \
                tlb_flush_pmd_range(tlb, address, HPAGE_PMD_SIZE);        \
                __tlb_remove_pmd_tlb_entry(tlb, pmdp, address);                \
        } while (0)

/**
 * tlb_remove_pud_tlb_entry - remember a pud mapping for later tlb
 * invalidation. This is a nop so far, because only x86 needs it.
 */
#ifndef __tlb_remove_pud_tlb_entry
#define __tlb_remove_pud_tlb_entry(tlb, pudp, address) do {} while (0)
#endif

#define tlb_remove_pud_tlb_entry(tlb, pudp, address)                        \
        do {                                                                \
                tlb_flush_pud_range(tlb, address, HPAGE_PUD_SIZE);        \
                __tlb_remove_pud_tlb_entry(tlb, pudp, address);                \
        } while (0)

/*
 * For things like page tables caches (ie caching addresses "inside" the
 * page tables, like x86 does), for legacy reasons, flushing an
 * individual page had better flush the page table caches behind it. This
 * is definitely how x86 works, for example. And if you have an
 * architected non-legacy page table cache (which I'm not aware of
 * anybody actually doing), you're going to have some architecturally
 * explicit flushing for that, likely *separate* from a regular TLB entry
 * flush, and thus you'd need more than just some range expansion..
 *
 * So if we ever find an architecture
 * that would want something that odd, I think it is up to that
 * architecture to do its own odd thing, not cause pain for others
 * http://lkml.kernel.org/r/CA+55aFzBggoXtNXQeng5d_mRoDnaMBE5Y+URs+PHR67nUpMtaw@mail.gmail.com
 *
 * For now w.r.t page table cache, mark the range_size as PAGE_SIZE
 */

#ifndef pte_free_tlb
#define pte_free_tlb(tlb, ptep, address)                        \
        do {                                                        \
                tlb_flush_pmd_range(tlb, address, PAGE_SIZE);        \
                tlb->freed_tables = 1;                                \
                __pte_free_tlb(tlb, ptep, address);                \
        } while (0)
#endif

#ifndef pmd_free_tlb
#define pmd_free_tlb(tlb, pmdp, address)                        \
        do {                                                        \
                tlb_flush_pud_range(tlb, address, PAGE_SIZE);        \
                tlb->freed_tables = 1;                                \
                __pmd_free_tlb(tlb, pmdp, address);                \
        } while (0)
#endif

#ifndef pud_free_tlb
#define pud_free_tlb(tlb, pudp, address)                        \
        do {                                                        \
                tlb_flush_p4d_range(tlb, address, PAGE_SIZE);        \
                tlb->freed_tables = 1;                                \
                __pud_free_tlb(tlb, pudp, address);                \
        } while (0)
#endif

#ifndef p4d_free_tlb
#define p4d_free_tlb(tlb, pudp, address)                        \
        do {                                                        \
                __tlb_adjust_range(tlb, address, PAGE_SIZE);        \
                tlb->freed_tables = 1;                                \
                __p4d_free_tlb(tlb, pudp, address);                \
        } while (0)
#endif

#endif /* CONFIG_MMU */

#endif /* _ASM_GENERIC__TLB_H */
































































































    1 

































































































































































































































































































































































    1 



    1 





    1 













    1 
    1 



    1 













































































































































































































































    1 
    1 

    1 


    1 

































































































































































































    1 











    1 

    1 






















































    1 









    1 











    1 
    1 







    1 





































    1 



    1 
    1 


    1 

    1 



    1 













































    1 



    1 

    1 

    1 


    1 
    1 






















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Page Attribute Table (PAT) support: handle memory caching attributes in page tables.
 *
 * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
 *          Suresh B Siddha <suresh.b.siddha@intel.com>
 *
 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
 *
 * Basic principles:
 *
 * PAT is a CPU feature supported by all modern x86 CPUs, to allow the firmware and
 * the kernel to set one of a handful of 'caching type' attributes for physical
 * memory ranges: uncached, write-combining, write-through, write-protected,
 * and the most commonly used and default attribute: write-back caching.
 *
 * PAT support supercedes and augments MTRR support in a compatible fashion: MTRR is
 * a hardware interface to enumerate a limited number of physical memory ranges
 * and set their caching attributes explicitly, programmed into the CPU via MSRs.
 * Even modern CPUs have MTRRs enabled - but these are typically not touched
 * by the kernel or by user-space (such as the X server), we rely on PAT for any
 * additional cache attribute logic.
 *
 * PAT doesn't work via explicit memory ranges, but uses page table entries to add
 * cache attribute information to the mapped memory range: there's 3 bits used,
 * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT), with the 8 possible values mapped by the
 * CPU to actual cache attributes via an MSR loaded into the CPU (MSR_IA32_CR_PAT).
 *
 * ( There's a metric ton of finer details, such as compatibility with CPU quirks
 *   that only support 4 types of PAT entries, and interaction with MTRRs, see
 *   below for details. )
 */

#include <linux/seq_file.h>
#include <linux/memblock.h>
#include <linux/debugfs.h>
#include <linux/ioport.h>
#include <linux/kernel.h>
#include <linux/pfn_t.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/rbtree.h>

#include <asm/cacheflush.h>
#include <asm/processor.h>
#include <asm/tlbflush.h>
#include <asm/x86_init.h>
#include <asm/fcntl.h>
#include <asm/e820/api.h>
#include <asm/mtrr.h>
#include <asm/page.h>
#include <asm/msr.h>
#include <asm/memtype.h>
#include <asm/io.h>

#include "memtype.h"
#include "../mm_internal.h"
#include "../../../mm/internal.h"        /* is_cow_mapping() */

#undef pr_fmt
#define pr_fmt(fmt) "" fmt

static bool __read_mostly pat_bp_initialized;
static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT);
static bool __read_mostly pat_bp_enabled;
static bool __read_mostly pat_cm_initialized;

/*
 * PAT support is enabled by default, but can be disabled for
 * various user-requested or hardware-forced reasons:
 */
void pat_disable(const char *msg_reason)
{
        if (pat_disabled)
                return;

        if (pat_bp_initialized) {
                WARN_ONCE(1, "x86/PAT: PAT cannot be disabled after initialization\n");
                return;
        }

        pat_disabled = true;
        pr_info("x86/PAT: %s\n", msg_reason);
}

static int __init nopat(char *str)
{
        pat_disable("PAT support disabled via boot option.");
        return 0;
}
early_param("nopat", nopat);

bool pat_enabled(void)
{
        return pat_bp_enabled;
}
EXPORT_SYMBOL_GPL(pat_enabled);

int pat_debug_enable;

static int __init pat_debug_setup(char *str)
{
        pat_debug_enable = 1;
        return 1;
}
__setup("debugpat", pat_debug_setup);

#ifdef CONFIG_X86_PAT
/*
 * X86 PAT uses page flags arch_1 and uncached together to keep track of
 * memory type of pages that have backing page struct.
 *
 * X86 PAT supports 4 different memory types:
 *  - _PAGE_CACHE_MODE_WB
 *  - _PAGE_CACHE_MODE_WC
 *  - _PAGE_CACHE_MODE_UC_MINUS
 *  - _PAGE_CACHE_MODE_WT
 *
 * _PAGE_CACHE_MODE_WB is the default type.
 */

#define _PGMT_WB                0
#define _PGMT_WC                (1UL << PG_arch_1)
#define _PGMT_UC_MINUS                (1UL << PG_uncached)
#define _PGMT_WT                (1UL << PG_uncached | 1UL << PG_arch_1)
#define _PGMT_MASK                (1UL << PG_uncached | 1UL << PG_arch_1)
#define _PGMT_CLEAR_MASK        (~_PGMT_MASK)

static inline enum page_cache_mode get_page_memtype(struct page *pg)
{
        unsigned long pg_flags = pg->flags & _PGMT_MASK;

        if (pg_flags == _PGMT_WB)
                return _PAGE_CACHE_MODE_WB;
        else if (pg_flags == _PGMT_WC)
                return _PAGE_CACHE_MODE_WC;
        else if (pg_flags == _PGMT_UC_MINUS)
                return _PAGE_CACHE_MODE_UC_MINUS;
        else
                return _PAGE_CACHE_MODE_WT;
}

static inline void set_page_memtype(struct page *pg,
                                    enum page_cache_mode memtype)
{
        unsigned long memtype_flags;
        unsigned long old_flags;
        unsigned long new_flags;

        switch (memtype) {
        case _PAGE_CACHE_MODE_WC:
                memtype_flags = _PGMT_WC;
                break;
        case _PAGE_CACHE_MODE_UC_MINUS:
                memtype_flags = _PGMT_UC_MINUS;
                break;
        case _PAGE_CACHE_MODE_WT:
                memtype_flags = _PGMT_WT;
                break;
        case _PAGE_CACHE_MODE_WB:
        default:
                memtype_flags = _PGMT_WB;
                break;
        }

        do {
                old_flags = pg->flags;
                new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags;
        } while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags);
}
#else
static inline enum page_cache_mode get_page_memtype(struct page *pg)
{
        return -1;
}
static inline void set_page_memtype(struct page *pg,
                                    enum page_cache_mode memtype)
{
}
#endif

enum {
        PAT_UC = 0,                /* uncached */
        PAT_WC = 1,                /* Write combining */
        PAT_WT = 4,                /* Write Through */
        PAT_WP = 5,                /* Write Protected */
        PAT_WB = 6,                /* Write Back (default) */
        PAT_UC_MINUS = 7,        /* UC, but can be overridden by MTRR */
};

#define CM(c) (_PAGE_CACHE_MODE_ ## c)

static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg)
{
        enum page_cache_mode cache;
        char *cache_mode;

        switch (pat_val) {
        case PAT_UC:       cache = CM(UC);       cache_mode = "UC  "; break;
        case PAT_WC:       cache = CM(WC);       cache_mode = "WC  "; break;
        case PAT_WT:       cache = CM(WT);       cache_mode = "WT  "; break;
        case PAT_WP:       cache = CM(WP);       cache_mode = "WP  "; break;
        case PAT_WB:       cache = CM(WB);       cache_mode = "WB  "; break;
        case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break;
        default:           cache = CM(WB);       cache_mode = "WB  "; break;
        }

        memcpy(msg, cache_mode, 4);

        return cache;
}

#undef CM

/*
 * Update the cache mode to pgprot translation tables according to PAT
 * configuration.
 * Using lower indices is preferred, so we start with highest index.
 */
static void __init_cache_modes(u64 pat)
{
        enum page_cache_mode cache;
        char pat_msg[33];
        int i;

        WARN_ON_ONCE(pat_cm_initialized);

        pat_msg[32] = 0;
        for (i = 7; i >= 0; i--) {
                cache = pat_get_cache_mode((pat >> (i * 8)) & 7,
                                           pat_msg + 4 * i);
                update_cache_mode_entry(i, cache);
        }
        pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg);

        pat_cm_initialized = true;
}

#define PAT(x, y)        ((u64)PAT_ ## y << ((x)*8))

static void pat_bp_init(u64 pat)
{
        u64 tmp_pat;

        if (!boot_cpu_has(X86_FEATURE_PAT)) {
                pat_disable("PAT not supported by the CPU.");
                return;
        }

        rdmsrl(MSR_IA32_CR_PAT, tmp_pat);
        if (!tmp_pat) {
                pat_disable("PAT support disabled by the firmware.");
                return;
        }

        wrmsrl(MSR_IA32_CR_PAT, pat);
        pat_bp_enabled = true;

        __init_cache_modes(pat);
}

static void pat_ap_init(u64 pat)
{
        if (!boot_cpu_has(X86_FEATURE_PAT)) {
                /*
                 * If this happens we are on a secondary CPU, but switched to
                 * PAT on the boot CPU. We have no way to undo PAT.
                 */
                panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n");
        }

        wrmsrl(MSR_IA32_CR_PAT, pat);
}

void init_cache_modes(void)
{
        u64 pat = 0;

        if (pat_cm_initialized)
                return;

        if (boot_cpu_has(X86_FEATURE_PAT)) {
                /*
                 * CPU supports PAT. Set PAT table to be consistent with
                 * PAT MSR. This case supports "nopat" boot option, and
                 * virtual machine environments which support PAT without
                 * MTRRs. In specific, Xen has unique setup to PAT MSR.
                 *
                 * If PAT MSR returns 0, it is considered invalid and emulates
                 * as No PAT.
                 */
                rdmsrl(MSR_IA32_CR_PAT, pat);
        }

        if (!pat) {
                /*
                 * No PAT. Emulate the PAT table that corresponds to the two
                 * cache bits, PWT (Write Through) and PCD (Cache Disable).
                 * This setup is also the same as the BIOS default setup.
                 *
                 * PTE encoding:
                 *
                 *       PCD
                 *       |PWT  PAT
                 *       ||    slot
                 *       00    0    WB : _PAGE_CACHE_MODE_WB
                 *       01    1    WT : _PAGE_CACHE_MODE_WT
                 *       10    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
                 *       11    3    UC : _PAGE_CACHE_MODE_UC
                 *
                 * NOTE: When WC or WP is used, it is redirected to UC- per
                 * the default setup in __cachemode2pte_tbl[].
                 */
                pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) |
                      PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC);
        }

        __init_cache_modes(pat);
}

/**
 * pat_init - Initialize the PAT MSR and PAT table on the current CPU
 *
 * This function initializes PAT MSR and PAT table with an OS-defined value
 * to enable additional cache attributes, WC, WT and WP.
 *
 * This function must be called on all CPUs using the specific sequence of
 * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this
 * procedure for PAT.
 */
void pat_init(void)
{
        u64 pat;
        struct cpuinfo_x86 *c = &boot_cpu_data;

#ifndef CONFIG_X86_PAT
        pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n");
#endif

        if (pat_disabled)
                return;

        if ((c->x86_vendor == X86_VENDOR_INTEL) &&
            (((c->x86 == 0x6) && (c->x86_model <= 0xd)) ||
             ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) {
                /*
                 * PAT support with the lower four entries. Intel Pentium 2,
                 * 3, M, and 4 are affected by PAT errata, which makes the
                 * upper four entries unusable. To be on the safe side, we don't
                 * use those.
                 *
                 *  PTE encoding:
                 *      PAT
                 *      |PCD
                 *      ||PWT  PAT
                 *      |||    slot
                 *      000    0    WB : _PAGE_CACHE_MODE_WB
                 *      001    1    WC : _PAGE_CACHE_MODE_WC
                 *      010    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
                 *      011    3    UC : _PAGE_CACHE_MODE_UC
                 * PAT bit unused
                 *
                 * NOTE: When WT or WP is used, it is redirected to UC- per
                 * the default setup in __cachemode2pte_tbl[].
                 */
                pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
                      PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
        } else {
                /*
                 * Full PAT support.  We put WT in slot 7 to improve
                 * robustness in the presence of errata that might cause
                 * the high PAT bit to be ignored.  This way, a buggy slot 7
                 * access will hit slot 3, and slot 3 is UC, so at worst
                 * we lose performance without causing a correctness issue.
                 * Pentium 4 erratum N46 is an example for such an erratum,
                 * although we try not to use PAT at all on affected CPUs.
                 *
                 *  PTE encoding:
                 *      PAT
                 *      |PCD
                 *      ||PWT  PAT
                 *      |||    slot
                 *      000    0    WB : _PAGE_CACHE_MODE_WB
                 *      001    1    WC : _PAGE_CACHE_MODE_WC
                 *      010    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
                 *      011    3    UC : _PAGE_CACHE_MODE_UC
                 *      100    4    WB : Reserved
                 *      101    5    WP : _PAGE_CACHE_MODE_WP
                 *      110    6    UC-: Reserved
                 *      111    7    WT : _PAGE_CACHE_MODE_WT
                 *
                 * The reserved slots are unused, but mapped to their
                 * corresponding types in the presence of PAT errata.
                 */
                pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
                      PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT);
        }

        if (!pat_bp_initialized) {
                pat_bp_init(pat);
                pat_bp_initialized = true;
        } else {
                pat_ap_init(pat);
        }
}

#undef PAT

static DEFINE_SPINLOCK(memtype_lock);        /* protects memtype accesses */

/*
 * Does intersection of PAT memory type and MTRR memory type and returns
 * the resulting memory type as PAT understands it.
 * (Type in pat and mtrr will not have same value)
 * The intersection is based on "Effective Memory Type" tables in IA-32
 * SDM vol 3a
 */
static unsigned long pat_x_mtrr_type(u64 start, u64 end,
                                     enum page_cache_mode req_type)
{
        /*
         * Look for MTRR hint to get the effective type in case where PAT
         * request is for WB.
         */
        if (req_type == _PAGE_CACHE_MODE_WB) {
                u8 mtrr_type, uniform;

                mtrr_type = mtrr_type_lookup(start, end, &uniform);
                if (mtrr_type != MTRR_TYPE_WRBACK)
                        return _PAGE_CACHE_MODE_UC_MINUS;

                return _PAGE_CACHE_MODE_WB;
        }

        return req_type;
}

struct pagerange_state {
        unsigned long                cur_pfn;
        int                        ram;
        int                        not_ram;
};

static int
pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg)
{
        struct pagerange_state *state = arg;

        state->not_ram        |= initial_pfn > state->cur_pfn;
        state->ram        |= total_nr_pages > 0;
        state->cur_pfn         = initial_pfn + total_nr_pages;

        return state->ram && state->not_ram;
}

static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
{
        int ret = 0;
        unsigned long start_pfn = start >> PAGE_SHIFT;
        unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
        struct pagerange_state state = {start_pfn, 0, 0};

        /*
         * For legacy reasons, physical address range in the legacy ISA
         * region is tracked as non-RAM. This will allow users of
         * /dev/mem to map portions of legacy ISA region, even when
         * some of those portions are listed(or not even listed) with
         * different e820 types(RAM/reserved/..)
         */
        if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT)
                start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT;

        if (start_pfn < end_pfn) {
                ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
                                &state, pagerange_is_ram_callback);
        }

        return (ret > 0) ? -1 : (state.ram ? 1 : 0);
}

/*
 * For RAM pages, we use page flags to mark the pages with appropriate type.
 * The page flags are limited to four types, WB (default), WC, WT and UC-.
 * WP request fails with -EINVAL, and UC gets redirected to UC-.  Setting
 * a new memory type is only allowed for a page mapped with the default WB
 * type.
 *
 * Here we do two passes:
 * - Find the memtype of all the pages in the range, look for any conflicts.
 * - In case of no conflicts, set the new memtype for pages in the range.
 */
static int reserve_ram_pages_type(u64 start, u64 end,
                                  enum page_cache_mode req_type,
                                  enum page_cache_mode *new_type)
{
        struct page *page;
        u64 pfn;

        if (req_type == _PAGE_CACHE_MODE_WP) {
                if (new_type)
                        *new_type = _PAGE_CACHE_MODE_UC_MINUS;
                return -EINVAL;
        }

        if (req_type == _PAGE_CACHE_MODE_UC) {
                /* We do not support strong UC */
                WARN_ON_ONCE(1);
                req_type = _PAGE_CACHE_MODE_UC_MINUS;
        }

        for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
                enum page_cache_mode type;

                page = pfn_to_page(pfn);
                type = get_page_memtype(page);
                if (type != _PAGE_CACHE_MODE_WB) {
                        pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n",
                                start, end - 1, type, req_type);
                        if (new_type)
                                *new_type = type;

                        return -EBUSY;
                }
        }

        if (new_type)
                *new_type = req_type;

        for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
                page = pfn_to_page(pfn);
                set_page_memtype(page, req_type);
        }
        return 0;
}

static int free_ram_pages_type(u64 start, u64 end)
{
        struct page *page;
        u64 pfn;

        for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
                page = pfn_to_page(pfn);
                set_page_memtype(page, _PAGE_CACHE_MODE_WB);
        }
        return 0;
}

static u64 sanitize_phys(u64 address)
{
        /*
         * When changing the memtype for pages containing poison allow
         * for a "decoy" virtual address (bit 63 clear) passed to
         * set_memory_X(). __pa() on a "decoy" address results in a
         * physical address with bit 63 set.
         *
         * Decoy addresses are not present for 32-bit builds, see
         * set_mce_nospec().
         */
        if (IS_ENABLED(CONFIG_X86_64))
                return address & __PHYSICAL_MASK;
        return address;
}

/*
 * req_type typically has one of the:
 * - _PAGE_CACHE_MODE_WB
 * - _PAGE_CACHE_MODE_WC
 * - _PAGE_CACHE_MODE_UC_MINUS
 * - _PAGE_CACHE_MODE_UC
 * - _PAGE_CACHE_MODE_WT
 *
 * If new_type is NULL, function will return an error if it cannot reserve the
 * region with req_type. If new_type is non-NULL, function will return
 * available type in new_type in case of no error. In case of any error
 * it will return a negative return value.
 */
int memtype_reserve(u64 start, u64 end, enum page_cache_mode req_type,
                    enum page_cache_mode *new_type)
{
        struct memtype *entry_new;
        enum page_cache_mode actual_type;
        int is_range_ram;
        int err = 0;

        start = sanitize_phys(start);

        /*
         * The end address passed into this function is exclusive, but
         * sanitize_phys() expects an inclusive address.
         */
        end = sanitize_phys(end - 1) + 1;
        if (start >= end) {
                WARN(1, "%s failed: [mem %#010Lx-%#010Lx], req %s\n", __func__,
                                start, end - 1, cattr_name(req_type));
                return -EINVAL;
        }

        if (!pat_enabled()) {
                /* This is identical to page table setting without PAT */
                if (new_type)
                        *new_type = req_type;
                return 0;
        }

        /* Low ISA region is always mapped WB in page table. No need to track */
        if (x86_platform.is_untracked_pat_range(start, end)) {
                if (new_type)
                        *new_type = _PAGE_CACHE_MODE_WB;
                return 0;
        }

        /*
         * Call mtrr_lookup to get the type hint. This is an
         * optimization for /dev/mem mmap'ers into WB memory (BIOS
         * tools and ACPI tools). Use WB request for WB memory and use
         * UC_MINUS otherwise.
         */
        actual_type = pat_x_mtrr_type(start, end, req_type);

        if (new_type)
                *new_type = actual_type;

        is_range_ram = pat_pagerange_is_ram(start, end);
        if (is_range_ram == 1) {

                err = reserve_ram_pages_type(start, end, req_type, new_type);

                return err;
        } else if (is_range_ram < 0) {
                return -EINVAL;
        }

        entry_new = kzalloc(sizeof(struct memtype), GFP_KERNEL);
        if (!entry_new)
                return -ENOMEM;

        entry_new->start = start;
        entry_new->end         = end;
        entry_new->type         = actual_type;

        spin_lock(&memtype_lock);

        err = memtype_check_insert(entry_new, new_type);
        if (err) {
                pr_info("x86/PAT: memtype_reserve failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
                        start, end - 1,
                        cattr_name(entry_new->type), cattr_name(req_type));
                kfree(entry_new);
                spin_unlock(&memtype_lock);

                return err;
        }

        spin_unlock(&memtype_lock);

        dprintk("memtype_reserve added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n",
                start, end - 1, cattr_name(entry_new->type), cattr_name(req_type),
                new_type ? cattr_name(*new_type) : "-");

        return err;
}

int memtype_free(u64 start, u64 end)
{
        int is_range_ram;
        struct memtype *entry_old;

        if (!pat_enabled())
                return 0;

        start = sanitize_phys(start);
        end = sanitize_phys(end);

        /* Low ISA region is always mapped WB. No need to track */
        if (x86_platform.is_untracked_pat_range(start, end))
                return 0;

        is_range_ram = pat_pagerange_is_ram(start, end);
        if (is_range_ram == 1)
                return free_ram_pages_type(start, end);
        if (is_range_ram < 0)
                return -EINVAL;

        spin_lock(&memtype_lock);
        entry_old = memtype_erase(start, end);
        spin_unlock(&memtype_lock);

        if (IS_ERR(entry_old)) {
                pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
                        current->comm, current->pid, start, end - 1);
                return -EINVAL;
        }

        kfree(entry_old);

        dprintk("memtype_free request [mem %#010Lx-%#010Lx]\n", start, end - 1);

        return 0;
}


/**
 * lookup_memtype - Looksup the memory type for a physical address
 * @paddr: physical address of which memory type needs to be looked up
 *
 * Only to be called when PAT is enabled
 *
 * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS
 * or _PAGE_CACHE_MODE_WT.
 */
static enum page_cache_mode lookup_memtype(u64 paddr)
{
        enum page_cache_mode rettype = _PAGE_CACHE_MODE_WB;
        struct memtype *entry;

        if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE))
                return rettype;

        if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
                struct page *page;

                page = pfn_to_page(paddr >> PAGE_SHIFT);
                return get_page_memtype(page);
        }

        spin_lock(&memtype_lock);

        entry = memtype_lookup(paddr);
        if (entry != NULL)
                rettype = entry->type;
        else
                rettype = _PAGE_CACHE_MODE_UC_MINUS;

        spin_unlock(&memtype_lock);

        return rettype;
}

/**
 * pat_pfn_immune_to_uc_mtrr - Check whether the PAT memory type
 * of @pfn cannot be overridden by UC MTRR memory type.
 *
 * Only to be called when PAT is enabled.
 *
 * Returns true, if the PAT memory type of @pfn is UC, UC-, or WC.
 * Returns false in other cases.
 */
bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn)
{
        enum page_cache_mode cm = lookup_memtype(PFN_PHYS(pfn));

        return cm == _PAGE_CACHE_MODE_UC ||
               cm == _PAGE_CACHE_MODE_UC_MINUS ||
               cm == _PAGE_CACHE_MODE_WC;
}
EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr);

/**
 * memtype_reserve_io - Request a memory type mapping for a region of memory
 * @start: start (physical address) of the region
 * @end: end (physical address) of the region
 * @type: A pointer to memtype, with requested type. On success, requested
 * or any other compatible type that was available for the region is returned
 *
 * On success, returns 0
 * On failure, returns non-zero
 */
int memtype_reserve_io(resource_size_t start, resource_size_t end,
                        enum page_cache_mode *type)
{
        resource_size_t size = end - start;
        enum page_cache_mode req_type = *type;
        enum page_cache_mode new_type;
        int ret;

        WARN_ON_ONCE(iomem_map_sanity_check(start, size));

        ret = memtype_reserve(start, end, req_type, &new_type);
        if (ret)
                goto out_err;

        if (!is_new_memtype_allowed(start, size, req_type, new_type))
                goto out_free;

        if (memtype_kernel_map_sync(start, size, new_type) < 0)
                goto out_free;

        *type = new_type;
        return 0;

out_free:
        memtype_free(start, end);
        ret = -EBUSY;
out_err:
        return ret;
}

/**
 * memtype_free_io - Release a memory type mapping for a region of memory
 * @start: start (physical address) of the region
 * @end: end (physical address) of the region
 */
void memtype_free_io(resource_size_t start, resource_size_t end)
{
        memtype_free(start, end);
}

int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size)
{
        enum page_cache_mode type = _PAGE_CACHE_MODE_WC;

        return memtype_reserve_io(start, start + size, &type);
}
EXPORT_SYMBOL(arch_io_reserve_memtype_wc);

void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size)
{
        memtype_free_io(start, start + size);
}
EXPORT_SYMBOL(arch_io_free_memtype_wc);

pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
                                unsigned long size, pgprot_t vma_prot)
{
        if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size))
                vma_prot = pgprot_decrypted(vma_prot);

        return vma_prot;
}

#ifdef CONFIG_STRICT_DEVMEM
/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */
static inline int range_is_allowed(unsigned long pfn, unsigned long size)
{
        return 1;
}
#else
/* This check is needed to avoid cache aliasing when PAT is enabled */
static inline int range_is_allowed(unsigned long pfn, unsigned long size)
{
        u64 from = ((u64)pfn) << PAGE_SHIFT;
        u64 to = from + size;
        u64 cursor = from;

        if (!pat_enabled())
                return 1;

        while (cursor < to) {
                if (!devmem_is_allowed(pfn))
                        return 0;
                cursor += PAGE_SIZE;
                pfn++;
        }
        return 1;
}
#endif /* CONFIG_STRICT_DEVMEM */

int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
                                unsigned long size, pgprot_t *vma_prot)
{
        enum page_cache_mode pcm = _PAGE_CACHE_MODE_WB;

        if (!range_is_allowed(pfn, size))
                return 0;

        if (file->f_flags & O_DSYNC)
                pcm = _PAGE_CACHE_MODE_UC_MINUS;

        *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
                             cachemode2protval(pcm));
        return 1;
}

/*
 * Change the memory type for the physical address range in kernel identity
 * mapping space if that range is a part of identity map.
 */
int memtype_kernel_map_sync(u64 base, unsigned long size,
                            enum page_cache_mode pcm)
{
        unsigned long id_sz;

        if (base > __pa(high_memory-1))
                return 0;

        /*
         * Some areas in the middle of the kernel identity range
         * are not mapped, for example the PCI space.
         */
        if (!page_is_ram(base >> PAGE_SHIFT))
                return 0;

        id_sz = (__pa(high_memory-1) <= base + size) ?
                                __pa(high_memory) - base : size;

        if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) {
                pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n",
                        current->comm, current->pid,
                        cattr_name(pcm),
                        base, (unsigned long long)(base + size-1));
                return -EINVAL;
        }
        return 0;
}

/*
 * Internal interface to reserve a range of physical memory with prot.
 * Reserved non RAM regions only and after successful memtype_reserve,
 * this func also keeps identity mapping (if any) in sync with this new prot.
 */
static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
                                int strict_prot)
{
        int is_ram = 0;
        int ret;
        enum page_cache_mode want_pcm = pgprot2cachemode(*vma_prot);
        enum page_cache_mode pcm = want_pcm;

        is_ram = pat_pagerange_is_ram(paddr, paddr + size);

        /*
         * reserve_pfn_range() for RAM pages. We do not refcount to keep
         * track of number of mappings of RAM pages. We can assert that
         * the type requested matches the type of first page in the range.
         */
        if (is_ram) {
                if (!pat_enabled())
                        return 0;

                pcm = lookup_memtype(paddr);
                if (want_pcm != pcm) {
                        pr_warn("x86/PAT: %s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
                                current->comm, current->pid,
                                cattr_name(want_pcm),
                                (unsigned long long)paddr,
                                (unsigned long long)(paddr + size - 1),
                                cattr_name(pcm));
                        *vma_prot = __pgprot((pgprot_val(*vma_prot) &
                                             (~_PAGE_CACHE_MASK)) |
                                             cachemode2protval(pcm));
                }
                return 0;
        }

        ret = memtype_reserve(paddr, paddr + size, want_pcm, &pcm);
        if (ret)
                return ret;

        if (pcm != want_pcm) {
                if (strict_prot ||
                    !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) {
                        memtype_free(paddr, paddr + size);
                        pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n",
                               current->comm, current->pid,
                               cattr_name(want_pcm),
                               (unsigned long long)paddr,
                               (unsigned long long)(paddr + size - 1),
                               cattr_name(pcm));
                        return -EINVAL;
                }
                /*
                 * We allow returning different type than the one requested in
                 * non strict case.
                 */
                *vma_prot = __pgprot((pgprot_val(*vma_prot) &
                                      (~_PAGE_CACHE_MASK)) |
                                     cachemode2protval(pcm));
        }

        if (memtype_kernel_map_sync(paddr, size, pcm) < 0) {
                memtype_free(paddr, paddr + size);
                return -EINVAL;
        }
        return 0;
}

/*
 * Internal interface to free a range of physical memory.
 * Frees non RAM regions only.
 */
static void free_pfn_range(u64 paddr, unsigned long size)
{
        int is_ram;

        is_ram = pat_pagerange_is_ram(paddr, paddr + size);
        if (is_ram == 0)
                memtype_free(paddr, paddr + size);
}

static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
                pgprot_t *pgprot)
{
        unsigned long prot;

        VM_WARN_ON_ONCE(!(vma->vm_flags & VM_PAT));

        /*
         * We need the starting PFN and cachemode used for track_pfn_remap()
         * that covered the whole VMA. For most mappings, we can obtain that
         * information from the page tables. For COW mappings, we might now
         * suddenly have anon folios mapped and follow_phys() will fail.
         *
         * Fallback to using vma->vm_pgoff, see remap_pfn_range_notrack(), to
         * detect the PFN. If we need the cachemode as well, we're out of luck
         * for now and have to fail fork().
         */
        if (!follow_phys(vma, vma->vm_start, 0, &prot, paddr)) {
                if (pgprot)
                        *pgprot = __pgprot(prot);
                return 0;
        }
        if (is_cow_mapping(vma->vm_flags)) {
                if (pgprot)
                        return -EINVAL;
                *paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
                return 0;
        }
        WARN_ON_ONCE(1);
        return -EINVAL;
}

/*
 * track_pfn_copy is called when vma that is covering the pfnmap gets
 * copied through copy_page_range().
 *
 * If the vma has a linear pfn mapping for the entire range, we get the prot
 * from pte and reserve the entire vma range with single reserve_pfn_range call.
 */
int track_pfn_copy(struct vm_area_struct *vma)
{
        resource_size_t paddr;
        unsigned long vma_size = vma->vm_end - vma->vm_start;
        pgprot_t pgprot;

        if (vma->vm_flags & VM_PAT) {
                if (get_pat_info(vma, &paddr, &pgprot))
                        return -EINVAL;
                /* reserve the whole chunk covered by vma. */
                return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
        }

        return 0;
}

/*
 * prot is passed in as a parameter for the new mapping. If the vma has
 * a linear pfn mapping for the entire range, or no vma is provided,
 * reserve the entire pfn + size range with single reserve_pfn_range
 * call.
 */
int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
                    unsigned long pfn, unsigned long addr, unsigned long size)
{
        resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
        enum page_cache_mode pcm;

        /* reserve the whole chunk starting from paddr */
        if (!vma || (addr == vma->vm_start
                                && size == (vma->vm_end - vma->vm_start))) {
                int ret;

                ret = reserve_pfn_range(paddr, size, prot, 0);
                if (ret == 0 && vma)
                        vma->vm_flags |= VM_PAT;
                return ret;
        }

        if (!pat_enabled())
                return 0;

        /*
         * For anything smaller than the vma size we set prot based on the
         * lookup.
         */
        pcm = lookup_memtype(paddr);

        /* Check memtype for the remaining pages */
        while (size > PAGE_SIZE) {
                size -= PAGE_SIZE;
                paddr += PAGE_SIZE;
                if (pcm != lookup_memtype(paddr))
                        return -EINVAL;
        }

        *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
                         cachemode2protval(pcm));

        return 0;
}

void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn)
{
        enum page_cache_mode pcm;

        if (!pat_enabled())
                return;

        /* Set prot based on lookup */
        pcm = lookup_memtype(pfn_t_to_phys(pfn));
        *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
                         cachemode2protval(pcm));
}

/*
 * untrack_pfn is called while unmapping a pfnmap for a region.
 * untrack can be called for a specific region indicated by pfn and size or
 * can be for the entire vma (in which case pfn, size are zero).
 */
void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
                 unsigned long size)
{
        resource_size_t paddr;

        if (vma && !(vma->vm_flags & VM_PAT))
                return;

        /* free the chunk starting from pfn or the whole chunk */
        paddr = (resource_size_t)pfn << PAGE_SHIFT;
        if (!paddr && !size) {
                if (get_pat_info(vma, &paddr, NULL))
                        return;
                size = vma->vm_end - vma->vm_start;
        }
        free_pfn_range(paddr, size);
        if (vma)
                vma->vm_flags &= ~VM_PAT;
}

/*
 * untrack_pfn_moved is called, while mremapping a pfnmap for a new region,
 * with the old vma after its pfnmap page table has been removed.  The new
 * vma has a new pfnmap to the same pfn & cache type with VM_PAT set.
 */
void untrack_pfn_moved(struct vm_area_struct *vma)
{
        vma->vm_flags &= ~VM_PAT;
}

pgprot_t pgprot_writecombine(pgprot_t prot)
{
        return __pgprot(pgprot_val(prot) |
                                cachemode2protval(_PAGE_CACHE_MODE_WC));
}
EXPORT_SYMBOL_GPL(pgprot_writecombine);

pgprot_t pgprot_writethrough(pgprot_t prot)
{
        return __pgprot(pgprot_val(prot) |
                                cachemode2protval(_PAGE_CACHE_MODE_WT));
}
EXPORT_SYMBOL_GPL(pgprot_writethrough);

#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)

/*
 * We are allocating a temporary printout-entry to be passed
 * between seq_start()/next() and seq_show():
 */
static struct memtype *memtype_get_idx(loff_t pos)
{
        struct memtype *entry_print;
        int ret;

        entry_print  = kzalloc(sizeof(struct memtype), GFP_KERNEL);
        if (!entry_print)
                return NULL;

        spin_lock(&memtype_lock);
        ret = memtype_copy_nth_element(entry_print, pos);
        spin_unlock(&memtype_lock);

        /* Free it on error: */
        if (ret) {
                kfree(entry_print);
                return NULL;
        }

        return entry_print;
}

static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
{
        if (*pos == 0) {
                ++*pos;
                seq_puts(seq, "PAT memtype list:\n");
        }

        return memtype_get_idx(*pos);
}

static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        kfree(v);
        ++*pos;
        return memtype_get_idx(*pos);
}

static void memtype_seq_stop(struct seq_file *seq, void *v)
{
        kfree(v);
}

static int memtype_seq_show(struct seq_file *seq, void *v)
{
        struct memtype *entry_print = (struct memtype *)v;

        seq_printf(seq, "PAT: [mem 0x%016Lx-0x%016Lx] %s\n",
                        entry_print->start,
                        entry_print->end,
                        cattr_name(entry_print->type));

        return 0;
}

static const struct seq_operations memtype_seq_ops = {
        .start = memtype_seq_start,
        .next  = memtype_seq_next,
        .stop  = memtype_seq_stop,
        .show  = memtype_seq_show,
};

static int memtype_seq_open(struct inode *inode, struct file *file)
{
        return seq_open(file, &memtype_seq_ops);
}

static const struct file_operations memtype_fops = {
        .open    = memtype_seq_open,
        .read    = seq_read,
        .llseek  = seq_lseek,
        .release = seq_release,
};

static int __init pat_memtype_list_init(void)
{
        if (pat_enabled()) {
                debugfs_create_file("pat_memtype_list", S_IRUSR,
                                    arch_debugfs_dir, NULL, &memtype_fops);
        }
        return 0;
}
late_initcall(pat_memtype_list_init);

#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */





































    1 
















    1 








    1 





    1 

    1 







    1 




    1 

































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2014 Davidlohr Bueso.
 */
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/mm.h>
#include <linux/vmacache.h>

/*
 * Hash based on the pmd of addr if configured with MMU, which provides a good
 * hit rate for workloads with spatial locality.  Otherwise, use pages.
 */
#ifdef CONFIG_MMU
#define VMACACHE_SHIFT        PMD_SHIFT
#else
#define VMACACHE_SHIFT        PAGE_SHIFT
#endif
#define VMACACHE_HASH(addr) ((addr >> VMACACHE_SHIFT) & VMACACHE_MASK)

/*
 * This task may be accessing a foreign mm via (for example)
 * get_user_pages()->find_vma().  The vmacache is task-local and this
 * task's vmacache pertains to a different mm (ie, its own).  There is
 * nothing we can do here.
 *
 * Also handle the case where a kernel thread has adopted this mm via
 * kthread_use_mm(). That kernel thread's vmacache is not applicable to this mm.
 */
static inline bool vmacache_valid_mm(struct mm_struct *mm)
{
        return current->mm == mm && !(current->flags & PF_KTHREAD);
}

void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
{
        if (vmacache_valid_mm(newvma->vm_mm))
                current->vmacache.vmas[VMACACHE_HASH(addr)] = newvma;
}

static bool vmacache_valid(struct mm_struct *mm)
{
        struct task_struct *curr;

        if (!vmacache_valid_mm(mm))
                return false;

        curr = current;
        if (mm->vmacache_seqnum != curr->vmacache.seqnum) {
                /*
                 * First attempt will always be invalid, initialize
                 * the new cache for this task here.
                 */
                curr->vmacache.seqnum = mm->vmacache_seqnum;
                vmacache_flush(curr);
                return false;
        }
        return true;
}

struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
{
        int idx = VMACACHE_HASH(addr);
        int i;

        count_vm_vmacache_event(VMACACHE_FIND_CALLS);

        if (!vmacache_valid(mm))
                return NULL;

        for (i = 0; i < VMACACHE_SIZE; i++) {
                struct vm_area_struct *vma = current->vmacache.vmas[idx];

                if (vma) {
#ifdef CONFIG_DEBUG_VM_VMACACHE
                        if (WARN_ON_ONCE(vma->vm_mm != mm))
                                break;
#endif
                        if (vma->vm_start <= addr && vma->vm_end > addr) {
                                count_vm_vmacache_event(VMACACHE_FIND_HITS);
                                return vma;
                        }
                }
                if (++idx == VMACACHE_SIZE)
                        idx = 0;
        }

        return NULL;
}

#ifndef CONFIG_MMU
struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
                                           unsigned long start,
                                           unsigned long end)
{
        int idx = VMACACHE_HASH(start);
        int i;

        count_vm_vmacache_event(VMACACHE_FIND_CALLS);

        if (!vmacache_valid(mm))
                return NULL;

        for (i = 0; i < VMACACHE_SIZE; i++) {
                struct vm_area_struct *vma = current->vmacache.vmas[idx];

                if (vma && vma->vm_start == start && vma->vm_end == end) {
                        count_vm_vmacache_event(VMACACHE_FIND_HITS);
                        return vma;
                }
                if (++idx == VMACACHE_SIZE)
                        idx = 0;
        }

        return NULL;
}
#endif




















































































































    1 































    1 



























































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Security server interface.
 *
 * Author : Stephen Smalley, <sds@tycho.nsa.gov>
 *
 */

#ifndef _SELINUX_SECURITY_H_
#define _SELINUX_SECURITY_H_

#include <linux/compiler.h>
#include <linux/dcache.h>
#include <linux/magic.h>
#include <linux/types.h>
#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <linux/workqueue.h>
#include "flask.h"
#include "policycap.h"

#define SECSID_NULL                        0x00000000 /* unspecified SID */
#define SECSID_WILD                        0xffffffff /* wildcard SID */
#define SECCLASS_NULL                        0x0000 /* no class */

/* Identify specific policy version changes */
#define POLICYDB_VERSION_BASE                15
#define POLICYDB_VERSION_BOOL                16
#define POLICYDB_VERSION_IPV6                17
#define POLICYDB_VERSION_NLCLASS        18
#define POLICYDB_VERSION_VALIDATETRANS        19
#define POLICYDB_VERSION_MLS                19
#define POLICYDB_VERSION_AVTAB                20
#define POLICYDB_VERSION_RANGETRANS        21
#define POLICYDB_VERSION_POLCAP                22
#define POLICYDB_VERSION_PERMISSIVE        23
#define POLICYDB_VERSION_BOUNDARY        24
#define POLICYDB_VERSION_FILENAME_TRANS        25
#define POLICYDB_VERSION_ROLETRANS        26
#define POLICYDB_VERSION_NEW_OBJECT_DEFAULTS        27
#define POLICYDB_VERSION_DEFAULT_TYPE        28
#define POLICYDB_VERSION_CONSTRAINT_NAMES        29
#define POLICYDB_VERSION_XPERMS_IOCTL        30
#define POLICYDB_VERSION_INFINIBAND                31
#define POLICYDB_VERSION_GLBLUB                32
#define POLICYDB_VERSION_COMP_FTRANS        33 /* compressed filename transitions */

/* Range of policy versions we understand*/
#define POLICYDB_VERSION_MIN   POLICYDB_VERSION_BASE
#define POLICYDB_VERSION_MAX   POLICYDB_VERSION_COMP_FTRANS

/* Mask for just the mount related flags */
#define SE_MNTMASK        0x0f
/* Super block security struct flags for mount options */
/* BE CAREFUL, these need to be the low order bits for selinux_get_mnt_opts */
#define CONTEXT_MNT        0x01
#define FSCONTEXT_MNT        0x02
#define ROOTCONTEXT_MNT        0x04
#define DEFCONTEXT_MNT        0x08
#define SBLABEL_MNT        0x10
/* Non-mount related flags */
#define SE_SBINITIALIZED        0x0100
#define SE_SBPROC                0x0200
#define SE_SBGENFS                0x0400
#define SE_SBGENFS_XATTR        0x0800

#define CONTEXT_STR        "context"
#define FSCONTEXT_STR        "fscontext"
#define ROOTCONTEXT_STR        "rootcontext"
#define DEFCONTEXT_STR        "defcontext"
#define SECLABEL_STR "seclabel"

struct netlbl_lsm_secattr;

extern int selinux_enabled_boot;

/*
 * type_datum properties
 * available at the kernel policy version >= POLICYDB_VERSION_BOUNDARY
 */
#define TYPEDATUM_PROPERTY_PRIMARY        0x0001
#define TYPEDATUM_PROPERTY_ATTRIBUTE        0x0002

/* limitation of boundary depth  */
#define POLICYDB_BOUNDS_MAXDEPTH        4

struct selinux_avc;
struct selinux_policy;

struct selinux_state {
#ifdef CONFIG_SECURITY_SELINUX_DISABLE
        bool disabled;
#endif
#ifdef CONFIG_SECURITY_SELINUX_DEVELOP
        bool enforcing;
#endif
        bool checkreqprot;
        bool initialized;
        bool policycap[__POLICYDB_CAPABILITY_MAX];

        struct page *status_page;
        struct mutex status_lock;

        struct selinux_avc *avc;
        struct selinux_policy __rcu *policy;
        struct mutex policy_mutex;
} __randomize_layout;

void selinux_avc_init(struct selinux_avc **avc);

extern struct selinux_state selinux_state;

static inline bool selinux_initialized(const struct selinux_state *state)
{
        /* do a synchronized load to avoid race conditions */
        return smp_load_acquire(&state->initialized);
}

static inline void selinux_mark_initialized(struct selinux_state *state)
{
        /* do a synchronized write to avoid race conditions */
        smp_store_release(&state->initialized, true);
}

#ifdef CONFIG_SECURITY_SELINUX_DEVELOP
static inline bool enforcing_enabled(struct selinux_state *state)
{
        return READ_ONCE(state->enforcing);
}

static inline void enforcing_set(struct selinux_state *state, bool value)
{
        WRITE_ONCE(state->enforcing, value);
}
#else
static inline bool enforcing_enabled(struct selinux_state *state)
{
        return true;
}

static inline void enforcing_set(struct selinux_state *state, bool value)
{
}
#endif

static inline bool checkreqprot_get(const struct selinux_state *state)
{
        return READ_ONCE(state->checkreqprot);
}

static inline void checkreqprot_set(struct selinux_state *state, bool value)
{
        WRITE_ONCE(state->checkreqprot, value);
}

#ifdef CONFIG_SECURITY_SELINUX_DISABLE
static inline bool selinux_disabled(struct selinux_state *state)
{
        return READ_ONCE(state->disabled);
}

static inline void selinux_mark_disabled(struct selinux_state *state)
{
        WRITE_ONCE(state->disabled, true);
}
#else
static inline bool selinux_disabled(struct selinux_state *state)
{
        return false;
}
#endif

static inline bool selinux_policycap_netpeer(void)
{
        struct selinux_state *state = &selinux_state;

        return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_NETPEER]);
}

static inline bool selinux_policycap_openperm(void)
{
        struct selinux_state *state = &selinux_state;

        return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_OPENPERM]);
}

static inline bool selinux_policycap_extsockclass(void)
{
        struct selinux_state *state = &selinux_state;

        return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_EXTSOCKCLASS]);
}

static inline bool selinux_policycap_alwaysnetwork(void)
{
        struct selinux_state *state = &selinux_state;

        return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_ALWAYSNETWORK]);
}

static inline bool selinux_policycap_cgroupseclabel(void)
{
        struct selinux_state *state = &selinux_state;

        return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_CGROUPSECLABEL]);
}

static inline bool selinux_policycap_nnp_nosuid_transition(void)
{
        struct selinux_state *state = &selinux_state;

        return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_NNP_NOSUID_TRANSITION]);
}

static inline bool selinux_policycap_genfs_seclabel_symlinks(void)
{
        struct selinux_state *state = &selinux_state;

        return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_GENFS_SECLABEL_SYMLINKS]);
}

static inline bool selinux_policycap_ioctl_skip_cloexec(void)
{
        struct selinux_state *state = &selinux_state;

        return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_IOCTL_SKIP_CLOEXEC]);
}

struct selinux_policy_convert_data;

struct selinux_load_state {
        struct selinux_policy *policy;
        struct selinux_policy_convert_data *convert_data;
};

int security_mls_enabled(struct selinux_state *state);
int security_load_policy(struct selinux_state *state,
                         void *data, size_t len,
                         struct selinux_load_state *load_state);
void selinux_policy_commit(struct selinux_state *state,
                           struct selinux_load_state *load_state);
void selinux_policy_cancel(struct selinux_state *state,
                           struct selinux_load_state *load_state);
int security_read_policy(struct selinux_state *state,
                         void **data, size_t *len);

int security_policycap_supported(struct selinux_state *state,
                                 unsigned int req_cap);

#define SEL_VEC_MAX 32
struct av_decision {
        u32 allowed;
        u32 auditallow;
        u32 auditdeny;
        u32 seqno;
        u32 flags;
};

#define XPERMS_ALLOWED 1
#define XPERMS_AUDITALLOW 2
#define XPERMS_DONTAUDIT 4

#define security_xperm_set(perms, x) (perms[x >> 5] |= 1 << (x & 0x1f))
#define security_xperm_test(perms, x) (1 & (perms[x >> 5] >> (x & 0x1f)))
struct extended_perms_data {
        u32 p[8];
};

struct extended_perms_decision {
        u8 used;
        u8 driver;
        struct extended_perms_data *allowed;
        struct extended_perms_data *auditallow;
        struct extended_perms_data *dontaudit;
};

struct extended_perms {
        u16 len;        /* length associated decision chain */
        struct extended_perms_data drivers; /* flag drivers that are used */
};

/* definitions of av_decision.flags */
#define AVD_FLAGS_PERMISSIVE        0x0001

void security_compute_av(struct selinux_state *state,
                         u32 ssid, u32 tsid,
                         u16 tclass, struct av_decision *avd,
                         struct extended_perms *xperms);

void security_compute_xperms_decision(struct selinux_state *state,
                                      u32 ssid, u32 tsid, u16 tclass,
                                      u8 driver,
                                      struct extended_perms_decision *xpermd);

void security_compute_av_user(struct selinux_state *state,
                              u32 ssid, u32 tsid,
                              u16 tclass, struct av_decision *avd);

int security_transition_sid(struct selinux_state *state,
                            u32 ssid, u32 tsid, u16 tclass,
                            const struct qstr *qstr, u32 *out_sid);

int security_transition_sid_user(struct selinux_state *state,
                                 u32 ssid, u32 tsid, u16 tclass,
                                 const char *objname, u32 *out_sid);

int security_member_sid(struct selinux_state *state, u32 ssid, u32 tsid,
                        u16 tclass, u32 *out_sid);

int security_change_sid(struct selinux_state *state, u32 ssid, u32 tsid,
                        u16 tclass, u32 *out_sid);

int security_sid_to_context(struct selinux_state *state, u32 sid,
                            char **scontext, u32 *scontext_len);

int security_sid_to_context_force(struct selinux_state *state,
                                  u32 sid, char **scontext, u32 *scontext_len);

int security_sid_to_context_inval(struct selinux_state *state,
                                  u32 sid, char **scontext, u32 *scontext_len);

int security_context_to_sid(struct selinux_state *state,
                            const char *scontext, u32 scontext_len,
                            u32 *out_sid, gfp_t gfp);

int security_context_str_to_sid(struct selinux_state *state,
                                const char *scontext, u32 *out_sid, gfp_t gfp);

int security_context_to_sid_default(struct selinux_state *state,
                                    const char *scontext, u32 scontext_len,
                                    u32 *out_sid, u32 def_sid, gfp_t gfp_flags);

int security_context_to_sid_force(struct selinux_state *state,
                                  const char *scontext, u32 scontext_len,
                                  u32 *sid);

int security_get_user_sids(struct selinux_state *state,
                           u32 callsid, char *username,
                           u32 **sids, u32 *nel);

int security_port_sid(struct selinux_state *state,
                      u8 protocol, u16 port, u32 *out_sid);

int security_ib_pkey_sid(struct selinux_state *state,
                         u64 subnet_prefix, u16 pkey_num, u32 *out_sid);

int security_ib_endport_sid(struct selinux_state *state,
                            const char *dev_name, u8 port_num, u32 *out_sid);

int security_netif_sid(struct selinux_state *state,
                       char *name, u32 *if_sid);

int security_node_sid(struct selinux_state *state,
                      u16 domain, void *addr, u32 addrlen,
                      u32 *out_sid);

int security_validate_transition(struct selinux_state *state,
                                 u32 oldsid, u32 newsid, u32 tasksid,
                                 u16 tclass);

int security_validate_transition_user(struct selinux_state *state,
                                      u32 oldsid, u32 newsid, u32 tasksid,
                                      u16 tclass);

int security_bounded_transition(struct selinux_state *state,
                                u32 oldsid, u32 newsid);

int security_sid_mls_copy(struct selinux_state *state,
                          u32 sid, u32 mls_sid, u32 *new_sid);

int security_net_peersid_resolve(struct selinux_state *state,
                                 u32 nlbl_sid, u32 nlbl_type,
                                 u32 xfrm_sid,
                                 u32 *peer_sid);

int security_get_classes(struct selinux_policy *policy,
                         char ***classes, int *nclasses);
int security_get_permissions(struct selinux_policy *policy,
                             char *class, char ***perms, int *nperms);
int security_get_reject_unknown(struct selinux_state *state);
int security_get_allow_unknown(struct selinux_state *state);

#define SECURITY_FS_USE_XATTR                1 /* use xattr */
#define SECURITY_FS_USE_TRANS                2 /* use transition SIDs, e.g. devpts/tmpfs */
#define SECURITY_FS_USE_TASK                3 /* use task SIDs, e.g. pipefs/sockfs */
#define SECURITY_FS_USE_GENFS                4 /* use the genfs support */
#define SECURITY_FS_USE_NONE                5 /* no labeling support */
#define SECURITY_FS_USE_MNTPOINT        6 /* use mountpoint labeling */
#define SECURITY_FS_USE_NATIVE                7 /* use native label support */
#define SECURITY_FS_USE_MAX                7 /* Highest SECURITY_FS_USE_XXX */

int security_fs_use(struct selinux_state *state, struct super_block *sb);

int security_genfs_sid(struct selinux_state *state,
                       const char *fstype, char *name, u16 sclass,
                       u32 *sid);

int selinux_policy_genfs_sid(struct selinux_policy *policy,
                       const char *fstype, char *name, u16 sclass,
                       u32 *sid);

#ifdef CONFIG_NETLABEL
int security_netlbl_secattr_to_sid(struct selinux_state *state,
                                   struct netlbl_lsm_secattr *secattr,
                                   u32 *sid);

int security_netlbl_sid_to_secattr(struct selinux_state *state,
                                   u32 sid,
                                   struct netlbl_lsm_secattr *secattr);
#else
static inline int security_netlbl_secattr_to_sid(struct selinux_state *state,
                                            struct netlbl_lsm_secattr *secattr,
                                            u32 *sid)
{
        return -EIDRM;
}

static inline int security_netlbl_sid_to_secattr(struct selinux_state *state,
                                         u32 sid,
                                         struct netlbl_lsm_secattr *secattr)
{
        return -ENOENT;
}
#endif /* CONFIG_NETLABEL */

const char *security_get_initial_sid_context(u32 sid);

/*
 * status notifier using mmap interface
 */
extern struct page *selinux_kernel_status_page(struct selinux_state *state);

#define SELINUX_KERNEL_STATUS_VERSION        1
struct selinux_kernel_status {
        u32        version;        /* version number of thie structure */
        u32        sequence;        /* sequence number of seqlock logic */
        u32        enforcing;        /* current setting of enforcing mode */
        u32        policyload;        /* times of policy reloaded */
        u32        deny_unknown;        /* current setting of deny_unknown */
        /*
         * The version > 0 supports above members.
         */
} __packed;

extern void selinux_status_update_setenforce(struct selinux_state *state,
                                             int enforcing);
extern void selinux_status_update_policyload(struct selinux_state *state,
                                             int seqno);
extern void selinux_complete_init(void);
extern int selinux_disable(struct selinux_state *state);
extern void exit_sel_fs(void);
extern struct path selinux_null;
extern struct vfsmount *selinuxfs_mount;
extern void selnl_notify_setenforce(int val);
extern void selnl_notify_policyload(u32 seqno);
extern int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm);

extern void avtab_cache_init(void);
extern void ebitmap_cache_init(void);
extern void hashtab_cache_init(void);
extern int security_sidtab_hash_stats(struct selinux_state *state, char *page);

#endif /* _SELINUX_SECURITY_H_ */




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_BH_H
#define _LINUX_BH_H

#include <linux/preempt.h>

#ifdef CONFIG_TRACE_IRQFLAGS
extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
#else
static __always_inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
{
        preempt_count_add(cnt);
        barrier();
}
#endif

static inline void local_bh_disable(void)
{
        __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
}

extern void _local_bh_enable(void);
extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt);

static inline void local_bh_enable_ip(unsigned long ip)
{
        __local_bh_enable_ip(ip, SOFTIRQ_DISABLE_OFFSET);
}

static inline void local_bh_enable(void)
{
        __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
}

#endif /* _LINUX_BH_H */
















































































    1 


    1 






    1 





    1 



    1 

































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  fs/anon_inodes.c
 *
 *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
 *
 *  Thanks to Arnd Bergmann for code review and suggestions.
 *  More changes for Thomas Gleixner suggestions.
 *
 */

#include <linux/cred.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/magic.h>
#include <linux/anon_inodes.h>
#include <linux/pseudo_fs.h>

#include <linux/uaccess.h>

static struct vfsmount *anon_inode_mnt __read_mostly;
static struct inode *anon_inode_inode;

/*
 * anon_inodefs_dname() is called from d_path().
 */
static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
{
        return dynamic_dname(dentry, buffer, buflen, "anon_inode:%s",
                                dentry->d_name.name);
}

static const struct dentry_operations anon_inodefs_dentry_operations = {
        .d_dname        = anon_inodefs_dname,
};

static int anon_inodefs_init_fs_context(struct fs_context *fc)
{
        struct pseudo_fs_context *ctx = init_pseudo(fc, ANON_INODE_FS_MAGIC);
        if (!ctx)
                return -ENOMEM;
        ctx->dops = &anon_inodefs_dentry_operations;
        return 0;
}

static struct file_system_type anon_inode_fs_type = {
        .name                = "anon_inodefs",
        .init_fs_context = anon_inodefs_init_fs_context,
        .kill_sb        = kill_anon_super,
};

/**
 * anon_inode_getfile - creates a new file instance by hooking it up to an
 *                      anonymous inode, and a dentry that describe the "class"
 *                      of the file
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops:    [in]    file operations for the new file
 * @priv:    [in]    private data for the new file (will be file's private_data)
 * @flags:   [in]    flags
 *
 * Creates a new file by hooking it on a single inode. This is useful for files
 * that do not need to have a full-fledged inode in order to operate correctly.
 * All the files created with anon_inode_getfile() will share a single inode,
 * hence saving memory and avoiding code duplication for the file/inode/dentry
 * setup.  Returns the newly created file* or an error pointer.
 */
struct file *anon_inode_getfile(const char *name,
                                const struct file_operations *fops,
                                void *priv, int flags)
{
        struct file *file;

        if (IS_ERR(anon_inode_inode))
                return ERR_PTR(-ENODEV);

        if (fops->owner && !try_module_get(fops->owner))
                return ERR_PTR(-ENOENT);

        /*
         * We know the anon_inode inode count is always greater than zero,
         * so ihold() is safe.
         */
        ihold(anon_inode_inode);
        file = alloc_file_pseudo(anon_inode_inode, anon_inode_mnt, name,
                                 flags & (O_ACCMODE | O_NONBLOCK), fops);
        if (IS_ERR(file))
                goto err;

        file->f_mapping = anon_inode_inode->i_mapping;

        file->private_data = priv;

        return file;

err:
        iput(anon_inode_inode);
        module_put(fops->owner);
        return file;
}
EXPORT_SYMBOL_GPL(anon_inode_getfile);

/**
 * anon_inode_getfd - creates a new file instance by hooking it up to an
 *                    anonymous inode, and a dentry that describe the "class"
 *                    of the file
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops:    [in]    file operations for the new file
 * @priv:    [in]    private data for the new file (will be file's private_data)
 * @flags:   [in]    flags
 *
 * Creates a new file by hooking it on a single inode. This is useful for files
 * that do not need to have a full-fledged inode in order to operate correctly.
 * All the files created with anon_inode_getfd() will share a single inode,
 * hence saving memory and avoiding code duplication for the file/inode/dentry
 * setup.  Returns new descriptor or an error code.
 */
int anon_inode_getfd(const char *name, const struct file_operations *fops,
                     void *priv, int flags)
{
        int error, fd;
        struct file *file;

        error = get_unused_fd_flags(flags);
        if (error < 0)
                return error;
        fd = error;

        file = anon_inode_getfile(name, fops, priv, flags);
        if (IS_ERR(file)) {
                error = PTR_ERR(file);
                goto err_put_unused_fd;
        }
        fd_install(fd, file);

        return fd;

err_put_unused_fd:
        put_unused_fd(fd);
        return error;
}
EXPORT_SYMBOL_GPL(anon_inode_getfd);

static int __init anon_inode_init(void)
{
        anon_inode_mnt = kern_mount(&anon_inode_fs_type);
        if (IS_ERR(anon_inode_mnt))
                panic("anon_inode_init() kernel mount failed (%ld)\n", PTR_ERR(anon_inode_mnt));

        anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
        if (IS_ERR(anon_inode_inode))
                panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode));

        return 0;
}

fs_initcall(anon_inode_init);



























































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
// SPDX-License-Identifier: GPL-2.0
/*
 * This is a maximally equidistributed combined Tausworthe generator
 * based on code from GNU Scientific Library 1.5 (30 Jun 2004)
 *
 * lfsr113 version:
 *
 * x_n = (s1_n ^ s2_n ^ s3_n ^ s4_n)
 *
 * s1_{n+1} = (((s1_n & 4294967294) << 18) ^ (((s1_n <<  6) ^ s1_n) >> 13))
 * s2_{n+1} = (((s2_n & 4294967288) <<  2) ^ (((s2_n <<  2) ^ s2_n) >> 27))
 * s3_{n+1} = (((s3_n & 4294967280) <<  7) ^ (((s3_n << 13) ^ s3_n) >> 21))
 * s4_{n+1} = (((s4_n & 4294967168) << 13) ^ (((s4_n <<  3) ^ s4_n) >> 12))
 *
 * The period of this generator is about 2^113 (see erratum paper).
 *
 * From: P. L'Ecuyer, "Maximally Equidistributed Combined Tausworthe
 * Generators", Mathematics of Computation, 65, 213 (1996), 203--213:
 * http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme.ps
 * ftp://ftp.iro.umontreal.ca/pub/simulation/lecuyer/papers/tausme.ps
 *
 * There is an erratum in the paper "Tables of Maximally Equidistributed
 * Combined LFSR Generators", Mathematics of Computation, 68, 225 (1999),
 * 261--269: http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme2.ps
 *
 *      ... the k_j most significant bits of z_j must be non-zero,
 *      for each j. (Note: this restriction also applies to the
 *      computer code given in [4], but was mistakenly not mentioned
 *      in that paper.)
 *
 * This affects the seeding procedure by imposing the requirement
 * s1 > 1, s2 > 7, s3 > 15, s4 > 127.
 */

#include <linux/types.h>
#include <linux/percpu.h>
#include <linux/export.h>
#include <linux/jiffies.h>
#include <linux/random.h>
#include <linux/sched.h>
#include <linux/bitops.h>
#include <linux/slab.h>
#include <linux/notifier.h>
#include <asm/unaligned.h>

/**
 *        prandom_u32_state - seeded pseudo-random number generator.
 *        @state: pointer to state structure holding seeded state.
 *
 *        This is used for pseudo-randomness with no outside seeding.
 *        For more random results, use prandom_u32().
 */
u32 prandom_u32_state(struct rnd_state *state)
{
#define TAUSWORTHE(s, a, b, c, d) ((s & c) << d) ^ (((s << a) ^ s) >> b)
        state->s1 = TAUSWORTHE(state->s1,  6U, 13U, 4294967294U, 18U);
        state->s2 = TAUSWORTHE(state->s2,  2U, 27U, 4294967288U,  2U);
        state->s3 = TAUSWORTHE(state->s3, 13U, 21U, 4294967280U,  7U);
        state->s4 = TAUSWORTHE(state->s4,  3U, 12U, 4294967168U, 13U);

        return (state->s1 ^ state->s2 ^ state->s3 ^ state->s4);
}
EXPORT_SYMBOL(prandom_u32_state);

/**
 *        prandom_bytes_state - get the requested number of pseudo-random bytes
 *
 *        @state: pointer to state structure holding seeded state.
 *        @buf: where to copy the pseudo-random bytes to
 *        @bytes: the requested number of bytes
 *
 *        This is used for pseudo-randomness with no outside seeding.
 *        For more random results, use prandom_bytes().
 */
void prandom_bytes_state(struct rnd_state *state, void *buf, size_t bytes)
{
        u8 *ptr = buf;

        while (bytes >= sizeof(u32)) {
                put_unaligned(prandom_u32_state(state), (u32 *) ptr);
                ptr += sizeof(u32);
                bytes -= sizeof(u32);
        }

        if (bytes > 0) {
                u32 rem = prandom_u32_state(state);
                do {
                        *ptr++ = (u8) rem;
                        bytes--;
                        rem >>= BITS_PER_BYTE;
                } while (bytes > 0);
        }
}
EXPORT_SYMBOL(prandom_bytes_state);

static void prandom_warmup(struct rnd_state *state)
{
        /* Calling RNG ten times to satisfy recurrence condition */
        prandom_u32_state(state);
        prandom_u32_state(state);
        prandom_u32_state(state);
        prandom_u32_state(state);
        prandom_u32_state(state);
        prandom_u32_state(state);
        prandom_u32_state(state);
        prandom_u32_state(state);
        prandom_u32_state(state);
        prandom_u32_state(state);
}

void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state)
{
        int i;

        for_each_possible_cpu(i) {
                struct rnd_state *state = per_cpu_ptr(pcpu_state, i);
                u32 seeds[4];

                get_random_bytes(&seeds, sizeof(seeds));
                state->s1 = __seed(seeds[0],   2U);
                state->s2 = __seed(seeds[1],   8U);
                state->s3 = __seed(seeds[2],  16U);
                state->s4 = __seed(seeds[3], 128U);

                prandom_warmup(state);
        }
}
EXPORT_SYMBOL(prandom_seed_full_state);

#ifdef CONFIG_RANDOM32_SELFTEST
static struct prandom_test1 {
        u32 seed;
        u32 result;
} test1[] = {
        { 1U, 3484351685U },
        { 2U, 2623130059U },
        { 3U, 3125133893U },
        { 4U,  984847254U },
};

static struct prandom_test2 {
        u32 seed;
        u32 iteration;
        u32 result;
} test2[] = {
        /* Test cases against taus113 from GSL library. */
        {  931557656U, 959U, 2975593782U },
        { 1339693295U, 876U, 3887776532U },
        { 1545556285U, 961U, 1615538833U },
        {  601730776U, 723U, 1776162651U },
        { 1027516047U, 687U,  511983079U },
        {  416526298U, 700U,  916156552U },
        { 1395522032U, 652U, 2222063676U },
        {  366221443U, 617U, 2992857763U },
        { 1539836965U, 714U, 3783265725U },
        {  556206671U, 994U,  799626459U },
        {  684907218U, 799U,  367789491U },
        { 2121230701U, 931U, 2115467001U },
        { 1668516451U, 644U, 3620590685U },
        {  768046066U, 883U, 2034077390U },
        { 1989159136U, 833U, 1195767305U },
        {  536585145U, 996U, 3577259204U },
        { 1008129373U, 642U, 1478080776U },
        { 1740775604U, 939U, 1264980372U },
        { 1967883163U, 508U,   10734624U },
        { 1923019697U, 730U, 3821419629U },
        {  442079932U, 560U, 3440032343U },
        { 1961302714U, 845U,  841962572U },
        { 2030205964U, 962U, 1325144227U },
        { 1160407529U, 507U,  240940858U },
        {  635482502U, 779U, 4200489746U },
        { 1252788931U, 699U,  867195434U },
        { 1961817131U, 719U,  668237657U },
        { 1071468216U, 983U,  917876630U },
        { 1281848367U, 932U, 1003100039U },
        {  582537119U, 780U, 1127273778U },
        { 1973672777U, 853U, 1071368872U },
        { 1896756996U, 762U, 1127851055U },
        {  847917054U, 500U, 1717499075U },
        { 1240520510U, 951U, 2849576657U },
        { 1685071682U, 567U, 1961810396U },
        { 1516232129U, 557U,    3173877U },
        { 1208118903U, 612U, 1613145022U },
        { 1817269927U, 693U, 4279122573U },
        { 1510091701U, 717U,  638191229U },
        {  365916850U, 807U,  600424314U },
        {  399324359U, 702U, 1803598116U },
        { 1318480274U, 779U, 2074237022U },
        {  697758115U, 840U, 1483639402U },
        { 1696507773U, 840U,  577415447U },
        { 2081979121U, 981U, 3041486449U },
        {  955646687U, 742U, 3846494357U },
        { 1250683506U, 749U,  836419859U },
        {  595003102U, 534U,  366794109U },
        {   47485338U, 558U, 3521120834U },
        {  619433479U, 610U, 3991783875U },
        {  704096520U, 518U, 4139493852U },
        { 1712224984U, 606U, 2393312003U },
        { 1318233152U, 922U, 3880361134U },
        {  855572992U, 761U, 1472974787U },
        {   64721421U, 703U,  683860550U },
        {  678931758U, 840U,  380616043U },
        {  692711973U, 778U, 1382361947U },
        {  677703619U, 530U, 2826914161U },
        {   92393223U, 586U, 1522128471U },
        { 1222592920U, 743U, 3466726667U },
        {  358288986U, 695U, 1091956998U },
        { 1935056945U, 958U,  514864477U },
        {  735675993U, 990U, 1294239989U },
        { 1560089402U, 897U, 2238551287U },
        {   70616361U, 829U,   22483098U },
        {  368234700U, 731U, 2913875084U },
        {   20221190U, 879U, 1564152970U },
        {  539444654U, 682U, 1835141259U },
        { 1314987297U, 840U, 1801114136U },
        { 2019295544U, 645U, 3286438930U },
        {  469023838U, 716U, 1637918202U },
        { 1843754496U, 653U, 2562092152U },
        {  400672036U, 809U, 4264212785U },
        {  404722249U, 965U, 2704116999U },
        {  600702209U, 758U,  584979986U },
        {  519953954U, 667U, 2574436237U },
        { 1658071126U, 694U, 2214569490U },
        {  420480037U, 749U, 3430010866U },
        {  690103647U, 969U, 3700758083U },
        { 1029424799U, 937U, 3787746841U },
        { 2012608669U, 506U, 3362628973U },
        { 1535432887U, 998U,   42610943U },
        { 1330635533U, 857U, 3040806504U },
        { 1223800550U, 539U, 3954229517U },
        { 1322411537U, 680U, 3223250324U },
        { 1877847898U, 945U, 2915147143U },
        { 1646356099U, 874U,  965988280U },
        {  805687536U, 744U, 4032277920U },
        { 1948093210U, 633U, 1346597684U },
        {  392609744U, 783U, 1636083295U },
        {  690241304U, 770U, 1201031298U },
        { 1360302965U, 696U, 1665394461U },
        { 1220090946U, 780U, 1316922812U },
        {  447092251U, 500U, 3438743375U },
        { 1613868791U, 592U,  828546883U },
        {  523430951U, 548U, 2552392304U },
        {  726692899U, 810U, 1656872867U },
        { 1364340021U, 836U, 3710513486U },
        { 1986257729U, 931U,  935013962U },
        {  407983964U, 921U,  728767059U },
};

static u32 __extract_hwseed(void)
{
        unsigned int val = 0;

        (void)(arch_get_random_seed_int(&val) ||
               arch_get_random_int(&val));

        return val;
}

static void prandom_seed_early(struct rnd_state *state, u32 seed,
                               bool mix_with_hwseed)
{
#define LCG(x)         ((x) * 69069U)        /* super-duper LCG */
#define HWSEED() (mix_with_hwseed ? __extract_hwseed() : 0)
        state->s1 = __seed(HWSEED() ^ LCG(seed),        2U);
        state->s2 = __seed(HWSEED() ^ LCG(state->s1),   8U);
        state->s3 = __seed(HWSEED() ^ LCG(state->s2),  16U);
        state->s4 = __seed(HWSEED() ^ LCG(state->s3), 128U);
}

static int __init prandom_state_selftest(void)
{
        int i, j, errors = 0, runs = 0;
        bool error = false;

        for (i = 0; i < ARRAY_SIZE(test1); i++) {
                struct rnd_state state;

                prandom_seed_early(&state, test1[i].seed, false);
                prandom_warmup(&state);

                if (test1[i].result != prandom_u32_state(&state))
                        error = true;
        }

        if (error)
                pr_warn("prandom: seed boundary self test failed\n");
        else
                pr_info("prandom: seed boundary self test passed\n");

        for (i = 0; i < ARRAY_SIZE(test2); i++) {
                struct rnd_state state;

                prandom_seed_early(&state, test2[i].seed, false);
                prandom_warmup(&state);

                for (j = 0; j < test2[i].iteration - 1; j++)
                        prandom_u32_state(&state);

                if (test2[i].result != prandom_u32_state(&state))
                        errors++;

                runs++;
                cond_resched();
        }

        if (errors)
                pr_warn("prandom: %d/%d self tests failed\n", errors, runs);
        else
                pr_info("prandom: %d self tests passed\n", runs);
        return 0;
}
core_initcall(prandom_state_selftest);
#endif

/*
 * The prandom_u32() implementation is now completely separate from the
 * prandom_state() functions, which are retained (for now) for compatibility.
 *
 * Because of (ab)use in the networking code for choosing random TCP/UDP port
 * numbers, which open DoS possibilities if guessable, we want something
 * stronger than a standard PRNG.  But the performance requirements of
 * the network code do not allow robust crypto for this application.
 *
 * So this is a homebrew Junior Spaceman implementation, based on the
 * lowest-latency trustworthy crypto primitive available, SipHash.
 * (The authors of SipHash have not been consulted about this abuse of
 * their work.)
 *
 * Standard SipHash-2-4 uses 2n+4 rounds to hash n words of input to
 * one word of output.  This abbreviated version uses 2 rounds per word
 * of output.
 */

struct siprand_state {
        unsigned long v0;
        unsigned long v1;
        unsigned long v2;
        unsigned long v3;
};

static DEFINE_PER_CPU(struct siprand_state, net_rand_state) __latent_entropy;
DEFINE_PER_CPU(unsigned long, net_rand_noise);
EXPORT_PER_CPU_SYMBOL(net_rand_noise);

/*
 * This is the core CPRNG function.  As "pseudorandom", this is not used
 * for truly valuable things, just intended to be a PITA to guess.
 * For maximum speed, we do just two SipHash rounds per word.  This is
 * the same rate as 4 rounds per 64 bits that SipHash normally uses,
 * so hopefully it's reasonably secure.
 *
 * There are two changes from the official SipHash finalization:
 * - We omit some constants XORed with v2 in the SipHash spec as irrelevant;
 *   they are there only to make the output rounds distinct from the input
 *   rounds, and this application has no input rounds.
 * - Rather than returning v0^v1^v2^v3, return v1+v3.
 *   If you look at the SipHash round, the last operation on v3 is
 *   "v3 ^= v0", so "v0 ^ v3" just undoes that, a waste of time.
 *   Likewise "v1 ^= v2".  (The rotate of v2 makes a difference, but
 *   it still cancels out half of the bits in v2 for no benefit.)
 *   Second, since the last combining operation was xor, continue the
 *   pattern of alternating xor/add for a tiny bit of extra non-linearity.
 */
static inline u32 siprand_u32(struct siprand_state *s)
{
        unsigned long v0 = s->v0, v1 = s->v1, v2 = s->v2, v3 = s->v3;
        unsigned long n = raw_cpu_read(net_rand_noise);

        v3 ^= n;
        PRND_SIPROUND(v0, v1, v2, v3);
        PRND_SIPROUND(v0, v1, v2, v3);
        v0 ^= n;
        s->v0 = v0;  s->v1 = v1;  s->v2 = v2;  s->v3 = v3;
        return v1 + v3;
}


/**
 *        prandom_u32 - pseudo random number generator
 *
 *        A 32 bit pseudo-random number is generated using a fast
 *        algorithm suitable for simulation. This algorithm is NOT
 *        considered safe for cryptographic use.
 */
u32 prandom_u32(void)
{
        struct siprand_state *state = get_cpu_ptr(&net_rand_state);
        u32 res = siprand_u32(state);

        put_cpu_ptr(&net_rand_state);
        return res;
}
EXPORT_SYMBOL(prandom_u32);

/**
 *        prandom_bytes - get the requested number of pseudo-random bytes
 *        @buf: where to copy the pseudo-random bytes to
 *        @bytes: the requested number of bytes
 */
void prandom_bytes(void *buf, size_t bytes)
{
        struct siprand_state *state = get_cpu_ptr(&net_rand_state);
        u8 *ptr = buf;

        while (bytes >= sizeof(u32)) {
                put_unaligned(siprand_u32(state), (u32 *)ptr);
                ptr += sizeof(u32);
                bytes -= sizeof(u32);
        }

        if (bytes > 0) {
                u32 rem = siprand_u32(state);

                do {
                        *ptr++ = (u8)rem;
                        rem >>= BITS_PER_BYTE;
                } while (--bytes > 0);
        }
        put_cpu_ptr(&net_rand_state);
}
EXPORT_SYMBOL(prandom_bytes);

/**
 *        prandom_seed - add entropy to pseudo random number generator
 *        @entropy: entropy value
 *
 *        Add some additional seed material to the prandom pool.
 *        The "entropy" is actually our IP address (the only caller is
 *        the network code), not for unpredictability, but to ensure that
 *        different machines are initialized differently.
 */
void prandom_seed(u32 entropy)
{
        int i;

        add_device_randomness(&entropy, sizeof(entropy));

        for_each_possible_cpu(i) {
                struct siprand_state *state = per_cpu_ptr(&net_rand_state, i);
                unsigned long v0 = state->v0, v1 = state->v1;
                unsigned long v2 = state->v2, v3 = state->v3;

                do {
                        v3 ^= entropy;
                        PRND_SIPROUND(v0, v1, v2, v3);
                        PRND_SIPROUND(v0, v1, v2, v3);
                        v0 ^= entropy;
                } while (unlikely(!v0 || !v1 || !v2 || !v3));

                WRITE_ONCE(state->v0, v0);
                WRITE_ONCE(state->v1, v1);
                WRITE_ONCE(state->v2, v2);
                WRITE_ONCE(state->v3, v3);
        }
}
EXPORT_SYMBOL(prandom_seed);

/*
 *        Generate some initially weak seeding values to allow
 *        the prandom_u32() engine to be started.
 */
static int __init prandom_init_early(void)
{
        int i;
        unsigned long v0, v1, v2, v3;

        if (!arch_get_random_long(&v0))
                v0 = jiffies;
        if (!arch_get_random_long(&v1))
                v1 = random_get_entropy();
        v2 = v0 ^ PRND_K0;
        v3 = v1 ^ PRND_K1;

        for_each_possible_cpu(i) {
                struct siprand_state *state;

                v3 ^= i;
                PRND_SIPROUND(v0, v1, v2, v3);
                PRND_SIPROUND(v0, v1, v2, v3);
                v0 ^= i;

                state = per_cpu_ptr(&net_rand_state, i);
                state->v0 = v0;  state->v1 = v1;
                state->v2 = v2;  state->v3 = v3;
        }

        return 0;
}
core_initcall(prandom_init_early);


/* Stronger reseeding when available, and periodically thereafter. */
static void prandom_reseed(struct timer_list *unused);

static DEFINE_TIMER(seed_timer, prandom_reseed);

static void prandom_reseed(struct timer_list *unused)
{
        unsigned long expires;
        int i;

        /*
         * Reinitialize each CPU's PRNG with 128 bits of key.
         * No locking on the CPUs, but then somewhat random results are,
         * well, expected.
         */
        for_each_possible_cpu(i) {
                struct siprand_state *state;
                unsigned long v0 = get_random_long(), v2 = v0 ^ PRND_K0;
                unsigned long v1 = get_random_long(), v3 = v1 ^ PRND_K1;
#if BITS_PER_LONG == 32
                int j;

                /*
                 * On 32-bit machines, hash in two extra words to
                 * approximate 128-bit key length.  Not that the hash
                 * has that much security, but this prevents a trivial
                 * 64-bit brute force.
                 */
                for (j = 0; j < 2; j++) {
                        unsigned long m = get_random_long();

                        v3 ^= m;
                        PRND_SIPROUND(v0, v1, v2, v3);
                        PRND_SIPROUND(v0, v1, v2, v3);
                        v0 ^= m;
                }
#endif
                /*
                 * Probably impossible in practice, but there is a
                 * theoretical risk that a race between this reseeding
                 * and the target CPU writing its state back could
                 * create the all-zero SipHash fixed point.
                 *
                 * To ensure that never happens, ensure the state
                 * we write contains no zero words.
                 */
                state = per_cpu_ptr(&net_rand_state, i);
                WRITE_ONCE(state->v0, v0 ? v0 : -1ul);
                WRITE_ONCE(state->v1, v1 ? v1 : -1ul);
                WRITE_ONCE(state->v2, v2 ? v2 : -1ul);
                WRITE_ONCE(state->v3, v3 ? v3 : -1ul);
        }

        /* reseed every ~60 seconds, in [40 .. 80) interval with slack */
        expires = round_jiffies(jiffies + 40 * HZ + prandom_u32_max(40 * HZ));
        mod_timer(&seed_timer, expires);
}

/*
 * The random ready callback can be called from almost any interrupt.
 * To avoid worrying about whether it's safe to delay that interrupt
 * long enough to seed all CPUs, just schedule an immediate timer event.
 */
static int prandom_timer_start(struct notifier_block *nb,
                               unsigned long action, void *data)
{
        mod_timer(&seed_timer, jiffies);
        return 0;
}

#ifdef CONFIG_RANDOM32_SELFTEST
/* Principle: True 32-bit random numbers will all have 16 differing bits on
 * average. For each 32-bit number, there are 601M numbers differing by 16
 * bits, and 89% of the numbers differ by at least 12 bits. Note that more
 * than 16 differing bits also implies a correlation with inverted bits. Thus
 * we take 1024 random numbers and compare each of them to the other ones,
 * counting the deviation of correlated bits to 16. Constants report 32,
 * counters 32-log2(TEST_SIZE), and pure randoms, around 6 or lower. With the
 * u32 total, TEST_SIZE may be as large as 4096 samples.
 */
#define TEST_SIZE 1024
static int __init prandom32_state_selftest(void)
{
        unsigned int x, y, bits, samples;
        u32 xor, flip;
        u32 total;
        u32 *data;

        data = kmalloc(sizeof(*data) * TEST_SIZE, GFP_KERNEL);
        if (!data)
                return 0;

        for (samples = 0; samples < TEST_SIZE; samples++)
                data[samples] = prandom_u32();

        flip = total = 0;
        for (x = 0; x < samples; x++) {
                for (y = 0; y < samples; y++) {
                        if (x == y)
                                continue;
                        xor = data[x] ^ data[y];
                        flip |= xor;
                        bits = hweight32(xor);
                        total += (bits - 16) * (bits - 16);
                }
        }

        /* We'll return the average deviation as 2*sqrt(corr/samples), which
         * is also sqrt(4*corr/samples) which provides a better resolution.
         */
        bits = int_sqrt(total / (samples * (samples - 1)) * 4);
        if (bits > 6)
                pr_warn("prandom32: self test failed (at least %u bits"
                        " correlated, fixed_mask=%#x fixed_value=%#x\n",
                        bits, ~flip, data[0] & ~flip);
        else
                pr_info("prandom32: self test passed (less than %u bits"
                        " correlated)\n",
                        bits+1);
        kfree(data);
        return 0;
}
core_initcall(prandom32_state_selftest);
#endif /*  CONFIG_RANDOM32_SELFTEST */

/*
 * Start periodic full reseeding as soon as strong
 * random numbers are available.
 */
static int __init prandom_init_late(void)
{
        static struct notifier_block random_ready = {
                .notifier_call = prandom_timer_start
        };
        int ret = register_random_ready_notifier(&random_ready);

        if (ret == -EALREADY) {
                prandom_timer_start(&random_ready, 0, NULL);
                ret = 0;
        }
        return ret;
}
late_initcall(prandom_init_late);







































































































































































































    1 











    1 





































    1 
















































































































































































































































































































































































































































































































    1 






    1 












    1 




















































    1 










































    1 
    1 














































































































































































































































































































































































































































































































































































































































































































































































































































































    1 









    1 



    1 











    1 


    1 























    1 









    1 

    1 































































    1 

    1 
    1 














    1 
    1 





    1 


    1 

    1 

    1 

    1 

    1 

    1 

    1 

























    1 


    1 










    1 







    1 



    1 






    1 








    1 

























    1 
    1 







































































































































































































































































    1 

























    1 


























    1 















    1 

    1 


    1 
    1 





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
// SPDX-License-Identifier: GPL-2.0-only
/*
 * fs/dcache.c
 *
 * Complete reimplementation
 * (C) 1997 Thomas Schoebel-Theuer,
 * with heavy changes by Linus Torvalds
 */

/*
 * Notes on the allocation strategy:
 *
 * The dcache is a master of the icache - whenever a dcache entry
 * exists, the inode will always exist. "iput()" is done either when
 * the dcache entry is deleted or garbage collected.
 */

#include <linux/ratelimit.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/fscrypt.h>
#include <linux/fsnotify.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/cache.h>
#include <linux/export.h>
#include <linux/security.h>
#include <linux/seqlock.h>
#include <linux/memblock.h>
#include <linux/bit_spinlock.h>
#include <linux/rculist_bl.h>
#include <linux/list_lru.h>
#include "internal.h"
#include "mount.h"

/*
 * Usage:
 * dcache->d_inode->i_lock protects:
 *   - i_dentry, d_u.d_alias, d_inode of aliases
 * dcache_hash_bucket lock protects:
 *   - the dcache hash table
 * s_roots bl list spinlock protects:
 *   - the s_roots list (see __d_drop)
 * dentry->d_sb->s_dentry_lru_lock protects:
 *   - the dcache lru lists and counters
 * d_lock protects:
 *   - d_flags
 *   - d_name
 *   - d_lru
 *   - d_count
 *   - d_unhashed()
 *   - d_parent and d_subdirs
 *   - childrens' d_child and d_parent
 *   - d_u.d_alias, d_inode
 *
 * Ordering:
 * dentry->d_inode->i_lock
 *   dentry->d_lock
 *     dentry->d_sb->s_dentry_lru_lock
 *     dcache_hash_bucket lock
 *     s_roots lock
 *
 * If there is an ancestor relationship:
 * dentry->d_parent->...->d_parent->d_lock
 *   ...
 *     dentry->d_parent->d_lock
 *       dentry->d_lock
 *
 * If no ancestor relationship:
 * arbitrary, since it's serialized on rename_lock
 */
int sysctl_vfs_cache_pressure __read_mostly = 100;
EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);

__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);

EXPORT_SYMBOL(rename_lock);

static struct kmem_cache *dentry_cache __read_mostly;

const struct qstr empty_name = QSTR_INIT("", 0);
EXPORT_SYMBOL(empty_name);
const struct qstr slash_name = QSTR_INIT("/", 1);
EXPORT_SYMBOL(slash_name);

/*
 * This is the single most critical data structure when it comes
 * to the dcache: the hashtable for lookups. Somebody should try
 * to make this good - I've just made it work.
 *
 * This hash-function tries to avoid losing too many bits of hash
 * information, yet avoid using a prime hash-size or similar.
 */

static unsigned int d_hash_shift __read_mostly;

static struct hlist_bl_head *dentry_hashtable __read_mostly;

static inline struct hlist_bl_head *d_hash(unsigned int hash)
{
        return dentry_hashtable + (hash >> d_hash_shift);
}

#define IN_LOOKUP_SHIFT 10
static struct hlist_bl_head in_lookup_hashtable[1 << IN_LOOKUP_SHIFT];

static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent,
                                        unsigned int hash)
{
        hash += (unsigned long) parent / L1_CACHE_BYTES;
        return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT);
}


/* Statistics gathering. */
struct dentry_stat_t dentry_stat = {
        .age_limit = 45,
};

static DEFINE_PER_CPU(long, nr_dentry);
static DEFINE_PER_CPU(long, nr_dentry_unused);
static DEFINE_PER_CPU(long, nr_dentry_negative);

#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)

/*
 * Here we resort to our own counters instead of using generic per-cpu counters
 * for consistency with what the vfs inode code does. We are expected to harvest
 * better code and performance by having our own specialized counters.
 *
 * Please note that the loop is done over all possible CPUs, not over all online
 * CPUs. The reason for this is that we don't want to play games with CPUs going
 * on and off. If one of them goes off, we will just keep their counters.
 *
 * glommer: See cffbc8a for details, and if you ever intend to change this,
 * please update all vfs counters to match.
 */
static long get_nr_dentry(void)
{
        int i;
        long sum = 0;
        for_each_possible_cpu(i)
                sum += per_cpu(nr_dentry, i);
        return sum < 0 ? 0 : sum;
}

static long get_nr_dentry_unused(void)
{
        int i;
        long sum = 0;
        for_each_possible_cpu(i)
                sum += per_cpu(nr_dentry_unused, i);
        return sum < 0 ? 0 : sum;
}

static long get_nr_dentry_negative(void)
{
        int i;
        long sum = 0;

        for_each_possible_cpu(i)
                sum += per_cpu(nr_dentry_negative, i);
        return sum < 0 ? 0 : sum;
}

int proc_nr_dentry(struct ctl_table *table, int write, void *buffer,
                   size_t *lenp, loff_t *ppos)
{
        dentry_stat.nr_dentry = get_nr_dentry();
        dentry_stat.nr_unused = get_nr_dentry_unused();
        dentry_stat.nr_negative = get_nr_dentry_negative();
        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
#endif

/*
 * Compare 2 name strings, return 0 if they match, otherwise non-zero.
 * The strings are both count bytes long, and count is non-zero.
 */
#ifdef CONFIG_DCACHE_WORD_ACCESS

#include <asm/word-at-a-time.h>
/*
 * NOTE! 'cs' and 'scount' come from a dentry, so it has a
 * aligned allocation for this particular component. We don't
 * strictly need the load_unaligned_zeropad() safety, but it
 * doesn't hurt either.
 *
 * In contrast, 'ct' and 'tcount' can be from a pathname, and do
 * need the careful unaligned handling.
 */
static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
{
        unsigned long a,b,mask;

        for (;;) {
                a = read_word_at_a_time(cs);
                b = load_unaligned_zeropad(ct);
                if (tcount < sizeof(unsigned long))
                        break;
                if (unlikely(a != b))
                        return 1;
                cs += sizeof(unsigned long);
                ct += sizeof(unsigned long);
                tcount -= sizeof(unsigned long);
                if (!tcount)
                        return 0;
        }
        mask = bytemask_from_count(tcount);
        return unlikely(!!((a ^ b) & mask));
}

#else

static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
{
        do {
                if (*cs != *ct)
                        return 1;
                cs++;
                ct++;
                tcount--;
        } while (tcount);
        return 0;
}

#endif

static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount)
{
        /*
         * Be careful about RCU walk racing with rename:
         * use 'READ_ONCE' to fetch the name pointer.
         *
         * NOTE! Even if a rename will mean that the length
         * was not loaded atomically, we don't care. The
         * RCU walk will check the sequence count eventually,
         * and catch it. And we won't overrun the buffer,
         * because we're reading the name pointer atomically,
         * and a dentry name is guaranteed to be properly
         * terminated with a NUL byte.
         *
         * End result: even if 'len' is wrong, we'll exit
         * early because the data cannot match (there can
         * be no NUL in the ct/tcount data)
         */
        const unsigned char *cs = READ_ONCE(dentry->d_name.name);

        return dentry_string_cmp(cs, ct, tcount);
}

struct external_name {
        union {
                atomic_t count;
                struct rcu_head head;
        } u;
        unsigned char name[];
};

static inline struct external_name *external_name(struct dentry *dentry)
{
        return container_of(dentry->d_name.name, struct external_name, name[0]);
}

static void __d_free(struct rcu_head *head)
{
        struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);

        kmem_cache_free(dentry_cache, dentry); 
}

static void __d_free_external(struct rcu_head *head)
{
        struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
        kfree(external_name(dentry));
        kmem_cache_free(dentry_cache, dentry);
}

static inline int dname_external(const struct dentry *dentry)
{
        return dentry->d_name.name != dentry->d_iname;
}

void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        name->name = dentry->d_name;
        if (unlikely(dname_external(dentry))) {
                atomic_inc(&external_name(dentry)->u.count);
        } else {
                memcpy(name->inline_name, dentry->d_iname,
                       dentry->d_name.len + 1);
                name->name.name = name->inline_name;
        }
        spin_unlock(&dentry->d_lock);
}
EXPORT_SYMBOL(take_dentry_name_snapshot);

void release_dentry_name_snapshot(struct name_snapshot *name)
{
        if (unlikely(name->name.name != name->inline_name)) {
                struct external_name *p;
                p = container_of(name->name.name, struct external_name, name[0]);
                if (unlikely(atomic_dec_and_test(&p->u.count)))
                        kfree_rcu(p, u.head);
        }
}
EXPORT_SYMBOL(release_dentry_name_snapshot);

static inline void __d_set_inode_and_type(struct dentry *dentry,
                                          struct inode *inode,
                                          unsigned type_flags)
{
        unsigned flags;

        dentry->d_inode = inode;
        flags = READ_ONCE(dentry->d_flags);
        flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
        flags |= type_flags;
        smp_store_release(&dentry->d_flags, flags);
}

static inline void __d_clear_type_and_inode(struct dentry *dentry)
{
        unsigned flags = READ_ONCE(dentry->d_flags);

        flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
        WRITE_ONCE(dentry->d_flags, flags);
        dentry->d_inode = NULL;
        if (dentry->d_flags & DCACHE_LRU_LIST)
                this_cpu_inc(nr_dentry_negative);
}

static void dentry_free(struct dentry *dentry)
{
        WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias));
        if (unlikely(dname_external(dentry))) {
                struct external_name *p = external_name(dentry);
                if (likely(atomic_dec_and_test(&p->u.count))) {
                        call_rcu(&dentry->d_u.d_rcu, __d_free_external);
                        return;
                }
        }
        /* if dentry was never visible to RCU, immediate free is OK */
        if (dentry->d_flags & DCACHE_NORCU)
                __d_free(&dentry->d_u.d_rcu);
        else
                call_rcu(&dentry->d_u.d_rcu, __d_free);
}

/*
 * Release the dentry's inode, using the filesystem
 * d_iput() operation if defined.
 */
static void dentry_unlink_inode(struct dentry * dentry)
        __releases(dentry->d_lock)
        __releases(dentry->d_inode->i_lock)
{
        struct inode *inode = dentry->d_inode;

        raw_write_seqcount_begin(&dentry->d_seq);
        __d_clear_type_and_inode(dentry);
        hlist_del_init(&dentry->d_u.d_alias);
        raw_write_seqcount_end(&dentry->d_seq);
        spin_unlock(&dentry->d_lock);
        spin_unlock(&inode->i_lock);
        if (!inode->i_nlink)
                fsnotify_inoderemove(inode);
        if (dentry->d_op && dentry->d_op->d_iput)
                dentry->d_op->d_iput(dentry, inode);
        else
                iput(inode);
}

/*
 * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry
 * is in use - which includes both the "real" per-superblock
 * LRU list _and_ the DCACHE_SHRINK_LIST use.
 *
 * The DCACHE_SHRINK_LIST bit is set whenever the dentry is
 * on the shrink list (ie not on the superblock LRU list).
 *
 * The per-cpu "nr_dentry_unused" counters are updated with
 * the DCACHE_LRU_LIST bit.
 *
 * The per-cpu "nr_dentry_negative" counters are only updated
 * when deleted from or added to the per-superblock LRU list, not
 * from/to the shrink list. That is to avoid an unneeded dec/inc
 * pair when moving from LRU to shrink list in select_collect().
 *
 * These helper functions make sure we always follow the
 * rules. d_lock must be held by the caller.
 */
#define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x))
static void d_lru_add(struct dentry *dentry)
{
        D_FLAG_VERIFY(dentry, 0);
        dentry->d_flags |= DCACHE_LRU_LIST;
        this_cpu_inc(nr_dentry_unused);
        if (d_is_negative(dentry))
                this_cpu_inc(nr_dentry_negative);
        WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
}

static void d_lru_del(struct dentry *dentry)
{
        D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
        dentry->d_flags &= ~DCACHE_LRU_LIST;
        this_cpu_dec(nr_dentry_unused);
        if (d_is_negative(dentry))
                this_cpu_dec(nr_dentry_negative);
        WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
}

static void d_shrink_del(struct dentry *dentry)
{
        D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
        list_del_init(&dentry->d_lru);
        dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
        this_cpu_dec(nr_dentry_unused);
}

static void d_shrink_add(struct dentry *dentry, struct list_head *list)
{
        D_FLAG_VERIFY(dentry, 0);
        list_add(&dentry->d_lru, list);
        dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST;
        this_cpu_inc(nr_dentry_unused);
}

/*
 * These can only be called under the global LRU lock, ie during the
 * callback for freeing the LRU list. "isolate" removes it from the
 * LRU lists entirely, while shrink_move moves it to the indicated
 * private list.
 */
static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry)
{
        D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
        dentry->d_flags &= ~DCACHE_LRU_LIST;
        this_cpu_dec(nr_dentry_unused);
        if (d_is_negative(dentry))
                this_cpu_dec(nr_dentry_negative);
        list_lru_isolate(lru, &dentry->d_lru);
}

static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
                              struct list_head *list)
{
        D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
        dentry->d_flags |= DCACHE_SHRINK_LIST;
        if (d_is_negative(dentry))
                this_cpu_dec(nr_dentry_negative);
        list_lru_isolate_move(lru, &dentry->d_lru, list);
}

/**
 * d_drop - drop a dentry
 * @dentry: dentry to drop
 *
 * d_drop() unhashes the entry from the parent dentry hashes, so that it won't
 * be found through a VFS lookup any more. Note that this is different from
 * deleting the dentry - d_delete will try to mark the dentry negative if
 * possible, giving a successful _negative_ lookup, while d_drop will
 * just make the cache lookup fail.
 *
 * d_drop() is used mainly for stuff that wants to invalidate a dentry for some
 * reason (NFS timeouts or autofs deletes).
 *
 * __d_drop requires dentry->d_lock
 * ___d_drop doesn't mark dentry as "unhashed"
 *   (dentry->d_hash.pprev will be LIST_POISON2, not NULL).
 */
static void ___d_drop(struct dentry *dentry)
{
        struct hlist_bl_head *b;
        /*
         * Hashed dentries are normally on the dentry hashtable,
         * with the exception of those newly allocated by
         * d_obtain_root, which are always IS_ROOT:
         */
        if (unlikely(IS_ROOT(dentry)))
                b = &dentry->d_sb->s_roots;
        else
                b = d_hash(dentry->d_name.hash);

        hlist_bl_lock(b);
        __hlist_bl_del(&dentry->d_hash);
        hlist_bl_unlock(b);
}

void __d_drop(struct dentry *dentry)
{
        if (!d_unhashed(dentry)) {
                ___d_drop(dentry);
                dentry->d_hash.pprev = NULL;
                write_seqcount_invalidate(&dentry->d_seq);
        }
}
EXPORT_SYMBOL(__d_drop);

void d_drop(struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
}
EXPORT_SYMBOL(d_drop);

static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent)
{
        struct dentry *next;
        /*
         * Inform d_walk() and shrink_dentry_list() that we are no longer
         * attached to the dentry tree
         */
        dentry->d_flags |= DCACHE_DENTRY_KILLED;
        if (unlikely(list_empty(&dentry->d_child)))
                return;
        __list_del_entry(&dentry->d_child);
        /*
         * Cursors can move around the list of children.  While we'd been
         * a normal list member, it didn't matter - ->d_child.next would've
         * been updated.  However, from now on it won't be and for the
         * things like d_walk() it might end up with a nasty surprise.
         * Normally d_walk() doesn't care about cursors moving around -
         * ->d_lock on parent prevents that and since a cursor has no children
         * of its own, we get through it without ever unlocking the parent.
         * There is one exception, though - if we ascend from a child that
         * gets killed as soon as we unlock it, the next sibling is found
         * using the value left in its ->d_child.next.  And if _that_
         * pointed to a cursor, and cursor got moved (e.g. by lseek())
         * before d_walk() regains parent->d_lock, we'll end up skipping
         * everything the cursor had been moved past.
         *
         * Solution: make sure that the pointer left behind in ->d_child.next
         * points to something that won't be moving around.  I.e. skip the
         * cursors.
         */
        while (dentry->d_child.next != &parent->d_subdirs) {
                next = list_entry(dentry->d_child.next, struct dentry, d_child);
                if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR)))
                        break;
                dentry->d_child.next = next->d_child.next;
        }
}

static void __dentry_kill(struct dentry *dentry)
{
        struct dentry *parent = NULL;
        bool can_free = true;
        if (!IS_ROOT(dentry))
                parent = dentry->d_parent;

        /*
         * The dentry is now unrecoverably dead to the world.
         */
        lockref_mark_dead(&dentry->d_lockref);

        /*
         * inform the fs via d_prune that this dentry is about to be
         * unhashed and destroyed.
         */
        if (dentry->d_flags & DCACHE_OP_PRUNE)
                dentry->d_op->d_prune(dentry);

        if (dentry->d_flags & DCACHE_LRU_LIST) {
                if (!(dentry->d_flags & DCACHE_SHRINK_LIST))
                        d_lru_del(dentry);
        }
        /* if it was on the hash then remove it */
        __d_drop(dentry);
        dentry_unlist(dentry, parent);
        if (parent)
                spin_unlock(&parent->d_lock);
        if (dentry->d_inode)
                dentry_unlink_inode(dentry);
        else
                spin_unlock(&dentry->d_lock);
        this_cpu_dec(nr_dentry);
        if (dentry->d_op && dentry->d_op->d_release)
                dentry->d_op->d_release(dentry);

        spin_lock(&dentry->d_lock);
        if (dentry->d_flags & DCACHE_SHRINK_LIST) {
                dentry->d_flags |= DCACHE_MAY_FREE;
                can_free = false;
        }
        spin_unlock(&dentry->d_lock);
        if (likely(can_free))
                dentry_free(dentry);
        cond_resched();
}

static struct dentry *__lock_parent(struct dentry *dentry)
{
        struct dentry *parent;
        rcu_read_lock();
        spin_unlock(&dentry->d_lock);
again:
        parent = READ_ONCE(dentry->d_parent);
        spin_lock(&parent->d_lock);
        /*
         * We can't blindly lock dentry until we are sure
         * that we won't violate the locking order.
         * Any changes of dentry->d_parent must have
         * been done with parent->d_lock held, so
         * spin_lock() above is enough of a barrier
         * for checking if it's still our child.
         */
        if (unlikely(parent != dentry->d_parent)) {
                spin_unlock(&parent->d_lock);
                goto again;
        }
        rcu_read_unlock();
        if (parent != dentry)
                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
        else
                parent = NULL;
        return parent;
}

static inline struct dentry *lock_parent(struct dentry *dentry)
{
        struct dentry *parent = dentry->d_parent;
        if (IS_ROOT(dentry))
                return NULL;
        if (likely(spin_trylock(&parent->d_lock)))
                return parent;
        return __lock_parent(dentry);
}

static inline bool retain_dentry(struct dentry *dentry)
{
        WARN_ON(d_in_lookup(dentry));

        /* Unreachable? Get rid of it */
        if (unlikely(d_unhashed(dentry)))
                return false;

        if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
                return false;

        if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) {
                if (dentry->d_op->d_delete(dentry))
                        return false;
        }

        if (unlikely(dentry->d_flags & DCACHE_DONTCACHE))
                return false;

        /* retain; LRU fodder */
        dentry->d_lockref.count--;
        if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
                d_lru_add(dentry);
        else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
                dentry->d_flags |= DCACHE_REFERENCED;
        return true;
}

void d_mark_dontcache(struct inode *inode)
{
        struct dentry *de;

        spin_lock(&inode->i_lock);
        hlist_for_each_entry(de, &inode->i_dentry, d_u.d_alias) {
                spin_lock(&de->d_lock);
                de->d_flags |= DCACHE_DONTCACHE;
                spin_unlock(&de->d_lock);
        }
        inode->i_state |= I_DONTCACHE;
        spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(d_mark_dontcache);

/*
 * Finish off a dentry we've decided to kill.
 * dentry->d_lock must be held, returns with it unlocked.
 * Returns dentry requiring refcount drop, or NULL if we're done.
 */
static struct dentry *dentry_kill(struct dentry *dentry)
        __releases(dentry->d_lock)
{
        struct inode *inode = dentry->d_inode;
        struct dentry *parent = NULL;

        if (inode && unlikely(!spin_trylock(&inode->i_lock)))
                goto slow_positive;

        if (!IS_ROOT(dentry)) {
                parent = dentry->d_parent;
                if (unlikely(!spin_trylock(&parent->d_lock))) {
                        parent = __lock_parent(dentry);
                        if (likely(inode || !dentry->d_inode))
                                goto got_locks;
                        /* negative that became positive */
                        if (parent)
                                spin_unlock(&parent->d_lock);
                        inode = dentry->d_inode;
                        goto slow_positive;
                }
        }
        __dentry_kill(dentry);
        return parent;

slow_positive:
        spin_unlock(&dentry->d_lock);
        spin_lock(&inode->i_lock);
        spin_lock(&dentry->d_lock);
        parent = lock_parent(dentry);
got_locks:
        if (unlikely(dentry->d_lockref.count != 1)) {
                dentry->d_lockref.count--;
        } else if (likely(!retain_dentry(dentry))) {
                __dentry_kill(dentry);
                return parent;
        }
        /* we are keeping it, after all */
        if (inode)
                spin_unlock(&inode->i_lock);
        if (parent)
                spin_unlock(&parent->d_lock);
        spin_unlock(&dentry->d_lock);
        return NULL;
}

/*
 * Try to do a lockless dput(), and return whether that was successful.
 *
 * If unsuccessful, we return false, having already taken the dentry lock.
 *
 * The caller needs to hold the RCU read lock, so that the dentry is
 * guaranteed to stay around even if the refcount goes down to zero!
 */
static inline bool fast_dput(struct dentry *dentry)
{
        int ret;
        unsigned int d_flags;

        /*
         * If we have a d_op->d_delete() operation, we sould not
         * let the dentry count go to zero, so use "put_or_lock".
         */
        if (unlikely(dentry->d_flags & DCACHE_OP_DELETE))
                return lockref_put_or_lock(&dentry->d_lockref);

        /*
         * .. otherwise, we can try to just decrement the
         * lockref optimistically.
         */
        ret = lockref_put_return(&dentry->d_lockref);

        /*
         * If the lockref_put_return() failed due to the lock being held
         * by somebody else, the fast path has failed. We will need to
         * get the lock, and then check the count again.
         */
        if (unlikely(ret < 0)) {
                spin_lock(&dentry->d_lock);
                if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) {
                        spin_unlock(&dentry->d_lock);
                        return true;
                }
                dentry->d_lockref.count--;
                goto locked;
        }

        /*
         * If we weren't the last ref, we're done.
         */
        if (ret)
                return true;

        /*
         * Careful, careful. The reference count went down
         * to zero, but we don't hold the dentry lock, so
         * somebody else could get it again, and do another
         * dput(), and we need to not race with that.
         *
         * However, there is a very special and common case
         * where we don't care, because there is nothing to
         * do: the dentry is still hashed, it does not have
         * a 'delete' op, and it's referenced and already on
         * the LRU list.
         *
         * NOTE! Since we aren't locked, these values are
         * not "stable". However, it is sufficient that at
         * some point after we dropped the reference the
         * dentry was hashed and the flags had the proper
         * value. Other dentry users may have re-gotten
         * a reference to the dentry and change that, but
         * our work is done - we can leave the dentry
         * around with a zero refcount.
         */
        smp_rmb();
        d_flags = READ_ONCE(dentry->d_flags);
        d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_DISCONNECTED;

        /* Nothing to do? Dropping the reference was all we needed? */
        if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry))
                return true;

        /*
         * Not the fast normal case? Get the lock. We've already decremented
         * the refcount, but we'll need to re-check the situation after
         * getting the lock.
         */
        spin_lock(&dentry->d_lock);

        /*
         * Did somebody else grab a reference to it in the meantime, and
         * we're no longer the last user after all? Alternatively, somebody
         * else could have killed it and marked it dead. Either way, we
         * don't need to do anything else.
         */
locked:
        if (dentry->d_lockref.count) {
                spin_unlock(&dentry->d_lock);
                return true;
        }

        /*
         * Re-get the reference we optimistically dropped. We hold the
         * lock, and we just tested that it was zero, so we can just
         * set it to 1.
         */
        dentry->d_lockref.count = 1;
        return false;
}


/* 
 * This is dput
 *
 * This is complicated by the fact that we do not want to put
 * dentries that are no longer on any hash chain on the unused
 * list: we'd much rather just get rid of them immediately.
 *
 * However, that implies that we have to traverse the dentry
 * tree upwards to the parents which might _also_ now be
 * scheduled for deletion (it may have been only waiting for
 * its last child to go away).
 *
 * This tail recursion is done by hand as we don't want to depend
 * on the compiler to always get this right (gcc generally doesn't).
 * Real recursion would eat up our stack space.
 */

/*
 * dput - release a dentry
 * @dentry: dentry to release 
 *
 * Release a dentry. This will drop the usage count and if appropriate
 * call the dentry unlink method as well as removing it from the queues and
 * releasing its resources. If the parent dentries were scheduled for release
 * they too may now get deleted.
 */
void dput(struct dentry *dentry)
{
        while (dentry) {
                might_sleep();

                rcu_read_lock();
                if (likely(fast_dput(dentry))) {
                        rcu_read_unlock();
                        return;
                }

                /* Slow case: now with the dentry lock held */
                rcu_read_unlock();

                if (likely(retain_dentry(dentry))) {
                        spin_unlock(&dentry->d_lock);
                        return;
                }

                dentry = dentry_kill(dentry);
        }
}
EXPORT_SYMBOL(dput);

static void __dput_to_list(struct dentry *dentry, struct list_head *list)
__must_hold(&dentry->d_lock)
{
        if (dentry->d_flags & DCACHE_SHRINK_LIST) {
                /* let the owner of the list it's on deal with it */
                --dentry->d_lockref.count;
        } else {
                if (dentry->d_flags & DCACHE_LRU_LIST)
                        d_lru_del(dentry);
                if (!--dentry->d_lockref.count)
                        d_shrink_add(dentry, list);
        }
}

void dput_to_list(struct dentry *dentry, struct list_head *list)
{
        rcu_read_lock();
        if (likely(fast_dput(dentry))) {
                rcu_read_unlock();
                return;
        }
        rcu_read_unlock();
        if (!retain_dentry(dentry))
                __dput_to_list(dentry, list);
        spin_unlock(&dentry->d_lock);
}

/* This must be called with d_lock held */
static inline void __dget_dlock(struct dentry *dentry)
{
        dentry->d_lockref.count++;
}

static inline void __dget(struct dentry *dentry)
{
        lockref_get(&dentry->d_lockref);
}

struct dentry *dget_parent(struct dentry *dentry)
{
        int gotref;
        struct dentry *ret;
        unsigned seq;

        /*
         * Do optimistic parent lookup without any
         * locking.
         */
        rcu_read_lock();
        seq = raw_seqcount_begin(&dentry->d_seq);
        ret = READ_ONCE(dentry->d_parent);
        gotref = lockref_get_not_zero(&ret->d_lockref);
        rcu_read_unlock();
        if (likely(gotref)) {
                if (!read_seqcount_retry(&dentry->d_seq, seq))
                        return ret;
                dput(ret);
        }

repeat:
        /*
         * Don't need rcu_dereference because we re-check it was correct under
         * the lock.
         */
        rcu_read_lock();
        ret = dentry->d_parent;
        spin_lock(&ret->d_lock);
        if (unlikely(ret != dentry->d_parent)) {
                spin_unlock(&ret->d_lock);
                rcu_read_unlock();
                goto repeat;
        }
        rcu_read_unlock();
        BUG_ON(!ret->d_lockref.count);
        ret->d_lockref.count++;
        spin_unlock(&ret->d_lock);
        return ret;
}
EXPORT_SYMBOL(dget_parent);

static struct dentry * __d_find_any_alias(struct inode *inode)
{
        struct dentry *alias;

        if (hlist_empty(&inode->i_dentry))
                return NULL;
        alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
        __dget(alias);
        return alias;
}

/**
 * d_find_any_alias - find any alias for a given inode
 * @inode: inode to find an alias for
 *
 * If any aliases exist for the given inode, take and return a
 * reference for one of them.  If no aliases exist, return %NULL.
 */
struct dentry *d_find_any_alias(struct inode *inode)
{
        struct dentry *de;

        spin_lock(&inode->i_lock);
        de = __d_find_any_alias(inode);
        spin_unlock(&inode->i_lock);
        return de;
}
EXPORT_SYMBOL(d_find_any_alias);

/**
 * d_find_alias - grab a hashed alias of inode
 * @inode: inode in question
 *
 * If inode has a hashed alias, or is a directory and has any alias,
 * acquire the reference to alias and return it. Otherwise return NULL.
 * Notice that if inode is a directory there can be only one alias and
 * it can be unhashed only if it has no children, or if it is the root
 * of a filesystem, or if the directory was renamed and d_revalidate
 * was the first vfs operation to notice.
 *
 * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
 * any other hashed alias over that one.
 */
static struct dentry *__d_find_alias(struct inode *inode)
{
        struct dentry *alias;

        if (S_ISDIR(inode->i_mode))
                return __d_find_any_alias(inode);

        hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
                spin_lock(&alias->d_lock);
                 if (!d_unhashed(alias)) {
                        __dget_dlock(alias);
                        spin_unlock(&alias->d_lock);
                        return alias;
                }
                spin_unlock(&alias->d_lock);
        }
        return NULL;
}

struct dentry *d_find_alias(struct inode *inode)
{
        struct dentry *de = NULL;

        if (!hlist_empty(&inode->i_dentry)) {
                spin_lock(&inode->i_lock);
                de = __d_find_alias(inode);
                spin_unlock(&inode->i_lock);
        }
        return de;
}
EXPORT_SYMBOL(d_find_alias);

/*
 *        Try to kill dentries associated with this inode.
 * WARNING: you must own a reference to inode.
 */
void d_prune_aliases(struct inode *inode)
{
        struct dentry *dentry;
restart:
        spin_lock(&inode->i_lock);
        hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
                spin_lock(&dentry->d_lock);
                if (!dentry->d_lockref.count) {
                        struct dentry *parent = lock_parent(dentry);
                        if (likely(!dentry->d_lockref.count)) {
                                __dentry_kill(dentry);
                                dput(parent);
                                goto restart;
                        }
                        if (parent)
                                spin_unlock(&parent->d_lock);
                }
                spin_unlock(&dentry->d_lock);
        }
        spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(d_prune_aliases);

/*
 * Lock a dentry from shrink list.
 * Called under rcu_read_lock() and dentry->d_lock; the former
 * guarantees that nothing we access will be freed under us.
 * Note that dentry is *not* protected from concurrent dentry_kill(),
 * d_delete(), etc.
 *
 * Return false if dentry has been disrupted or grabbed, leaving
 * the caller to kick it off-list.  Otherwise, return true and have
 * that dentry's inode and parent both locked.
 */
static bool shrink_lock_dentry(struct dentry *dentry)
{
        struct inode *inode;
        struct dentry *parent;

        if (dentry->d_lockref.count)
                return false;

        inode = dentry->d_inode;
        if (inode && unlikely(!spin_trylock(&inode->i_lock))) {
                spin_unlock(&dentry->d_lock);
                spin_lock(&inode->i_lock);
                spin_lock(&dentry->d_lock);
                if (unlikely(dentry->d_lockref.count))
                        goto out;
                /* changed inode means that somebody had grabbed it */
                if (unlikely(inode != dentry->d_inode))
                        goto out;
        }

        parent = dentry->d_parent;
        if (IS_ROOT(dentry) || likely(spin_trylock(&parent->d_lock)))
                return true;

        spin_unlock(&dentry->d_lock);
        spin_lock(&parent->d_lock);
        if (unlikely(parent != dentry->d_parent)) {
                spin_unlock(&parent->d_lock);
                spin_lock(&dentry->d_lock);
                goto out;
        }
        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
        if (likely(!dentry->d_lockref.count))
                return true;
        spin_unlock(&parent->d_lock);
out:
        if (inode)
                spin_unlock(&inode->i_lock);
        return false;
}

void shrink_dentry_list(struct list_head *list)
{
        while (!list_empty(list)) {
                struct dentry *dentry, *parent;

                dentry = list_entry(list->prev, struct dentry, d_lru);
                spin_lock(&dentry->d_lock);
                rcu_read_lock();
                if (!shrink_lock_dentry(dentry)) {
                        bool can_free = false;
                        rcu_read_unlock();
                        d_shrink_del(dentry);
                        if (dentry->d_lockref.count < 0)
                                can_free = dentry->d_flags & DCACHE_MAY_FREE;
                        spin_unlock(&dentry->d_lock);
                        if (can_free)
                                dentry_free(dentry);
                        continue;
                }
                rcu_read_unlock();
                d_shrink_del(dentry);
                parent = dentry->d_parent;
                if (parent != dentry)
                        __dput_to_list(parent, list);
                __dentry_kill(dentry);
        }
}

static enum lru_status dentry_lru_isolate(struct list_head *item,
                struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
{
        struct list_head *freeable = arg;
        struct dentry        *dentry = container_of(item, struct dentry, d_lru);


        /*
         * we are inverting the lru lock/dentry->d_lock here,
         * so use a trylock. If we fail to get the lock, just skip
         * it
         */
        if (!spin_trylock(&dentry->d_lock))
                return LRU_SKIP;

        /*
         * Referenced dentries are still in use. If they have active
         * counts, just remove them from the LRU. Otherwise give them
         * another pass through the LRU.
         */
        if (dentry->d_lockref.count) {
                d_lru_isolate(lru, dentry);
                spin_unlock(&dentry->d_lock);
                return LRU_REMOVED;
        }

        if (dentry->d_flags & DCACHE_REFERENCED) {
                dentry->d_flags &= ~DCACHE_REFERENCED;
                spin_unlock(&dentry->d_lock);

                /*
                 * The list move itself will be made by the common LRU code. At
                 * this point, we've dropped the dentry->d_lock but keep the
                 * lru lock. This is safe to do, since every list movement is
                 * protected by the lru lock even if both locks are held.
                 *
                 * This is guaranteed by the fact that all LRU management
                 * functions are intermediated by the LRU API calls like
                 * list_lru_add and list_lru_del. List movement in this file
                 * only ever occur through this functions or through callbacks
                 * like this one, that are called from the LRU API.
                 *
                 * The only exceptions to this are functions like
                 * shrink_dentry_list, and code that first checks for the
                 * DCACHE_SHRINK_LIST flag.  Those are guaranteed to be
                 * operating only with stack provided lists after they are
                 * properly isolated from the main list.  It is thus, always a
                 * local access.
                 */
                return LRU_ROTATE;
        }

        d_lru_shrink_move(lru, dentry, freeable);
        spin_unlock(&dentry->d_lock);

        return LRU_REMOVED;
}

/**
 * prune_dcache_sb - shrink the dcache
 * @sb: superblock
 * @sc: shrink control, passed to list_lru_shrink_walk()
 *
 * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This
 * is done when we need more memory and called from the superblock shrinker
 * function.
 *
 * This function may fail to free any resources if all the dentries are in
 * use.
 */
long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
{
        LIST_HEAD(dispose);
        long freed;

        freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc,
                                     dentry_lru_isolate, &dispose);
        shrink_dentry_list(&dispose);
        return freed;
}

static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
                struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
{
        struct list_head *freeable = arg;
        struct dentry        *dentry = container_of(item, struct dentry, d_lru);

        /*
         * we are inverting the lru lock/dentry->d_lock here,
         * so use a trylock. If we fail to get the lock, just skip
         * it
         */
        if (!spin_trylock(&dentry->d_lock))
                return LRU_SKIP;

        d_lru_shrink_move(lru, dentry, freeable);
        spin_unlock(&dentry->d_lock);

        return LRU_REMOVED;
}


/**
 * shrink_dcache_sb - shrink dcache for a superblock
 * @sb: superblock
 *
 * Shrink the dcache for the specified super block. This is used to free
 * the dcache before unmounting a file system.
 */
void shrink_dcache_sb(struct super_block *sb)
{
        do {
                LIST_HEAD(dispose);

                list_lru_walk(&sb->s_dentry_lru,
                        dentry_lru_isolate_shrink, &dispose, 1024);
                shrink_dentry_list(&dispose);
        } while (list_lru_count(&sb->s_dentry_lru) > 0);
}
EXPORT_SYMBOL(shrink_dcache_sb);

/**
 * enum d_walk_ret - action to talke during tree walk
 * @D_WALK_CONTINUE:        contrinue walk
 * @D_WALK_QUIT:        quit walk
 * @D_WALK_NORETRY:        quit when retry is needed
 * @D_WALK_SKIP:        skip this dentry and its children
 */
enum d_walk_ret {
        D_WALK_CONTINUE,
        D_WALK_QUIT,
        D_WALK_NORETRY,
        D_WALK_SKIP,
};

/**
 * d_walk - walk the dentry tree
 * @parent:        start of walk
 * @data:        data passed to @enter() and @finish()
 * @enter:        callback when first entering the dentry
 *
 * The @enter() callbacks are called with d_lock held.
 */
static void d_walk(struct dentry *parent, void *data,
                   enum d_walk_ret (*enter)(void *, struct dentry *))
{
        struct dentry *this_parent;
        struct list_head *next;
        unsigned seq = 0;
        enum d_walk_ret ret;
        bool retry = true;

again:
        read_seqbegin_or_lock(&rename_lock, &seq);
        this_parent = parent;
        spin_lock(&this_parent->d_lock);

        ret = enter(data, this_parent);
        switch (ret) {
        case D_WALK_CONTINUE:
                break;
        case D_WALK_QUIT:
        case D_WALK_SKIP:
                goto out_unlock;
        case D_WALK_NORETRY:
                retry = false;
                break;
        }
repeat:
        next = this_parent->d_subdirs.next;
resume:
        while (next != &this_parent->d_subdirs) {
                struct list_head *tmp = next;
                struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
                next = tmp->next;

                if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR))
                        continue;

                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);

                ret = enter(data, dentry);
                switch (ret) {
                case D_WALK_CONTINUE:
                        break;
                case D_WALK_QUIT:
                        spin_unlock(&dentry->d_lock);
                        goto out_unlock;
                case D_WALK_NORETRY:
                        retry = false;
                        break;
                case D_WALK_SKIP:
                        spin_unlock(&dentry->d_lock);
                        continue;
                }

                if (!list_empty(&dentry->d_subdirs)) {
                        spin_unlock(&this_parent->d_lock);
                        spin_release(&dentry->d_lock.dep_map, _RET_IP_);
                        this_parent = dentry;
                        spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
                        goto repeat;
                }
                spin_unlock(&dentry->d_lock);
        }
        /*
         * All done at this level ... ascend and resume the search.
         */
        rcu_read_lock();
ascend:
        if (this_parent != parent) {
                struct dentry *child = this_parent;
                this_parent = child->d_parent;

                spin_unlock(&child->d_lock);
                spin_lock(&this_parent->d_lock);

                /* might go back up the wrong parent if we have had a rename. */
                if (need_seqretry(&rename_lock, seq))
                        goto rename_retry;
                /* go into the first sibling still alive */
                do {
                        next = child->d_child.next;
                        if (next == &this_parent->d_subdirs)
                                goto ascend;
                        child = list_entry(next, struct dentry, d_child);
                } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED));
                rcu_read_unlock();
                goto resume;
        }
        if (need_seqretry(&rename_lock, seq))
                goto rename_retry;
        rcu_read_unlock();

out_unlock:
        spin_unlock(&this_parent->d_lock);
        done_seqretry(&rename_lock, seq);
        return;

rename_retry:
        spin_unlock(&this_parent->d_lock);
        rcu_read_unlock();
        BUG_ON(seq & 1);
        if (!retry)
                return;
        seq = 1;
        goto again;
}

struct check_mount {
        struct vfsmount *mnt;
        unsigned int mounted;
};

static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry)
{
        struct check_mount *info = data;
        struct path path = { .mnt = info->mnt, .dentry = dentry };

        if (likely(!d_mountpoint(dentry)))
                return D_WALK_CONTINUE;
        if (__path_is_mountpoint(&path)) {
                info->mounted = 1;
                return D_WALK_QUIT;
        }
        return D_WALK_CONTINUE;
}

/**
 * path_has_submounts - check for mounts over a dentry in the
 *                      current namespace.
 * @parent: path to check.
 *
 * Return true if the parent or its subdirectories contain
 * a mount point in the current namespace.
 */
int path_has_submounts(const struct path *parent)
{
        struct check_mount data = { .mnt = parent->mnt, .mounted = 0 };

        read_seqlock_excl(&mount_lock);
        d_walk(parent->dentry, &data, path_check_mount);
        read_sequnlock_excl(&mount_lock);

        return data.mounted;
}
EXPORT_SYMBOL(path_has_submounts);

/*
 * Called by mount code to set a mountpoint and check if the mountpoint is
 * reachable (e.g. NFS can unhash a directory dentry and then the complete
 * subtree can become unreachable).
 *
 * Only one of d_invalidate() and d_set_mounted() must succeed.  For
 * this reason take rename_lock and d_lock on dentry and ancestors.
 */
int d_set_mounted(struct dentry *dentry)
{
        struct dentry *p;
        int ret = -ENOENT;
        write_seqlock(&rename_lock);
        for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) {
                /* Need exclusion wrt. d_invalidate() */
                spin_lock(&p->d_lock);
                if (unlikely(d_unhashed(p))) {
                        spin_unlock(&p->d_lock);
                        goto out;
                }
                spin_unlock(&p->d_lock);
        }
        spin_lock(&dentry->d_lock);
        if (!d_unlinked(dentry)) {
                ret = -EBUSY;
                if (!d_mountpoint(dentry)) {
                        dentry->d_flags |= DCACHE_MOUNTED;
                        ret = 0;
                }
        }
         spin_unlock(&dentry->d_lock);
out:
        write_sequnlock(&rename_lock);
        return ret;
}

/*
 * Search the dentry child list of the specified parent,
 * and move any unused dentries to the end of the unused
 * list for prune_dcache(). We descend to the next level
 * whenever the d_subdirs list is non-empty and continue
 * searching.
 *
 * It returns zero iff there are no unused children,
 * otherwise  it returns the number of children moved to
 * the end of the unused list. This may not be the total
 * number of unused children, because select_parent can
 * drop the lock and return early due to latency
 * constraints.
 */

struct select_data {
        struct dentry *start;
        union {
                long found;
                struct dentry *victim;
        };
        struct list_head dispose;
};

static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
{
        struct select_data *data = _data;
        enum d_walk_ret ret = D_WALK_CONTINUE;

        if (data->start == dentry)
                goto out;

        if (dentry->d_flags & DCACHE_SHRINK_LIST) {
                data->found++;
        } else {
                if (dentry->d_flags & DCACHE_LRU_LIST)
                        d_lru_del(dentry);
                if (!dentry->d_lockref.count) {
                        d_shrink_add(dentry, &data->dispose);
                        data->found++;
                }
        }
        /*
         * We can return to the caller if we have found some (this
         * ensures forward progress). We'll be coming back to find
         * the rest.
         */
        if (!list_empty(&data->dispose))
                ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
out:
        return ret;
}

static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry)
{
        struct select_data *data = _data;
        enum d_walk_ret ret = D_WALK_CONTINUE;

        if (data->start == dentry)
                goto out;

        if (dentry->d_flags & DCACHE_SHRINK_LIST) {
                if (!dentry->d_lockref.count) {
                        rcu_read_lock();
                        data->victim = dentry;
                        return D_WALK_QUIT;
                }
        } else {
                if (dentry->d_flags & DCACHE_LRU_LIST)
                        d_lru_del(dentry);
                if (!dentry->d_lockref.count)
                        d_shrink_add(dentry, &data->dispose);
        }
        /*
         * We can return to the caller if we have found some (this
         * ensures forward progress). We'll be coming back to find
         * the rest.
         */
        if (!list_empty(&data->dispose))
                ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
out:
        return ret;
}

/**
 * shrink_dcache_parent - prune dcache
 * @parent: parent of entries to prune
 *
 * Prune the dcache to remove unused children of the parent dentry.
 */
void shrink_dcache_parent(struct dentry *parent)
{
        for (;;) {
                struct select_data data = {.start = parent};

                INIT_LIST_HEAD(&data.dispose);
                d_walk(parent, &data, select_collect);

                if (!list_empty(&data.dispose)) {
                        shrink_dentry_list(&data.dispose);
                        continue;
                }

                cond_resched();
                if (!data.found)
                        break;
                data.victim = NULL;
                d_walk(parent, &data, select_collect2);
                if (data.victim) {
                        struct dentry *parent;
                        spin_lock(&data.victim->d_lock);
                        if (!shrink_lock_dentry(data.victim)) {
                                spin_unlock(&data.victim->d_lock);
                                rcu_read_unlock();
                        } else {
                                rcu_read_unlock();
                                parent = data.victim->d_parent;
                                if (parent != data.victim)
                                        __dput_to_list(parent, &data.dispose);
                                __dentry_kill(data.victim);
                        }
                }
                if (!list_empty(&data.dispose))
                        shrink_dentry_list(&data.dispose);
        }
}
EXPORT_SYMBOL(shrink_dcache_parent);

static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
{
        /* it has busy descendents; complain about those instead */
        if (!list_empty(&dentry->d_subdirs))
                return D_WALK_CONTINUE;

        /* root with refcount 1 is fine */
        if (dentry == _data && dentry->d_lockref.count == 1)
                return D_WALK_CONTINUE;

        printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} "
                        " still in use (%d) [unmount of %s %s]\n",
                       dentry,
                       dentry->d_inode ?
                       dentry->d_inode->i_ino : 0UL,
                       dentry,
                       dentry->d_lockref.count,
                       dentry->d_sb->s_type->name,
                       dentry->d_sb->s_id);
        WARN_ON(1);
        return D_WALK_CONTINUE;
}

static void do_one_tree(struct dentry *dentry)
{
        shrink_dcache_parent(dentry);
        d_walk(dentry, dentry, umount_check);
        d_drop(dentry);
        dput(dentry);
}

/*
 * destroy the dentries attached to a superblock on unmounting
 */
void shrink_dcache_for_umount(struct super_block *sb)
{
        struct dentry *dentry;

        WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked");

        dentry = sb->s_root;
        sb->s_root = NULL;
        do_one_tree(dentry);

        while (!hlist_bl_empty(&sb->s_roots)) {
                dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_roots), struct dentry, d_hash));
                do_one_tree(dentry);
        }
}

static enum d_walk_ret find_submount(void *_data, struct dentry *dentry)
{
        struct dentry **victim = _data;
        if (d_mountpoint(dentry)) {
                __dget_dlock(dentry);
                *victim = dentry;
                return D_WALK_QUIT;
        }
        return D_WALK_CONTINUE;
}

/**
 * d_invalidate - detach submounts, prune dcache, and drop
 * @dentry: dentry to invalidate (aka detach, prune and drop)
 */
void d_invalidate(struct dentry *dentry)
{
        bool had_submounts = false;
        spin_lock(&dentry->d_lock);
        if (d_unhashed(dentry)) {
                spin_unlock(&dentry->d_lock);
                return;
        }
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);

        /* Negative dentries can be dropped without further checks */
        if (!dentry->d_inode)
                return;

        shrink_dcache_parent(dentry);
        for (;;) {
                struct dentry *victim = NULL;
                d_walk(dentry, &victim, find_submount);
                if (!victim) {
                        if (had_submounts)
                                shrink_dcache_parent(dentry);
                        return;
                }
                had_submounts = true;
                detach_mounts(victim);
                dput(victim);
        }
}
EXPORT_SYMBOL(d_invalidate);

/**
 * __d_alloc        -        allocate a dcache entry
 * @sb: filesystem it will belong to
 * @name: qstr of the name
 *
 * Allocates a dentry. It returns %NULL if there is insufficient memory
 * available. On a success the dentry is returned. The name passed in is
 * copied and the copy passed in may be reused after this call.
 */
 
static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
{
        struct dentry *dentry;
        char *dname;
        int err;

        dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
        if (!dentry)
                return NULL;

        /*
         * We guarantee that the inline name is always NUL-terminated.
         * This way the memcpy() done by the name switching in rename
         * will still always have a NUL at the end, even if we might
         * be overwriting an internal NUL character
         */
        dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
        if (unlikely(!name)) {
                name = &slash_name;
                dname = dentry->d_iname;
        } else if (name->len > DNAME_INLINE_LEN-1) {
                size_t size = offsetof(struct external_name, name[1]);
                struct external_name *p = kmalloc(size + name->len,
                                                  GFP_KERNEL_ACCOUNT |
                                                  __GFP_RECLAIMABLE);
                if (!p) {
                        kmem_cache_free(dentry_cache, dentry); 
                        return NULL;
                }
                atomic_set(&p->u.count, 1);
                dname = p->name;
        } else  {
                dname = dentry->d_iname;
        }        

        dentry->d_name.len = name->len;
        dentry->d_name.hash = name->hash;
        memcpy(dname, name->name, name->len);
        dname[name->len] = 0;

        /* Make sure we always see the terminating NUL character */
        smp_store_release(&dentry->d_name.name, dname); /* ^^^ */

        dentry->d_lockref.count = 1;
        dentry->d_flags = 0;
        spin_lock_init(&dentry->d_lock);
        seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock);
        dentry->d_inode = NULL;
        dentry->d_parent = dentry;
        dentry->d_sb = sb;
        dentry->d_op = NULL;
        dentry->d_fsdata = NULL;
        INIT_HLIST_BL_NODE(&dentry->d_hash);
        INIT_LIST_HEAD(&dentry->d_lru);
        INIT_LIST_HEAD(&dentry->d_subdirs);
        INIT_HLIST_NODE(&dentry->d_u.d_alias);
        INIT_LIST_HEAD(&dentry->d_child);
        d_set_d_op(dentry, dentry->d_sb->s_d_op);

        if (dentry->d_op && dentry->d_op->d_init) {
                err = dentry->d_op->d_init(dentry);
                if (err) {
                        if (dname_external(dentry))
                                kfree(external_name(dentry));
                        kmem_cache_free(dentry_cache, dentry);
                        return NULL;
                }
        }

        this_cpu_inc(nr_dentry);

        return dentry;
}

/**
 * d_alloc        -        allocate a dcache entry
 * @parent: parent of entry to allocate
 * @name: qstr of the name
 *
 * Allocates a dentry. It returns %NULL if there is insufficient memory
 * available. On a success the dentry is returned. The name passed in is
 * copied and the copy passed in may be reused after this call.
 */
struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
{
        struct dentry *dentry = __d_alloc(parent->d_sb, name);
        if (!dentry)
                return NULL;
        spin_lock(&parent->d_lock);
        /*
         * don't need child lock because it is not subject
         * to concurrency here
         */
        __dget_dlock(parent);
        dentry->d_parent = parent;
        list_add(&dentry->d_child, &parent->d_subdirs);
        spin_unlock(&parent->d_lock);

        return dentry;
}
EXPORT_SYMBOL(d_alloc);

struct dentry *d_alloc_anon(struct super_block *sb)
{
        return __d_alloc(sb, NULL);
}
EXPORT_SYMBOL(d_alloc_anon);

struct dentry *d_alloc_cursor(struct dentry * parent)
{
        struct dentry *dentry = d_alloc_anon(parent->d_sb);
        if (dentry) {
                dentry->d_flags |= DCACHE_DENTRY_CURSOR;
                dentry->d_parent = dget(parent);
        }
        return dentry;
}

/**
 * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems)
 * @sb: the superblock
 * @name: qstr of the name
 *
 * For a filesystem that just pins its dentries in memory and never
 * performs lookups at all, return an unhashed IS_ROOT dentry.
 * This is used for pipes, sockets et.al. - the stuff that should
 * never be anyone's children or parents.  Unlike all other
 * dentries, these will not have RCU delay between dropping the
 * last reference and freeing them.
 *
 * The only user is alloc_file_pseudo() and that's what should
 * be considered a public interface.  Don't use directly.
 */
struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
{
        struct dentry *dentry = __d_alloc(sb, name);
        if (likely(dentry))
                dentry->d_flags |= DCACHE_NORCU;
        return dentry;
}

struct dentry *d_alloc_name(struct dentry *parent, const char *name)
{
        struct qstr q;

        q.name = name;
        q.hash_len = hashlen_string(parent, name);
        return d_alloc(parent, &q);
}
EXPORT_SYMBOL(d_alloc_name);

void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
{
        WARN_ON_ONCE(dentry->d_op);
        WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH        |
                                DCACHE_OP_COMPARE        |
                                DCACHE_OP_REVALIDATE        |
                                DCACHE_OP_WEAK_REVALIDATE        |
                                DCACHE_OP_DELETE        |
                                DCACHE_OP_REAL));
        dentry->d_op = op;
        if (!op)
                return;
        if (op->d_hash)
                dentry->d_flags |= DCACHE_OP_HASH;
        if (op->d_compare)
                dentry->d_flags |= DCACHE_OP_COMPARE;
        if (op->d_revalidate)
                dentry->d_flags |= DCACHE_OP_REVALIDATE;
        if (op->d_weak_revalidate)
                dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE;
        if (op->d_delete)
                dentry->d_flags |= DCACHE_OP_DELETE;
        if (op->d_prune)
                dentry->d_flags |= DCACHE_OP_PRUNE;
        if (op->d_real)
                dentry->d_flags |= DCACHE_OP_REAL;

}
EXPORT_SYMBOL(d_set_d_op);


/*
 * d_set_fallthru - Mark a dentry as falling through to a lower layer
 * @dentry - The dentry to mark
 *
 * Mark a dentry as falling through to the lower layer (as set with
 * d_pin_lower()).  This flag may be recorded on the medium.
 */
void d_set_fallthru(struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        dentry->d_flags |= DCACHE_FALLTHRU;
        spin_unlock(&dentry->d_lock);
}
EXPORT_SYMBOL(d_set_fallthru);

static unsigned d_flags_for_inode(struct inode *inode)
{
        unsigned add_flags = DCACHE_REGULAR_TYPE;

        if (!inode)
                return DCACHE_MISS_TYPE;

        if (S_ISDIR(inode->i_mode)) {
                add_flags = DCACHE_DIRECTORY_TYPE;
                if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) {
                        if (unlikely(!inode->i_op->lookup))
                                add_flags = DCACHE_AUTODIR_TYPE;
                        else
                                inode->i_opflags |= IOP_LOOKUP;
                }
                goto type_determined;
        }

        if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
                if (unlikely(inode->i_op->get_link)) {
                        add_flags = DCACHE_SYMLINK_TYPE;
                        goto type_determined;
                }
                inode->i_opflags |= IOP_NOFOLLOW;
        }

        if (unlikely(!S_ISREG(inode->i_mode)))
                add_flags = DCACHE_SPECIAL_TYPE;

type_determined:
        if (unlikely(IS_AUTOMOUNT(inode)))
                add_flags |= DCACHE_NEED_AUTOMOUNT;
        return add_flags;
}

static void __d_instantiate(struct dentry *dentry, struct inode *inode)
{
        unsigned add_flags = d_flags_for_inode(inode);
        WARN_ON(d_in_lookup(dentry));

        spin_lock(&dentry->d_lock);
        /*
         * Decrement negative dentry count if it was in the LRU list.
         */
        if (dentry->d_flags & DCACHE_LRU_LIST)
                this_cpu_dec(nr_dentry_negative);
        hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
        raw_write_seqcount_begin(&dentry->d_seq);
        __d_set_inode_and_type(dentry, inode, add_flags);
        raw_write_seqcount_end(&dentry->d_seq);
        fsnotify_update_flags(dentry);
        spin_unlock(&dentry->d_lock);
}

/**
 * d_instantiate - fill in inode information for a dentry
 * @entry: dentry to complete
 * @inode: inode to attach to this dentry
 *
 * Fill in inode information in the entry.
 *
 * This turns negative dentries into productive full members
 * of society.
 *
 * NOTE! This assumes that the inode count has been incremented
 * (or otherwise set) by the caller to indicate that it is now
 * in use by the dcache.
 */
 
void d_instantiate(struct dentry *entry, struct inode * inode)
{
        BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
        if (inode) {
                security_d_instantiate(entry, inode);
                spin_lock(&inode->i_lock);
                __d_instantiate(entry, inode);
                spin_unlock(&inode->i_lock);
        }
}
EXPORT_SYMBOL(d_instantiate);

/*
 * This should be equivalent to d_instantiate() + unlock_new_inode(),
 * with lockdep-related part of unlock_new_inode() done before
 * anything else.  Use that instead of open-coding d_instantiate()/
 * unlock_new_inode() combinations.
 */
void d_instantiate_new(struct dentry *entry, struct inode *inode)
{
        BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
        BUG_ON(!inode);
        lockdep_annotate_inode_mutex_key(inode);
        security_d_instantiate(entry, inode);
        spin_lock(&inode->i_lock);
        __d_instantiate(entry, inode);
        WARN_ON(!(inode->i_state & I_NEW));
        inode->i_state &= ~I_NEW & ~I_CREATING;
        smp_mb();
        wake_up_bit(&inode->i_state, __I_NEW);
        spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(d_instantiate_new);

struct dentry *d_make_root(struct inode *root_inode)
{
        struct dentry *res = NULL;

        if (root_inode) {
                res = d_alloc_anon(root_inode->i_sb);
                if (res)
                        d_instantiate(res, root_inode);
                else
                        iput(root_inode);
        }
        return res;
}
EXPORT_SYMBOL(d_make_root);

static struct dentry *__d_instantiate_anon(struct dentry *dentry,
                                           struct inode *inode,
                                           bool disconnected)
{
        struct dentry *res;
        unsigned add_flags;

        security_d_instantiate(dentry, inode);
        spin_lock(&inode->i_lock);
        res = __d_find_any_alias(inode);
        if (res) {
                spin_unlock(&inode->i_lock);
                dput(dentry);
                goto out_iput;
        }

        /* attach a disconnected dentry */
        add_flags = d_flags_for_inode(inode);

        if (disconnected)
                add_flags |= DCACHE_DISCONNECTED;

        spin_lock(&dentry->d_lock);
        __d_set_inode_and_type(dentry, inode, add_flags);
        hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
        if (!disconnected) {
                hlist_bl_lock(&dentry->d_sb->s_roots);
                hlist_bl_add_head(&dentry->d_hash, &dentry->d_sb->s_roots);
                hlist_bl_unlock(&dentry->d_sb->s_roots);
        }
        spin_unlock(&dentry->d_lock);
        spin_unlock(&inode->i_lock);

        return dentry;

 out_iput:
        iput(inode);
        return res;
}

struct dentry *d_instantiate_anon(struct dentry *dentry, struct inode *inode)
{
        return __d_instantiate_anon(dentry, inode, true);
}
EXPORT_SYMBOL(d_instantiate_anon);

static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected)
{
        struct dentry *tmp;
        struct dentry *res;

        if (!inode)
                return ERR_PTR(-ESTALE);
        if (IS_ERR(inode))
                return ERR_CAST(inode);

        res = d_find_any_alias(inode);
        if (res)
                goto out_iput;

        tmp = d_alloc_anon(inode->i_sb);
        if (!tmp) {
                res = ERR_PTR(-ENOMEM);
                goto out_iput;
        }

        return __d_instantiate_anon(tmp, inode, disconnected);

out_iput:
        iput(inode);
        return res;
}

/**
 * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode
 * @inode: inode to allocate the dentry for
 *
 * Obtain a dentry for an inode resulting from NFS filehandle conversion or
 * similar open by handle operations.  The returned dentry may be anonymous,
 * or may have a full name (if the inode was already in the cache).
 *
 * When called on a directory inode, we must ensure that the inode only ever
 * has one dentry.  If a dentry is found, that is returned instead of
 * allocating a new one.
 *
 * On successful return, the reference to the inode has been transferred
 * to the dentry.  In case of an error the reference on the inode is released.
 * To make it easier to use in export operations a %NULL or IS_ERR inode may
 * be passed in and the error will be propagated to the return value,
 * with a %NULL @inode replaced by ERR_PTR(-ESTALE).
 */
struct dentry *d_obtain_alias(struct inode *inode)
{
        return __d_obtain_alias(inode, true);
}
EXPORT_SYMBOL(d_obtain_alias);

/**
 * d_obtain_root - find or allocate a dentry for a given inode
 * @inode: inode to allocate the dentry for
 *
 * Obtain an IS_ROOT dentry for the root of a filesystem.
 *
 * We must ensure that directory inodes only ever have one dentry.  If a
 * dentry is found, that is returned instead of allocating a new one.
 *
 * On successful return, the reference to the inode has been transferred
 * to the dentry.  In case of an error the reference on the inode is
 * released.  A %NULL or IS_ERR inode may be passed in and will be the
 * error will be propagate to the return value, with a %NULL @inode
 * replaced by ERR_PTR(-ESTALE).
 */
struct dentry *d_obtain_root(struct inode *inode)
{
        return __d_obtain_alias(inode, false);
}
EXPORT_SYMBOL(d_obtain_root);

/**
 * d_add_ci - lookup or allocate new dentry with case-exact name
 * @inode:  the inode case-insensitive lookup has found
 * @dentry: the negative dentry that was passed to the parent's lookup func
 * @name:   the case-exact name to be associated with the returned dentry
 *
 * This is to avoid filling the dcache with case-insensitive names to the
 * same inode, only the actual correct case is stored in the dcache for
 * case-insensitive filesystems.
 *
 * For a case-insensitive lookup match and if the the case-exact dentry
 * already exists in in the dcache, use it and return it.
 *
 * If no entry exists with the exact case name, allocate new dentry with
 * the exact case, and return the spliced entry.
 */
struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
                        struct qstr *name)
{
        struct dentry *found, *res;

        /*
         * First check if a dentry matching the name already exists,
         * if not go ahead and create it now.
         */
        found = d_hash_and_lookup(dentry->d_parent, name);
        if (found) {
                iput(inode);
                return found;
        }
        if (d_in_lookup(dentry)) {
                found = d_alloc_parallel(dentry->d_parent, name,
                                        dentry->d_wait);
                if (IS_ERR(found) || !d_in_lookup(found)) {
                        iput(inode);
                        return found;
                }
        } else {
                found = d_alloc(dentry->d_parent, name);
                if (!found) {
                        iput(inode);
                        return ERR_PTR(-ENOMEM);
                } 
        }
        res = d_splice_alias(inode, found);
        if (res) {
                dput(found);
                return res;
        }
        return found;
}
EXPORT_SYMBOL(d_add_ci);


static inline bool d_same_name(const struct dentry *dentry,
                                const struct dentry *parent,
                                const struct qstr *name)
{
        if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) {
                if (dentry->d_name.len != name->len)
                        return false;
                return dentry_cmp(dentry, name->name, name->len) == 0;
        }
        return parent->d_op->d_compare(dentry,
                                       dentry->d_name.len, dentry->d_name.name,
                                       name) == 0;
}

/**
 * __d_lookup_rcu - search for a dentry (racy, store-free)
 * @parent: parent dentry
 * @name: qstr of name we wish to find
 * @seqp: returns d_seq value at the point where the dentry was found
 * Returns: dentry, or NULL
 *
 * __d_lookup_rcu is the dcache lookup function for rcu-walk name
 * resolution (store-free path walking) design described in
 * Documentation/filesystems/path-lookup.txt.
 *
 * This is not to be used outside core vfs.
 *
 * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock
 * held, and rcu_read_lock held. The returned dentry must not be stored into
 * without taking d_lock and checking d_seq sequence count against @seq
 * returned here.
 *
 * A refcount may be taken on the found dentry with the d_rcu_to_refcount
 * function.
 *
 * Alternatively, __d_lookup_rcu may be called again to look up the child of
 * the returned dentry, so long as its parent's seqlock is checked after the
 * child is looked up. Thus, an interlocking stepping of sequence lock checks
 * is formed, giving integrity down the path walk.
 *
 * NOTE! The caller *has* to check the resulting dentry against the sequence
 * number we've returned before using any of the resulting dentry state!
 */
struct dentry *__d_lookup_rcu(const struct dentry *parent,
                                const struct qstr *name,
                                unsigned *seqp)
{
        u64 hashlen = name->hash_len;
        const unsigned char *str = name->name;
        struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen));
        struct hlist_bl_node *node;
        struct dentry *dentry;

        /*
         * Note: There is significant duplication with __d_lookup_rcu which is
         * required to prevent single threaded performance regressions
         * especially on architectures where smp_rmb (in seqcounts) are costly.
         * Keep the two functions in sync.
         */

        /*
         * The hash list is protected using RCU.
         *
         * Carefully use d_seq when comparing a candidate dentry, to avoid
         * races with d_move().
         *
         * It is possible that concurrent renames can mess up our list
         * walk here and result in missing our dentry, resulting in the
         * false-negative result. d_lookup() protects against concurrent
         * renames using rename_lock seqlock.
         *
         * See Documentation/filesystems/path-lookup.txt for more details.
         */
        hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
                unsigned seq;

seqretry:
                /*
                 * The dentry sequence count protects us from concurrent
                 * renames, and thus protects parent and name fields.
                 *
                 * The caller must perform a seqcount check in order
                 * to do anything useful with the returned dentry.
                 *
                 * NOTE! We do a "raw" seqcount_begin here. That means that
                 * we don't wait for the sequence count to stabilize if it
                 * is in the middle of a sequence change. If we do the slow
                 * dentry compare, we will do seqretries until it is stable,
                 * and if we end up with a successful lookup, we actually
                 * want to exit RCU lookup anyway.
                 *
                 * Note that raw_seqcount_begin still *does* smp_rmb(), so
                 * we are still guaranteed NUL-termination of ->d_name.name.
                 */
                seq = raw_seqcount_begin(&dentry->d_seq);
                if (dentry->d_parent != parent)
                        continue;
                if (d_unhashed(dentry))
                        continue;

                if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
                        int tlen;
                        const char *tname;
                        if (dentry->d_name.hash != hashlen_hash(hashlen))
                                continue;
                        tlen = dentry->d_name.len;
                        tname = dentry->d_name.name;
                        /* we want a consistent (name,len) pair */
                        if (read_seqcount_retry(&dentry->d_seq, seq)) {
                                cpu_relax();
                                goto seqretry;
                        }
                        if (parent->d_op->d_compare(dentry,
                                                    tlen, tname, name) != 0)
                                continue;
                } else {
                        if (dentry->d_name.hash_len != hashlen)
                                continue;
                        if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0)
                                continue;
                }
                *seqp = seq;
                return dentry;
        }
        return NULL;
}

/**
 * d_lookup - search for a dentry
 * @parent: parent dentry
 * @name: qstr of name we wish to find
 * Returns: dentry, or NULL
 *
 * d_lookup searches the children of the parent dentry for the name in
 * question. If the dentry is found its reference count is incremented and the
 * dentry is returned. The caller must use dput to free the entry when it has
 * finished using it. %NULL is returned if the dentry does not exist.
 */
struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name)
{
        struct dentry *dentry;
        unsigned seq;

        do {
                seq = read_seqbegin(&rename_lock);
                dentry = __d_lookup(parent, name);
                if (dentry)
                        break;
        } while (read_seqretry(&rename_lock, seq));
        return dentry;
}
EXPORT_SYMBOL(d_lookup);

/**
 * __d_lookup - search for a dentry (racy)
 * @parent: parent dentry
 * @name: qstr of name we wish to find
 * Returns: dentry, or NULL
 *
 * __d_lookup is like d_lookup, however it may (rarely) return a
 * false-negative result due to unrelated rename activity.
 *
 * __d_lookup is slightly faster by avoiding rename_lock read seqlock,
 * however it must be used carefully, eg. with a following d_lookup in
 * the case of failure.
 *
 * __d_lookup callers must be commented.
 */
struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
{
        unsigned int hash = name->hash;
        struct hlist_bl_head *b = d_hash(hash);
        struct hlist_bl_node *node;
        struct dentry *found = NULL;
        struct dentry *dentry;

        /*
         * Note: There is significant duplication with __d_lookup_rcu which is
         * required to prevent single threaded performance regressions
         * especially on architectures where smp_rmb (in seqcounts) are costly.
         * Keep the two functions in sync.
         */

        /*
         * The hash list is protected using RCU.
         *
         * Take d_lock when comparing a candidate dentry, to avoid races
         * with d_move().
         *
         * It is possible that concurrent renames can mess up our list
         * walk here and result in missing our dentry, resulting in the
         * false-negative result. d_lookup() protects against concurrent
         * renames using rename_lock seqlock.
         *
         * See Documentation/filesystems/path-lookup.txt for more details.
         */
        rcu_read_lock();
        
        hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {

                if (dentry->d_name.hash != hash)
                        continue;

                spin_lock(&dentry->d_lock);
                if (dentry->d_parent != parent)
                        goto next;
                if (d_unhashed(dentry))
                        goto next;

                if (!d_same_name(dentry, parent, name))
                        goto next;

                dentry->d_lockref.count++;
                found = dentry;
                spin_unlock(&dentry->d_lock);
                break;
next:
                spin_unlock(&dentry->d_lock);
         }
         rcu_read_unlock();

         return found;
}

/**
 * d_hash_and_lookup - hash the qstr then search for a dentry
 * @dir: Directory to search in
 * @name: qstr of name we wish to find
 *
 * On lookup failure NULL is returned; on bad name - ERR_PTR(-error)
 */
struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
{
        /*
         * Check for a fs-specific hash function. Note that we must
         * calculate the standard hash first, as the d_op->d_hash()
         * routine may choose to leave the hash value unchanged.
         */
        name->hash = full_name_hash(dir, name->name, name->len);
        if (dir->d_flags & DCACHE_OP_HASH) {
                int err = dir->d_op->d_hash(dir, name);
                if (unlikely(err < 0))
                        return ERR_PTR(err);
        }
        return d_lookup(dir, name);
}
EXPORT_SYMBOL(d_hash_and_lookup);

/*
 * When a file is deleted, we have two options:
 * - turn this dentry into a negative dentry
 * - unhash this dentry and free it.
 *
 * Usually, we want to just turn this into
 * a negative dentry, but if anybody else is
 * currently using the dentry or the inode
 * we can't do that and we fall back on removing
 * it from the hash queues and waiting for
 * it to be deleted later when it has no users
 */
 
/**
 * d_delete - delete a dentry
 * @dentry: The dentry to delete
 *
 * Turn the dentry into a negative dentry if possible, otherwise
 * remove it from the hash queues so it can be deleted later
 */
 
void d_delete(struct dentry * dentry)
{
        struct inode *inode = dentry->d_inode;

        spin_lock(&inode->i_lock);
        spin_lock(&dentry->d_lock);
        /*
         * Are we the only user?
         */
        if (dentry->d_lockref.count == 1) {
                dentry->d_flags &= ~DCACHE_CANT_MOUNT;
                dentry_unlink_inode(dentry);
        } else {
                __d_drop(dentry);
                spin_unlock(&dentry->d_lock);
                spin_unlock(&inode->i_lock);
        }
}
EXPORT_SYMBOL(d_delete);

static void __d_rehash(struct dentry *entry)
{
        struct hlist_bl_head *b = d_hash(entry->d_name.hash);

        hlist_bl_lock(b);
        hlist_bl_add_head_rcu(&entry->d_hash, b);
        hlist_bl_unlock(b);
}

/**
 * d_rehash        - add an entry back to the hash
 * @entry: dentry to add to the hash
 *
 * Adds a dentry to the hash according to its name.
 */
 
void d_rehash(struct dentry * entry)
{
        spin_lock(&entry->d_lock);
        __d_rehash(entry);
        spin_unlock(&entry->d_lock);
}
EXPORT_SYMBOL(d_rehash);

static inline unsigned start_dir_add(struct inode *dir)
{

        for (;;) {
                unsigned n = dir->i_dir_seq;
                if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
                        return n;
                cpu_relax();
        }
}

static inline void end_dir_add(struct inode *dir, unsigned n)
{
        smp_store_release(&dir->i_dir_seq, n + 2);
}

static void d_wait_lookup(struct dentry *dentry)
{
        if (d_in_lookup(dentry)) {
                DECLARE_WAITQUEUE(wait, current);
                add_wait_queue(dentry->d_wait, &wait);
                do {
                        set_current_state(TASK_UNINTERRUPTIBLE);
                        spin_unlock(&dentry->d_lock);
                        schedule();
                        spin_lock(&dentry->d_lock);
                } while (d_in_lookup(dentry));
        }
}

struct dentry *d_alloc_parallel(struct dentry *parent,
                                const struct qstr *name,
                                wait_queue_head_t *wq)
{
        unsigned int hash = name->hash;
        struct hlist_bl_head *b = in_lookup_hash(parent, hash);
        struct hlist_bl_node *node;
        struct dentry *new = d_alloc(parent, name);
        struct dentry *dentry;
        unsigned seq, r_seq, d_seq;

        if (unlikely(!new))
                return ERR_PTR(-ENOMEM);

retry:
        rcu_read_lock();
        seq = smp_load_acquire(&parent->d_inode->i_dir_seq);
        r_seq = read_seqbegin(&rename_lock);
        dentry = __d_lookup_rcu(parent, name, &d_seq);
        if (unlikely(dentry)) {
                if (!lockref_get_not_dead(&dentry->d_lockref)) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (read_seqcount_retry(&dentry->d_seq, d_seq)) {
                        rcu_read_unlock();
                        dput(dentry);
                        goto retry;
                }
                rcu_read_unlock();
                dput(new);
                return dentry;
        }
        if (unlikely(read_seqretry(&rename_lock, r_seq))) {
                rcu_read_unlock();
                goto retry;
        }

        if (unlikely(seq & 1)) {
                rcu_read_unlock();
                goto retry;
        }

        hlist_bl_lock(b);
        if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) {
                hlist_bl_unlock(b);
                rcu_read_unlock();
                goto retry;
        }
        /*
         * No changes for the parent since the beginning of d_lookup().
         * Since all removals from the chain happen with hlist_bl_lock(),
         * any potential in-lookup matches are going to stay here until
         * we unlock the chain.  All fields are stable in everything
         * we encounter.
         */
        hlist_bl_for_each_entry(dentry, node, b, d_u.d_in_lookup_hash) {
                if (dentry->d_name.hash != hash)
                        continue;
                if (dentry->d_parent != parent)
                        continue;
                if (!d_same_name(dentry, parent, name))
                        continue;
                hlist_bl_unlock(b);
                /* now we can try to grab a reference */
                if (!lockref_get_not_dead(&dentry->d_lockref)) {
                        rcu_read_unlock();
                        goto retry;
                }

                rcu_read_unlock();
                /*
                 * somebody is likely to be still doing lookup for it;
                 * wait for them to finish
                 */
                spin_lock(&dentry->d_lock);
                d_wait_lookup(dentry);
                /*
                 * it's not in-lookup anymore; in principle we should repeat
                 * everything from dcache lookup, but it's likely to be what
                 * d_lookup() would've found anyway.  If it is, just return it;
                 * otherwise we really have to repeat the whole thing.
                 */
                if (unlikely(dentry->d_name.hash != hash))
                        goto mismatch;
                if (unlikely(dentry->d_parent != parent))
                        goto mismatch;
                if (unlikely(d_unhashed(dentry)))
                        goto mismatch;
                if (unlikely(!d_same_name(dentry, parent, name)))
                        goto mismatch;
                /* OK, it *is* a hashed match; return it */
                spin_unlock(&dentry->d_lock);
                dput(new);
                return dentry;
        }
        rcu_read_unlock();
        /* we can't take ->d_lock here; it's OK, though. */
        new->d_flags |= DCACHE_PAR_LOOKUP;
        new->d_wait = wq;
        hlist_bl_add_head_rcu(&new->d_u.d_in_lookup_hash, b);
        hlist_bl_unlock(b);
        return new;
mismatch:
        spin_unlock(&dentry->d_lock);
        dput(dentry);
        goto retry;
}
EXPORT_SYMBOL(d_alloc_parallel);

void __d_lookup_done(struct dentry *dentry)
{
        struct hlist_bl_head *b = in_lookup_hash(dentry->d_parent,
                                                 dentry->d_name.hash);
        hlist_bl_lock(b);
        dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
        __hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
        wake_up_all(dentry->d_wait);
        dentry->d_wait = NULL;
        hlist_bl_unlock(b);
        INIT_HLIST_NODE(&dentry->d_u.d_alias);
        INIT_LIST_HEAD(&dentry->d_lru);
}
EXPORT_SYMBOL(__d_lookup_done);

/* inode->i_lock held if inode is non-NULL */

static inline void __d_add(struct dentry *dentry, struct inode *inode)
{
        struct inode *dir = NULL;
        unsigned n;
        spin_lock(&dentry->d_lock);
        if (unlikely(d_in_lookup(dentry))) {
                dir = dentry->d_parent->d_inode;
                n = start_dir_add(dir);
                __d_lookup_done(dentry);
        }
        if (inode) {
                unsigned add_flags = d_flags_for_inode(inode);
                hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
                raw_write_seqcount_begin(&dentry->d_seq);
                __d_set_inode_and_type(dentry, inode, add_flags);
                raw_write_seqcount_end(&dentry->d_seq);
                fsnotify_update_flags(dentry);
        }
        __d_rehash(dentry);
        if (dir)
                end_dir_add(dir, n);
        spin_unlock(&dentry->d_lock);
        if (inode)
                spin_unlock(&inode->i_lock);
}

/**
 * d_add - add dentry to hash queues
 * @entry: dentry to add
 * @inode: The inode to attach to this dentry
 *
 * This adds the entry to the hash queues and initializes @inode.
 * The entry was actually filled in earlier during d_alloc().
 */

void d_add(struct dentry *entry, struct inode *inode)
{
        if (inode) {
                security_d_instantiate(entry, inode);
                spin_lock(&inode->i_lock);
        }
        __d_add(entry, inode);
}
EXPORT_SYMBOL(d_add);

/**
 * d_exact_alias - find and hash an exact unhashed alias
 * @entry: dentry to add
 * @inode: The inode to go with this dentry
 *
 * If an unhashed dentry with the same name/parent and desired
 * inode already exists, hash and return it.  Otherwise, return
 * NULL.
 *
 * Parent directory should be locked.
 */
struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
{
        struct dentry *alias;
        unsigned int hash = entry->d_name.hash;

        spin_lock(&inode->i_lock);
        hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
                /*
                 * Don't need alias->d_lock here, because aliases with
                 * d_parent == entry->d_parent are not subject to name or
                 * parent changes, because the parent inode i_mutex is held.
                 */
                if (alias->d_name.hash != hash)
                        continue;
                if (alias->d_parent != entry->d_parent)
                        continue;
                if (!d_same_name(alias, entry->d_parent, &entry->d_name))
                        continue;
                spin_lock(&alias->d_lock);
                if (!d_unhashed(alias)) {
                        spin_unlock(&alias->d_lock);
                        alias = NULL;
                } else {
                        __dget_dlock(alias);
                        __d_rehash(alias);
                        spin_unlock(&alias->d_lock);
                }
                spin_unlock(&inode->i_lock);
                return alias;
        }
        spin_unlock(&inode->i_lock);
        return NULL;
}
EXPORT_SYMBOL(d_exact_alias);

static void swap_names(struct dentry *dentry, struct dentry *target)
{
        if (unlikely(dname_external(target))) {
                if (unlikely(dname_external(dentry))) {
                        /*
                         * Both external: swap the pointers
                         */
                        swap(target->d_name.name, dentry->d_name.name);
                } else {
                        /*
                         * dentry:internal, target:external.  Steal target's
                         * storage and make target internal.
                         */
                        memcpy(target->d_iname, dentry->d_name.name,
                                        dentry->d_name.len + 1);
                        dentry->d_name.name = target->d_name.name;
                        target->d_name.name = target->d_iname;
                }
        } else {
                if (unlikely(dname_external(dentry))) {
                        /*
                         * dentry:external, target:internal.  Give dentry's
                         * storage to target and make dentry internal
                         */
                        memcpy(dentry->d_iname, target->d_name.name,
                                        target->d_name.len + 1);
                        target->d_name.name = dentry->d_name.name;
                        dentry->d_name.name = dentry->d_iname;
                } else {
                        /*
                         * Both are internal.
                         */
                        unsigned int i;
                        BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
                        for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
                                swap(((long *) &dentry->d_iname)[i],
                                     ((long *) &target->d_iname)[i]);
                        }
                }
        }
        swap(dentry->d_name.hash_len, target->d_name.hash_len);
}

static void copy_name(struct dentry *dentry, struct dentry *target)
{
        struct external_name *old_name = NULL;
        if (unlikely(dname_external(dentry)))
                old_name = external_name(dentry);
        if (unlikely(dname_external(target))) {
                atomic_inc(&external_name(target)->u.count);
                dentry->d_name = target->d_name;
        } else {
                memcpy(dentry->d_iname, target->d_name.name,
                                target->d_name.len + 1);
                dentry->d_name.name = dentry->d_iname;
                dentry->d_name.hash_len = target->d_name.hash_len;
        }
        if (old_name && likely(atomic_dec_and_test(&old_name->u.count)))
                kfree_rcu(old_name, u.head);
}

/*
 * __d_move - move a dentry
 * @dentry: entry to move
 * @target: new dentry
 * @exchange: exchange the two dentries
 *
 * Update the dcache to reflect the move of a file name. Negative
 * dcache entries should not be moved in this way. Caller must hold
 * rename_lock, the i_mutex of the source and target directories,
 * and the sb->s_vfs_rename_mutex if they differ. See lock_rename().
 */
static void __d_move(struct dentry *dentry, struct dentry *target,
                     bool exchange)
{
        struct dentry *old_parent, *p;
        struct inode *dir = NULL;
        unsigned n;

        WARN_ON(!dentry->d_inode);
        if (WARN_ON(dentry == target))
                return;

        BUG_ON(d_ancestor(target, dentry));
        old_parent = dentry->d_parent;
        p = d_ancestor(old_parent, target);
        if (IS_ROOT(dentry)) {
                BUG_ON(p);
                spin_lock(&target->d_parent->d_lock);
        } else if (!p) {
                /* target is not a descendent of dentry->d_parent */
                spin_lock(&target->d_parent->d_lock);
                spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_NESTED);
        } else {
                BUG_ON(p == dentry);
                spin_lock(&old_parent->d_lock);
                if (p != target)
                        spin_lock_nested(&target->d_parent->d_lock,
                                        DENTRY_D_LOCK_NESTED);
        }
        spin_lock_nested(&dentry->d_lock, 2);
        spin_lock_nested(&target->d_lock, 3);

        if (unlikely(d_in_lookup(target))) {
                dir = target->d_parent->d_inode;
                n = start_dir_add(dir);
                __d_lookup_done(target);
        }

        write_seqcount_begin(&dentry->d_seq);
        write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED);

        /* unhash both */
        if (!d_unhashed(dentry))
                ___d_drop(dentry);
        if (!d_unhashed(target))
                ___d_drop(target);

        /* ... and switch them in the tree */
        dentry->d_parent = target->d_parent;
        if (!exchange) {
                copy_name(dentry, target);
                target->d_hash.pprev = NULL;
                dentry->d_parent->d_lockref.count++;
                if (dentry != old_parent) /* wasn't IS_ROOT */
                        WARN_ON(!--old_parent->d_lockref.count);
        } else {
                target->d_parent = old_parent;
                swap_names(dentry, target);
                list_move(&target->d_child, &target->d_parent->d_subdirs);
                __d_rehash(target);
                fsnotify_update_flags(target);
        }
        list_move(&dentry->d_child, &dentry->d_parent->d_subdirs);
        __d_rehash(dentry);
        fsnotify_update_flags(dentry);
        fscrypt_handle_d_move(dentry);

        write_seqcount_end(&target->d_seq);
        write_seqcount_end(&dentry->d_seq);

        if (dir)
                end_dir_add(dir, n);

        if (dentry->d_parent != old_parent)
                spin_unlock(&dentry->d_parent->d_lock);
        if (dentry != old_parent)
                spin_unlock(&old_parent->d_lock);
        spin_unlock(&target->d_lock);
        spin_unlock(&dentry->d_lock);
}

/*
 * d_move - move a dentry
 * @dentry: entry to move
 * @target: new dentry
 *
 * Update the dcache to reflect the move of a file name. Negative
 * dcache entries should not be moved in this way. See the locking
 * requirements for __d_move.
 */
void d_move(struct dentry *dentry, struct dentry *target)
{
        write_seqlock(&rename_lock);
        __d_move(dentry, target, false);
        write_sequnlock(&rename_lock);
}
EXPORT_SYMBOL(d_move);

/*
 * d_exchange - exchange two dentries
 * @dentry1: first dentry
 * @dentry2: second dentry
 */
void d_exchange(struct dentry *dentry1, struct dentry *dentry2)
{
        write_seqlock(&rename_lock);

        WARN_ON(!dentry1->d_inode);
        WARN_ON(!dentry2->d_inode);
        WARN_ON(IS_ROOT(dentry1));
        WARN_ON(IS_ROOT(dentry2));

        __d_move(dentry1, dentry2, true);

        write_sequnlock(&rename_lock);
}

/**
 * d_ancestor - search for an ancestor
 * @p1: ancestor dentry
 * @p2: child dentry
 *
 * Returns the ancestor dentry of p2 which is a child of p1, if p1 is
 * an ancestor of p2, else NULL.
 */
struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
{
        struct dentry *p;

        for (p = p2; !IS_ROOT(p); p = p->d_parent) {
                if (p->d_parent == p1)
                        return p;
        }
        return NULL;
}

/*
 * This helper attempts to cope with remotely renamed directories
 *
 * It assumes that the caller is already holding
 * dentry->d_parent->d_inode->i_mutex, and rename_lock
 *
 * Note: If ever the locking in lock_rename() changes, then please
 * remember to update this too...
 */
static int __d_unalias(struct inode *inode,
                struct dentry *dentry, struct dentry *alias)
{
        struct mutex *m1 = NULL;
        struct rw_semaphore *m2 = NULL;
        int ret = -ESTALE;

        /* If alias and dentry share a parent, then no extra locks required */
        if (alias->d_parent == dentry->d_parent)
                goto out_unalias;

        /* See lock_rename() */
        if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex))
                goto out_err;
        m1 = &dentry->d_sb->s_vfs_rename_mutex;
        if (!inode_trylock_shared(alias->d_parent->d_inode))
                goto out_err;
        m2 = &alias->d_parent->d_inode->i_rwsem;
out_unalias:
        __d_move(alias, dentry, false);
        ret = 0;
out_err:
        if (m2)
                up_read(m2);
        if (m1)
                mutex_unlock(m1);
        return ret;
}

/**
 * d_splice_alias - splice a disconnected dentry into the tree if one exists
 * @inode:  the inode which may have a disconnected dentry
 * @dentry: a negative dentry which we want to point to the inode.
 *
 * If inode is a directory and has an IS_ROOT alias, then d_move that in
 * place of the given dentry and return it, else simply d_add the inode
 * to the dentry and return NULL.
 *
 * If a non-IS_ROOT directory is found, the filesystem is corrupt, and
 * we should error out: directories can't have multiple aliases.
 *
 * This is needed in the lookup routine of any filesystem that is exportable
 * (via knfsd) so that we can build dcache paths to directories effectively.
 *
 * If a dentry was found and moved, then it is returned.  Otherwise NULL
 * is returned.  This matches the expected return value of ->lookup.
 *
 * Cluster filesystems may call this function with a negative, hashed dentry.
 * In that case, we know that the inode will be a regular file, and also this
 * will only occur during atomic_open. So we need to check for the dentry
 * being already hashed only in the final case.
 */
struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
{
        if (IS_ERR(inode))
                return ERR_CAST(inode);

        BUG_ON(!d_unhashed(dentry));

        if (!inode)
                goto out;

        security_d_instantiate(dentry, inode);
        spin_lock(&inode->i_lock);
        if (S_ISDIR(inode->i_mode)) {
                struct dentry *new = __d_find_any_alias(inode);
                if (unlikely(new)) {
                        /* The reference to new ensures it remains an alias */
                        spin_unlock(&inode->i_lock);
                        write_seqlock(&rename_lock);
                        if (unlikely(d_ancestor(new, dentry))) {
                                write_sequnlock(&rename_lock);
                                dput(new);
                                new = ERR_PTR(-ELOOP);
                                pr_warn_ratelimited(
                                        "VFS: Lookup of '%s' in %s %s"
                                        " would have caused loop\n",
                                        dentry->d_name.name,
                                        inode->i_sb->s_type->name,
                                        inode->i_sb->s_id);
                        } else if (!IS_ROOT(new)) {
                                struct dentry *old_parent = dget(new->d_parent);
                                int err = __d_unalias(inode, dentry, new);
                                write_sequnlock(&rename_lock);
                                if (err) {
                                        dput(new);
                                        new = ERR_PTR(err);
                                }
                                dput(old_parent);
                        } else {
                                __d_move(new, dentry, false);
                                write_sequnlock(&rename_lock);
                        }
                        iput(inode);
                        return new;
                }
        }
out:
        __d_add(dentry, inode);
        return NULL;
}
EXPORT_SYMBOL(d_splice_alias);

/*
 * Test whether new_dentry is a subdirectory of old_dentry.
 *
 * Trivially implemented using the dcache structure
 */

/**
 * is_subdir - is new dentry a subdirectory of old_dentry
 * @new_dentry: new dentry
 * @old_dentry: old dentry
 *
 * Returns true if new_dentry is a subdirectory of the parent (at any depth).
 * Returns false otherwise.
 * Caller must ensure that "new_dentry" is pinned before calling is_subdir()
 */
  
bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
{
        bool result;
        unsigned seq;

        if (new_dentry == old_dentry)
                return true;

        do {
                /* for restarting inner loop in case of seq retry */
                seq = read_seqbegin(&rename_lock);
                /*
                 * Need rcu_readlock to protect against the d_parent trashing
                 * due to d_move
                 */
                rcu_read_lock();
                if (d_ancestor(old_dentry, new_dentry))
                        result = true;
                else
                        result = false;
                rcu_read_unlock();
        } while (read_seqretry(&rename_lock, seq));

        return result;
}
EXPORT_SYMBOL(is_subdir);

static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry)
{
        struct dentry *root = data;
        if (dentry != root) {
                if (d_unhashed(dentry) || !dentry->d_inode)
                        return D_WALK_SKIP;

                if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
                        dentry->d_flags |= DCACHE_GENOCIDE;
                        dentry->d_lockref.count--;
                }
        }
        return D_WALK_CONTINUE;
}

void d_genocide(struct dentry *parent)
{
        d_walk(parent, parent, d_genocide_kill);
}

EXPORT_SYMBOL(d_genocide);

void d_tmpfile(struct dentry *dentry, struct inode *inode)
{
        inode_dec_link_count(inode);
        BUG_ON(dentry->d_name.name != dentry->d_iname ||
                !hlist_unhashed(&dentry->d_u.d_alias) ||
                !d_unlinked(dentry));
        spin_lock(&dentry->d_parent->d_lock);
        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
        dentry->d_name.len = sprintf(dentry->d_iname, "#%llu",
                                (unsigned long long)inode->i_ino);
        spin_unlock(&dentry->d_lock);
        spin_unlock(&dentry->d_parent->d_lock);
        d_instantiate(dentry, inode);
}
EXPORT_SYMBOL(d_tmpfile);

static __initdata unsigned long dhash_entries;
static int __init set_dhash_entries(char *str)
{
        if (!str)
                return 0;
        dhash_entries = simple_strtoul(str, &str, 0);
        return 1;
}
__setup("dhash_entries=", set_dhash_entries);

static void __init dcache_init_early(void)
{
        /* If hashes are distributed across NUMA nodes, defer
         * hash allocation until vmalloc space is available.
         */
        if (hashdist)
                return;

        dentry_hashtable =
                alloc_large_system_hash("Dentry cache",
                                        sizeof(struct hlist_bl_head),
                                        dhash_entries,
                                        13,
                                        HASH_EARLY | HASH_ZERO,
                                        &d_hash_shift,
                                        NULL,
                                        0,
                                        0);
        d_hash_shift = 32 - d_hash_shift;
}

static void __init dcache_init(void)
{
        /*
         * A constructor could be added for stable state like the lists,
         * but it is probably not worth it because of the cache nature
         * of the dcache.
         */
        dentry_cache = KMEM_CACHE_USERCOPY(dentry,
                SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
                d_iname);

        /* Hash may have been set up in dcache_init_early */
        if (!hashdist)
                return;

        dentry_hashtable =
                alloc_large_system_hash("Dentry cache",
                                        sizeof(struct hlist_bl_head),
                                        dhash_entries,
                                        13,
                                        HASH_ZERO,
                                        &d_hash_shift,
                                        NULL,
                                        0,
                                        0);
        d_hash_shift = 32 - d_hash_shift;
}

/* SLAB cache for __getname() consumers */
struct kmem_cache *names_cachep __read_mostly;
EXPORT_SYMBOL(names_cachep);

void __init vfs_caches_init_early(void)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++)
                INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]);

        dcache_init_early();
        inode_init_early();
}

void __init vfs_caches_init(void)
{
        names_cachep = kmem_cache_create_usercopy("names_cache", PATH_MAX, 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, 0, PATH_MAX, NULL);

        dcache_init();
        inode_init();
        files_init();
        files_maxfiles_init();
        mnt_init();
        bdev_cache_init();
        chrdev_init();
}




































    1 



    1 






































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#ifdef CONFIG_PREEMPTIRQ_TRACEPOINTS

#undef TRACE_SYSTEM
#define TRACE_SYSTEM preemptirq

#if !defined(_TRACE_PREEMPTIRQ_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PREEMPTIRQ_H

#include <linux/ktime.h>
#include <linux/tracepoint.h>
#include <linux/string.h>
#include <asm/sections.h>

DECLARE_EVENT_CLASS(preemptirq_template,

        TP_PROTO(unsigned long ip, unsigned long parent_ip),

        TP_ARGS(ip, parent_ip),

        TP_STRUCT__entry(
                __field(s32, caller_offs)
                __field(s32, parent_offs)
        ),

        TP_fast_assign(
                __entry->caller_offs = (s32)(ip - (unsigned long)_stext);
                __entry->parent_offs = (s32)(parent_ip - (unsigned long)_stext);
        ),

        TP_printk("caller=%pS parent=%pS",
                  (void *)((unsigned long)(_stext) + __entry->caller_offs),
                  (void *)((unsigned long)(_stext) + __entry->parent_offs))
);

#ifdef CONFIG_TRACE_IRQFLAGS
DEFINE_EVENT(preemptirq_template, irq_disable,
             TP_PROTO(unsigned long ip, unsigned long parent_ip),
             TP_ARGS(ip, parent_ip));

DEFINE_EVENT(preemptirq_template, irq_enable,
             TP_PROTO(unsigned long ip, unsigned long parent_ip),
             TP_ARGS(ip, parent_ip));
#else
#define trace_irq_enable(...)
#define trace_irq_disable(...)
#define trace_irq_enable_rcuidle(...)
#define trace_irq_disable_rcuidle(...)
#endif

#ifdef CONFIG_TRACE_PREEMPT_TOGGLE
DEFINE_EVENT(preemptirq_template, preempt_disable,
             TP_PROTO(unsigned long ip, unsigned long parent_ip),
             TP_ARGS(ip, parent_ip));

DEFINE_EVENT(preemptirq_template, preempt_enable,
             TP_PROTO(unsigned long ip, unsigned long parent_ip),
             TP_ARGS(ip, parent_ip));
#else
#define trace_preempt_enable(...)
#define trace_preempt_disable(...)
#define trace_preempt_enable_rcuidle(...)
#define trace_preempt_disable_rcuidle(...)
#endif

#endif /* _TRACE_PREEMPTIRQ_H */

#include <trace/define_trace.h>

#else /* !CONFIG_PREEMPTIRQ_TRACEPOINTS */
#define trace_irq_enable(...)
#define trace_irq_disable(...)
#define trace_irq_enable_rcuidle(...)
#define trace_irq_disable_rcuidle(...)
#define trace_preempt_enable(...)
#define trace_preempt_disable(...)
#define trace_preempt_enable_rcuidle(...)
#define trace_preempt_disable_rcuidle(...)
#endif



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM ext4

#if !defined(_TRACE_EXT4_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_EXT4_H

#include <linux/writeback.h>
#include <linux/tracepoint.h>

struct ext4_allocation_context;
struct ext4_allocation_request;
struct ext4_extent;
struct ext4_prealloc_space;
struct ext4_inode_info;
struct mpage_da_data;
struct ext4_map_blocks;
struct extent_status;
struct ext4_fsmap;
struct partial_cluster;

#define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))

#define show_mballoc_flags(flags) __print_flags(flags, "|",        \
        { EXT4_MB_HINT_MERGE,                "HINT_MERGE" },                \
        { EXT4_MB_HINT_RESERVED,        "HINT_RESV" },                \
        { EXT4_MB_HINT_METADATA,        "HINT_MDATA" },                \
        { EXT4_MB_HINT_FIRST,                "HINT_FIRST" },                \
        { EXT4_MB_HINT_BEST,                "HINT_BEST" },                \
        { EXT4_MB_HINT_DATA,                "HINT_DATA" },                \
        { EXT4_MB_HINT_NOPREALLOC,        "HINT_NOPREALLOC" },        \
        { EXT4_MB_HINT_GROUP_ALLOC,        "HINT_GRP_ALLOC" },        \
        { EXT4_MB_HINT_GOAL_ONLY,        "HINT_GOAL_ONLY" },        \
        { EXT4_MB_HINT_TRY_GOAL,        "HINT_TRY_GOAL" },        \
        { EXT4_MB_DELALLOC_RESERVED,        "DELALLOC_RESV" },        \
        { EXT4_MB_STREAM_ALLOC,                "STREAM_ALLOC" },        \
        { EXT4_MB_USE_ROOT_BLOCKS,        "USE_ROOT_BLKS" },        \
        { EXT4_MB_USE_RESERVED,                "USE_RESV" },                \
        { EXT4_MB_STRICT_CHECK,                "STRICT_CHECK" })

#define show_map_flags(flags) __print_flags(flags, "|",                        \
        { EXT4_GET_BLOCKS_CREATE,                "CREATE" },                \
        { EXT4_GET_BLOCKS_UNWRIT_EXT,                "UNWRIT" },                \
        { EXT4_GET_BLOCKS_DELALLOC_RESERVE,        "DELALLOC" },                \
        { EXT4_GET_BLOCKS_PRE_IO,                "PRE_IO" },                \
        { EXT4_GET_BLOCKS_CONVERT,                "CONVERT" },                \
        { EXT4_GET_BLOCKS_METADATA_NOFAIL,        "METADATA_NOFAIL" },        \
        { EXT4_GET_BLOCKS_NO_NORMALIZE,                "NO_NORMALIZE" },        \
        { EXT4_GET_BLOCKS_CONVERT_UNWRITTEN,        "CONVERT_UNWRITTEN" },  \
        { EXT4_GET_BLOCKS_ZERO,                        "ZERO" },                \
        { EXT4_GET_BLOCKS_IO_SUBMIT,                "IO_SUBMIT" },                \
        { EXT4_EX_NOCACHE,                        "EX_NOCACHE" })

/*
 * __print_flags() requires that all enum values be wrapped in the
 * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace
 * ring buffer.
 */
TRACE_DEFINE_ENUM(BH_New);
TRACE_DEFINE_ENUM(BH_Mapped);
TRACE_DEFINE_ENUM(BH_Unwritten);
TRACE_DEFINE_ENUM(BH_Boundary);

#define show_mflags(flags) __print_flags(flags, "",        \
        { EXT4_MAP_NEW,                "N" },                        \
        { EXT4_MAP_MAPPED,        "M" },                        \
        { EXT4_MAP_UNWRITTEN,        "U" },                        \
        { EXT4_MAP_BOUNDARY,        "B" })

#define show_free_flags(flags) __print_flags(flags, "|",        \
        { EXT4_FREE_BLOCKS_METADATA,                "METADATA" },        \
        { EXT4_FREE_BLOCKS_FORGET,                "FORGET" },        \
        { EXT4_FREE_BLOCKS_VALIDATED,                "VALIDATED" },        \
        { EXT4_FREE_BLOCKS_NO_QUOT_UPDATE,        "NO_QUOTA" },        \
        { EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER,"1ST_CLUSTER" },\
        { EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER,        "LAST_CLUSTER" })

TRACE_DEFINE_ENUM(ES_WRITTEN_B);
TRACE_DEFINE_ENUM(ES_UNWRITTEN_B);
TRACE_DEFINE_ENUM(ES_DELAYED_B);
TRACE_DEFINE_ENUM(ES_HOLE_B);
TRACE_DEFINE_ENUM(ES_REFERENCED_B);

#define show_extent_status(status) __print_flags(status, "",        \
        { EXTENT_STATUS_WRITTEN,        "W" },                        \
        { EXTENT_STATUS_UNWRITTEN,        "U" },                        \
        { EXTENT_STATUS_DELAYED,        "D" },                        \
        { EXTENT_STATUS_HOLE,                "H" },                        \
        { EXTENT_STATUS_REFERENCED,        "R" })

#define show_falloc_mode(mode) __print_flags(mode, "|",                \
        { FALLOC_FL_KEEP_SIZE,                "KEEP_SIZE"},                \
        { FALLOC_FL_PUNCH_HOLE,                "PUNCH_HOLE"},                \
        { FALLOC_FL_NO_HIDE_STALE,        "NO_HIDE_STALE"},        \
        { FALLOC_FL_COLLAPSE_RANGE,        "COLLAPSE_RANGE"},        \
        { FALLOC_FL_ZERO_RANGE,                "ZERO_RANGE"})

TRACE_DEFINE_ENUM(EXT4_FC_REASON_XATTR);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_CROSS_RENAME);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_JOURNAL_FLAG_CHANGE);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_NOMEM);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_SWAP_BOOT);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_RESIZE);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_RENAME_DIR);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_FALLOC_RANGE);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_INODE_JOURNAL_DATA);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_ENCRYPTED_FILENAME);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX);

#define show_fc_reason(reason)                                                \
        __print_symbolic(reason,                                        \
                { EXT4_FC_REASON_XATTR,                "XATTR"},                \
                { EXT4_FC_REASON_CROSS_RENAME,        "CROSS_RENAME"},        \
                { EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, "JOURNAL_FLAG_CHANGE"}, \
                { EXT4_FC_REASON_NOMEM,        "NO_MEM"},                        \
                { EXT4_FC_REASON_SWAP_BOOT,        "SWAP_BOOT"},                \
                { EXT4_FC_REASON_RESIZE,        "RESIZE"},                \
                { EXT4_FC_REASON_RENAME_DIR,        "RENAME_DIR"},                \
                { EXT4_FC_REASON_FALLOC_RANGE,        "FALLOC_RANGE"},        \
                { EXT4_FC_REASON_INODE_JOURNAL_DATA,        "INODE_JOURNAL_DATA"}, \
                { EXT4_FC_REASON_ENCRYPTED_FILENAME,        "ENCRYPTED_FILENAME"})

TRACE_EVENT(ext4_other_inode_update_time,
        TP_PROTO(struct inode *inode, ino_t orig_ino),

        TP_ARGS(inode, orig_ino),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        ino_t,        orig_ino                )
                __field(        uid_t,        uid                        )
                __field(        gid_t,        gid                        )
                __field(        __u16, mode                        )
        ),

        TP_fast_assign(
                __entry->orig_ino = orig_ino;
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->uid        = i_uid_read(inode);
                __entry->gid        = i_gid_read(inode);
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d orig_ino %lu ino %lu mode 0%o uid %u gid %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->orig_ino,
                  (unsigned long) __entry->ino, __entry->mode,
                  __entry->uid, __entry->gid)
);

TRACE_EVENT(ext4_free_inode,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        uid_t,        uid                        )
                __field(        gid_t,        gid                        )
                __field(        __u64, blocks                        )
                __field(        __u16, mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->uid        = i_uid_read(inode);
                __entry->gid        = i_gid_read(inode);
                __entry->blocks        = inode->i_blocks;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->mode,
                  __entry->uid, __entry->gid, __entry->blocks)
);

TRACE_EVENT(ext4_request_inode,
        TP_PROTO(struct inode *dir, int mode),

        TP_ARGS(dir, mode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        dir                        )
                __field(        __u16, mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = dir->i_sb->s_dev;
                __entry->dir        = dir->i_ino;
                __entry->mode        = mode;
        ),

        TP_printk("dev %d,%d dir %lu mode 0%o",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->dir, __entry->mode)
);

TRACE_EVENT(ext4_allocate_inode,
        TP_PROTO(struct inode *inode, struct inode *dir, int mode),

        TP_ARGS(inode, dir, mode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        ino_t,        dir                        )
                __field(        __u16,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->dir        = dir->i_ino;
                __entry->mode        = mode;
        ),

        TP_printk("dev %d,%d ino %lu dir %lu mode 0%o",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned long) __entry->dir, __entry->mode)
);

TRACE_EVENT(ext4_evict_inode,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        int,        nlink                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->nlink        = inode->i_nlink;
        ),

        TP_printk("dev %d,%d ino %lu nlink %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->nlink)
);

TRACE_EVENT(ext4_drop_inode,
        TP_PROTO(struct inode *inode, int drop),

        TP_ARGS(inode, drop),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        int,        drop                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->drop        = drop;
        ),

        TP_printk("dev %d,%d ino %lu drop %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->drop)
);

TRACE_EVENT(ext4_nfs_commit_metadata,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
        ),

        TP_printk("dev %d,%d ino %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino)
);

TRACE_EVENT(ext4_mark_inode_dirty,
        TP_PROTO(struct inode *inode, unsigned long IP),

        TP_ARGS(inode, IP),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(unsigned long,        ip                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->ip        = IP;
        ),

        TP_printk("dev %d,%d ino %lu caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, (void *)__entry->ip)
);

TRACE_EVENT(ext4_begin_ordered_truncate,
        TP_PROTO(struct inode *inode, loff_t new_size),

        TP_ARGS(inode, new_size),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        new_size                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->new_size        = new_size;
        ),

        TP_printk("dev %d,%d ino %lu new_size %lld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->new_size)
);

DECLARE_EVENT_CLASS(ext4__write_begin,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int flags),

        TP_ARGS(inode, pos, len, flags),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        pos                        )
                __field(        unsigned int, len                )
                __field(        unsigned int, flags                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pos        = pos;
                __entry->len        = len;
                __entry->flags        = flags;
        ),

        TP_printk("dev %d,%d ino %lu pos %lld len %u flags %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->pos, __entry->len, __entry->flags)
);

DEFINE_EVENT(ext4__write_begin, ext4_write_begin,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int flags),

        TP_ARGS(inode, pos, len, flags)
);

DEFINE_EVENT(ext4__write_begin, ext4_da_write_begin,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int flags),

        TP_ARGS(inode, pos, len, flags)
);

DECLARE_EVENT_CLASS(ext4__write_end,
        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                        unsigned int copied),

        TP_ARGS(inode, pos, len, copied),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        pos                        )
                __field(        unsigned int, len                )
                __field(        unsigned int, copied                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pos        = pos;
                __entry->len        = len;
                __entry->copied        = copied;
        ),

        TP_printk("dev %d,%d ino %lu pos %lld len %u copied %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->pos, __entry->len, __entry->copied)
);

DEFINE_EVENT(ext4__write_end, ext4_write_end,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int copied),

        TP_ARGS(inode, pos, len, copied)
);

DEFINE_EVENT(ext4__write_end, ext4_journalled_write_end,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int copied),

        TP_ARGS(inode, pos, len, copied)
);

DEFINE_EVENT(ext4__write_end, ext4_da_write_end,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int copied),

        TP_ARGS(inode, pos, len, copied)
);

TRACE_EVENT(ext4_writepages,
        TP_PROTO(struct inode *inode, struct writeback_control *wbc),

        TP_ARGS(inode, wbc),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        long,        nr_to_write                )
                __field(        long,        pages_skipped                )
                __field(        loff_t,        range_start                )
                __field(        loff_t,        range_end                )
                __field(       pgoff_t,        writeback_index                )
                __field(        int,        sync_mode                )
                __field(        char,        for_kupdate                )
                __field(        char,        range_cyclic                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->nr_to_write        = wbc->nr_to_write;
                __entry->pages_skipped        = wbc->pages_skipped;
                __entry->range_start        = wbc->range_start;
                __entry->range_end        = wbc->range_end;
                __entry->writeback_index = inode->i_mapping->writeback_index;
                __entry->sync_mode        = wbc->sync_mode;
                __entry->for_kupdate        = wbc->for_kupdate;
                __entry->range_cyclic        = wbc->range_cyclic;
        ),

        TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld "
                  "range_start %lld range_end %lld sync_mode %d "
                  "for_kupdate %d range_cyclic %d writeback_index %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->nr_to_write,
                  __entry->pages_skipped, __entry->range_start,
                  __entry->range_end, __entry->sync_mode,
                  __entry->for_kupdate, __entry->range_cyclic,
                  (unsigned long) __entry->writeback_index)
);

TRACE_EVENT(ext4_da_write_pages,
        TP_PROTO(struct inode *inode, pgoff_t first_page,
                 struct writeback_control *wbc),

        TP_ARGS(inode, first_page, wbc),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(      pgoff_t,        first_page                )
                __field(         long,        nr_to_write                )
                __field(          int,        sync_mode                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->first_page        = first_page;
                __entry->nr_to_write        = wbc->nr_to_write;
                __entry->sync_mode        = wbc->sync_mode;
        ),

        TP_printk("dev %d,%d ino %lu first_page %lu nr_to_write %ld "
                  "sync_mode %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->first_page,
                  __entry->nr_to_write, __entry->sync_mode)
);

TRACE_EVENT(ext4_da_write_pages_extent,
        TP_PROTO(struct inode *inode, struct ext4_map_blocks *map),

        TP_ARGS(inode, map),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        lblk                        )
                __field(        __u32,        len                        )
                __field(        __u32,        flags                        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->lblk                = map->m_lblk;
                __entry->len                = map->m_len;
                __entry->flags                = map->m_flags;
        ),

        TP_printk("dev %d,%d ino %lu lblk %llu len %u flags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->lblk, __entry->len,
                  show_mflags(__entry->flags))
);

TRACE_EVENT(ext4_writepages_result,
        TP_PROTO(struct inode *inode, struct writeback_control *wbc,
                        int ret, int pages_written),

        TP_ARGS(inode, wbc, ret, pages_written),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        int,        ret                        )
                __field(        int,        pages_written                )
                __field(        long,        pages_skipped                )
                __field(       pgoff_t,        writeback_index                )
                __field(        int,        sync_mode                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->ret                = ret;
                __entry->pages_written        = pages_written;
                __entry->pages_skipped        = wbc->pages_skipped;
                __entry->writeback_index = inode->i_mapping->writeback_index;
                __entry->sync_mode        = wbc->sync_mode;
        ),

        TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld "
                  "sync_mode %d writeback_index %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->ret,
                  __entry->pages_written, __entry->pages_skipped,
                  __entry->sync_mode,
                  (unsigned long) __entry->writeback_index)
);

DECLARE_EVENT_CLASS(ext4__page_op,
        TP_PROTO(struct page *page),

        TP_ARGS(page),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        pgoff_t, index                        )

        ),

        TP_fast_assign(
                __entry->dev        = page->mapping->host->i_sb->s_dev;
                __entry->ino        = page->mapping->host->i_ino;
                __entry->index        = page->index;
        ),

        TP_printk("dev %d,%d ino %lu page_index %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned long) __entry->index)
);

DEFINE_EVENT(ext4__page_op, ext4_writepage,

        TP_PROTO(struct page *page),

        TP_ARGS(page)
);

DEFINE_EVENT(ext4__page_op, ext4_readpage,

        TP_PROTO(struct page *page),

        TP_ARGS(page)
);

DEFINE_EVENT(ext4__page_op, ext4_releasepage,

        TP_PROTO(struct page *page),

        TP_ARGS(page)
);

DECLARE_EVENT_CLASS(ext4_invalidatepage_op,
        TP_PROTO(struct page *page, unsigned int offset, unsigned int length),

        TP_ARGS(page, offset, length),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        pgoff_t, index                        )
                __field(        unsigned int, offset                )
                __field(        unsigned int, length                )
        ),

        TP_fast_assign(
                __entry->dev        = page->mapping->host->i_sb->s_dev;
                __entry->ino        = page->mapping->host->i_ino;
                __entry->index        = page->index;
                __entry->offset        = offset;
                __entry->length        = length;
        ),

        TP_printk("dev %d,%d ino %lu page_index %lu offset %u length %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned long) __entry->index,
                  __entry->offset, __entry->length)
);

DEFINE_EVENT(ext4_invalidatepage_op, ext4_invalidatepage,
        TP_PROTO(struct page *page, unsigned int offset, unsigned int length),

        TP_ARGS(page, offset, length)
);

DEFINE_EVENT(ext4_invalidatepage_op, ext4_journalled_invalidatepage,
        TP_PROTO(struct page *page, unsigned int offset, unsigned int length),

        TP_ARGS(page, offset, length)
);

TRACE_EVENT(ext4_discard_blocks,
        TP_PROTO(struct super_block *sb, unsigned long long blk,
                        unsigned long long count),

        TP_ARGS(sb, blk, count),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u64,        blk                        )
                __field(        __u64,        count                        )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->blk        = blk;
                __entry->count        = count;
        ),

        TP_printk("dev %d,%d blk %llu count %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->blk, __entry->count)
);

DECLARE_EVENT_CLASS(ext4__mb_new_pa,
        TP_PROTO(struct ext4_allocation_context *ac,
                 struct ext4_prealloc_space *pa),

        TP_ARGS(ac, pa),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        pa_pstart                )
                __field(        __u64,        pa_lstart                )
                __field(        __u32,        pa_len                        )

        ),

        TP_fast_assign(
                __entry->dev                = ac->ac_sb->s_dev;
                __entry->ino                = ac->ac_inode->i_ino;
                __entry->pa_pstart        = pa->pa_pstart;
                __entry->pa_lstart        = pa->pa_lstart;
                __entry->pa_len                = pa->pa_len;
        ),

        TP_printk("dev %d,%d ino %lu pstart %llu len %u lstart %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart)
);

DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_inode_pa,

        TP_PROTO(struct ext4_allocation_context *ac,
                 struct ext4_prealloc_space *pa),

        TP_ARGS(ac, pa)
);

DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_group_pa,

        TP_PROTO(struct ext4_allocation_context *ac,
                 struct ext4_prealloc_space *pa),

        TP_ARGS(ac, pa)
);

TRACE_EVENT(ext4_mb_release_inode_pa,
        TP_PROTO(struct ext4_prealloc_space *pa,
                 unsigned long long block, unsigned int count),

        TP_ARGS(pa, block, count),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        block                        )
                __field(        __u32,        count                        )

        ),

        TP_fast_assign(
                __entry->dev                = pa->pa_inode->i_sb->s_dev;
                __entry->ino                = pa->pa_inode->i_ino;
                __entry->block                = block;
                __entry->count                = count;
        ),

        TP_printk("dev %d,%d ino %lu block %llu count %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->block, __entry->count)
);

TRACE_EVENT(ext4_mb_release_group_pa,
        TP_PROTO(struct super_block *sb, struct ext4_prealloc_space *pa),

        TP_ARGS(sb, pa),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u64,        pa_pstart                )
                __field(        __u32,        pa_len                        )

        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->pa_pstart        = pa->pa_pstart;
                __entry->pa_len                = pa->pa_len;
        ),

        TP_printk("dev %d,%d pstart %llu len %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->pa_pstart, __entry->pa_len)
);

TRACE_EVENT(ext4_discard_preallocations,
        TP_PROTO(struct inode *inode, unsigned int len, unsigned int needed),

        TP_ARGS(inode, len, needed),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        unsigned int,        len                )
                __field(        unsigned int,        needed                )

        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->len        = len;
                __entry->needed        = needed;
        ),

        TP_printk("dev %d,%d ino %lu len: %u needed %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->len,
                  __entry->needed)
);

TRACE_EVENT(ext4_mb_discard_preallocations,
        TP_PROTO(struct super_block *sb, int needed),

        TP_ARGS(sb, needed),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        int,        needed                        )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->needed        = needed;
        ),

        TP_printk("dev %d,%d needed %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->needed)
);

TRACE_EVENT(ext4_request_blocks,
        TP_PROTO(struct ext4_allocation_request *ar),

        TP_ARGS(ar),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        unsigned int, len                )
                __field(        __u32,  logical                        )
                __field(        __u32,        lleft                        )
                __field(        __u32,        lright                        )
                __field(        __u64,        goal                        )
                __field(        __u64,        pleft                        )
                __field(        __u64,        pright                        )
                __field(        unsigned int, flags                )
        ),

        TP_fast_assign(
                __entry->dev        = ar->inode->i_sb->s_dev;
                __entry->ino        = ar->inode->i_ino;
                __entry->len        = ar->len;
                __entry->logical = ar->logical;
                __entry->goal        = ar->goal;
                __entry->lleft        = ar->lleft;
                __entry->lright        = ar->lright;
                __entry->pleft        = ar->pleft;
                __entry->pright        = ar->pright;
                __entry->flags        = ar->flags;
        ),

        TP_printk("dev %d,%d ino %lu flags %s len %u lblk %u goal %llu "
                  "lleft %u lright %u pleft %llu pright %llu ",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags),
                  __entry->len, __entry->logical, __entry->goal,
                  __entry->lleft, __entry->lright, __entry->pleft,
                  __entry->pright)
);

TRACE_EVENT(ext4_allocate_blocks,
        TP_PROTO(struct ext4_allocation_request *ar, unsigned long long block),

        TP_ARGS(ar, block),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        block                        )
                __field(        unsigned int, len                )
                __field(        __u32,  logical                        )
                __field(        __u32,        lleft                        )
                __field(        __u32,        lright                        )
                __field(        __u64,        goal                        )
                __field(        __u64,        pleft                        )
                __field(        __u64,        pright                        )
                __field(        unsigned int, flags                )
        ),

        TP_fast_assign(
                __entry->dev        = ar->inode->i_sb->s_dev;
                __entry->ino        = ar->inode->i_ino;
                __entry->block        = block;
                __entry->len        = ar->len;
                __entry->logical = ar->logical;
                __entry->goal        = ar->goal;
                __entry->lleft        = ar->lleft;
                __entry->lright        = ar->lright;
                __entry->pleft        = ar->pleft;
                __entry->pright        = ar->pright;
                __entry->flags        = ar->flags;
        ),

        TP_printk("dev %d,%d ino %lu flags %s len %u block %llu lblk %u "
                  "goal %llu lleft %u lright %u pleft %llu pright %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags),
                  __entry->len, __entry->block, __entry->logical,
                  __entry->goal,  __entry->lleft, __entry->lright,
                  __entry->pleft, __entry->pright)
);

TRACE_EVENT(ext4_free_blocks,
        TP_PROTO(struct inode *inode, __u64 block, unsigned long count,
                 int flags),

        TP_ARGS(inode, block, count, flags),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        block                        )
                __field(        unsigned long,        count                )
                __field(        int,        flags                        )
                __field(        __u16,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->block                = block;
                __entry->count                = count;
                __entry->flags                = flags;
                __entry->mode                = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->mode, __entry->block, __entry->count,
                  show_free_flags(__entry->flags))
);

TRACE_EVENT(ext4_sync_file_enter,
        TP_PROTO(struct file *file, int datasync),

        TP_ARGS(file, datasync),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        ino_t,        parent                        )
                __field(        int,        datasync                )
        ),

        TP_fast_assign(
                struct dentry *dentry = file->f_path.dentry;

                __entry->dev                = dentry->d_sb->s_dev;
                __entry->ino                = d_inode(dentry)->i_ino;
                __entry->datasync        = datasync;
                __entry->parent                = d_inode(dentry->d_parent)->i_ino;
        ),

        TP_printk("dev %d,%d ino %lu parent %lu datasync %d ",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned long) __entry->parent, __entry->datasync)
);

TRACE_EVENT(ext4_sync_file_exit,
        TP_PROTO(struct inode *inode, int ret),

        TP_ARGS(inode, ret),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        int,        ret                        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->ret                = ret;
        ),

        TP_printk("dev %d,%d ino %lu ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->ret)
);

TRACE_EVENT(ext4_sync_fs,
        TP_PROTO(struct super_block *sb, int wait),

        TP_ARGS(sb, wait),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        int,        wait                        )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->wait        = wait;
        ),

        TP_printk("dev %d,%d wait %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->wait)
);

TRACE_EVENT(ext4_alloc_da_blocks,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field( unsigned int,        data_blocks                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
        ),

        TP_printk("dev %d,%d ino %lu reserved_data_blocks %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->data_blocks)
);

TRACE_EVENT(ext4_mballoc_alloc,
        TP_PROTO(struct ext4_allocation_context *ac),

        TP_ARGS(ac),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u32,         orig_logical                )
                __field(          int,        orig_start                )
                __field(        __u32,         orig_group                )
                __field(          int,        orig_len                )
                __field(        __u32,         goal_logical                )
                __field(          int,        goal_start                )
                __field(        __u32,         goal_group                )
                __field(          int,        goal_len                )
                __field(        __u32,         result_logical                )
                __field(          int,        result_start                )
                __field(        __u32,         result_group                )
                __field(          int,        result_len                )
                __field(        __u16,        found                        )
                __field(        __u16,        groups                        )
                __field(        __u16,        buddy                        )
                __field(        __u16,        flags                        )
                __field(        __u16,        tail                        )
                __field(        __u8,        cr                        )
        ),

        TP_fast_assign(
                __entry->dev                = ac->ac_inode->i_sb->s_dev;
                __entry->ino                = ac->ac_inode->i_ino;
                __entry->orig_logical        = ac->ac_o_ex.fe_logical;
                __entry->orig_start        = ac->ac_o_ex.fe_start;
                __entry->orig_group        = ac->ac_o_ex.fe_group;
                __entry->orig_len        = ac->ac_o_ex.fe_len;
                __entry->goal_logical        = ac->ac_g_ex.fe_logical;
                __entry->goal_start        = ac->ac_g_ex.fe_start;
                __entry->goal_group        = ac->ac_g_ex.fe_group;
                __entry->goal_len        = ac->ac_g_ex.fe_len;
                __entry->result_logical        = ac->ac_f_ex.fe_logical;
                __entry->result_start        = ac->ac_f_ex.fe_start;
                __entry->result_group        = ac->ac_f_ex.fe_group;
                __entry->result_len        = ac->ac_f_ex.fe_len;
                __entry->found                = ac->ac_found;
                __entry->flags                = ac->ac_flags;
                __entry->groups                = ac->ac_groups_scanned;
                __entry->buddy                = ac->ac_buddy;
                __entry->tail                = ac->ac_tail;
                __entry->cr                = ac->ac_criteria;
        ),

        TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u "
                  "result %u/%d/%u@%u blks %u grps %u cr %u flags %s "
                  "tail %u broken %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->orig_group, __entry->orig_start,
                  __entry->orig_len, __entry->orig_logical,
                  __entry->goal_group, __entry->goal_start,
                  __entry->goal_len, __entry->goal_logical,
                  __entry->result_group, __entry->result_start,
                  __entry->result_len, __entry->result_logical,
                  __entry->found, __entry->groups, __entry->cr,
                  show_mballoc_flags(__entry->flags), __entry->tail,
                  __entry->buddy ? 1 << __entry->buddy : 0)
);

TRACE_EVENT(ext4_mballoc_prealloc,
        TP_PROTO(struct ext4_allocation_context *ac),

        TP_ARGS(ac),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u32,         orig_logical                )
                __field(          int,        orig_start                )
                __field(        __u32,         orig_group                )
                __field(          int,        orig_len                )
                __field(        __u32,         result_logical                )
                __field(          int,        result_start                )
                __field(        __u32,         result_group                )
                __field(          int,        result_len                )
        ),

        TP_fast_assign(
                __entry->dev                = ac->ac_inode->i_sb->s_dev;
                __entry->ino                = ac->ac_inode->i_ino;
                __entry->orig_logical        = ac->ac_o_ex.fe_logical;
                __entry->orig_start        = ac->ac_o_ex.fe_start;
                __entry->orig_group        = ac->ac_o_ex.fe_group;
                __entry->orig_len        = ac->ac_o_ex.fe_len;
                __entry->result_logical        = ac->ac_b_ex.fe_logical;
                __entry->result_start        = ac->ac_b_ex.fe_start;
                __entry->result_group        = ac->ac_b_ex.fe_group;
                __entry->result_len        = ac->ac_b_ex.fe_len;
        ),

        TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->orig_group, __entry->orig_start,
                  __entry->orig_len, __entry->orig_logical,
                  __entry->result_group, __entry->result_start,
                  __entry->result_len, __entry->result_logical)
);

DECLARE_EVENT_CLASS(ext4__mballoc,
        TP_PROTO(struct super_block *sb,
                 struct inode *inode,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, inode, group, start, len),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(          int,        result_start                )
                __field(        __u32,         result_group                )
                __field(          int,        result_len                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->ino                = inode ? inode->i_ino : 0;
                __entry->result_start        = start;
                __entry->result_group        = group;
                __entry->result_len        = len;
        ),

        TP_printk("dev %d,%d inode %lu extent %u/%d/%d ",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->result_group, __entry->result_start,
                  __entry->result_len)
);

DEFINE_EVENT(ext4__mballoc, ext4_mballoc_discard,

        TP_PROTO(struct super_block *sb,
                 struct inode *inode,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, inode, group, start, len)
);

DEFINE_EVENT(ext4__mballoc, ext4_mballoc_free,

        TP_PROTO(struct super_block *sb,
                 struct inode *inode,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, inode, group, start, len)
);

TRACE_EVENT(ext4_forget,
        TP_PROTO(struct inode *inode, int is_metadata, __u64 block),

        TP_ARGS(inode, is_metadata, block),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        block                        )
                __field(        int,        is_metadata                )
                __field(        __u16,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->block        = block;
                __entry->is_metadata = is_metadata;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->mode, __entry->is_metadata, __entry->block)
);

TRACE_EVENT(ext4_da_update_reserve_space,
        TP_PROTO(struct inode *inode, int used_blocks, int quota_claim),

        TP_ARGS(inode, used_blocks, quota_claim),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        i_blocks                )
                __field(        int,        used_blocks                )
                __field(        int,        reserved_data_blocks        )
                __field(        int,        quota_claim                )
                __field(        __u16,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->i_blocks = inode->i_blocks;
                __entry->used_blocks = used_blocks;
                __entry->reserved_data_blocks =
                                EXT4_I(inode)->i_reserved_data_blocks;
                __entry->quota_claim = quota_claim;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d "
                  "reserved_data_blocks %d quota_claim %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->mode, __entry->i_blocks,
                  __entry->used_blocks, __entry->reserved_data_blocks,
                  __entry->quota_claim)
);

TRACE_EVENT(ext4_da_reserve_space,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        i_blocks                )
                __field(        int,        reserved_data_blocks        )
                __field(        __u16,  mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->i_blocks = inode->i_blocks;
                __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu "
                  "reserved_data_blocks %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->mode, __entry->i_blocks,
                  __entry->reserved_data_blocks)
);

TRACE_EVENT(ext4_da_release_space,
        TP_PROTO(struct inode *inode, int freed_blocks),

        TP_ARGS(inode, freed_blocks),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        i_blocks                )
                __field(        int,        freed_blocks                )
                __field(        int,        reserved_data_blocks        )
                __field(        __u16,  mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->i_blocks = inode->i_blocks;
                __entry->freed_blocks = freed_blocks;
                __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu freed_blocks %d "
                  "reserved_data_blocks %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->mode, __entry->i_blocks,
                  __entry->freed_blocks, __entry->reserved_data_blocks)
);

DECLARE_EVENT_CLASS(ext4__bitmap_load,
        TP_PROTO(struct super_block *sb, unsigned long group),

        TP_ARGS(sb, group),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u32,        group                        )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->group        = group;
        ),

        TP_printk("dev %d,%d group %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->group)
);

DEFINE_EVENT(ext4__bitmap_load, ext4_mb_bitmap_load,

        TP_PROTO(struct super_block *sb, unsigned long group),

        TP_ARGS(sb, group)
);

DEFINE_EVENT(ext4__bitmap_load, ext4_mb_buddy_bitmap_load,

        TP_PROTO(struct super_block *sb, unsigned long group),

        TP_ARGS(sb, group)
);

DEFINE_EVENT(ext4__bitmap_load, ext4_load_inode_bitmap,

        TP_PROTO(struct super_block *sb, unsigned long group),

        TP_ARGS(sb, group)
);

TRACE_EVENT(ext4_read_block_bitmap_load,
        TP_PROTO(struct super_block *sb, unsigned long group, bool prefetch),

        TP_ARGS(sb, group, prefetch),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u32,        group                        )
                __field(        bool,        prefetch                )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->group        = group;
                __entry->prefetch = prefetch;
        ),

        TP_printk("dev %d,%d group %u prefetch %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->group, __entry->prefetch)
);

TRACE_EVENT(ext4_direct_IO_enter,
        TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw),

        TP_ARGS(inode, offset, len, rw),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        pos                        )
                __field(        unsigned long,        len                )
                __field(        int,        rw                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pos        = offset;
                __entry->len        = len;
                __entry->rw        = rw;
        ),

        TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->pos, __entry->len, __entry->rw)
);

TRACE_EVENT(ext4_direct_IO_exit,
        TP_PROTO(struct inode *inode, loff_t offset, unsigned long len,
                 int rw, int ret),

        TP_ARGS(inode, offset, len, rw, ret),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        pos                        )
                __field(        unsigned long,        len                )
                __field(        int,        rw                        )
                __field(        int,        ret                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pos        = offset;
                __entry->len        = len;
                __entry->rw        = rw;
                __entry->ret        = ret;
        ),

        TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->pos, __entry->len,
                  __entry->rw, __entry->ret)
);

DECLARE_EVENT_CLASS(ext4__fallocate_mode,
        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),

        TP_ARGS(inode, offset, len, mode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        offset                        )
                __field(        loff_t, len                        )
                __field(        int,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->offset        = offset;
                __entry->len        = len;
                __entry->mode        = mode;
        ),

        TP_printk("dev %d,%d ino %lu offset %lld len %lld mode %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->offset, __entry->len,
                  show_falloc_mode(__entry->mode))
);

DEFINE_EVENT(ext4__fallocate_mode, ext4_fallocate_enter,

        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),

        TP_ARGS(inode, offset, len, mode)
);

DEFINE_EVENT(ext4__fallocate_mode, ext4_punch_hole,

        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),

        TP_ARGS(inode, offset, len, mode)
);

DEFINE_EVENT(ext4__fallocate_mode, ext4_zero_range,

        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),

        TP_ARGS(inode, offset, len, mode)
);

TRACE_EVENT(ext4_fallocate_exit,
        TP_PROTO(struct inode *inode, loff_t offset,
                 unsigned int max_blocks, int ret),

        TP_ARGS(inode, offset, max_blocks, ret),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        pos                        )
                __field(        unsigned int,        blocks                )
                __field(        int,         ret                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pos        = offset;
                __entry->blocks        = max_blocks;
                __entry->ret        = ret;
        ),

        TP_printk("dev %d,%d ino %lu pos %lld blocks %u ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->pos, __entry->blocks,
                  __entry->ret)
);

TRACE_EVENT(ext4_unlink_enter,
        TP_PROTO(struct inode *parent, struct dentry *dentry),

        TP_ARGS(parent, dentry),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        ino_t,        parent                        )
                __field(        loff_t,        size                        )
        ),

        TP_fast_assign(
                __entry->dev                = dentry->d_sb->s_dev;
                __entry->ino                = d_inode(dentry)->i_ino;
                __entry->parent                = parent->i_ino;
                __entry->size                = d_inode(dentry)->i_size;
        ),

        TP_printk("dev %d,%d ino %lu size %lld parent %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->size,
                  (unsigned long) __entry->parent)
);

TRACE_EVENT(ext4_unlink_exit,
        TP_PROTO(struct dentry *dentry, int ret),

        TP_ARGS(dentry, ret),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        int,        ret                        )
        ),

        TP_fast_assign(
                __entry->dev                = dentry->d_sb->s_dev;
                __entry->ino                = d_inode(dentry)->i_ino;
                __entry->ret                = ret;
        ),

        TP_printk("dev %d,%d ino %lu ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->ret)
);

DECLARE_EVENT_CLASS(ext4__truncate,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        __u64,                blocks                )
        ),

        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
                __entry->blocks        = inode->i_blocks;
        ),

        TP_printk("dev %d,%d ino %lu blocks %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->blocks)
);

DEFINE_EVENT(ext4__truncate, ext4_truncate_enter,

        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

DEFINE_EVENT(ext4__truncate, ext4_truncate_exit,

        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

/* 'ux' is the unwritten extent. */
TRACE_EVENT(ext4_ext_convert_to_initialized_enter,
        TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
                 struct ext4_extent *ux),

        TP_ARGS(inode, map, ux),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        m_lblk        )
                __field(        unsigned,        m_len        )
                __field(        ext4_lblk_t,        u_lblk        )
                __field(        unsigned,        u_len        )
                __field(        ext4_fsblk_t,        u_pblk        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->m_lblk                = map->m_lblk;
                __entry->m_len                = map->m_len;
                __entry->u_lblk                = le32_to_cpu(ux->ee_block);
                __entry->u_len                = ext4_ext_get_actual_len(ux);
                __entry->u_pblk                = ext4_ext_pblock(ux);
        ),

        TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u u_lblk %u u_len %u "
                  "u_pblk %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->m_lblk, __entry->m_len,
                  __entry->u_lblk, __entry->u_len, __entry->u_pblk)
);

/*
 * 'ux' is the unwritten extent.
 * 'ix' is the initialized extent to which blocks are transferred.
 */
TRACE_EVENT(ext4_ext_convert_to_initialized_fastpath,
        TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
                 struct ext4_extent *ux, struct ext4_extent *ix),

        TP_ARGS(inode, map, ux, ix),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        m_lblk        )
                __field(        unsigned,        m_len        )
                __field(        ext4_lblk_t,        u_lblk        )
                __field(        unsigned,        u_len        )
                __field(        ext4_fsblk_t,        u_pblk        )
                __field(        ext4_lblk_t,        i_lblk        )
                __field(        unsigned,        i_len        )
                __field(        ext4_fsblk_t,        i_pblk        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->m_lblk                = map->m_lblk;
                __entry->m_len                = map->m_len;
                __entry->u_lblk                = le32_to_cpu(ux->ee_block);
                __entry->u_len                = ext4_ext_get_actual_len(ux);
                __entry->u_pblk                = ext4_ext_pblock(ux);
                __entry->i_lblk                = le32_to_cpu(ix->ee_block);
                __entry->i_len                = ext4_ext_get_actual_len(ix);
                __entry->i_pblk                = ext4_ext_pblock(ix);
        ),

        TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u "
                  "u_lblk %u u_len %u u_pblk %llu "
                  "i_lblk %u i_len %u i_pblk %llu ",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->m_lblk, __entry->m_len,
                  __entry->u_lblk, __entry->u_len, __entry->u_pblk,
                  __entry->i_lblk, __entry->i_len, __entry->i_pblk)
);

DECLARE_EVENT_CLASS(ext4__map_blocks_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
                 unsigned int len, unsigned int flags),

        TP_ARGS(inode, lblk, len, flags),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        unsigned int,        len                )
                __field(        unsigned int,        flags                )
        ),

        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
                __entry->lblk        = lblk;
                __entry->len        = len;
                __entry->flags        = flags;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u len %u flags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->len, show_map_flags(__entry->flags))
);

DEFINE_EVENT(ext4__map_blocks_enter, ext4_ext_map_blocks_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
                 unsigned len, unsigned flags),

        TP_ARGS(inode, lblk, len, flags)
);

DEFINE_EVENT(ext4__map_blocks_enter, ext4_ind_map_blocks_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
                 unsigned len, unsigned flags),

        TP_ARGS(inode, lblk, len, flags)
);

DECLARE_EVENT_CLASS(ext4__map_blocks_exit,
        TP_PROTO(struct inode *inode, unsigned flags, struct ext4_map_blocks *map,
                 int ret),

        TP_ARGS(inode, flags, map, ret),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        unsigned int,        flags                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        unsigned int,        len                )
                __field(        unsigned int,        mflags                )
                __field(        int,                ret                )
        ),

        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
                __entry->flags        = flags;
                __entry->pblk        = map->m_pblk;
                __entry->lblk        = map->m_lblk;
                __entry->len        = map->m_len;
                __entry->mflags        = map->m_flags;
                __entry->ret        = ret;
        ),

        TP_printk("dev %d,%d ino %lu flags %s lblk %u pblk %llu len %u "
                  "mflags %s ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  show_map_flags(__entry->flags), __entry->lblk, __entry->pblk,
                  __entry->len, show_mflags(__entry->mflags), __entry->ret)
);

DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit,
        TP_PROTO(struct inode *inode, unsigned flags,
                 struct ext4_map_blocks *map, int ret),

        TP_ARGS(inode, flags, map, ret)
);

DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit,
        TP_PROTO(struct inode *inode, unsigned flags,
                 struct ext4_map_blocks *map, int ret),

        TP_ARGS(inode, flags, map, ret)
);

TRACE_EVENT(ext4_ext_load_extent,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk),

        TP_ARGS(inode, lblk, pblk),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        ext4_lblk_t,        lblk                )
        ),

        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
                __entry->pblk        = pblk;
                __entry->lblk        = lblk;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u pblk %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->pblk)
);

TRACE_EVENT(ext4_load_inode,
        TP_PROTO(struct super_block *sb, unsigned long ino),

        TP_ARGS(sb, ino),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                )
                __field(        ino_t,        ino                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->ino                = ino;
        ),

        TP_printk("dev %d,%d ino %ld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino)
);

TRACE_EVENT(ext4_journal_start,
        TP_PROTO(struct super_block *sb, int blocks, int rsv_blocks,
                 int revoke_creds, unsigned long IP),

        TP_ARGS(sb, blocks, rsv_blocks, revoke_creds, IP),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(unsigned long,        ip                        )
                __field(          int,        blocks                        )
                __field(          int,        rsv_blocks                )
                __field(          int,        revoke_creds                )
        ),

        TP_fast_assign(
                __entry->dev                 = sb->s_dev;
                __entry->ip                 = IP;
                __entry->blocks                 = blocks;
                __entry->rsv_blocks         = rsv_blocks;
                __entry->revoke_creds         = revoke_creds;
        ),

        TP_printk("dev %d,%d blocks %d, rsv_blocks %d, revoke_creds %d, "
                  "caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->blocks, __entry->rsv_blocks, __entry->revoke_creds,
                  (void *)__entry->ip)
);

TRACE_EVENT(ext4_journal_start_reserved,
        TP_PROTO(struct super_block *sb, int blocks, unsigned long IP),

        TP_ARGS(sb, blocks, IP),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(unsigned long,        ip                        )
                __field(          int,        blocks                        )
        ),

        TP_fast_assign(
                __entry->dev                 = sb->s_dev;
                __entry->ip                 = IP;
                __entry->blocks                 = blocks;
        ),

        TP_printk("dev %d,%d blocks, %d caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->blocks, (void *)__entry->ip)
);

DECLARE_EVENT_CLASS(ext4__trim,
        TP_PROTO(struct super_block *sb,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, group, start, len),

        TP_STRUCT__entry(
                __field(        int,        dev_major                )
                __field(        int,        dev_minor                )
                __field(        __u32,         group                        )
                __field(        int,        start                        )
                __field(        int,        len                        )
        ),

        TP_fast_assign(
                __entry->dev_major        = MAJOR(sb->s_dev);
                __entry->dev_minor        = MINOR(sb->s_dev);
                __entry->group                = group;
                __entry->start                = start;
                __entry->len                = len;
        ),

        TP_printk("dev %d,%d group %u, start %d, len %d",
                  __entry->dev_major, __entry->dev_minor,
                  __entry->group, __entry->start, __entry->len)
);

DEFINE_EVENT(ext4__trim, ext4_trim_extent,

        TP_PROTO(struct super_block *sb,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, group, start, len)
);

DEFINE_EVENT(ext4__trim, ext4_trim_all_free,

        TP_PROTO(struct super_block *sb,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, group, start, len)
);

TRACE_EVENT(ext4_ext_handle_unwritten_extents,
        TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int flags,
                 unsigned int allocated, ext4_fsblk_t newblock),

        TP_ARGS(inode, map, flags, allocated, newblock),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        int,                flags                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        unsigned int,        len                )
                __field(        unsigned int,        allocated        )
                __field(        ext4_fsblk_t,        newblk                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->flags                = flags;
                __entry->lblk                = map->m_lblk;
                __entry->pblk                = map->m_pblk;
                __entry->len                = map->m_len;
                __entry->allocated        = allocated;
                __entry->newblk                = newblock;
        ),

        TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %s "
                  "allocated %d newblock %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->lblk, (unsigned long long) __entry->pblk,
                  __entry->len, show_map_flags(__entry->flags),
                  (unsigned int) __entry->allocated,
                  (unsigned long long) __entry->newblk)
);

TRACE_EVENT(ext4_get_implied_cluster_alloc_exit,
        TP_PROTO(struct super_block *sb, struct ext4_map_blocks *map, int ret),

        TP_ARGS(sb, map, ret),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        unsigned int,        flags        )
                __field(        ext4_lblk_t,        lblk        )
                __field(        ext4_fsblk_t,        pblk        )
                __field(        unsigned int,        len        )
                __field(        int,                ret        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->flags        = map->m_flags;
                __entry->lblk        = map->m_lblk;
                __entry->pblk        = map->m_pblk;
                __entry->len        = map->m_len;
                __entry->ret        = ret;
        ),

        TP_printk("dev %d,%d m_lblk %u m_pblk %llu m_len %u m_flags %s ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->lblk, (unsigned long long) __entry->pblk,
                  __entry->len, show_mflags(__entry->flags), __entry->ret)
);

TRACE_EVENT(ext4_ext_put_in_cache,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len,
                 ext4_fsblk_t start),

        TP_ARGS(inode, lblk, len, start),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        lblk        )
                __field(        unsigned int,        len        )
                __field(        ext4_fsblk_t,        start        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = lblk;
                __entry->len        = len;
                __entry->start        = start;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u len %u start %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->lblk,
                  __entry->len,
                  (unsigned long long) __entry->start)
);

TRACE_EVENT(ext4_ext_in_cache,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk, int ret),

        TP_ARGS(inode, lblk, ret),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        lblk        )
                __field(        int,                ret        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = lblk;
                __entry->ret        = ret;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->lblk,
                  __entry->ret)

);

TRACE_EVENT(ext4_find_delalloc_range,
        TP_PROTO(struct inode *inode, ext4_lblk_t from, ext4_lblk_t to,
                int reverse, int found, ext4_lblk_t found_blk),

        TP_ARGS(inode, from, to, reverse, found, found_blk),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        from                )
                __field(        ext4_lblk_t,        to                )
                __field(        int,                reverse                )
                __field(        int,                found                )
                __field(        ext4_lblk_t,        found_blk        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->from                = from;
                __entry->to                = to;
                __entry->reverse        = reverse;
                __entry->found                = found;
                __entry->found_blk        = found_blk;
        ),

        TP_printk("dev %d,%d ino %lu from %u to %u reverse %d found %d "
                  "(blk = %u)",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->from, (unsigned) __entry->to,
                  __entry->reverse, __entry->found,
                  (unsigned) __entry->found_blk)
);

TRACE_EVENT(ext4_get_reserved_cluster_alloc,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len),

        TP_ARGS(inode, lblk, len),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        lblk        )
                __field(        unsigned int,        len        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = lblk;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u len %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->lblk,
                  __entry->len)
);

TRACE_EVENT(ext4_ext_show_extent,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
                 unsigned short len),

        TP_ARGS(inode, lblk, pblk, len),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_fsblk_t,        pblk        )
                __field(        ext4_lblk_t,        lblk        )
                __field(        unsigned short,        len        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pblk        = pblk;
                __entry->lblk        = lblk;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->lblk,
                  (unsigned long long) __entry->pblk,
                  (unsigned short) __entry->len)
);

TRACE_EVENT(ext4_remove_blocks,
        TP_PROTO(struct inode *inode, struct ext4_extent *ex,
                 ext4_lblk_t from, ext4_fsblk_t to,
                 struct partial_cluster *pc),

        TP_ARGS(inode, ex, from, to, pc),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        from        )
                __field(        ext4_lblk_t,        to        )
                __field(        ext4_fsblk_t,        ee_pblk        )
                __field(        ext4_lblk_t,        ee_lblk        )
                __field(        unsigned short,        ee_len        )
                __field(        ext4_fsblk_t,        pc_pclu        )
                __field(        ext4_lblk_t,        pc_lblk        )
                __field(        int,                pc_state)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->from                = from;
                __entry->to                = to;
                __entry->ee_pblk        = ext4_ext_pblock(ex);
                __entry->ee_lblk        = le32_to_cpu(ex->ee_block);
                __entry->ee_len                = ext4_ext_get_actual_len(ex);
                __entry->pc_pclu        = pc->pclu;
                __entry->pc_lblk        = pc->lblk;
                __entry->pc_state        = pc->state;
        ),

        TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]"
                  "from %u to %u partial [pclu %lld lblk %u state %d]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->ee_lblk,
                  (unsigned long long) __entry->ee_pblk,
                  (unsigned short) __entry->ee_len,
                  (unsigned) __entry->from,
                  (unsigned) __entry->to,
                  (long long) __entry->pc_pclu,
                  (unsigned int) __entry->pc_lblk,
                  (int) __entry->pc_state)
);

TRACE_EVENT(ext4_ext_rm_leaf,
        TP_PROTO(struct inode *inode, ext4_lblk_t start,
                 struct ext4_extent *ex,
                 struct partial_cluster *pc),

        TP_ARGS(inode, start, ex, pc),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        start        )
                __field(        ext4_lblk_t,        ee_lblk        )
                __field(        ext4_fsblk_t,        ee_pblk        )
                __field(        short,                ee_len        )
                __field(        ext4_fsblk_t,        pc_pclu        )
                __field(        ext4_lblk_t,        pc_lblk        )
                __field(        int,                pc_state)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->start                = start;
                __entry->ee_lblk        = le32_to_cpu(ex->ee_block);
                __entry->ee_pblk        = ext4_ext_pblock(ex);
                __entry->ee_len                = ext4_ext_get_actual_len(ex);
                __entry->pc_pclu        = pc->pclu;
                __entry->pc_lblk        = pc->lblk;
                __entry->pc_state        = pc->state;
        ),

        TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]"
                  "partial [pclu %lld lblk %u state %d]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->start,
                  (unsigned) __entry->ee_lblk,
                  (unsigned long long) __entry->ee_pblk,
                  (unsigned short) __entry->ee_len,
                  (long long) __entry->pc_pclu,
                  (unsigned int) __entry->pc_lblk,
                  (int) __entry->pc_state)
);

TRACE_EVENT(ext4_ext_rm_idx,
        TP_PROTO(struct inode *inode, ext4_fsblk_t pblk),

        TP_ARGS(inode, pblk),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_fsblk_t,        pblk        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pblk        = pblk;
        ),

        TP_printk("dev %d,%d ino %lu index_pblk %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned long long) __entry->pblk)
);

TRACE_EVENT(ext4_ext_remove_space,
        TP_PROTO(struct inode *inode, ext4_lblk_t start,
                 ext4_lblk_t end, int depth),

        TP_ARGS(inode, start, end, depth),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        start        )
                __field(        ext4_lblk_t,        end        )
                __field(        int,                depth        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->start        = start;
                __entry->end        = end;
                __entry->depth        = depth;
        ),

        TP_printk("dev %d,%d ino %lu since %u end %u depth %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->start,
                  (unsigned) __entry->end,
                  __entry->depth)
);

TRACE_EVENT(ext4_ext_remove_space_done,
        TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end,
                 int depth, struct partial_cluster *pc, __le16 eh_entries),

        TP_ARGS(inode, start, end, depth, pc, eh_entries),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        start                )
                __field(        ext4_lblk_t,        end                )
                __field(        int,                depth                )
                __field(        ext4_fsblk_t,        pc_pclu                )
                __field(        ext4_lblk_t,        pc_lblk                )
                __field(        int,                pc_state        )
                __field(        unsigned short,        eh_entries        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->start                = start;
                __entry->end                = end;
                __entry->depth                = depth;
                __entry->pc_pclu        = pc->pclu;
                __entry->pc_lblk        = pc->lblk;
                __entry->pc_state        = pc->state;
                __entry->eh_entries        = le16_to_cpu(eh_entries);
        ),

        TP_printk("dev %d,%d ino %lu since %u end %u depth %d "
                  "partial [pclu %lld lblk %u state %d] "
                  "remaining_entries %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->start,
                  (unsigned) __entry->end,
                  __entry->depth,
                  (long long) __entry->pc_pclu,
                  (unsigned int) __entry->pc_lblk,
                  (int) __entry->pc_state,
                  (unsigned short) __entry->eh_entries)
);

DECLARE_EVENT_CLASS(ext4__es_extent,
        TP_PROTO(struct inode *inode, struct extent_status *es),

        TP_ARGS(inode, es),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_lblk_t,        len                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        char, status        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = es->es_lblk;
                __entry->len        = es->es_len;
                __entry->pblk        = ext4_es_show_pblock(es);
                __entry->status        = ext4_es_status(es);
        ),

        TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->len,
                  __entry->pblk, show_extent_status(__entry->status))
);

DEFINE_EVENT(ext4__es_extent, ext4_es_insert_extent,
        TP_PROTO(struct inode *inode, struct extent_status *es),

        TP_ARGS(inode, es)
);

DEFINE_EVENT(ext4__es_extent, ext4_es_cache_extent,
        TP_PROTO(struct inode *inode, struct extent_status *es),

        TP_ARGS(inode, es)
);

TRACE_EVENT(ext4_es_remove_extent,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len),

        TP_ARGS(inode, lblk, len),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        lblk                        )
                __field(        loff_t,        len                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = lblk;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu es [%lld/%lld)",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->len)
);

TRACE_EVENT(ext4_es_find_extent_range_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk),

        TP_ARGS(inode, lblk),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = lblk;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->lblk)
);

TRACE_EVENT(ext4_es_find_extent_range_exit,
        TP_PROTO(struct inode *inode, struct extent_status *es),

        TP_ARGS(inode, es),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_lblk_t,        len                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        char, status        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = es->es_lblk;
                __entry->len        = es->es_len;
                __entry->pblk        = ext4_es_show_pblock(es);
                __entry->status        = ext4_es_status(es);
        ),

        TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->len,
                  __entry->pblk, show_extent_status(__entry->status))
);

TRACE_EVENT(ext4_es_lookup_extent_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk),

        TP_ARGS(inode, lblk),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = lblk;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->lblk)
);

TRACE_EVENT(ext4_es_lookup_extent_exit,
        TP_PROTO(struct inode *inode, struct extent_status *es,
                 int found),

        TP_ARGS(inode, es, found),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_lblk_t,        len                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        char,                status                )
                __field(        int,                found                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = es->es_lblk;
                __entry->len        = es->es_len;
                __entry->pblk        = ext4_es_show_pblock(es);
                __entry->status        = ext4_es_status(es);
                __entry->found        = found;
        ),

        TP_printk("dev %d,%d ino %lu found %d [%u/%u) %llu %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->found,
                  __entry->lblk, __entry->len,
                  __entry->found ? __entry->pblk : 0,
                  show_extent_status(__entry->found ? __entry->status : 0))
);

DECLARE_EVENT_CLASS(ext4__es_shrink_enter,
        TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt),

        TP_ARGS(sb, nr_to_scan, cache_cnt),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        int,        nr_to_scan                )
                __field(        int,        cache_cnt                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->nr_to_scan        = nr_to_scan;
                __entry->cache_cnt        = cache_cnt;
        ),

        TP_printk("dev %d,%d nr_to_scan %d cache_cnt %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->nr_to_scan, __entry->cache_cnt)
);

DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_count,
        TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt),

        TP_ARGS(sb, nr_to_scan, cache_cnt)
);

DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_scan_enter,
        TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt),

        TP_ARGS(sb, nr_to_scan, cache_cnt)
);

TRACE_EVENT(ext4_es_shrink_scan_exit,
        TP_PROTO(struct super_block *sb, int nr_shrunk, int cache_cnt),

        TP_ARGS(sb, nr_shrunk, cache_cnt),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        int,        nr_shrunk                )
                __field(        int,        cache_cnt                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->nr_shrunk        = nr_shrunk;
                __entry->cache_cnt        = cache_cnt;
        ),

        TP_printk("dev %d,%d nr_shrunk %d cache_cnt %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->nr_shrunk, __entry->cache_cnt)
);

TRACE_EVENT(ext4_collapse_range,
        TP_PROTO(struct inode *inode, loff_t offset, loff_t len),

        TP_ARGS(inode, offset, len),

        TP_STRUCT__entry(
                __field(dev_t,        dev)
                __field(ino_t,        ino)
                __field(loff_t,        offset)
                __field(loff_t, len)
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->offset        = offset;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu offset %lld len %lld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->offset, __entry->len)
);

TRACE_EVENT(ext4_insert_range,
        TP_PROTO(struct inode *inode, loff_t offset, loff_t len),

        TP_ARGS(inode, offset, len),

        TP_STRUCT__entry(
                __field(dev_t,        dev)
                __field(ino_t,        ino)
                __field(loff_t,        offset)
                __field(loff_t, len)
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->offset        = offset;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu offset %lld len %lld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->offset, __entry->len)
);

TRACE_EVENT(ext4_es_shrink,
        TP_PROTO(struct super_block *sb, int nr_shrunk, u64 scan_time,
                 int nr_skipped, int retried),

        TP_ARGS(sb, nr_shrunk, scan_time, nr_skipped, retried),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        int,                nr_shrunk        )
                __field(        unsigned long long, scan_time        )
                __field(        int,                nr_skipped        )
                __field(        int,                retried                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->nr_shrunk        = nr_shrunk;
                __entry->scan_time        = div_u64(scan_time, 1000);
                __entry->nr_skipped        = nr_skipped;
                __entry->retried        = retried;
        ),

        TP_printk("dev %d,%d nr_shrunk %d, scan_time %llu "
                  "nr_skipped %d retried %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_shrunk,
                  __entry->scan_time, __entry->nr_skipped, __entry->retried)
);

TRACE_EVENT(ext4_es_insert_delayed_block,
        TP_PROTO(struct inode *inode, struct extent_status *es,
                 bool allocated),

        TP_ARGS(inode, es, allocated),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_lblk_t,        len                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        char,                status                )
                __field(        bool,                allocated        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->lblk                = es->es_lblk;
                __entry->len                = es->es_len;
                __entry->pblk                = ext4_es_show_pblock(es);
                __entry->status                = ext4_es_status(es);
                __entry->allocated        = allocated;
        ),

        TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s "
                  "allocated %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->len,
                  __entry->pblk, show_extent_status(__entry->status),
                  __entry->allocated)
);

/* fsmap traces */
DECLARE_EVENT_CLASS(ext4_fsmap_class,
        TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len,
                 u64 owner),
        TP_ARGS(sb, keydev, agno, bno, len, owner),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(dev_t, keydev)
                __field(u32, agno)
                __field(u64, bno)
                __field(u64, len)
                __field(u64, owner)
        ),
        TP_fast_assign(
                __entry->dev = sb->s_bdev->bd_dev;
                __entry->keydev = new_decode_dev(keydev);
                __entry->agno = agno;
                __entry->bno = bno;
                __entry->len = len;
                __entry->owner = owner;
        ),
        TP_printk("dev %d:%d keydev %d:%d agno %u bno %llu len %llu owner %lld\n",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  MAJOR(__entry->keydev), MINOR(__entry->keydev),
                  __entry->agno,
                  __entry->bno,
                  __entry->len,
                  __entry->owner)
)
#define DEFINE_FSMAP_EVENT(name) \
DEFINE_EVENT(ext4_fsmap_class, name, \
        TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, \
                 u64 owner), \
        TP_ARGS(sb, keydev, agno, bno, len, owner))
DEFINE_FSMAP_EVENT(ext4_fsmap_low_key);
DEFINE_FSMAP_EVENT(ext4_fsmap_high_key);
DEFINE_FSMAP_EVENT(ext4_fsmap_mapping);

DECLARE_EVENT_CLASS(ext4_getfsmap_class,
        TP_PROTO(struct super_block *sb, struct ext4_fsmap *fsmap),
        TP_ARGS(sb, fsmap),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(dev_t, keydev)
                __field(u64, block)
                __field(u64, len)
                __field(u64, owner)
                __field(u64, flags)
        ),
        TP_fast_assign(
                __entry->dev = sb->s_bdev->bd_dev;
                __entry->keydev = new_decode_dev(fsmap->fmr_device);
                __entry->block = fsmap->fmr_physical;
                __entry->len = fsmap->fmr_length;
                __entry->owner = fsmap->fmr_owner;
                __entry->flags = fsmap->fmr_flags;
        ),
        TP_printk("dev %d:%d keydev %d:%d block %llu len %llu owner %lld flags 0x%llx\n",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  MAJOR(__entry->keydev), MINOR(__entry->keydev),
                  __entry->block,
                  __entry->len,
                  __entry->owner,
                  __entry->flags)
)
#define DEFINE_GETFSMAP_EVENT(name) \
DEFINE_EVENT(ext4_getfsmap_class, name, \
        TP_PROTO(struct super_block *sb, struct ext4_fsmap *fsmap), \
        TP_ARGS(sb, fsmap))
DEFINE_GETFSMAP_EVENT(ext4_getfsmap_low_key);
DEFINE_GETFSMAP_EVENT(ext4_getfsmap_high_key);
DEFINE_GETFSMAP_EVENT(ext4_getfsmap_mapping);

TRACE_EVENT(ext4_shutdown,
        TP_PROTO(struct super_block *sb, unsigned long flags),

        TP_ARGS(sb, flags),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(     unsigned,        flags                        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->flags        = flags;
        ),

        TP_printk("dev %d,%d flags %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->flags)
);

TRACE_EVENT(ext4_error,
        TP_PROTO(struct super_block *sb, const char *function,
                 unsigned int line),

        TP_ARGS(sb, function, line),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field( const char *,        function                )
                __field(     unsigned,        line                        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->function = function;
                __entry->line        = line;
        ),

        TP_printk("dev %d,%d function %s line %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->function, __entry->line)
);

TRACE_EVENT(ext4_prefetch_bitmaps,
            TP_PROTO(struct super_block *sb, ext4_group_t group,
                     ext4_group_t next, unsigned int prefetch_ios),

        TP_ARGS(sb, group, next, prefetch_ios),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u32,        group                        )
                __field(        __u32,        next                        )
                __field(        __u32,        ios                        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->group        = group;
                __entry->next        = next;
                __entry->ios        = prefetch_ios;
        ),

        TP_printk("dev %d,%d group %u next %u ios %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->group, __entry->next, __entry->ios)
);

TRACE_EVENT(ext4_lazy_itable_init,
            TP_PROTO(struct super_block *sb, ext4_group_t group),

        TP_ARGS(sb, group),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u32,        group                        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->group        = group;
        ),

        TP_printk("dev %d,%d group %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group)
);

TRACE_EVENT(ext4_fc_replay_scan,
        TP_PROTO(struct super_block *sb, int error, int off),

        TP_ARGS(sb, error, off),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int, error)
                __field(int, off)
        ),

        TP_fast_assign(
                __entry->dev = sb->s_dev;
                __entry->error = error;
                __entry->off = off;
        ),

        TP_printk("FC scan pass on dev %d,%d: error %d, off %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->error, __entry->off)
);

TRACE_EVENT(ext4_fc_replay,
        TP_PROTO(struct super_block *sb, int tag, int ino, int priv1, int priv2),

        TP_ARGS(sb, tag, ino, priv1, priv2),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int, tag)
                __field(int, ino)
                __field(int, priv1)
                __field(int, priv2)
        ),

        TP_fast_assign(
                __entry->dev = sb->s_dev;
                __entry->tag = tag;
                __entry->ino = ino;
                __entry->priv1 = priv1;
                __entry->priv2 = priv2;
        ),

        TP_printk("FC Replay %d,%d: tag %d, ino %d, data1 %d, data2 %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->tag, __entry->ino, __entry->priv1, __entry->priv2)
);

TRACE_EVENT(ext4_fc_commit_start,
        TP_PROTO(struct super_block *sb),

        TP_ARGS(sb),

        TP_STRUCT__entry(
                __field(dev_t, dev)
        ),

        TP_fast_assign(
                __entry->dev = sb->s_dev;
        ),

        TP_printk("fast_commit started on dev %d,%d",
                  MAJOR(__entry->dev), MINOR(__entry->dev))
);

TRACE_EVENT(ext4_fc_commit_stop,
            TP_PROTO(struct super_block *sb, int nblks, int reason),

        TP_ARGS(sb, nblks, reason),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int, nblks)
                __field(int, reason)
                __field(int, num_fc)
                __field(int, num_fc_ineligible)
                __field(int, nblks_agg)
        ),

        TP_fast_assign(
                __entry->dev = sb->s_dev;
                __entry->nblks = nblks;
                __entry->reason = reason;
                __entry->num_fc = EXT4_SB(sb)->s_fc_stats.fc_num_commits;
                __entry->num_fc_ineligible =
                        EXT4_SB(sb)->s_fc_stats.fc_ineligible_commits;
                __entry->nblks_agg = EXT4_SB(sb)->s_fc_stats.fc_numblks;
        ),

        TP_printk("fc on [%d,%d] nblks %d, reason %d, fc = %d, ineligible = %d, agg_nblks %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->nblks, __entry->reason, __entry->num_fc,
                  __entry->num_fc_ineligible, __entry->nblks_agg)
);

#define FC_REASON_NAME_STAT(reason)                                        \
        show_fc_reason(reason),                                                \
        __entry->fc_ineligible_rc[reason]

TRACE_EVENT(ext4_fc_stats,
        TP_PROTO(struct super_block *sb),

        TP_ARGS(sb),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __array(unsigned int, fc_ineligible_rc, EXT4_FC_REASON_MAX)
                __field(unsigned long, fc_commits)
                __field(unsigned long, fc_ineligible_commits)
                __field(unsigned long, fc_numblks)
        ),

        TP_fast_assign(
                int i;

                __entry->dev = sb->s_dev;
                for (i = 0; i < EXT4_FC_REASON_MAX; i++) {
                        __entry->fc_ineligible_rc[i] =
                                EXT4_SB(sb)->s_fc_stats.fc_ineligible_reason_count[i];
                }
                __entry->fc_commits = EXT4_SB(sb)->s_fc_stats.fc_num_commits;
                __entry->fc_ineligible_commits =
                        EXT4_SB(sb)->s_fc_stats.fc_ineligible_commits;
                __entry->fc_numblks = EXT4_SB(sb)->s_fc_stats.fc_numblks;
        ),

        TP_printk("dev %d,%d fc ineligible reasons:\n"
                  "%s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u"
                  "num_commits:%lu, ineligible: %lu, numblks: %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_XATTR),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_CROSS_RENAME),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_JOURNAL_FLAG_CHANGE),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_NOMEM),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_SWAP_BOOT),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_RESIZE),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_RENAME_DIR),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_FALLOC_RANGE),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_INODE_JOURNAL_DATA),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_ENCRYPTED_FILENAME),
                  __entry->fc_commits, __entry->fc_ineligible_commits,
                  __entry->fc_numblks)
);

#define DEFINE_TRACE_DENTRY_EVENT(__type)                                \
        TRACE_EVENT(ext4_fc_track_##__type,                                \
            TP_PROTO(struct inode *inode, struct dentry *dentry, int ret), \
                                                                        \
            TP_ARGS(inode, dentry, ret),                                \
                                                                        \
            TP_STRUCT__entry(                                                \
                    __field(dev_t, dev)                                        \
                    __field(int, ino)                                        \
                    __field(int, error)                                        \
                    ),                                                        \
                                                                        \
            TP_fast_assign(                                                \
                    __entry->dev = inode->i_sb->s_dev;                        \
                    __entry->ino = inode->i_ino;                        \
                    __entry->error = ret;                                \
                    ),                                                        \
                                                                        \
            TP_printk("dev %d:%d, inode %d, error %d, fc_%s",                \
                      MAJOR(__entry->dev), MINOR(__entry->dev),                \
                      __entry->ino, __entry->error,                        \
                      #__type)                                                \
        )

DEFINE_TRACE_DENTRY_EVENT(create);
DEFINE_TRACE_DENTRY_EVENT(link);
DEFINE_TRACE_DENTRY_EVENT(unlink);

TRACE_EVENT(ext4_fc_track_inode,
            TP_PROTO(struct inode *inode, int ret),

            TP_ARGS(inode, ret),

            TP_STRUCT__entry(
                    __field(dev_t, dev)
                    __field(int, ino)
                    __field(int, error)
                    ),

            TP_fast_assign(
                    __entry->dev = inode->i_sb->s_dev;
                    __entry->ino = inode->i_ino;
                    __entry->error = ret;
                    ),

            TP_printk("dev %d:%d, inode %d, error %d",
                      MAJOR(__entry->dev), MINOR(__entry->dev),
                      __entry->ino, __entry->error)
        );

TRACE_EVENT(ext4_fc_track_range,
            TP_PROTO(struct inode *inode, long start, long end, int ret),

            TP_ARGS(inode, start, end, ret),

            TP_STRUCT__entry(
                    __field(dev_t, dev)
                    __field(int, ino)
                    __field(long, start)
                    __field(long, end)
                    __field(int, error)
                    ),

            TP_fast_assign(
                    __entry->dev = inode->i_sb->s_dev;
                    __entry->ino = inode->i_ino;
                    __entry->start = start;
                    __entry->end = end;
                    __entry->error = ret;
                    ),

            TP_printk("dev %d:%d, inode %d, error %d, start %ld, end %ld",
                      MAJOR(__entry->dev), MINOR(__entry->dev),
                      __entry->ino, __entry->error, __entry->start,
                      __entry->end)
        );

#endif /* _TRACE_EXT4_H */

/* This part must be outside protection */
#include <trace/define_trace.h>





























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
/* SPDX-License-Identifier: GPL-2.0 */
/*
  File: linux/posix_acl.h

  (C) 2002 Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/


#ifndef __LINUX_POSIX_ACL_H
#define __LINUX_POSIX_ACL_H

#include <linux/bug.h>
#include <linux/slab.h>
#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <uapi/linux/posix_acl.h>

struct posix_acl_entry {
        short                        e_tag;
        unsigned short                e_perm;
        union {
                kuid_t                e_uid;
                kgid_t                e_gid;
        };
};

struct posix_acl {
        refcount_t                a_refcount;
        struct rcu_head                a_rcu;
        unsigned int                a_count;
        struct posix_acl_entry        a_entries[];
};

#define FOREACH_ACL_ENTRY(pa, acl, pe) \
        for(pa=(acl)->a_entries, pe=pa+(acl)->a_count; pa<pe; pa++)


/*
 * Duplicate an ACL handle.
 */
static inline struct posix_acl *
posix_acl_dup(struct posix_acl *acl)
{
        if (acl)
                refcount_inc(&acl->a_refcount);
        return acl;
}

/*
 * Free an ACL handle.
 */
static inline void
posix_acl_release(struct posix_acl *acl)
{
        if (acl && refcount_dec_and_test(&acl->a_refcount))
                kfree_rcu(acl, a_rcu);
}


/* posix_acl.c */

extern void posix_acl_init(struct posix_acl *, int);
extern struct posix_acl *posix_acl_alloc(int, gfp_t);
extern int posix_acl_valid(struct user_namespace *, const struct posix_acl *);
extern int posix_acl_permission(struct inode *, const struct posix_acl *, int);
extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t);
extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *);
extern int __posix_acl_create(struct posix_acl **, gfp_t, umode_t *);
extern int __posix_acl_chmod(struct posix_acl **, gfp_t, umode_t);

extern struct posix_acl *get_posix_acl(struct inode *, int);
extern int set_posix_acl(struct inode *, int, struct posix_acl *);

#ifdef CONFIG_FS_POSIX_ACL
extern int posix_acl_chmod(struct inode *, umode_t);
extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **,
                struct posix_acl **);
extern int posix_acl_update_mode(struct inode *, umode_t *, struct posix_acl **);

extern int simple_set_acl(struct inode *, struct posix_acl *, int);
extern int simple_acl_create(struct inode *, struct inode *);

struct posix_acl *get_cached_acl(struct inode *inode, int type);
struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type);
void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl);
void forget_cached_acl(struct inode *inode, int type);
void forget_all_cached_acls(struct inode *inode);

static inline void cache_no_acl(struct inode *inode)
{
        inode->i_acl = NULL;
        inode->i_default_acl = NULL;
}
#else
static inline int posix_acl_chmod(struct inode *inode, umode_t mode)
{
        return 0;
}

#define simple_set_acl                NULL

static inline int simple_acl_create(struct inode *dir, struct inode *inode)
{
        return 0;
}
static inline void cache_no_acl(struct inode *inode)
{
}

static inline int posix_acl_create(struct inode *inode, umode_t *mode,
                struct posix_acl **default_acl, struct posix_acl **acl)
{
        *default_acl = *acl = NULL;
        return 0;
}

static inline void forget_all_cached_acls(struct inode *inode)
{
}
#endif /* CONFIG_FS_POSIX_ACL */

struct posix_acl *get_acl(struct inode *inode, int type);

#endif  /* __LINUX_POSIX_ACL_H */







































































































































































































































































































































































































































    1 















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
// SPDX-License-Identifier: GPL-2.0-only
/*
 * (C) 1997 Linus Torvalds
 * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation)
 */
#include <linux/export.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/backing-dev.h>
#include <linux/hash.h>
#include <linux/swap.h>
#include <linux/security.h>
#include <linux/cdev.h>
#include <linux/memblock.h>
#include <linux/fscrypt.h>
#include <linux/fsnotify.h>
#include <linux/mount.h>
#include <linux/posix_acl.h>
#include <linux/prefetch.h>
#include <linux/buffer_head.h> /* for inode_has_buffers */
#include <linux/ratelimit.h>
#include <linux/list_lru.h>
#include <linux/iversion.h>
#include <trace/events/writeback.h>
#include "internal.h"

/*
 * Inode locking rules:
 *
 * inode->i_lock protects:
 *   inode->i_state, inode->i_hash, __iget()
 * Inode LRU list locks protect:
 *   inode->i_sb->s_inode_lru, inode->i_lru
 * inode->i_sb->s_inode_list_lock protects:
 *   inode->i_sb->s_inodes, inode->i_sb_list
 * bdi->wb.list_lock protects:
 *   bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list
 * inode_hash_lock protects:
 *   inode_hashtable, inode->i_hash
 *
 * Lock ordering:
 *
 * inode->i_sb->s_inode_list_lock
 *   inode->i_lock
 *     Inode LRU list locks
 *
 * bdi->wb.list_lock
 *   inode->i_lock
 *
 * inode_hash_lock
 *   inode->i_sb->s_inode_list_lock
 *   inode->i_lock
 *
 * iunique_lock
 *   inode_hash_lock
 */

static unsigned int i_hash_mask __read_mostly;
static unsigned int i_hash_shift __read_mostly;
static struct hlist_head *inode_hashtable __read_mostly;
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);

/*
 * Empty aops. Can be used for the cases where the user does not
 * define any of the address_space operations.
 */
const struct address_space_operations empty_aops = {
};
EXPORT_SYMBOL(empty_aops);

/*
 * Statistics gathering..
 */
struct inodes_stat_t inodes_stat;

static DEFINE_PER_CPU(unsigned long, nr_inodes);
static DEFINE_PER_CPU(unsigned long, nr_unused);

static struct kmem_cache *inode_cachep __read_mostly;

static long get_nr_inodes(void)
{
        int i;
        long sum = 0;
        for_each_possible_cpu(i)
                sum += per_cpu(nr_inodes, i);
        return sum < 0 ? 0 : sum;
}

static inline long get_nr_inodes_unused(void)
{
        int i;
        long sum = 0;
        for_each_possible_cpu(i)
                sum += per_cpu(nr_unused, i);
        return sum < 0 ? 0 : sum;
}

long get_nr_dirty_inodes(void)
{
        /* not actually dirty inodes, but a wild approximation */
        long nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
        return nr_dirty > 0 ? nr_dirty : 0;
}

/*
 * Handle nr_inode sysctl
 */
#ifdef CONFIG_SYSCTL
int proc_nr_inodes(struct ctl_table *table, int write,
                   void *buffer, size_t *lenp, loff_t *ppos)
{
        inodes_stat.nr_inodes = get_nr_inodes();
        inodes_stat.nr_unused = get_nr_inodes_unused();
        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
#endif

static int no_open(struct inode *inode, struct file *file)
{
        return -ENXIO;
}

/**
 * inode_init_always - perform inode structure initialisation
 * @sb: superblock inode belongs to
 * @inode: inode to initialise
 *
 * These are initializations that need to be done on every inode
 * allocation as the fields are not initialised by slab allocation.
 */
int inode_init_always(struct super_block *sb, struct inode *inode)
{
        static const struct inode_operations empty_iops;
        static const struct file_operations no_open_fops = {.open = no_open};
        struct address_space *const mapping = &inode->i_data;

        inode->i_sb = sb;
        inode->i_blkbits = sb->s_blocksize_bits;
        inode->i_flags = 0;
        atomic64_set(&inode->i_sequence, 0);
        atomic_set(&inode->i_count, 1);
        inode->i_op = &empty_iops;
        inode->i_fop = &no_open_fops;
        inode->__i_nlink = 1;
        inode->i_opflags = 0;
        if (sb->s_xattr)
                inode->i_opflags |= IOP_XATTR;
        i_uid_write(inode, 0);
        i_gid_write(inode, 0);
        atomic_set(&inode->i_writecount, 0);
        inode->i_size = 0;
        inode->i_write_hint = WRITE_LIFE_NOT_SET;
        inode->i_blocks = 0;
        inode->i_bytes = 0;
        inode->i_generation = 0;
        inode->i_pipe = NULL;
        inode->i_bdev = NULL;
        inode->i_cdev = NULL;
        inode->i_link = NULL;
        inode->i_dir_seq = 0;
        inode->i_rdev = 0;
        inode->dirtied_when = 0;

#ifdef CONFIG_CGROUP_WRITEBACK
        inode->i_wb_frn_winner = 0;
        inode->i_wb_frn_avg_time = 0;
        inode->i_wb_frn_history = 0;
#endif

        spin_lock_init(&inode->i_lock);
        lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);

        init_rwsem(&inode->i_rwsem);
        lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key);

        atomic_set(&inode->i_dio_count, 0);

        mapping->a_ops = &empty_aops;
        mapping->host = inode;
        mapping->flags = 0;
        if (sb->s_type->fs_flags & FS_THP_SUPPORT)
                __set_bit(AS_THP_SUPPORT, &mapping->flags);
        mapping->wb_err = 0;
        atomic_set(&mapping->i_mmap_writable, 0);
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        atomic_set(&mapping->nr_thps, 0);
#endif
        mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
        mapping->private_data = NULL;
        mapping->writeback_index = 0;
        inode->i_private = NULL;
        inode->i_mapping = mapping;
        INIT_HLIST_HEAD(&inode->i_dentry);        /* buggered by rcu freeing */
#ifdef CONFIG_FS_POSIX_ACL
        inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
#endif

#ifdef CONFIG_FSNOTIFY
        inode->i_fsnotify_mask = 0;
#endif
        inode->i_flctx = NULL;

        if (unlikely(security_inode_alloc(inode)))
                return -ENOMEM;
        this_cpu_inc(nr_inodes);

        return 0;
}
EXPORT_SYMBOL(inode_init_always);

void free_inode_nonrcu(struct inode *inode)
{
        kmem_cache_free(inode_cachep, inode);
}
EXPORT_SYMBOL(free_inode_nonrcu);

static void i_callback(struct rcu_head *head)
{
        struct inode *inode = container_of(head, struct inode, i_rcu);
        if (inode->free_inode)
                inode->free_inode(inode);
        else
                free_inode_nonrcu(inode);
}

static struct inode *alloc_inode(struct super_block *sb)
{
        const struct super_operations *ops = sb->s_op;
        struct inode *inode;

        if (ops->alloc_inode)
                inode = ops->alloc_inode(sb);
        else
                inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);

        if (!inode)
                return NULL;

        if (unlikely(inode_init_always(sb, inode))) {
                if (ops->destroy_inode) {
                        ops->destroy_inode(inode);
                        if (!ops->free_inode)
                                return NULL;
                }
                inode->free_inode = ops->free_inode;
                i_callback(&inode->i_rcu);
                return NULL;
        }

        return inode;
}

void __destroy_inode(struct inode *inode)
{
        BUG_ON(inode_has_buffers(inode));
        inode_detach_wb(inode);
        security_inode_free(inode);
        fsnotify_inode_delete(inode);
        locks_free_lock_context(inode);
        if (!inode->i_nlink) {
                WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
                atomic_long_dec(&inode->i_sb->s_remove_count);
        }

#ifdef CONFIG_FS_POSIX_ACL
        if (inode->i_acl && !is_uncached_acl(inode->i_acl))
                posix_acl_release(inode->i_acl);
        if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl))
                posix_acl_release(inode->i_default_acl);
#endif
        this_cpu_dec(nr_inodes);
}
EXPORT_SYMBOL(__destroy_inode);

static void destroy_inode(struct inode *inode)
{
        const struct super_operations *ops = inode->i_sb->s_op;

        BUG_ON(!list_empty(&inode->i_lru));
        __destroy_inode(inode);
        if (ops->destroy_inode) {
                ops->destroy_inode(inode);
                if (!ops->free_inode)
                        return;
        }
        inode->free_inode = ops->free_inode;
        call_rcu(&inode->i_rcu, i_callback);
}

/**
 * drop_nlink - directly drop an inode's link count
 * @inode: inode
 *
 * This is a low-level filesystem helper to replace any
 * direct filesystem manipulation of i_nlink.  In cases
 * where we are attempting to track writes to the
 * filesystem, a decrement to zero means an imminent
 * write when the file is truncated and actually unlinked
 * on the filesystem.
 */
void drop_nlink(struct inode *inode)
{
        WARN_ON(inode->i_nlink == 0);
        inode->__i_nlink--;
        if (!inode->i_nlink)
                atomic_long_inc(&inode->i_sb->s_remove_count);
}
EXPORT_SYMBOL(drop_nlink);

/**
 * clear_nlink - directly zero an inode's link count
 * @inode: inode
 *
 * This is a low-level filesystem helper to replace any
 * direct filesystem manipulation of i_nlink.  See
 * drop_nlink() for why we care about i_nlink hitting zero.
 */
void clear_nlink(struct inode *inode)
{
        if (inode->i_nlink) {
                inode->__i_nlink = 0;
                atomic_long_inc(&inode->i_sb->s_remove_count);
        }
}
EXPORT_SYMBOL(clear_nlink);

/**
 * set_nlink - directly set an inode's link count
 * @inode: inode
 * @nlink: new nlink (should be non-zero)
 *
 * This is a low-level filesystem helper to replace any
 * direct filesystem manipulation of i_nlink.
 */
void set_nlink(struct inode *inode, unsigned int nlink)
{
        if (!nlink) {
                clear_nlink(inode);
        } else {
                /* Yes, some filesystems do change nlink from zero to one */
                if (inode->i_nlink == 0)
                        atomic_long_dec(&inode->i_sb->s_remove_count);

                inode->__i_nlink = nlink;
        }
}
EXPORT_SYMBOL(set_nlink);

/**
 * inc_nlink - directly increment an inode's link count
 * @inode: inode
 *
 * This is a low-level filesystem helper to replace any
 * direct filesystem manipulation of i_nlink.  Currently,
 * it is only here for parity with dec_nlink().
 */
void inc_nlink(struct inode *inode)
{
        if (unlikely(inode->i_nlink == 0)) {
                WARN_ON(!(inode->i_state & I_LINKABLE));
                atomic_long_dec(&inode->i_sb->s_remove_count);
        }

        inode->__i_nlink++;
}
EXPORT_SYMBOL(inc_nlink);

static void __address_space_init_once(struct address_space *mapping)
{
        xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
        init_rwsem(&mapping->i_mmap_rwsem);
        INIT_LIST_HEAD(&mapping->private_list);
        spin_lock_init(&mapping->private_lock);
        mapping->i_mmap = RB_ROOT_CACHED;
}

void address_space_init_once(struct address_space *mapping)
{
        memset(mapping, 0, sizeof(*mapping));
        __address_space_init_once(mapping);
}
EXPORT_SYMBOL(address_space_init_once);

/*
 * These are initializations that only need to be done
 * once, because the fields are idempotent across use
 * of the inode, so let the slab aware of that.
 */
void inode_init_once(struct inode *inode)
{
        memset(inode, 0, sizeof(*inode));
        INIT_HLIST_NODE(&inode->i_hash);
        INIT_LIST_HEAD(&inode->i_devices);
        INIT_LIST_HEAD(&inode->i_io_list);
        INIT_LIST_HEAD(&inode->i_wb_list);
        INIT_LIST_HEAD(&inode->i_lru);
        __address_space_init_once(&inode->i_data);
        i_size_ordered_init(inode);
}
EXPORT_SYMBOL(inode_init_once);

static void init_once(void *foo)
{
        struct inode *inode = (struct inode *) foo;

        inode_init_once(inode);
}

/*
 * inode->i_lock must be held
 */
void __iget(struct inode *inode)
{
        atomic_inc(&inode->i_count);
}

/*
 * get additional reference to inode; caller must already hold one.
 */
void ihold(struct inode *inode)
{
        WARN_ON(atomic_inc_return(&inode->i_count) < 2);
}
EXPORT_SYMBOL(ihold);

static void inode_lru_list_add(struct inode *inode)
{
        if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru))
                this_cpu_inc(nr_unused);
        else
                inode->i_state |= I_REFERENCED;
}

/*
 * Add inode to LRU if needed (inode is unused and clean).
 *
 * Needs inode->i_lock held.
 */
void inode_add_lru(struct inode *inode)
{
        if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
                                I_FREEING | I_WILL_FREE)) &&
            !atomic_read(&inode->i_count) && inode->i_sb->s_flags & SB_ACTIVE)
                inode_lru_list_add(inode);
}


static void inode_lru_list_del(struct inode *inode)
{

        if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru))
                this_cpu_dec(nr_unused);
}

/**
 * inode_sb_list_add - add inode to the superblock list of inodes
 * @inode: inode to add
 */
void inode_sb_list_add(struct inode *inode)
{
        spin_lock(&inode->i_sb->s_inode_list_lock);
        list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
        spin_unlock(&inode->i_sb->s_inode_list_lock);
}
EXPORT_SYMBOL_GPL(inode_sb_list_add);

static inline void inode_sb_list_del(struct inode *inode)
{
        if (!list_empty(&inode->i_sb_list)) {
                spin_lock(&inode->i_sb->s_inode_list_lock);
                list_del_init(&inode->i_sb_list);
                spin_unlock(&inode->i_sb->s_inode_list_lock);
        }
}

static unsigned long hash(struct super_block *sb, unsigned long hashval)
{
        unsigned long tmp;

        tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
                        L1_CACHE_BYTES;
        tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
        return tmp & i_hash_mask;
}

/**
 *        __insert_inode_hash - hash an inode
 *        @inode: unhashed inode
 *        @hashval: unsigned long value used to locate this object in the
 *                inode_hashtable.
 *
 *        Add an inode to the inode hash for this superblock.
 */
void __insert_inode_hash(struct inode *inode, unsigned long hashval)
{
        struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);

        spin_lock(&inode_hash_lock);
        spin_lock(&inode->i_lock);
        hlist_add_head_rcu(&inode->i_hash, b);
        spin_unlock(&inode->i_lock);
        spin_unlock(&inode_hash_lock);
}
EXPORT_SYMBOL(__insert_inode_hash);

/**
 *        __remove_inode_hash - remove an inode from the hash
 *        @inode: inode to unhash
 *
 *        Remove an inode from the superblock.
 */
void __remove_inode_hash(struct inode *inode)
{
        spin_lock(&inode_hash_lock);
        spin_lock(&inode->i_lock);
        hlist_del_init_rcu(&inode->i_hash);
        spin_unlock(&inode->i_lock);
        spin_unlock(&inode_hash_lock);
}
EXPORT_SYMBOL(__remove_inode_hash);

void clear_inode(struct inode *inode)
{
        /*
         * We have to cycle the i_pages lock here because reclaim can be in the
         * process of removing the last page (in __delete_from_page_cache())
         * and we must not free the mapping under it.
         */
        xa_lock_irq(&inode->i_data.i_pages);
        BUG_ON(inode->i_data.nrpages);
        BUG_ON(inode->i_data.nrexceptional);
        xa_unlock_irq(&inode->i_data.i_pages);
        BUG_ON(!list_empty(&inode->i_data.private_list));
        BUG_ON(!(inode->i_state & I_FREEING));
        BUG_ON(inode->i_state & I_CLEAR);
        BUG_ON(!list_empty(&inode->i_wb_list));
        /* don't need i_lock here, no concurrent mods to i_state */
        inode->i_state = I_FREEING | I_CLEAR;
}
EXPORT_SYMBOL(clear_inode);

/*
 * Free the inode passed in, removing it from the lists it is still connected
 * to. We remove any pages still attached to the inode and wait for any IO that
 * is still in progress before finally destroying the inode.
 *
 * An inode must already be marked I_FREEING so that we avoid the inode being
 * moved back onto lists if we race with other code that manipulates the lists
 * (e.g. writeback_single_inode). The caller is responsible for setting this.
 *
 * An inode must already be removed from the LRU list before being evicted from
 * the cache. This should occur atomically with setting the I_FREEING state
 * flag, so no inodes here should ever be on the LRU when being evicted.
 */
static void evict(struct inode *inode)
{
        const struct super_operations *op = inode->i_sb->s_op;

        BUG_ON(!(inode->i_state & I_FREEING));
        BUG_ON(!list_empty(&inode->i_lru));

        if (!list_empty(&inode->i_io_list))
                inode_io_list_del(inode);

        inode_sb_list_del(inode);

        /*
         * Wait for flusher thread to be done with the inode so that filesystem
         * does not start destroying it while writeback is still running. Since
         * the inode has I_FREEING set, flusher thread won't start new work on
         * the inode.  We just have to wait for running writeback to finish.
         */
        inode_wait_for_writeback(inode);

        if (op->evict_inode) {
                op->evict_inode(inode);
        } else {
                truncate_inode_pages_final(&inode->i_data);
                clear_inode(inode);
        }
        if (S_ISBLK(inode->i_mode) && inode->i_bdev)
                bd_forget(inode);
        if (S_ISCHR(inode->i_mode) && inode->i_cdev)
                cd_forget(inode);

        remove_inode_hash(inode);

        spin_lock(&inode->i_lock);
        wake_up_bit(&inode->i_state, __I_NEW);
        BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
        spin_unlock(&inode->i_lock);

        destroy_inode(inode);
}

/*
 * dispose_list - dispose of the contents of a local list
 * @head: the head of the list to free
 *
 * Dispose-list gets a local list with local inodes in it, so it doesn't
 * need to worry about list corruption and SMP locks.
 */
static void dispose_list(struct list_head *head)
{
        while (!list_empty(head)) {
                struct inode *inode;

                inode = list_first_entry(head, struct inode, i_lru);
                list_del_init(&inode->i_lru);

                evict(inode);
                cond_resched();
        }
}

/**
 * evict_inodes        - evict all evictable inodes for a superblock
 * @sb:                superblock to operate on
 *
 * Make sure that no inodes with zero refcount are retained.  This is
 * called by superblock shutdown after having SB_ACTIVE flag removed,
 * so any inode reaching zero refcount during or after that call will
 * be immediately evicted.
 */
void evict_inodes(struct super_block *sb)
{
        struct inode *inode, *next;
        LIST_HEAD(dispose);

again:
        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
                if (atomic_read(&inode->i_count))
                        continue;

                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }

                inode->i_state |= I_FREEING;
                inode_lru_list_del(inode);
                spin_unlock(&inode->i_lock);
                list_add(&inode->i_lru, &dispose);

                /*
                 * We can have a ton of inodes to evict at unmount time given
                 * enough memory, check to see if we need to go to sleep for a
                 * bit so we don't livelock.
                 */
                if (need_resched()) {
                        spin_unlock(&sb->s_inode_list_lock);
                        cond_resched();
                        dispose_list(&dispose);
                        goto again;
                }
        }
        spin_unlock(&sb->s_inode_list_lock);

        dispose_list(&dispose);
}
EXPORT_SYMBOL_GPL(evict_inodes);

/**
 * invalidate_inodes        - attempt to free all inodes on a superblock
 * @sb:                superblock to operate on
 * @kill_dirty: flag to guide handling of dirty inodes
 *
 * Attempts to free all inodes for a given superblock.  If there were any
 * busy inodes return a non-zero value, else zero.
 * If @kill_dirty is set, discard dirty inodes too, otherwise treat
 * them as busy.
 */
int invalidate_inodes(struct super_block *sb, bool kill_dirty)
{
        int busy = 0;
        struct inode *inode, *next;
        LIST_HEAD(dispose);

again:
        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }
                if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
                        spin_unlock(&inode->i_lock);
                        busy = 1;
                        continue;
                }
                if (atomic_read(&inode->i_count)) {
                        spin_unlock(&inode->i_lock);
                        busy = 1;
                        continue;
                }

                inode->i_state |= I_FREEING;
                inode_lru_list_del(inode);
                spin_unlock(&inode->i_lock);
                list_add(&inode->i_lru, &dispose);
                if (need_resched()) {
                        spin_unlock(&sb->s_inode_list_lock);
                        cond_resched();
                        dispose_list(&dispose);
                        goto again;
                }
        }
        spin_unlock(&sb->s_inode_list_lock);

        dispose_list(&dispose);

        return busy;
}

/*
 * Isolate the inode from the LRU in preparation for freeing it.
 *
 * Any inodes which are pinned purely because of attached pagecache have their
 * pagecache removed.  If the inode has metadata buffers attached to
 * mapping->private_list then try to remove them.
 *
 * If the inode has the I_REFERENCED flag set, then it means that it has been
 * used recently - the flag is set in iput_final(). When we encounter such an
 * inode, clear the flag and move it to the back of the LRU so it gets another
 * pass through the LRU before it gets reclaimed. This is necessary because of
 * the fact we are doing lazy LRU updates to minimise lock contention so the
 * LRU does not have strict ordering. Hence we don't want to reclaim inodes
 * with this flag set because they are the inodes that are out of order.
 */
static enum lru_status inode_lru_isolate(struct list_head *item,
                struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
{
        struct list_head *freeable = arg;
        struct inode        *inode = container_of(item, struct inode, i_lru);

        /*
         * we are inverting the lru lock/inode->i_lock here, so use a trylock.
         * If we fail to get the lock, just skip it.
         */
        if (!spin_trylock(&inode->i_lock))
                return LRU_SKIP;

        /*
         * Referenced or dirty inodes are still in use. Give them another pass
         * through the LRU as we canot reclaim them now.
         */
        if (atomic_read(&inode->i_count) ||
            (inode->i_state & ~I_REFERENCED)) {
                list_lru_isolate(lru, &inode->i_lru);
                spin_unlock(&inode->i_lock);
                this_cpu_dec(nr_unused);
                return LRU_REMOVED;
        }

        /* recently referenced inodes get one more pass */
        if (inode->i_state & I_REFERENCED) {
                inode->i_state &= ~I_REFERENCED;
                spin_unlock(&inode->i_lock);
                return LRU_ROTATE;
        }

        if (inode_has_buffers(inode) || inode->i_data.nrpages) {
                __iget(inode);
                spin_unlock(&inode->i_lock);
                spin_unlock(lru_lock);
                if (remove_inode_buffers(inode)) {
                        unsigned long reap;
                        reap = invalidate_mapping_pages(&inode->i_data, 0, -1);
                        if (current_is_kswapd())
                                __count_vm_events(KSWAPD_INODESTEAL, reap);
                        else
                                __count_vm_events(PGINODESTEAL, reap);
                        if (current->reclaim_state)
                                current->reclaim_state->reclaimed_slab += reap;
                }
                iput(inode);
                spin_lock(lru_lock);
                return LRU_RETRY;
        }

        WARN_ON(inode->i_state & I_NEW);
        inode->i_state |= I_FREEING;
        list_lru_isolate_move(lru, &inode->i_lru, freeable);
        spin_unlock(&inode->i_lock);

        this_cpu_dec(nr_unused);
        return LRU_REMOVED;
}

/*
 * Walk the superblock inode LRU for freeable inodes and attempt to free them.
 * This is called from the superblock shrinker function with a number of inodes
 * to trim from the LRU. Inodes to be freed are moved to a temporary list and
 * then are freed outside inode_lock by dispose_list().
 */
long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
{
        LIST_HEAD(freeable);
        long freed;

        freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
                                     inode_lru_isolate, &freeable);
        dispose_list(&freeable);
        return freed;
}

static void __wait_on_freeing_inode(struct inode *inode);
/*
 * Called with the inode lock held.
 */
static struct inode *find_inode(struct super_block *sb,
                                struct hlist_head *head,
                                int (*test)(struct inode *, void *),
                                void *data)
{
        struct inode *inode = NULL;

repeat:
        hlist_for_each_entry(inode, head, i_hash) {
                if (inode->i_sb != sb)
                        continue;
                if (!test(inode, data))
                        continue;
                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
                        __wait_on_freeing_inode(inode);
                        goto repeat;
                }
                if (unlikely(inode->i_state & I_CREATING)) {
                        spin_unlock(&inode->i_lock);
                        return ERR_PTR(-ESTALE);
                }
                __iget(inode);
                spin_unlock(&inode->i_lock);
                return inode;
        }
        return NULL;
}

/*
 * find_inode_fast is the fast path version of find_inode, see the comment at
 * iget_locked for details.
 */
static struct inode *find_inode_fast(struct super_block *sb,
                                struct hlist_head *head, unsigned long ino)
{
        struct inode *inode = NULL;

repeat:
        hlist_for_each_entry(inode, head, i_hash) {
                if (inode->i_ino != ino)
                        continue;
                if (inode->i_sb != sb)
                        continue;
                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
                        __wait_on_freeing_inode(inode);
                        goto repeat;
                }
                if (unlikely(inode->i_state & I_CREATING)) {
                        spin_unlock(&inode->i_lock);
                        return ERR_PTR(-ESTALE);
                }
                __iget(inode);
                spin_unlock(&inode->i_lock);
                return inode;
        }
        return NULL;
}

/*
 * Each cpu owns a range of LAST_INO_BATCH numbers.
 * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
 * to renew the exhausted range.
 *
 * This does not significantly increase overflow rate because every CPU can
 * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
 * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
 * 2^32 range, and is a worst-case. Even a 50% wastage would only increase
 * overflow rate by 2x, which does not seem too significant.
 *
 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
 * error if st_ino won't fit in target struct field. Use 32bit counter
 * here to attempt to avoid that.
 */
#define LAST_INO_BATCH 1024
static DEFINE_PER_CPU(unsigned int, last_ino);

unsigned int get_next_ino(void)
{
        unsigned int *p = &get_cpu_var(last_ino);
        unsigned int res = *p;

#ifdef CONFIG_SMP
        if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
                static atomic_t shared_last_ino;
                int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);

                res = next - LAST_INO_BATCH;
        }
#endif

        res++;
        /* get_next_ino should not provide a 0 inode number */
        if (unlikely(!res))
                res++;
        *p = res;
        put_cpu_var(last_ino);
        return res;
}
EXPORT_SYMBOL(get_next_ino);

/**
 *        new_inode_pseudo         - obtain an inode
 *        @sb: superblock
 *
 *        Allocates a new inode for given superblock.
 *        Inode wont be chained in superblock s_inodes list
 *        This means :
 *        - fs can't be unmount
 *        - quotas, fsnotify, writeback can't work
 */
struct inode *new_inode_pseudo(struct super_block *sb)
{
        struct inode *inode = alloc_inode(sb);

        if (inode) {
                spin_lock(&inode->i_lock);
                inode->i_state = 0;
                spin_unlock(&inode->i_lock);
                INIT_LIST_HEAD(&inode->i_sb_list);
        }
        return inode;
}

/**
 *        new_inode         - obtain an inode
 *        @sb: superblock
 *
 *        Allocates a new inode for given superblock. The default gfp_mask
 *        for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
 *        If HIGHMEM pages are unsuitable or it is known that pages allocated
 *        for the page cache are not reclaimable or migratable,
 *        mapping_set_gfp_mask() must be called with suitable flags on the
 *        newly created inode's mapping
 *
 */
struct inode *new_inode(struct super_block *sb)
{
        struct inode *inode;

        spin_lock_prefetch(&sb->s_inode_list_lock);

        inode = new_inode_pseudo(sb);
        if (inode)
                inode_sb_list_add(inode);
        return inode;
}
EXPORT_SYMBOL(new_inode);

#ifdef CONFIG_DEBUG_LOCK_ALLOC
void lockdep_annotate_inode_mutex_key(struct inode *inode)
{
        if (S_ISDIR(inode->i_mode)) {
                struct file_system_type *type = inode->i_sb->s_type;

                /* Set new key only if filesystem hasn't already changed it */
                if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) {
                        /*
                         * ensure nobody is actually holding i_mutex
                         */
                        // mutex_destroy(&inode->i_mutex);
                        init_rwsem(&inode->i_rwsem);
                        lockdep_set_class(&inode->i_rwsem,
                                          &type->i_mutex_dir_key);
                }
        }
}
EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key);
#endif

/**
 * unlock_new_inode - clear the I_NEW state and wake up any waiters
 * @inode:        new inode to unlock
 *
 * Called when the inode is fully initialised to clear the new state of the
 * inode and wake up anyone waiting for the inode to finish initialisation.
 */
void unlock_new_inode(struct inode *inode)
{
        lockdep_annotate_inode_mutex_key(inode);
        spin_lock(&inode->i_lock);
        WARN_ON(!(inode->i_state & I_NEW));
        inode->i_state &= ~I_NEW & ~I_CREATING;
        smp_mb();
        wake_up_bit(&inode->i_state, __I_NEW);
        spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(unlock_new_inode);

void discard_new_inode(struct inode *inode)
{
        lockdep_annotate_inode_mutex_key(inode);
        spin_lock(&inode->i_lock);
        WARN_ON(!(inode->i_state & I_NEW));
        inode->i_state &= ~I_NEW;
        smp_mb();
        wake_up_bit(&inode->i_state, __I_NEW);
        spin_unlock(&inode->i_lock);
        iput(inode);
}
EXPORT_SYMBOL(discard_new_inode);

/**
 * lock_two_inodes - lock two inodes (may be regular files but also dirs)
 *
 * Lock any non-NULL argument. The caller must make sure that if he is passing
 * in two directories, one is not ancestor of the other.  Zero, one or two
 * objects may be locked by this function.
 *
 * @inode1: first inode to lock
 * @inode2: second inode to lock
 * @subclass1: inode lock subclass for the first lock obtained
 * @subclass2: inode lock subclass for the second lock obtained
 */
void lock_two_inodes(struct inode *inode1, struct inode *inode2,
                     unsigned subclass1, unsigned subclass2)
{
        if (!inode1 || !inode2) {
                /*
                 * Make sure @subclass1 will be used for the acquired lock.
                 * This is not strictly necessary (no current caller cares) but
                 * let's keep things consistent.
                 */
                if (!inode1)
                        swap(inode1, inode2);
                goto lock;
        }

        /*
         * If one object is directory and the other is not, we must make sure
         * to lock directory first as the other object may be its child.
         */
        if (S_ISDIR(inode2->i_mode) == S_ISDIR(inode1->i_mode)) {
                if (inode1 > inode2)
                        swap(inode1, inode2);
        } else if (!S_ISDIR(inode1->i_mode))
                swap(inode1, inode2);
lock:
        if (inode1)
                inode_lock_nested(inode1, subclass1);
        if (inode2 && inode2 != inode1)
                inode_lock_nested(inode2, subclass2);
}

/**
 * lock_two_nondirectories - take two i_mutexes on non-directory objects
 *
 * Lock any non-NULL argument that is not a directory.
 * Zero, one or two objects may be locked by this function.
 *
 * @inode1: first inode to lock
 * @inode2: second inode to lock
 */
void lock_two_nondirectories(struct inode *inode1, struct inode *inode2)
{
        if (inode1 > inode2)
                swap(inode1, inode2);

        if (inode1 && !S_ISDIR(inode1->i_mode))
                inode_lock(inode1);
        if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
                inode_lock_nested(inode2, I_MUTEX_NONDIR2);
}
EXPORT_SYMBOL(lock_two_nondirectories);

/**
 * unlock_two_nondirectories - release locks from lock_two_nondirectories()
 * @inode1: first inode to unlock
 * @inode2: second inode to unlock
 */
void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2)
{
        if (inode1 && !S_ISDIR(inode1->i_mode))
                inode_unlock(inode1);
        if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
                inode_unlock(inode2);
}
EXPORT_SYMBOL(unlock_two_nondirectories);

/**
 * inode_insert5 - obtain an inode from a mounted file system
 * @inode:        pre-allocated inode to use for insert to cache
 * @hashval:        hash value (usually inode number) to get
 * @test:        callback used for comparisons between inodes
 * @set:        callback used to initialize a new struct inode
 * @data:        opaque data pointer to pass to @test and @set
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * and if present it is return it with an increased reference count. This is
 * a variant of iget5_locked() for callers that don't want to fail on memory
 * allocation of inode.
 *
 * If the inode is not in cache, insert the pre-allocated inode to cache and
 * return it locked, hashed, and with the I_NEW flag set. The file system gets
 * to fill it in before unlocking it via unlock_new_inode().
 *
 * Note both @test and @set are called with the inode_hash_lock held, so can't
 * sleep.
 */
struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
                            int (*test)(struct inode *, void *),
                            int (*set)(struct inode *, void *), void *data)
{
        struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
        struct inode *old;
        bool creating = inode->i_state & I_CREATING;

again:
        spin_lock(&inode_hash_lock);
        old = find_inode(inode->i_sb, head, test, data);
        if (unlikely(old)) {
                /*
                 * Uhhuh, somebody else created the same inode under us.
                 * Use the old inode instead of the preallocated one.
                 */
                spin_unlock(&inode_hash_lock);
                if (IS_ERR(old))
                        return NULL;
                wait_on_inode(old);
                if (unlikely(inode_unhashed(old))) {
                        iput(old);
                        goto again;
                }
                return old;
        }

        if (set && unlikely(set(inode, data))) {
                inode = NULL;
                goto unlock;
        }

        /*
         * Return the locked inode with I_NEW set, the
         * caller is responsible for filling in the contents
         */
        spin_lock(&inode->i_lock);
        inode->i_state |= I_NEW;
        hlist_add_head_rcu(&inode->i_hash, head);
        spin_unlock(&inode->i_lock);
        if (!creating)
                inode_sb_list_add(inode);
unlock:
        spin_unlock(&inode_hash_lock);

        return inode;
}
EXPORT_SYMBOL(inode_insert5);

/**
 * iget5_locked - obtain an inode from a mounted file system
 * @sb:                super block of file system
 * @hashval:        hash value (usually inode number) to get
 * @test:        callback used for comparisons between inodes
 * @set:        callback used to initialize a new struct inode
 * @data:        opaque data pointer to pass to @test and @set
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * and if present it is return it with an increased reference count. This is
 * a generalized version of iget_locked() for file systems where the inode
 * number is not sufficient for unique identification of an inode.
 *
 * If the inode is not in cache, allocate a new inode and return it locked,
 * hashed, and with the I_NEW flag set. The file system gets to fill it in
 * before unlocking it via unlock_new_inode().
 *
 * Note both @test and @set are called with the inode_hash_lock held, so can't
 * sleep.
 */
struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
                int (*test)(struct inode *, void *),
                int (*set)(struct inode *, void *), void *data)
{
        struct inode *inode = ilookup5(sb, hashval, test, data);

        if (!inode) {
                struct inode *new = alloc_inode(sb);

                if (new) {
                        new->i_state = 0;
                        inode = inode_insert5(new, hashval, test, set, data);
                        if (unlikely(inode != new))
                                destroy_inode(new);
                }
        }
        return inode;
}
EXPORT_SYMBOL(iget5_locked);

/**
 * iget_locked - obtain an inode from a mounted file system
 * @sb:                super block of file system
 * @ino:        inode number to get
 *
 * Search for the inode specified by @ino in the inode cache and if present
 * return it with an increased reference count. This is for file systems
 * where the inode number is sufficient for unique identification of an inode.
 *
 * If the inode is not in cache, allocate a new inode and return it locked,
 * hashed, and with the I_NEW flag set.  The file system gets to fill it in
 * before unlocking it via unlock_new_inode().
 */
struct inode *iget_locked(struct super_block *sb, unsigned long ino)
{
        struct hlist_head *head = inode_hashtable + hash(sb, ino);
        struct inode *inode;
again:
        spin_lock(&inode_hash_lock);
        inode = find_inode_fast(sb, head, ino);
        spin_unlock(&inode_hash_lock);
        if (inode) {
                if (IS_ERR(inode))
                        return NULL;
                wait_on_inode(inode);
                if (unlikely(inode_unhashed(inode))) {
                        iput(inode);
                        goto again;
                }
                return inode;
        }

        inode = alloc_inode(sb);
        if (inode) {
                struct inode *old;

                spin_lock(&inode_hash_lock);
                /* We released the lock, so.. */
                old = find_inode_fast(sb, head, ino);
                if (!old) {
                        inode->i_ino = ino;
                        spin_lock(&inode->i_lock);
                        inode->i_state = I_NEW;
                        hlist_add_head_rcu(&inode->i_hash, head);
                        spin_unlock(&inode->i_lock);
                        inode_sb_list_add(inode);
                        spin_unlock(&inode_hash_lock);

                        /* Return the locked inode with I_NEW set, the
                         * caller is responsible for filling in the contents
                         */
                        return inode;
                }

                /*
                 * Uhhuh, somebody else created the same inode under
                 * us. Use the old inode instead of the one we just
                 * allocated.
                 */
                spin_unlock(&inode_hash_lock);
                destroy_inode(inode);
                if (IS_ERR(old))
                        return NULL;
                inode = old;
                wait_on_inode(inode);
                if (unlikely(inode_unhashed(inode))) {
                        iput(inode);
                        goto again;
                }
        }
        return inode;
}
EXPORT_SYMBOL(iget_locked);

/*
 * search the inode cache for a matching inode number.
 * If we find one, then the inode number we are trying to
 * allocate is not unique and so we should not use it.
 *
 * Returns 1 if the inode number is unique, 0 if it is not.
 */
static int test_inode_iunique(struct super_block *sb, unsigned long ino)
{
        struct hlist_head *b = inode_hashtable + hash(sb, ino);
        struct inode *inode;

        hlist_for_each_entry_rcu(inode, b, i_hash) {
                if (inode->i_ino == ino && inode->i_sb == sb)
                        return 0;
        }
        return 1;
}

/**
 *        iunique - get a unique inode number
 *        @sb: superblock
 *        @max_reserved: highest reserved inode number
 *
 *        Obtain an inode number that is unique on the system for a given
 *        superblock. This is used by file systems that have no natural
 *        permanent inode numbering system. An inode number is returned that
 *        is higher than the reserved limit but unique.
 *
 *        BUGS:
 *        With a large number of inodes live on the file system this function
 *        currently becomes quite slow.
 */
ino_t iunique(struct super_block *sb, ino_t max_reserved)
{
        /*
         * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
         * error if st_ino won't fit in target struct field. Use 32bit counter
         * here to attempt to avoid that.
         */
        static DEFINE_SPINLOCK(iunique_lock);
        static unsigned int counter;
        ino_t res;

        rcu_read_lock();
        spin_lock(&iunique_lock);
        do {
                if (counter <= max_reserved)
                        counter = max_reserved + 1;
                res = counter++;
        } while (!test_inode_iunique(sb, res));
        spin_unlock(&iunique_lock);
        rcu_read_unlock();

        return res;
}
EXPORT_SYMBOL(iunique);

struct inode *igrab(struct inode *inode)
{
        spin_lock(&inode->i_lock);
        if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) {
                __iget(inode);
                spin_unlock(&inode->i_lock);
        } else {
                spin_unlock(&inode->i_lock);
                /*
                 * Handle the case where s_op->clear_inode is not been
                 * called yet, and somebody is calling igrab
                 * while the inode is getting freed.
                 */
                inode = NULL;
        }
        return inode;
}
EXPORT_SYMBOL(igrab);

/**
 * ilookup5_nowait - search for an inode in the inode cache
 * @sb:                super block of file system to search
 * @hashval:        hash value (usually inode number) to search for
 * @test:        callback used for comparisons between inodes
 * @data:        opaque data pointer to pass to @test
 *
 * Search for the inode specified by @hashval and @data in the inode cache.
 * If the inode is in the cache, the inode is returned with an incremented
 * reference count.
 *
 * Note: I_NEW is not waited upon so you have to be very careful what you do
 * with the returned inode.  You probably should be using ilookup5() instead.
 *
 * Note2: @test is called with the inode_hash_lock held, so can't sleep.
 */
struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
                int (*test)(struct inode *, void *), void *data)
{
        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
        struct inode *inode;

        spin_lock(&inode_hash_lock);
        inode = find_inode(sb, head, test, data);
        spin_unlock(&inode_hash_lock);

        return IS_ERR(inode) ? NULL : inode;
}
EXPORT_SYMBOL(ilookup5_nowait);

/**
 * ilookup5 - search for an inode in the inode cache
 * @sb:                super block of file system to search
 * @hashval:        hash value (usually inode number) to search for
 * @test:        callback used for comparisons between inodes
 * @data:        opaque data pointer to pass to @test
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * and if the inode is in the cache, return the inode with an incremented
 * reference count.  Waits on I_NEW before returning the inode.
 * returned with an incremented reference count.
 *
 * This is a generalized version of ilookup() for file systems where the
 * inode number is not sufficient for unique identification of an inode.
 *
 * Note: @test is called with the inode_hash_lock held, so can't sleep.
 */
struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
                int (*test)(struct inode *, void *), void *data)
{
        struct inode *inode;
again:
        inode = ilookup5_nowait(sb, hashval, test, data);
        if (inode) {
                wait_on_inode(inode);
                if (unlikely(inode_unhashed(inode))) {
                        iput(inode);
                        goto again;
                }
        }
        return inode;
}
EXPORT_SYMBOL(ilookup5);

/**
 * ilookup - search for an inode in the inode cache
 * @sb:                super block of file system to search
 * @ino:        inode number to search for
 *
 * Search for the inode @ino in the inode cache, and if the inode is in the
 * cache, the inode is returned with an incremented reference count.
 */
struct inode *ilookup(struct super_block *sb, unsigned long ino)
{
        struct hlist_head *head = inode_hashtable + hash(sb, ino);
        struct inode *inode;
again:
        spin_lock(&inode_hash_lock);
        inode = find_inode_fast(sb, head, ino);
        spin_unlock(&inode_hash_lock);

        if (inode) {
                if (IS_ERR(inode))
                        return NULL;
                wait_on_inode(inode);
                if (unlikely(inode_unhashed(inode))) {
                        iput(inode);
                        goto again;
                }
        }
        return inode;
}
EXPORT_SYMBOL(ilookup);

/**
 * find_inode_nowait - find an inode in the inode cache
 * @sb:                super block of file system to search
 * @hashval:        hash value (usually inode number) to search for
 * @match:        callback used for comparisons between inodes
 * @data:        opaque data pointer to pass to @match
 *
 * Search for the inode specified by @hashval and @data in the inode
 * cache, where the helper function @match will return 0 if the inode
 * does not match, 1 if the inode does match, and -1 if the search
 * should be stopped.  The @match function must be responsible for
 * taking the i_lock spin_lock and checking i_state for an inode being
 * freed or being initialized, and incrementing the reference count
 * before returning 1.  It also must not sleep, since it is called with
 * the inode_hash_lock spinlock held.
 *
 * This is a even more generalized version of ilookup5() when the
 * function must never block --- find_inode() can block in
 * __wait_on_freeing_inode() --- or when the caller can not increment
 * the reference count because the resulting iput() might cause an
 * inode eviction.  The tradeoff is that the @match funtion must be
 * very carefully implemented.
 */
struct inode *find_inode_nowait(struct super_block *sb,
                                unsigned long hashval,
                                int (*match)(struct inode *, unsigned long,
                                             void *),
                                void *data)
{
        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
        struct inode *inode, *ret_inode = NULL;
        int mval;

        spin_lock(&inode_hash_lock);
        hlist_for_each_entry(inode, head, i_hash) {
                if (inode->i_sb != sb)
                        continue;
                mval = match(inode, hashval, data);
                if (mval == 0)
                        continue;
                if (mval == 1)
                        ret_inode = inode;
                goto out;
        }
out:
        spin_unlock(&inode_hash_lock);
        return ret_inode;
}
EXPORT_SYMBOL(find_inode_nowait);

/**
 * find_inode_rcu - find an inode in the inode cache
 * @sb:                Super block of file system to search
 * @hashval:        Key to hash
 * @test:        Function to test match on an inode
 * @data:        Data for test function
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * where the helper function @test will return 0 if the inode does not match
 * and 1 if it does.  The @test function must be responsible for taking the
 * i_lock spin_lock and checking i_state for an inode being freed or being
 * initialized.
 *
 * If successful, this will return the inode for which the @test function
 * returned 1 and NULL otherwise.
 *
 * The @test function is not permitted to take a ref on any inode presented.
 * It is also not permitted to sleep.
 *
 * The caller must hold the RCU read lock.
 */
struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval,
                             int (*test)(struct inode *, void *), void *data)
{
        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
        struct inode *inode;

        RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
                         "suspicious find_inode_rcu() usage");

        hlist_for_each_entry_rcu(inode, head, i_hash) {
                if (inode->i_sb == sb &&
                    !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) &&
                    test(inode, data))
                        return inode;
        }
        return NULL;
}
EXPORT_SYMBOL(find_inode_rcu);

/**
 * find_inode_by_rcu - Find an inode in the inode cache
 * @sb:                Super block of file system to search
 * @ino:        The inode number to match
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * where the helper function @test will return 0 if the inode does not match
 * and 1 if it does.  The @test function must be responsible for taking the
 * i_lock spin_lock and checking i_state for an inode being freed or being
 * initialized.
 *
 * If successful, this will return the inode for which the @test function
 * returned 1 and NULL otherwise.
 *
 * The @test function is not permitted to take a ref on any inode presented.
 * It is also not permitted to sleep.
 *
 * The caller must hold the RCU read lock.
 */
struct inode *find_inode_by_ino_rcu(struct super_block *sb,
                                    unsigned long ino)
{
        struct hlist_head *head = inode_hashtable + hash(sb, ino);
        struct inode *inode;

        RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
                         "suspicious find_inode_by_ino_rcu() usage");

        hlist_for_each_entry_rcu(inode, head, i_hash) {
                if (inode->i_ino == ino &&
                    inode->i_sb == sb &&
                    !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)))
                    return inode;
        }
        return NULL;
}
EXPORT_SYMBOL(find_inode_by_ino_rcu);

int insert_inode_locked(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        ino_t ino = inode->i_ino;
        struct hlist_head *head = inode_hashtable + hash(sb, ino);

        while (1) {
                struct inode *old = NULL;
                spin_lock(&inode_hash_lock);
                hlist_for_each_entry(old, head, i_hash) {
                        if (old->i_ino != ino)
                                continue;
                        if (old->i_sb != sb)
                                continue;
                        spin_lock(&old->i_lock);
                        if (old->i_state & (I_FREEING|I_WILL_FREE)) {
                                spin_unlock(&old->i_lock);
                                continue;
                        }
                        break;
                }
                if (likely(!old)) {
                        spin_lock(&inode->i_lock);
                        inode->i_state |= I_NEW | I_CREATING;
                        hlist_add_head_rcu(&inode->i_hash, head);
                        spin_unlock(&inode->i_lock);
                        spin_unlock(&inode_hash_lock);
                        return 0;
                }
                if (unlikely(old->i_state & I_CREATING)) {
                        spin_unlock(&old->i_lock);
                        spin_unlock(&inode_hash_lock);
                        return -EBUSY;
                }
                __iget(old);
                spin_unlock(&old->i_lock);
                spin_unlock(&inode_hash_lock);
                wait_on_inode(old);
                if (unlikely(!inode_unhashed(old))) {
                        iput(old);
                        return -EBUSY;
                }
                iput(old);
        }
}
EXPORT_SYMBOL(insert_inode_locked);

int insert_inode_locked4(struct inode *inode, unsigned long hashval,
                int (*test)(struct inode *, void *), void *data)
{
        struct inode *old;

        inode->i_state |= I_CREATING;
        old = inode_insert5(inode, hashval, test, NULL, data);

        if (old != inode) {
                iput(old);
                return -EBUSY;
        }
        return 0;
}
EXPORT_SYMBOL(insert_inode_locked4);


int generic_delete_inode(struct inode *inode)
{
        return 1;
}
EXPORT_SYMBOL(generic_delete_inode);

/*
 * Called when we're dropping the last reference
 * to an inode.
 *
 * Call the FS "drop_inode()" function, defaulting to
 * the legacy UNIX filesystem behaviour.  If it tells
 * us to evict inode, do so.  Otherwise, retain inode
 * in cache if fs is alive, sync and evict if fs is
 * shutting down.
 */
static void iput_final(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        const struct super_operations *op = inode->i_sb->s_op;
        unsigned long state;
        int drop;

        WARN_ON(inode->i_state & I_NEW);

        if (op->drop_inode)
                drop = op->drop_inode(inode);
        else
                drop = generic_drop_inode(inode);

        if (!drop &&
            !(inode->i_state & I_DONTCACHE) &&
            (sb->s_flags & SB_ACTIVE)) {
                inode_add_lru(inode);
                spin_unlock(&inode->i_lock);
                return;
        }

        state = inode->i_state;
        if (!drop) {
                WRITE_ONCE(inode->i_state, state | I_WILL_FREE);
                spin_unlock(&inode->i_lock);

                write_inode_now(inode, 1);

                spin_lock(&inode->i_lock);
                state = inode->i_state;
                WARN_ON(state & I_NEW);
                state &= ~I_WILL_FREE;
        }

        WRITE_ONCE(inode->i_state, state | I_FREEING);
        if (!list_empty(&inode->i_lru))
                inode_lru_list_del(inode);
        spin_unlock(&inode->i_lock);

        evict(inode);
}

/**
 *        iput        - put an inode
 *        @inode: inode to put
 *
 *        Puts an inode, dropping its usage count. If the inode use count hits
 *        zero, the inode is then freed and may also be destroyed.
 *
 *        Consequently, iput() can sleep.
 */
void iput(struct inode *inode)
{
        if (!inode)
                return;
        BUG_ON(inode->i_state & I_CLEAR);
retry:
        if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
                if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
                        atomic_inc(&inode->i_count);
                        spin_unlock(&inode->i_lock);
                        trace_writeback_lazytime_iput(inode);
                        mark_inode_dirty_sync(inode);
                        goto retry;
                }
                iput_final(inode);
        }
}
EXPORT_SYMBOL(iput);

#ifdef CONFIG_BLOCK
/**
 *        bmap        - find a block number in a file
 *        @inode:  inode owning the block number being requested
 *        @block: pointer containing the block to find
 *
 *        Replaces the value in ``*block`` with the block number on the device holding
 *        corresponding to the requested block number in the file.
 *        That is, asked for block 4 of inode 1 the function will replace the
 *        4 in ``*block``, with disk block relative to the disk start that holds that
 *        block of the file.
 *
 *        Returns -EINVAL in case of error, 0 otherwise. If mapping falls into a
 *        hole, returns 0 and ``*block`` is also set to 0.
 */
int bmap(struct inode *inode, sector_t *block)
{
        if (!inode->i_mapping->a_ops->bmap)
                return -EINVAL;

        *block = inode->i_mapping->a_ops->bmap(inode->i_mapping, *block);
        return 0;
}
EXPORT_SYMBOL(bmap);
#endif

/*
 * With relative atime, only update atime if the previous atime is
 * earlier than either the ctime or mtime or if at least a day has
 * passed since the last atime update.
 */
static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
                             struct timespec64 now)
{

        if (!(mnt->mnt_flags & MNT_RELATIME))
                return 1;
        /*
         * Is mtime younger than atime? If yes, update atime:
         */
        if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0)
                return 1;
        /*
         * Is ctime younger than atime? If yes, update atime:
         */
        if (timespec64_compare(&inode->i_ctime, &inode->i_atime) >= 0)
                return 1;

        /*
         * Is the previous atime value older than a day? If yes,
         * update atime:
         */
        if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60)
                return 1;
        /*
         * Good, we can skip the atime update:
         */
        return 0;
}

int generic_update_time(struct inode *inode, struct timespec64 *time, int flags)
{
        int iflags = I_DIRTY_TIME;
        bool dirty = false;

        if (flags & S_ATIME)
                inode->i_atime = *time;
        if (flags & S_VERSION)
                dirty = inode_maybe_inc_iversion(inode, false);
        if (flags & S_CTIME)
                inode->i_ctime = *time;
        if (flags & S_MTIME)
                inode->i_mtime = *time;
        if ((flags & (S_ATIME | S_CTIME | S_MTIME)) &&
            !(inode->i_sb->s_flags & SB_LAZYTIME))
                dirty = true;

        if (dirty)
                iflags |= I_DIRTY_SYNC;
        __mark_inode_dirty(inode, iflags);
        return 0;
}
EXPORT_SYMBOL(generic_update_time);

/*
 * This does the actual work of updating an inodes time or version.  Must have
 * had called mnt_want_write() before calling this.
 */
int inode_update_time(struct inode *inode, struct timespec64 *time, int flags)
{
        if (inode->i_op->update_time)
                return inode->i_op->update_time(inode, time, flags);
        return generic_update_time(inode, time, flags);
}
EXPORT_SYMBOL(inode_update_time);

/**
 *        touch_atime        -        update the access time
 *        @path: the &struct path to update
 *        @inode: inode to update
 *
 *        Update the accessed time on an inode and mark it for writeback.
 *        This function automatically handles read only file systems and media,
 *        as well as the "noatime" flag and inode specific "noatime" markers.
 */
bool atime_needs_update(const struct path *path, struct inode *inode)
{
        struct vfsmount *mnt = path->mnt;
        struct timespec64 now;

        if (inode->i_flags & S_NOATIME)
                return false;

        /* Atime updates will likely cause i_uid and i_gid to be written
         * back improprely if their true value is unknown to the vfs.
         */
        if (HAS_UNMAPPED_ID(inode))
                return false;

        if (IS_NOATIME(inode))
                return false;
        if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
                return false;

        if (mnt->mnt_flags & MNT_NOATIME)
                return false;
        if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
                return false;

        now = current_time(inode);

        if (!relatime_need_update(mnt, inode, now))
                return false;

        if (timespec64_equal(&inode->i_atime, &now))
                return false;

        return true;
}

void touch_atime(const struct path *path)
{
        struct vfsmount *mnt = path->mnt;
        struct inode *inode = d_inode(path->dentry);
        struct timespec64 now;

        if (!atime_needs_update(path, inode))
                return;

        if (!sb_start_write_trylock(inode->i_sb))
                return;

        if (__mnt_want_write(mnt) != 0)
                goto skip_update;
        /*
         * File systems can error out when updating inodes if they need to
         * allocate new space to modify an inode (such is the case for
         * Btrfs), but since we touch atime while walking down the path we
         * really don't care if we failed to update the atime of the file,
         * so just ignore the return value.
         * We may also fail on filesystems that have the ability to make parts
         * of the fs read only, e.g. subvolumes in Btrfs.
         */
        now = current_time(inode);
        inode_update_time(inode, &now, S_ATIME);
        __mnt_drop_write(mnt);
skip_update:
        sb_end_write(inode->i_sb);
}
EXPORT_SYMBOL(touch_atime);

/*
 * Return mask of changes for notify_change() that need to be done as a
 * response to write or truncate. Return 0 if nothing has to be changed.
 * Negative value on error (change should be denied).
 */
int dentry_needs_remove_privs(struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);
        int mask = 0;
        int ret;

        if (IS_NOSEC(inode))
                return 0;

        mask = setattr_should_drop_suidgid(inode);
        ret = security_inode_need_killpriv(dentry);
        if (ret < 0)
                return ret;
        if (ret)
                mask |= ATTR_KILL_PRIV;
        return mask;
}

static int __remove_privs(struct dentry *dentry, int kill)
{
        struct iattr newattrs;

        newattrs.ia_valid = ATTR_FORCE | kill;
        /*
         * Note we call this on write, so notify_change will not
         * encounter any conflicting delegations:
         */
        return notify_change(dentry, &newattrs, NULL);
}

/*
 * Remove special file priviledges (suid, capabilities) when file is written
 * to or truncated.
 */
int file_remove_privs(struct file *file)
{
        struct dentry *dentry = file_dentry(file);
        struct inode *inode = file_inode(file);
        int kill;
        int error = 0;

        /*
         * Fast path for nothing security related.
         * As well for non-regular files, e.g. blkdev inodes.
         * For example, blkdev_write_iter() might get here
         * trying to remove privs which it is not allowed to.
         */
        if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
                return 0;

        kill = dentry_needs_remove_privs(dentry);
        if (kill < 0)
                return kill;
        if (kill)
                error = __remove_privs(dentry, kill);
        if (!error)
                inode_has_no_xattr(inode);

        return error;
}
EXPORT_SYMBOL(file_remove_privs);

/**
 *        file_update_time        -        update mtime and ctime time
 *        @file: file accessed
 *
 *        Update the mtime and ctime members of an inode and mark the inode
 *        for writeback.  Note that this function is meant exclusively for
 *        usage in the file write path of filesystems, and filesystems may
 *        choose to explicitly ignore update via this function with the
 *        S_NOCMTIME inode flag, e.g. for network filesystem where these
 *        timestamps are handled by the server.  This can return an error for
 *        file systems who need to allocate space in order to update an inode.
 */

int file_update_time(struct file *file)
{
        struct inode *inode = file_inode(file);
        struct timespec64 now;
        int sync_it = 0;
        int ret;

        /* First try to exhaust all avenues to not sync */
        if (IS_NOCMTIME(inode))
                return 0;

        now = current_time(inode);
        if (!timespec64_equal(&inode->i_mtime, &now))
                sync_it = S_MTIME;

        if (!timespec64_equal(&inode->i_ctime, &now))
                sync_it |= S_CTIME;

        if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
                sync_it |= S_VERSION;

        if (!sync_it)
                return 0;

        /* Finally allowed to write? Takes lock. */
        if (__mnt_want_write_file(file))
                return 0;

        ret = inode_update_time(inode, &now, sync_it);
        __mnt_drop_write_file(file);

        return ret;
}
EXPORT_SYMBOL(file_update_time);

/* Caller must hold the file's inode lock */
int file_modified(struct file *file)
{
        int err;

        /*
         * Clear the security bits if the process is not being run by root.
         * This keeps people from modifying setuid and setgid binaries.
         */
        err = file_remove_privs(file);
        if (err)
                return err;

        if (unlikely(file->f_mode & FMODE_NOCMTIME))
                return 0;

        return file_update_time(file);
}
EXPORT_SYMBOL(file_modified);

int inode_needs_sync(struct inode *inode)
{
        if (IS_SYNC(inode))
                return 1;
        if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
                return 1;
        return 0;
}
EXPORT_SYMBOL(inode_needs_sync);

/*
 * If we try to find an inode in the inode hash while it is being
 * deleted, we have to wait until the filesystem completes its
 * deletion before reporting that it isn't found.  This function waits
 * until the deletion _might_ have completed.  Callers are responsible
 * to recheck inode state.
 *
 * It doesn't matter if I_NEW is not set initially, a call to
 * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
 * will DTRT.
 */
static void __wait_on_freeing_inode(struct inode *inode)
{
        wait_queue_head_t *wq;
        DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
        wq = bit_waitqueue(&inode->i_state, __I_NEW);
        prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
        spin_unlock(&inode->i_lock);
        spin_unlock(&inode_hash_lock);
        schedule();
        finish_wait(wq, &wait.wq_entry);
        spin_lock(&inode_hash_lock);
}

static __initdata unsigned long ihash_entries;
static int __init set_ihash_entries(char *str)
{
        if (!str)
                return 0;
        ihash_entries = simple_strtoul(str, &str, 0);
        return 1;
}
__setup("ihash_entries=", set_ihash_entries);

/*
 * Initialize the waitqueues and inode hash table.
 */
void __init inode_init_early(void)
{
        /* If hashes are distributed across NUMA nodes, defer
         * hash allocation until vmalloc space is available.
         */
        if (hashdist)
                return;

        inode_hashtable =
                alloc_large_system_hash("Inode-cache",
                                        sizeof(struct hlist_head),
                                        ihash_entries,
                                        14,
                                        HASH_EARLY | HASH_ZERO,
                                        &i_hash_shift,
                                        &i_hash_mask,
                                        0,
                                        0);
}

void __init inode_init(void)
{
        /* inode slab cache */
        inode_cachep = kmem_cache_create("inode_cache",
                                         sizeof(struct inode),
                                         0,
                                         (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
                                         SLAB_MEM_SPREAD|SLAB_ACCOUNT),
                                         init_once);

        /* Hash may have been set up in inode_init_early */
        if (!hashdist)
                return;

        inode_hashtable =
                alloc_large_system_hash("Inode-cache",
                                        sizeof(struct hlist_head),
                                        ihash_entries,
                                        14,
                                        HASH_ZERO,
                                        &i_hash_shift,
                                        &i_hash_mask,
                                        0,
                                        0);
}

void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
{
        inode->i_mode = mode;
        if (S_ISCHR(mode)) {
                inode->i_fop = &def_chr_fops;
                inode->i_rdev = rdev;
        } else if (S_ISBLK(mode)) {
                inode->i_fop = &def_blk_fops;
                inode->i_rdev = rdev;
        } else if (S_ISFIFO(mode))
                inode->i_fop = &pipefifo_fops;
        else if (S_ISSOCK(mode))
                ;        /* leave it no_open_fops */
        else
                printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
                                  " inode %s:%lu\n", mode, inode->i_sb->s_id,
                                  inode->i_ino);
}
EXPORT_SYMBOL(init_special_inode);

/**
 * inode_init_owner - Init uid,gid,mode for new inode according to posix standards
 * @inode: New inode
 * @dir: Directory inode
 * @mode: mode of the new inode
 */
void inode_init_owner(struct inode *inode, const struct inode *dir,
                        umode_t mode)
{
        inode->i_uid = current_fsuid();
        if (dir && dir->i_mode & S_ISGID) {
                inode->i_gid = dir->i_gid;

                /* Directories are special, and always inherit S_ISGID */
                if (S_ISDIR(mode))
                        mode |= S_ISGID;
        } else
                inode->i_gid = current_fsgid();
        inode->i_mode = mode;
}
EXPORT_SYMBOL(inode_init_owner);

/**
 * inode_owner_or_capable - check current task permissions to inode
 * @inode: inode being checked
 *
 * Return true if current either has CAP_FOWNER in a namespace with the
 * inode owner uid mapped, or owns the file.
 */
bool inode_owner_or_capable(const struct inode *inode)
{
        struct user_namespace *ns;

        if (uid_eq(current_fsuid(), inode->i_uid))
                return true;

        ns = current_user_ns();
        if (kuid_has_mapping(ns, inode->i_uid) && ns_capable(ns, CAP_FOWNER))
                return true;
        return false;
}
EXPORT_SYMBOL(inode_owner_or_capable);

/*
 * Direct i/o helper functions
 */
static void __inode_dio_wait(struct inode *inode)
{
        wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
        DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);

        do {
                prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE);
                if (atomic_read(&inode->i_dio_count))
                        schedule();
        } while (atomic_read(&inode->i_dio_count));
        finish_wait(wq, &q.wq_entry);
}

/**
 * inode_dio_wait - wait for outstanding DIO requests to finish
 * @inode: inode to wait for
 *
 * Waits for all pending direct I/O requests to finish so that we can
 * proceed with a truncate or equivalent operation.
 *
 * Must be called under a lock that serializes taking new references
 * to i_dio_count, usually by inode->i_mutex.
 */
void inode_dio_wait(struct inode *inode)
{
        if (atomic_read(&inode->i_dio_count))
                __inode_dio_wait(inode);
}
EXPORT_SYMBOL(inode_dio_wait);

/*
 * inode_set_flags - atomically set some inode flags
 *
 * Note: the caller should be holding i_mutex, or else be sure that
 * they have exclusive access to the inode structure (i.e., while the
 * inode is being instantiated).  The reason for the cmpxchg() loop
 * --- which wouldn't be necessary if all code paths which modify
 * i_flags actually followed this rule, is that there is at least one
 * code path which doesn't today so we use cmpxchg() out of an abundance
 * of caution.
 *
 * In the long run, i_mutex is overkill, and we should probably look
 * at using the i_lock spinlock to protect i_flags, and then make sure
 * it is so documented in include/linux/fs.h and that all code follows
 * the locking convention!!
 */
void inode_set_flags(struct inode *inode, unsigned int flags,
                     unsigned int mask)
{
        WARN_ON_ONCE(flags & ~mask);
        set_mask_bits(&inode->i_flags, mask, flags);
}
EXPORT_SYMBOL(inode_set_flags);

void inode_nohighmem(struct inode *inode)
{
        mapping_set_gfp_mask(inode->i_mapping, GFP_USER);
}
EXPORT_SYMBOL(inode_nohighmem);

/**
 * timestamp_truncate - Truncate timespec to a granularity
 * @t: Timespec
 * @inode: inode being updated
 *
 * Truncate a timespec to the granularity supported by the fs
 * containing the inode. Always rounds down. gran must
 * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns).
 */
struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        unsigned int gran = sb->s_time_gran;

        t.tv_sec = clamp(t.tv_sec, sb->s_time_min, sb->s_time_max);
        if (unlikely(t.tv_sec == sb->s_time_max || t.tv_sec == sb->s_time_min))
                t.tv_nsec = 0;

        /* Avoid division in the common cases 1 ns and 1 s. */
        if (gran == 1)
                ; /* nothing */
        else if (gran == NSEC_PER_SEC)
                t.tv_nsec = 0;
        else if (gran > 1 && gran < NSEC_PER_SEC)
                t.tv_nsec -= t.tv_nsec % gran;
        else
                WARN(1, "invalid file time granularity: %u", gran);
        return t;
}
EXPORT_SYMBOL(timestamp_truncate);

/**
 * current_time - Return FS time
 * @inode: inode.
 *
 * Return the current time truncated to the time granularity supported by
 * the fs.
 *
 * Note that inode and inode->sb cannot be NULL.
 * Otherwise, the function warns and returns time without truncation.
 */
struct timespec64 current_time(struct inode *inode)
{
        struct timespec64 now;

        ktime_get_coarse_real_ts64(&now);

        if (unlikely(!inode->i_sb)) {
                WARN(1, "current_time() called with uninitialized super_block in the inode");
                return now;
        }

        return timestamp_truncate(now, inode);
}
EXPORT_SYMBOL(current_time);

/*
 * Generic function to check FS_IOC_SETFLAGS values and reject any invalid
 * configurations.
 *
 * Note: the caller should be holding i_mutex, or else be sure that they have
 * exclusive access to the inode structure.
 */
int vfs_ioc_setflags_prepare(struct inode *inode, unsigned int oldflags,
                             unsigned int flags)
{
        /*
         * The IMMUTABLE and APPEND_ONLY flags can only be changed by
         * the relevant capability.
         *
         * This test looks nicer. Thanks to Pauline Middelink
         */
        if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) &&
            !capable(CAP_LINUX_IMMUTABLE))
                return -EPERM;

        return fscrypt_prepare_setflags(inode, oldflags, flags);
}
EXPORT_SYMBOL(vfs_ioc_setflags_prepare);

/*
 * Generic function to check FS_IOC_FSSETXATTR values and reject any invalid
 * configurations.
 *
 * Note: the caller should be holding i_mutex, or else be sure that they have
 * exclusive access to the inode structure.
 */
int vfs_ioc_fssetxattr_check(struct inode *inode, const struct fsxattr *old_fa,
                             struct fsxattr *fa)
{
        /*
         * Can't modify an immutable/append-only file unless we have
         * appropriate permission.
         */
        if ((old_fa->fsx_xflags ^ fa->fsx_xflags) &
                        (FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND) &&
            !capable(CAP_LINUX_IMMUTABLE))
                return -EPERM;

        /*
         * Project Quota ID state is only allowed to change from within the init
         * namespace. Enforce that restriction only if we are trying to change
         * the quota ID state. Everything else is allowed in user namespaces.
         */
        if (current_user_ns() != &init_user_ns) {
                if (old_fa->fsx_projid != fa->fsx_projid)
                        return -EINVAL;
                if ((old_fa->fsx_xflags ^ fa->fsx_xflags) &
                                FS_XFLAG_PROJINHERIT)
                        return -EINVAL;
        }

        /* Check extent size hints. */
        if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode))
                return -EINVAL;

        if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
                        !S_ISDIR(inode->i_mode))
                return -EINVAL;

        if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) &&
            !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
                return -EINVAL;

        /*
         * It is only valid to set the DAX flag on regular files and
         * directories on filesystems.
         */
        if ((fa->fsx_xflags & FS_XFLAG_DAX) &&
            !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
                return -EINVAL;

        /* Extent size hints of zero turn off the flags. */
        if (fa->fsx_extsize == 0)
                fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);
        if (fa->fsx_cowextsize == 0)
                fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;

        return 0;
}
EXPORT_SYMBOL(vfs_ioc_fssetxattr_check);

/**
 * inode_set_ctime_current - set the ctime to current_time
 * @inode: inode
 *
 * Set the inode->i_ctime to the current value for the inode. Returns
 * the current value that was assigned to i_ctime.
 */
struct timespec64 inode_set_ctime_current(struct inode *inode)
{
        struct timespec64 now = current_time(inode);

        inode_set_ctime(inode, now.tv_sec, now.tv_nsec);
        return now;
}
EXPORT_SYMBOL(inode_set_ctime_current);

/**
 * in_group_or_capable - check whether caller is CAP_FSETID privileged
 * @inode:        inode to check
 * @gid:        the new/current gid of @inode
 *
 * Check wether @gid is in the caller's group list or if the caller is
 * privileged with CAP_FSETID over @inode. This can be used to determine
 * whether the setgid bit can be kept or must be dropped.
 *
 * Return: true if the caller is sufficiently privileged, false if not.
 */
bool in_group_or_capable(const struct inode *inode, kgid_t gid)
{
        if (in_group_p(gid))
                return true;
        if (capable_wrt_inode_uidgid(inode, CAP_FSETID))
                return true;
        return false;
}

/**
 * mode_strip_sgid - handle the sgid bit for non-directories
 * @dir: parent directory inode
 * @mode: mode of the file to be created in @dir
 *
 * If the @mode of the new file has both the S_ISGID and S_IXGRP bit
 * raised and @dir has the S_ISGID bit raised ensure that the caller is
 * either in the group of the parent directory or they have CAP_FSETID
 * in their user namespace and are privileged over the parent directory.
 * In all other cases, strip the S_ISGID bit from @mode.
 *
 * Return: the new mode to use for the file
 */
umode_t mode_strip_sgid(const struct inode *dir, umode_t mode)
{
        if ((mode & (S_ISGID | S_IXGRP)) != (S_ISGID | S_IXGRP))
                return mode;
        if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID))
                return mode;
        if (in_group_or_capable(dir, dir->i_gid))
                return mode;
        return mode & ~S_ISGID;
}
EXPORT_SYMBOL(mode_strip_sgid);












































































































































































































































































































    1 
    1 



























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Kernel Probes (KProbes)
 *  kernel/kprobes.c
 *
 * Copyright (C) IBM Corporation, 2002, 2004
 *
 * 2002-Oct        Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
 *                Probes initial implementation (includes suggestions from
 *                Rusty Russell).
 * 2004-Aug        Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> with
 *                hlists and exceptions notifier as suggested by Andi Kleen.
 * 2004-July        Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
 *                interface to access function arguments.
 * 2004-Sep        Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes
 *                exceptions notifier to be first on the priority list.
 * 2005-May        Hien Nguyen <hien@us.ibm.com>, Jim Keniston
 *                <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
 *                <prasanna@in.ibm.com> added function-return probes.
 */
#include <linux/kprobes.h>
#include <linux/hash.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/stddef.h>
#include <linux/export.h>
#include <linux/moduleloader.h>
#include <linux/kallsyms.h>
#include <linux/freezer.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/sysctl.h>
#include <linux/kdebug.h>
#include <linux/memory.h>
#include <linux/ftrace.h>
#include <linux/cpu.h>
#include <linux/jump_label.h>
#include <linux/perf_event.h>
#include <linux/static_call.h>

#include <asm/sections.h>
#include <asm/cacheflush.h>
#include <asm/errno.h>
#include <linux/uaccess.h>

#define KPROBE_HASH_BITS 6
#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)


static int kprobes_initialized;
/* kprobe_table can be accessed by
 * - Normal hlist traversal and RCU add/del under kprobe_mutex is held.
 * Or
 * - RCU hlist traversal under disabling preempt (breakpoint handlers)
 */
static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];

/* NOTE: change this value only with kprobe_mutex held */
static bool kprobes_all_disarmed;

/* This protects kprobe_table and optimizing_list */
static DEFINE_MUTEX(kprobe_mutex);
static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
static struct {
        raw_spinlock_t lock ____cacheline_aligned_in_smp;
} kretprobe_table_locks[KPROBE_TABLE_SIZE];

kprobe_opcode_t * __weak kprobe_lookup_name(const char *name,
                                        unsigned int __unused)
{
        return ((kprobe_opcode_t *)(kallsyms_lookup_name(name)));
}

static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
{
        return &(kretprobe_table_locks[hash].lock);
}

/* Blacklist -- list of struct kprobe_blacklist_entry */
static LIST_HEAD(kprobe_blacklist);

#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
/*
 * kprobe->ainsn.insn points to the copy of the instruction to be
 * single-stepped. x86_64, POWER4 and above have no-exec support and
 * stepping on the instruction on a vmalloced/kmalloced/data page
 * is a recipe for disaster
 */
struct kprobe_insn_page {
        struct list_head list;
        kprobe_opcode_t *insns;                /* Page of instruction slots */
        struct kprobe_insn_cache *cache;
        int nused;
        int ngarbage;
        char slot_used[];
};

#define KPROBE_INSN_PAGE_SIZE(slots)                        \
        (offsetof(struct kprobe_insn_page, slot_used) +        \
         (sizeof(char) * (slots)))

static int slots_per_page(struct kprobe_insn_cache *c)
{
        return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
}

enum kprobe_slot_state {
        SLOT_CLEAN = 0,
        SLOT_DIRTY = 1,
        SLOT_USED = 2,
};

void __weak *alloc_insn_page(void)
{
        return module_alloc(PAGE_SIZE);
}

void __weak free_insn_page(void *page)
{
        module_memfree(page);
}

struct kprobe_insn_cache kprobe_insn_slots = {
        .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex),
        .alloc = alloc_insn_page,
        .free = free_insn_page,
        .sym = KPROBE_INSN_PAGE_SYM,
        .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
        .insn_size = MAX_INSN_SIZE,
        .nr_garbage = 0,
};
static int collect_garbage_slots(struct kprobe_insn_cache *c);

/**
 * __get_insn_slot() - Find a slot on an executable page for an instruction.
 * We allocate an executable page if there's no room on existing ones.
 */
kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
{
        struct kprobe_insn_page *kip;
        kprobe_opcode_t *slot = NULL;

        /* Since the slot array is not protected by rcu, we need a mutex */
        mutex_lock(&c->mutex);
 retry:
        rcu_read_lock();
        list_for_each_entry_rcu(kip, &c->pages, list) {
                if (kip->nused < slots_per_page(c)) {
                        int i;
                        for (i = 0; i < slots_per_page(c); i++) {
                                if (kip->slot_used[i] == SLOT_CLEAN) {
                                        kip->slot_used[i] = SLOT_USED;
                                        kip->nused++;
                                        slot = kip->insns + (i * c->insn_size);
                                        rcu_read_unlock();
                                        goto out;
                                }
                        }
                        /* kip->nused is broken. Fix it. */
                        kip->nused = slots_per_page(c);
                        WARN_ON(1);
                }
        }
        rcu_read_unlock();

        /* If there are any garbage slots, collect it and try again. */
        if (c->nr_garbage && collect_garbage_slots(c) == 0)
                goto retry;

        /* All out of space.  Need to allocate a new page. */
        kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
        if (!kip)
                goto out;

        /*
         * Use module_alloc so this page is within +/- 2GB of where the
         * kernel image and loaded module images reside. This is required
         * so x86_64 can correctly handle the %rip-relative fixups.
         */
        kip->insns = c->alloc();
        if (!kip->insns) {
                kfree(kip);
                goto out;
        }
        INIT_LIST_HEAD(&kip->list);
        memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
        kip->slot_used[0] = SLOT_USED;
        kip->nused = 1;
        kip->ngarbage = 0;
        kip->cache = c;
        list_add_rcu(&kip->list, &c->pages);
        slot = kip->insns;

        /* Record the perf ksymbol register event after adding the page */
        perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns,
                           PAGE_SIZE, false, c->sym);
out:
        mutex_unlock(&c->mutex);
        return slot;
}

/* Return 1 if all garbages are collected, otherwise 0. */
static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
{
        kip->slot_used[idx] = SLOT_CLEAN;
        kip->nused--;
        if (kip->nused == 0) {
                /*
                 * Page is no longer in use.  Free it unless
                 * it's the last one.  We keep the last one
                 * so as not to have to set it up again the
                 * next time somebody inserts a probe.
                 */
                if (!list_is_singular(&kip->list)) {
                        /*
                         * Record perf ksymbol unregister event before removing
                         * the page.
                         */
                        perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
                                           (unsigned long)kip->insns, PAGE_SIZE, true,
                                           kip->cache->sym);
                        list_del_rcu(&kip->list);
                        synchronize_rcu();
                        kip->cache->free(kip->insns);
                        kfree(kip);
                }
                return 1;
        }
        return 0;
}

static int collect_garbage_slots(struct kprobe_insn_cache *c)
{
        struct kprobe_insn_page *kip, *next;

        /* Ensure no-one is interrupted on the garbages */
        synchronize_rcu();

        list_for_each_entry_safe(kip, next, &c->pages, list) {
                int i;
                if (kip->ngarbage == 0)
                        continue;
                kip->ngarbage = 0;        /* we will collect all garbages */
                for (i = 0; i < slots_per_page(c); i++) {
                        if (kip->slot_used[i] == SLOT_DIRTY && collect_one_slot(kip, i))
                                break;
                }
        }
        c->nr_garbage = 0;
        return 0;
}

void __free_insn_slot(struct kprobe_insn_cache *c,
                      kprobe_opcode_t *slot, int dirty)
{
        struct kprobe_insn_page *kip;
        long idx;

        mutex_lock(&c->mutex);
        rcu_read_lock();
        list_for_each_entry_rcu(kip, &c->pages, list) {
                idx = ((long)slot - (long)kip->insns) /
                        (c->insn_size * sizeof(kprobe_opcode_t));
                if (idx >= 0 && idx < slots_per_page(c))
                        goto out;
        }
        /* Could not find this slot. */
        WARN_ON(1);
        kip = NULL;
out:
        rcu_read_unlock();
        /* Mark and sweep: this may sleep */
        if (kip) {
                /* Check double free */
                WARN_ON(kip->slot_used[idx] != SLOT_USED);
                if (dirty) {
                        kip->slot_used[idx] = SLOT_DIRTY;
                        kip->ngarbage++;
                        if (++c->nr_garbage > slots_per_page(c))
                                collect_garbage_slots(c);
                } else {
                        collect_one_slot(kip, idx);
                }
        }
        mutex_unlock(&c->mutex);
}

/*
 * Check given address is on the page of kprobe instruction slots.
 * This will be used for checking whether the address on a stack
 * is on a text area or not.
 */
bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr)
{
        struct kprobe_insn_page *kip;
        bool ret = false;

        rcu_read_lock();
        list_for_each_entry_rcu(kip, &c->pages, list) {
                if (addr >= (unsigned long)kip->insns &&
                    addr < (unsigned long)kip->insns + PAGE_SIZE) {
                        ret = true;
                        break;
                }
        }
        rcu_read_unlock();

        return ret;
}

int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum,
                             unsigned long *value, char *type, char *sym)
{
        struct kprobe_insn_page *kip;
        int ret = -ERANGE;

        rcu_read_lock();
        list_for_each_entry_rcu(kip, &c->pages, list) {
                if ((*symnum)--)
                        continue;
                strlcpy(sym, c->sym, KSYM_NAME_LEN);
                *type = 't';
                *value = (unsigned long)kip->insns;
                ret = 0;
                break;
        }
        rcu_read_unlock();

        return ret;
}

#ifdef CONFIG_OPTPROBES
/* For optimized_kprobe buffer */
struct kprobe_insn_cache kprobe_optinsn_slots = {
        .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex),
        .alloc = alloc_insn_page,
        .free = free_insn_page,
        .sym = KPROBE_OPTINSN_PAGE_SYM,
        .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
        /* .insn_size is initialized later */
        .nr_garbage = 0,
};
#endif
#endif

/* We have preemption disabled.. so it is safe to use __ versions */
static inline void set_kprobe_instance(struct kprobe *kp)
{
        __this_cpu_write(kprobe_instance, kp);
}

static inline void reset_kprobe_instance(void)
{
        __this_cpu_write(kprobe_instance, NULL);
}

/*
 * This routine is called either:
 *         - under the kprobe_mutex - during kprobe_[un]register()
 *                                 OR
 *         - with preemption disabled - from arch/xxx/kernel/kprobes.c
 */
struct kprobe *get_kprobe(void *addr)
{
        struct hlist_head *head;
        struct kprobe *p;

        head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
        hlist_for_each_entry_rcu(p, head, hlist,
                                 lockdep_is_held(&kprobe_mutex)) {
                if (p->addr == addr)
                        return p;
        }

        return NULL;
}
NOKPROBE_SYMBOL(get_kprobe);

static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);

/* Return true if the kprobe is an aggregator */
static inline int kprobe_aggrprobe(struct kprobe *p)
{
        return p->pre_handler == aggr_pre_handler;
}

/* Return true(!0) if the kprobe is unused */
static inline int kprobe_unused(struct kprobe *p)
{
        return kprobe_aggrprobe(p) && kprobe_disabled(p) &&
               list_empty(&p->list);
}

/*
 * Keep all fields in the kprobe consistent
 */
static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
{
        memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t));
        memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn));
}

#ifdef CONFIG_OPTPROBES
/* NOTE: change this value only with kprobe_mutex held */
static bool kprobes_allow_optimization;

/*
 * Call all pre_handler on the list, but ignores its return value.
 * This must be called from arch-dep optimized caller.
 */
void opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
        struct kprobe *kp;

        list_for_each_entry_rcu(kp, &p->list, list) {
                if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
                        set_kprobe_instance(kp);
                        kp->pre_handler(kp, regs);
                }
                reset_kprobe_instance();
        }
}
NOKPROBE_SYMBOL(opt_pre_handler);

/* Free optimized instructions and optimized_kprobe */
static void free_aggr_kprobe(struct kprobe *p)
{
        struct optimized_kprobe *op;

        op = container_of(p, struct optimized_kprobe, kp);
        arch_remove_optimized_kprobe(op);
        arch_remove_kprobe(p);
        kfree(op);
}

/* Return true(!0) if the kprobe is ready for optimization. */
static inline int kprobe_optready(struct kprobe *p)
{
        struct optimized_kprobe *op;

        if (kprobe_aggrprobe(p)) {
                op = container_of(p, struct optimized_kprobe, kp);
                return arch_prepared_optinsn(&op->optinsn);
        }

        return 0;
}

/* Return true if the kprobe is disarmed. Note: p must be on hash list */
bool kprobe_disarmed(struct kprobe *p)
{
        struct optimized_kprobe *op;

        /* If kprobe is not aggr/opt probe, just return kprobe is disabled */
        if (!kprobe_aggrprobe(p))
                return kprobe_disabled(p);

        op = container_of(p, struct optimized_kprobe, kp);

        return kprobe_disabled(p) && list_empty(&op->list);
}

/* Return true(!0) if the probe is queued on (un)optimizing lists */
static int kprobe_queued(struct kprobe *p)
{
        struct optimized_kprobe *op;

        if (kprobe_aggrprobe(p)) {
                op = container_of(p, struct optimized_kprobe, kp);
                if (!list_empty(&op->list))
                        return 1;
        }
        return 0;
}

/*
 * Return an optimized kprobe whose optimizing code replaces
 * instructions including addr (exclude breakpoint).
 */
static struct kprobe *get_optimized_kprobe(unsigned long addr)
{
        int i;
        struct kprobe *p = NULL;
        struct optimized_kprobe *op;

        /* Don't check i == 0, since that is a breakpoint case. */
        for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++)
                p = get_kprobe((void *)(addr - i));

        if (p && kprobe_optready(p)) {
                op = container_of(p, struct optimized_kprobe, kp);
                if (arch_within_optimized_kprobe(op, addr))
                        return p;
        }

        return NULL;
}

/* Optimization staging list, protected by kprobe_mutex */
static LIST_HEAD(optimizing_list);
static LIST_HEAD(unoptimizing_list);
static LIST_HEAD(freeing_list);

static void kprobe_optimizer(struct work_struct *work);
static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
#define OPTIMIZE_DELAY 5

/*
 * Optimize (replace a breakpoint with a jump) kprobes listed on
 * optimizing_list.
 */
static void do_optimize_kprobes(void)
{
        lockdep_assert_held(&text_mutex);
        /*
         * The optimization/unoptimization refers online_cpus via
         * stop_machine() and cpu-hotplug modifies online_cpus.
         * And same time, text_mutex will be held in cpu-hotplug and here.
         * This combination can cause a deadlock (cpu-hotplug try to lock
         * text_mutex but stop_machine can not be done because online_cpus
         * has been changed)
         * To avoid this deadlock, caller must have locked cpu hotplug
         * for preventing cpu-hotplug outside of text_mutex locking.
         */
        lockdep_assert_cpus_held();

        /* Optimization never be done when disarmed */
        if (kprobes_all_disarmed || !kprobes_allow_optimization ||
            list_empty(&optimizing_list))
                return;

        arch_optimize_kprobes(&optimizing_list);
}

/*
 * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
 * if need) kprobes listed on unoptimizing_list.
 */
static void do_unoptimize_kprobes(void)
{
        struct optimized_kprobe *op, *tmp;

        lockdep_assert_held(&text_mutex);
        /* See comment in do_optimize_kprobes() */
        lockdep_assert_cpus_held();

        if (!list_empty(&unoptimizing_list))
                arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);

        /* Loop on 'freeing_list' for disarming and removing from kprobe hash list */
        list_for_each_entry_safe(op, tmp, &freeing_list, list) {
                /* Switching from detour code to origin */
                op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
                /* Disarm probes if marked disabled and not gone */
                if (kprobe_disabled(&op->kp) && !kprobe_gone(&op->kp))
                        arch_disarm_kprobe(&op->kp);
                if (kprobe_unused(&op->kp)) {
                        /*
                         * Remove unused probes from hash list. After waiting
                         * for synchronization, these probes are reclaimed.
                         * (reclaiming is done by do_free_cleaned_kprobes.)
                         */
                        hlist_del_rcu(&op->kp.hlist);
                } else
                        list_del_init(&op->list);
        }
}

/* Reclaim all kprobes on the free_list */
static void do_free_cleaned_kprobes(void)
{
        struct optimized_kprobe *op, *tmp;

        list_for_each_entry_safe(op, tmp, &freeing_list, list) {
                list_del_init(&op->list);
                if (WARN_ON_ONCE(!kprobe_unused(&op->kp))) {
                        /*
                         * This must not happen, but if there is a kprobe
                         * still in use, keep it on kprobes hash list.
                         */
                        continue;
                }
                free_aggr_kprobe(&op->kp);
        }
}

/* Start optimizer after OPTIMIZE_DELAY passed */
static void kick_kprobe_optimizer(void)
{
        schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
}

/* Kprobe jump optimizer */
static void kprobe_optimizer(struct work_struct *work)
{
        mutex_lock(&kprobe_mutex);
        cpus_read_lock();
        mutex_lock(&text_mutex);

        /*
         * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
         * kprobes before waiting for quiesence period.
         */
        do_unoptimize_kprobes();

        /*
         * Step 2: Wait for quiesence period to ensure all potentially
         * preempted tasks to have normally scheduled. Because optprobe
         * may modify multiple instructions, there is a chance that Nth
         * instruction is preempted. In that case, such tasks can return
         * to 2nd-Nth byte of jump instruction. This wait is for avoiding it.
         * Note that on non-preemptive kernel, this is transparently converted
         * to synchronoze_sched() to wait for all interrupts to have completed.
         */
        synchronize_rcu_tasks();

        /* Step 3: Optimize kprobes after quiesence period */
        do_optimize_kprobes();

        /* Step 4: Free cleaned kprobes after quiesence period */
        do_free_cleaned_kprobes();

        mutex_unlock(&text_mutex);
        cpus_read_unlock();

        /* Step 5: Kick optimizer again if needed */
        if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
                kick_kprobe_optimizer();

        mutex_unlock(&kprobe_mutex);
}

/* Wait for completing optimization and unoptimization */
void wait_for_kprobe_optimizer(void)
{
        mutex_lock(&kprobe_mutex);

        while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) {
                mutex_unlock(&kprobe_mutex);

                /* this will also make optimizing_work execute immmediately */
                flush_delayed_work(&optimizing_work);
                /* @optimizing_work might not have been queued yet, relax */
                cpu_relax();

                mutex_lock(&kprobe_mutex);
        }

        mutex_unlock(&kprobe_mutex);
}

bool optprobe_queued_unopt(struct optimized_kprobe *op)
{
        struct optimized_kprobe *_op;

        list_for_each_entry(_op, &unoptimizing_list, list) {
                if (op == _op)
                        return true;
        }

        return false;
}

/* Optimize kprobe if p is ready to be optimized */
static void optimize_kprobe(struct kprobe *p)
{
        struct optimized_kprobe *op;

        /* Check if the kprobe is disabled or not ready for optimization. */
        if (!kprobe_optready(p) || !kprobes_allow_optimization ||
            (kprobe_disabled(p) || kprobes_all_disarmed))
                return;

        /* kprobes with post_handler can not be optimized */
        if (p->post_handler)
                return;

        op = container_of(p, struct optimized_kprobe, kp);

        /* Check there is no other kprobes at the optimized instructions */
        if (arch_check_optimized_kprobe(op) < 0)
                return;

        /* Check if it is already optimized. */
        if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) {
                if (optprobe_queued_unopt(op)) {
                        /* This is under unoptimizing. Just dequeue the probe */
                        list_del_init(&op->list);
                }
                return;
        }
        op->kp.flags |= KPROBE_FLAG_OPTIMIZED;

        /* On unoptimizing/optimizing_list, op must have OPTIMIZED flag */
        if (WARN_ON_ONCE(!list_empty(&op->list)))
                return;

        list_add(&op->list, &optimizing_list);
        kick_kprobe_optimizer();
}

/* Short cut to direct unoptimizing */
static void force_unoptimize_kprobe(struct optimized_kprobe *op)
{
        lockdep_assert_cpus_held();
        arch_unoptimize_kprobe(op);
        op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
}

/* Unoptimize a kprobe if p is optimized */
static void unoptimize_kprobe(struct kprobe *p, bool force)
{
        struct optimized_kprobe *op;

        if (!kprobe_aggrprobe(p) || kprobe_disarmed(p))
                return; /* This is not an optprobe nor optimized */

        op = container_of(p, struct optimized_kprobe, kp);
        if (!kprobe_optimized(p))
                return;

        if (!list_empty(&op->list)) {
                if (optprobe_queued_unopt(op)) {
                        /* Queued in unoptimizing queue */
                        if (force) {
                                /*
                                 * Forcibly unoptimize the kprobe here, and queue it
                                 * in the freeing list for release afterwards.
                                 */
                                force_unoptimize_kprobe(op);
                                list_move(&op->list, &freeing_list);
                        }
                } else {
                        /* Dequeue from the optimizing queue */
                        list_del_init(&op->list);
                        op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
                }
                return;
        }

        /* Optimized kprobe case */
        if (force) {
                /* Forcibly update the code: this is a special case */
                force_unoptimize_kprobe(op);
        } else {
                list_add(&op->list, &unoptimizing_list);
                kick_kprobe_optimizer();
        }
}

/* Cancel unoptimizing for reusing */
static int reuse_unused_kprobe(struct kprobe *ap)
{
        struct optimized_kprobe *op;

        /*
         * Unused kprobe MUST be on the way of delayed unoptimizing (means
         * there is still a relative jump) and disabled.
         */
        op = container_of(ap, struct optimized_kprobe, kp);
        WARN_ON_ONCE(list_empty(&op->list));
        /* Enable the probe again */
        ap->flags &= ~KPROBE_FLAG_DISABLED;
        /* Optimize it again (remove from op->list) */
        if (!kprobe_optready(ap))
                return -EINVAL;

        optimize_kprobe(ap);
        return 0;
}

/* Remove optimized instructions */
static void kill_optimized_kprobe(struct kprobe *p)
{
        struct optimized_kprobe *op;

        op = container_of(p, struct optimized_kprobe, kp);
        if (!list_empty(&op->list))
                /* Dequeue from the (un)optimization queue */
                list_del_init(&op->list);
        op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;

        if (kprobe_unused(p)) {
                /*
                 * Unused kprobe is on unoptimizing or freeing list. We move it
                 * to freeing_list and let the kprobe_optimizer() remove it from
                 * the kprobe hash list and free it.
                 */
                if (optprobe_queued_unopt(op))
                        list_move(&op->list, &freeing_list);
        }

        /* Don't touch the code, because it is already freed. */
        arch_remove_optimized_kprobe(op);
}

static inline
void __prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
{
        if (!kprobe_ftrace(p))
                arch_prepare_optimized_kprobe(op, p);
}

/* Try to prepare optimized instructions */
static void prepare_optimized_kprobe(struct kprobe *p)
{
        struct optimized_kprobe *op;

        op = container_of(p, struct optimized_kprobe, kp);
        __prepare_optimized_kprobe(op, p);
}

/* Allocate new optimized_kprobe and try to prepare optimized instructions */
static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
{
        struct optimized_kprobe *op;

        op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL);
        if (!op)
                return NULL;

        INIT_LIST_HEAD(&op->list);
        op->kp.addr = p->addr;
        __prepare_optimized_kprobe(op, p);

        return &op->kp;
}

static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);

/*
 * Prepare an optimized_kprobe and optimize it
 * NOTE: p must be a normal registered kprobe
 */
static void try_to_optimize_kprobe(struct kprobe *p)
{
        struct kprobe *ap;
        struct optimized_kprobe *op;

        /* Impossible to optimize ftrace-based kprobe */
        if (kprobe_ftrace(p))
                return;

        /* For preparing optimization, jump_label_text_reserved() is called */
        cpus_read_lock();
        jump_label_lock();
        mutex_lock(&text_mutex);

        ap = alloc_aggr_kprobe(p);
        if (!ap)
                goto out;

        op = container_of(ap, struct optimized_kprobe, kp);
        if (!arch_prepared_optinsn(&op->optinsn)) {
                /* If failed to setup optimizing, fallback to kprobe */
                arch_remove_optimized_kprobe(op);
                kfree(op);
                goto out;
        }

        init_aggr_kprobe(ap, p);
        optimize_kprobe(ap);        /* This just kicks optimizer thread */

out:
        mutex_unlock(&text_mutex);
        jump_label_unlock();
        cpus_read_unlock();
}

static void optimize_all_kprobes(void)
{
        struct hlist_head *head;
        struct kprobe *p;
        unsigned int i;

        mutex_lock(&kprobe_mutex);
        /* If optimization is already allowed, just return */
        if (kprobes_allow_optimization)
                goto out;

        cpus_read_lock();
        kprobes_allow_optimization = true;
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry(p, head, hlist)
                        if (!kprobe_disabled(p))
                                optimize_kprobe(p);
        }
        cpus_read_unlock();
        printk(KERN_INFO "Kprobes globally optimized\n");
out:
        mutex_unlock(&kprobe_mutex);
}

#ifdef CONFIG_SYSCTL
static void unoptimize_all_kprobes(void)
{
        struct hlist_head *head;
        struct kprobe *p;
        unsigned int i;

        mutex_lock(&kprobe_mutex);
        /* If optimization is already prohibited, just return */
        if (!kprobes_allow_optimization) {
                mutex_unlock(&kprobe_mutex);
                return;
        }

        cpus_read_lock();
        kprobes_allow_optimization = false;
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry(p, head, hlist) {
                        if (!kprobe_disabled(p))
                                unoptimize_kprobe(p, false);
                }
        }
        cpus_read_unlock();
        mutex_unlock(&kprobe_mutex);

        /* Wait for unoptimizing completion */
        wait_for_kprobe_optimizer();
        printk(KERN_INFO "Kprobes globally unoptimized\n");
}

static DEFINE_MUTEX(kprobe_sysctl_mutex);
int sysctl_kprobes_optimization;
int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
                                      void *buffer, size_t *length,
                                      loff_t *ppos)
{
        int ret;

        mutex_lock(&kprobe_sysctl_mutex);
        sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
        ret = proc_dointvec_minmax(table, write, buffer, length, ppos);

        if (sysctl_kprobes_optimization)
                optimize_all_kprobes();
        else
                unoptimize_all_kprobes();
        mutex_unlock(&kprobe_sysctl_mutex);

        return ret;
}
#endif /* CONFIG_SYSCTL */

/* Put a breakpoint for a probe. Must be called with text_mutex locked */
static void __arm_kprobe(struct kprobe *p)
{
        struct kprobe *_p;

        /* Check collision with other optimized kprobes */
        _p = get_optimized_kprobe((unsigned long)p->addr);
        if (unlikely(_p))
                /* Fallback to unoptimized kprobe */
                unoptimize_kprobe(_p, true);

        arch_arm_kprobe(p);
        optimize_kprobe(p);        /* Try to optimize (add kprobe to a list) */
}

/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
static void __disarm_kprobe(struct kprobe *p, bool reopt)
{
        struct kprobe *_p;

        /* Try to unoptimize */
        unoptimize_kprobe(p, kprobes_all_disarmed);

        if (!kprobe_queued(p)) {
                arch_disarm_kprobe(p);
                /* If another kprobe was blocked, optimize it. */
                _p = get_optimized_kprobe((unsigned long)p->addr);
                if (unlikely(_p) && reopt)
                        optimize_kprobe(_p);
        }
        /* TODO: reoptimize others after unoptimized this probe */
}

#else /* !CONFIG_OPTPROBES */

#define optimize_kprobe(p)                        do {} while (0)
#define unoptimize_kprobe(p, f)                        do {} while (0)
#define kill_optimized_kprobe(p)                do {} while (0)
#define prepare_optimized_kprobe(p)                do {} while (0)
#define try_to_optimize_kprobe(p)                do {} while (0)
#define __arm_kprobe(p)                                arch_arm_kprobe(p)
#define __disarm_kprobe(p, o)                        arch_disarm_kprobe(p)
#define kprobe_disarmed(p)                        kprobe_disabled(p)
#define wait_for_kprobe_optimizer()                do {} while (0)

static int reuse_unused_kprobe(struct kprobe *ap)
{
        /*
         * If the optimized kprobe is NOT supported, the aggr kprobe is
         * released at the same time that the last aggregated kprobe is
         * unregistered.
         * Thus there should be no chance to reuse unused kprobe.
         */
        printk(KERN_ERR "Error: There should be no unused kprobe here.\n");
        return -EINVAL;
}

static void free_aggr_kprobe(struct kprobe *p)
{
        arch_remove_kprobe(p);
        kfree(p);
}

static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
{
        return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
}
#endif /* CONFIG_OPTPROBES */

#ifdef CONFIG_KPROBES_ON_FTRACE
static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
        .func = kprobe_ftrace_handler,
        .flags = FTRACE_OPS_FL_SAVE_REGS,
};

static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = {
        .func = kprobe_ftrace_handler,
        .flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY,
};

static int kprobe_ipmodify_enabled;
static int kprobe_ftrace_enabled;

/* Must ensure p->addr is really on ftrace */
static int prepare_kprobe(struct kprobe *p)
{
        if (!kprobe_ftrace(p))
                return arch_prepare_kprobe(p);

        return arch_prepare_kprobe_ftrace(p);
}

/* Caller must lock kprobe_mutex */
static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
                               int *cnt)
{
        int ret = 0;

        ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 0, 0);
        if (ret) {
                pr_debug("Failed to arm kprobe-ftrace at %pS (%d)\n",
                         p->addr, ret);
                return ret;
        }

        if (*cnt == 0) {
                ret = register_ftrace_function(ops);
                if (ret) {
                        pr_debug("Failed to init kprobe-ftrace (%d)\n", ret);
                        goto err_ftrace;
                }
        }

        (*cnt)++;
        return ret;

err_ftrace:
        /*
         * At this point, sinec ops is not registered, we should be sefe from
         * registering empty filter.
         */
        ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0);
        return ret;
}

static int arm_kprobe_ftrace(struct kprobe *p)
{
        bool ipmodify = (p->post_handler != NULL);

        return __arm_kprobe_ftrace(p,
                ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops,
                ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
}

/* Caller must lock kprobe_mutex */
static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
                                  int *cnt)
{
        int ret = 0;

        if (*cnt == 1) {
                ret = unregister_ftrace_function(ops);
                if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (%d)\n", ret))
                        return ret;
        }

        (*cnt)--;

        ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0);
        WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (%d)\n",
                  p->addr, ret);
        return ret;
}

static int disarm_kprobe_ftrace(struct kprobe *p)
{
        bool ipmodify = (p->post_handler != NULL);

        return __disarm_kprobe_ftrace(p,
                ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops,
                ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
}
#else        /* !CONFIG_KPROBES_ON_FTRACE */
static inline int prepare_kprobe(struct kprobe *p)
{
        return arch_prepare_kprobe(p);
}

static inline int arm_kprobe_ftrace(struct kprobe *p)
{
        return -ENODEV;
}

static inline int disarm_kprobe_ftrace(struct kprobe *p)
{
        return -ENODEV;
}
#endif

/* Arm a kprobe with text_mutex */
static int arm_kprobe(struct kprobe *kp)
{
        if (unlikely(kprobe_ftrace(kp)))
                return arm_kprobe_ftrace(kp);

        cpus_read_lock();
        mutex_lock(&text_mutex);
        __arm_kprobe(kp);
        mutex_unlock(&text_mutex);
        cpus_read_unlock();

        return 0;
}

/* Disarm a kprobe with text_mutex */
static int disarm_kprobe(struct kprobe *kp, bool reopt)
{
        if (unlikely(kprobe_ftrace(kp)))
                return disarm_kprobe_ftrace(kp);

        cpus_read_lock();
        mutex_lock(&text_mutex);
        __disarm_kprobe(kp, reopt);
        mutex_unlock(&text_mutex);
        cpus_read_unlock();

        return 0;
}

/*
 * Aggregate handlers for multiple kprobes support - these handlers
 * take care of invoking the individual kprobe handlers on p->list
 */
static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
        struct kprobe *kp;

        list_for_each_entry_rcu(kp, &p->list, list) {
                if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
                        set_kprobe_instance(kp);
                        if (kp->pre_handler(kp, regs))
                                return 1;
                }
                reset_kprobe_instance();
        }
        return 0;
}
NOKPROBE_SYMBOL(aggr_pre_handler);

static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
                              unsigned long flags)
{
        struct kprobe *kp;

        list_for_each_entry_rcu(kp, &p->list, list) {
                if (kp->post_handler && likely(!kprobe_disabled(kp))) {
                        set_kprobe_instance(kp);
                        kp->post_handler(kp, regs, flags);
                        reset_kprobe_instance();
                }
        }
}
NOKPROBE_SYMBOL(aggr_post_handler);

static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
                              int trapnr)
{
        struct kprobe *cur = __this_cpu_read(kprobe_instance);

        /*
         * if we faulted "during" the execution of a user specified
         * probe handler, invoke just that probe's fault handler
         */
        if (cur && cur->fault_handler) {
                if (cur->fault_handler(cur, regs, trapnr))
                        return 1;
        }
        return 0;
}
NOKPROBE_SYMBOL(aggr_fault_handler);

/* Walks the list and increments nmissed count for multiprobe case */
void kprobes_inc_nmissed_count(struct kprobe *p)
{
        struct kprobe *kp;
        if (!kprobe_aggrprobe(p)) {
                p->nmissed++;
        } else {
                list_for_each_entry_rcu(kp, &p->list, list)
                        kp->nmissed++;
        }
        return;
}
NOKPROBE_SYMBOL(kprobes_inc_nmissed_count);

static void recycle_rp_inst(struct kretprobe_instance *ri)
{
        struct kretprobe *rp = ri->rp;

        /* remove rp inst off the rprobe_inst_table */
        hlist_del(&ri->hlist);
        INIT_HLIST_NODE(&ri->hlist);
        if (likely(rp)) {
                raw_spin_lock(&rp->lock);
                hlist_add_head(&ri->hlist, &rp->free_instances);
                raw_spin_unlock(&rp->lock);
        } else
                kfree_rcu(ri, rcu);
}
NOKPROBE_SYMBOL(recycle_rp_inst);

static void kretprobe_hash_lock(struct task_struct *tsk,
                         struct hlist_head **head, unsigned long *flags)
__acquires(hlist_lock)
{
        unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
        raw_spinlock_t *hlist_lock;

        *head = &kretprobe_inst_table[hash];
        hlist_lock = kretprobe_table_lock_ptr(hash);
        /*
         * Nested is a workaround that will soon not be needed.
         * There's other protections that make sure the same lock
         * is not taken on the same CPU that lockdep is unaware of.
         * Differentiate when it is taken in NMI context.
         */
        raw_spin_lock_irqsave_nested(hlist_lock, *flags, !!in_nmi());
}
NOKPROBE_SYMBOL(kretprobe_hash_lock);

static void kretprobe_table_lock(unsigned long hash,
                                 unsigned long *flags)
__acquires(hlist_lock)
{
        raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
        /*
         * Nested is a workaround that will soon not be needed.
         * There's other protections that make sure the same lock
         * is not taken on the same CPU that lockdep is unaware of.
         * Differentiate when it is taken in NMI context.
         */
        raw_spin_lock_irqsave_nested(hlist_lock, *flags, !!in_nmi());
}
NOKPROBE_SYMBOL(kretprobe_table_lock);

static void kretprobe_hash_unlock(struct task_struct *tsk,
                           unsigned long *flags)
__releases(hlist_lock)
{
        unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
        raw_spinlock_t *hlist_lock;

        hlist_lock = kretprobe_table_lock_ptr(hash);
        raw_spin_unlock_irqrestore(hlist_lock, *flags);
}
NOKPROBE_SYMBOL(kretprobe_hash_unlock);

static void kretprobe_table_unlock(unsigned long hash,
                                   unsigned long *flags)
__releases(hlist_lock)
{
        raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
        raw_spin_unlock_irqrestore(hlist_lock, *flags);
}
NOKPROBE_SYMBOL(kretprobe_table_unlock);

static struct kprobe kprobe_busy = {
        .addr = (void *) get_kprobe,
};

void kprobe_busy_begin(void)
{
        struct kprobe_ctlblk *kcb;

        preempt_disable();
        __this_cpu_write(current_kprobe, &kprobe_busy);
        kcb = get_kprobe_ctlblk();
        kcb->kprobe_status = KPROBE_HIT_ACTIVE;
}

void kprobe_busy_end(void)
{
        __this_cpu_write(current_kprobe, NULL);
        preempt_enable();
}

/*
 * This function is called from finish_task_switch when task tk becomes dead,
 * so that we can recycle any function-return probe instances associated
 * with this task. These left over instances represent probed functions
 * that have been called but will never return.
 */
void kprobe_flush_task(struct task_struct *tk)
{
        struct kretprobe_instance *ri;
        struct hlist_head *head;
        struct hlist_node *tmp;
        unsigned long hash, flags = 0;

        if (unlikely(!kprobes_initialized))
                /* Early boot.  kretprobe_table_locks not yet initialized. */
                return;

        kprobe_busy_begin();

        hash = hash_ptr(tk, KPROBE_HASH_BITS);
        head = &kretprobe_inst_table[hash];
        kretprobe_table_lock(hash, &flags);
        hlist_for_each_entry_safe(ri, tmp, head, hlist) {
                if (ri->task == tk)
                        recycle_rp_inst(ri);
        }
        kretprobe_table_unlock(hash, &flags);

        kprobe_busy_end();
}
NOKPROBE_SYMBOL(kprobe_flush_task);

static inline void free_rp_inst(struct kretprobe *rp)
{
        struct kretprobe_instance *ri;
        struct hlist_node *next;

        hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) {
                hlist_del(&ri->hlist);
                kfree(ri);
        }
}

static void cleanup_rp_inst(struct kretprobe *rp)
{
        unsigned long flags, hash;
        struct kretprobe_instance *ri;
        struct hlist_node *next;
        struct hlist_head *head;

        /* To avoid recursive kretprobe by NMI, set kprobe busy here */
        kprobe_busy_begin();
        for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
                kretprobe_table_lock(hash, &flags);
                head = &kretprobe_inst_table[hash];
                hlist_for_each_entry_safe(ri, next, head, hlist) {
                        if (ri->rp == rp)
                                ri->rp = NULL;
                }
                kretprobe_table_unlock(hash, &flags);
        }
        kprobe_busy_end();

        free_rp_inst(rp);
}
NOKPROBE_SYMBOL(cleanup_rp_inst);

/* Add the new probe to ap->list */
static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
{
        if (p->post_handler)
                unoptimize_kprobe(ap, true);        /* Fall back to normal kprobe */

        list_add_rcu(&p->list, &ap->list);
        if (p->post_handler && !ap->post_handler)
                ap->post_handler = aggr_post_handler;

        return 0;
}

/*
 * Fill in the required fields of the "manager kprobe". Replace the
 * earlier kprobe in the hlist with the manager kprobe
 */
static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
{
        /* Copy p's insn slot to ap */
        copy_kprobe(p, ap);
        flush_insn_slot(ap);
        ap->addr = p->addr;
        ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED;
        ap->pre_handler = aggr_pre_handler;
        ap->fault_handler = aggr_fault_handler;
        /* We don't care the kprobe which has gone. */
        if (p->post_handler && !kprobe_gone(p))
                ap->post_handler = aggr_post_handler;

        INIT_LIST_HEAD(&ap->list);
        INIT_HLIST_NODE(&ap->hlist);

        list_add_rcu(&p->list, &ap->list);
        hlist_replace_rcu(&p->hlist, &ap->hlist);
}

/*
 * This is the second or subsequent kprobe at the address - handle
 * the intricacies
 */
static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
{
        int ret = 0;
        struct kprobe *ap = orig_p;

        cpus_read_lock();

        /* For preparing optimization, jump_label_text_reserved() is called */
        jump_label_lock();
        mutex_lock(&text_mutex);

        if (!kprobe_aggrprobe(orig_p)) {
                /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
                ap = alloc_aggr_kprobe(orig_p);
                if (!ap) {
                        ret = -ENOMEM;
                        goto out;
                }
                init_aggr_kprobe(ap, orig_p);
        } else if (kprobe_unused(ap)) {
                /* This probe is going to die. Rescue it */
                ret = reuse_unused_kprobe(ap);
                if (ret)
                        goto out;
        }

        if (kprobe_gone(ap)) {
                /*
                 * Attempting to insert new probe at the same location that
                 * had a probe in the module vaddr area which already
                 * freed. So, the instruction slot has already been
                 * released. We need a new slot for the new probe.
                 */
                ret = arch_prepare_kprobe(ap);
                if (ret)
                        /*
                         * Even if fail to allocate new slot, don't need to
                         * free aggr_probe. It will be used next time, or
                         * freed by unregister_kprobe.
                         */
                        goto out;

                /* Prepare optimized instructions if possible. */
                prepare_optimized_kprobe(ap);

                /*
                 * Clear gone flag to prevent allocating new slot again, and
                 * set disabled flag because it is not armed yet.
                 */
                ap->flags = (ap->flags & ~KPROBE_FLAG_GONE)
                            | KPROBE_FLAG_DISABLED;
        }

        /* Copy ap's insn slot to p */
        copy_kprobe(ap, p);
        ret = add_new_kprobe(ap, p);

out:
        mutex_unlock(&text_mutex);
        jump_label_unlock();
        cpus_read_unlock();

        if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) {
                ap->flags &= ~KPROBE_FLAG_DISABLED;
                if (!kprobes_all_disarmed) {
                        /* Arm the breakpoint again. */
                        ret = arm_kprobe(ap);
                        if (ret) {
                                ap->flags |= KPROBE_FLAG_DISABLED;
                                list_del_rcu(&p->list);
                                synchronize_rcu();
                        }
                }
        }
        return ret;
}

bool __weak arch_within_kprobe_blacklist(unsigned long addr)
{
        /* The __kprobes marked functions and entry code must not be probed */
        return addr >= (unsigned long)__kprobes_text_start &&
               addr < (unsigned long)__kprobes_text_end;
}

static bool __within_kprobe_blacklist(unsigned long addr)
{
        struct kprobe_blacklist_entry *ent;

        if (arch_within_kprobe_blacklist(addr))
                return true;
        /*
         * If there exists a kprobe_blacklist, verify and
         * fail any probe registration in the prohibited area
         */
        list_for_each_entry(ent, &kprobe_blacklist, list) {
                if (addr >= ent->start_addr && addr < ent->end_addr)
                        return true;
        }
        return false;
}

bool within_kprobe_blacklist(unsigned long addr)
{
        char symname[KSYM_NAME_LEN], *p;

        if (__within_kprobe_blacklist(addr))
                return true;

        /* Check if the address is on a suffixed-symbol */
        if (!lookup_symbol_name(addr, symname)) {
                p = strchr(symname, '.');
                if (!p)
                        return false;
                *p = '\0';
                addr = (unsigned long)kprobe_lookup_name(symname, 0);
                if (addr)
                        return __within_kprobe_blacklist(addr);
        }
        return false;
}

/*
 * If we have a symbol_name argument, look it up and add the offset field
 * to it. This way, we can specify a relative address to a symbol.
 * This returns encoded errors if it fails to look up symbol or invalid
 * combination of parameters.
 */
static kprobe_opcode_t *_kprobe_addr(kprobe_opcode_t *addr,
                        const char *symbol_name, unsigned int offset)
{
        if ((symbol_name && addr) || (!symbol_name && !addr))
                goto invalid;

        if (symbol_name) {
                addr = kprobe_lookup_name(symbol_name, offset);
                if (!addr)
                        return ERR_PTR(-ENOENT);
        }

        addr = (kprobe_opcode_t *)(((char *)addr) + offset);
        if (addr)
                return addr;

invalid:
        return ERR_PTR(-EINVAL);
}

static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
{
        return _kprobe_addr(p->addr, p->symbol_name, p->offset);
}

/* Check passed kprobe is valid and return kprobe in kprobe_table. */
static struct kprobe *__get_valid_kprobe(struct kprobe *p)
{
        struct kprobe *ap, *list_p;

        lockdep_assert_held(&kprobe_mutex);

        ap = get_kprobe(p->addr);
        if (unlikely(!ap))
                return NULL;

        if (p != ap) {
                list_for_each_entry(list_p, &ap->list, list)
                        if (list_p == p)
                        /* kprobe p is a valid probe */
                                goto valid;
                return NULL;
        }
valid:
        return ap;
}

/* Return error if the kprobe is being re-registered */
static inline int check_kprobe_rereg(struct kprobe *p)
{
        int ret = 0;

        mutex_lock(&kprobe_mutex);
        if (__get_valid_kprobe(p))
                ret = -EINVAL;
        mutex_unlock(&kprobe_mutex);

        return ret;
}

int __weak arch_check_ftrace_location(struct kprobe *p)
{
        unsigned long ftrace_addr;

        ftrace_addr = ftrace_location((unsigned long)p->addr);
        if (ftrace_addr) {
#ifdef CONFIG_KPROBES_ON_FTRACE
                /* Given address is not on the instruction boundary */
                if ((unsigned long)p->addr != ftrace_addr)
                        return -EILSEQ;
                p->flags |= KPROBE_FLAG_FTRACE;
#else        /* !CONFIG_KPROBES_ON_FTRACE */
                return -EINVAL;
#endif
        }
        return 0;
}

static bool is_cfi_preamble_symbol(unsigned long addr)
{
        char symbuf[KSYM_NAME_LEN];

        if (lookup_symbol_name(addr, symbuf))
                return false;

        return str_has_prefix("__cfi_", symbuf) ||
                str_has_prefix("__pfx_", symbuf);
}

static int check_kprobe_address_safe(struct kprobe *p,
                                     struct module **probed_mod)
{
        int ret;

        ret = arch_check_ftrace_location(p);
        if (ret)
                return ret;
        jump_label_lock();
        preempt_disable();

        /* Ensure it is not in reserved area nor out of text */
        if (!(core_kernel_text((unsigned long) p->addr) ||
            is_module_text_address((unsigned long) p->addr)) ||
            in_gate_area_no_mm((unsigned long) p->addr) ||
            within_kprobe_blacklist((unsigned long) p->addr) ||
            jump_label_text_reserved(p->addr, p->addr) ||
            static_call_text_reserved(p->addr, p->addr) ||
            find_bug((unsigned long)p->addr) ||
            is_cfi_preamble_symbol((unsigned long)p->addr)) {
                ret = -EINVAL;
                goto out;
        }

        /* Check if are we probing a module */
        *probed_mod = __module_text_address((unsigned long) p->addr);
        if (*probed_mod) {
                /*
                 * We must hold a refcount of the probed module while updating
                 * its code to prohibit unexpected unloading.
                 */
                if (unlikely(!try_module_get(*probed_mod))) {
                        ret = -ENOENT;
                        goto out;
                }

                /*
                 * If the module freed .init.text, we couldn't insert
                 * kprobes in there.
                 */
                if (within_module_init((unsigned long)p->addr, *probed_mod) &&
                    (*probed_mod)->state != MODULE_STATE_COMING) {
                        module_put(*probed_mod);
                        *probed_mod = NULL;
                        ret = -ENOENT;
                }
        }
out:
        preempt_enable();
        jump_label_unlock();

        return ret;
}

int register_kprobe(struct kprobe *p)
{
        int ret;
        struct kprobe *old_p;
        struct module *probed_mod;
        kprobe_opcode_t *addr;

        /* Adjust probe address from symbol */
        addr = kprobe_addr(p);
        if (IS_ERR(addr))
                return PTR_ERR(addr);
        p->addr = addr;

        ret = check_kprobe_rereg(p);
        if (ret)
                return ret;

        /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
        p->flags &= KPROBE_FLAG_DISABLED;
        p->nmissed = 0;
        INIT_LIST_HEAD(&p->list);

        ret = check_kprobe_address_safe(p, &probed_mod);
        if (ret)
                return ret;

        mutex_lock(&kprobe_mutex);

        old_p = get_kprobe(p->addr);
        if (old_p) {
                /* Since this may unoptimize old_p, locking text_mutex. */
                ret = register_aggr_kprobe(old_p, p);
                goto out;
        }

        cpus_read_lock();
        /* Prevent text modification */
        mutex_lock(&text_mutex);
        ret = prepare_kprobe(p);
        mutex_unlock(&text_mutex);
        cpus_read_unlock();
        if (ret)
                goto out;

        INIT_HLIST_NODE(&p->hlist);
        hlist_add_head_rcu(&p->hlist,
                       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);

        if (!kprobes_all_disarmed && !kprobe_disabled(p)) {
                ret = arm_kprobe(p);
                if (ret) {
                        hlist_del_rcu(&p->hlist);
                        synchronize_rcu();
                        goto out;
                }
        }

        /* Try to optimize kprobe */
        try_to_optimize_kprobe(p);
out:
        mutex_unlock(&kprobe_mutex);

        if (probed_mod)
                module_put(probed_mod);

        return ret;
}
EXPORT_SYMBOL_GPL(register_kprobe);

/* Check if all probes on the aggrprobe are disabled */
static int aggr_kprobe_disabled(struct kprobe *ap)
{
        struct kprobe *kp;

        lockdep_assert_held(&kprobe_mutex);

        list_for_each_entry(kp, &ap->list, list)
                if (!kprobe_disabled(kp))
                        /*
                         * There is an active probe on the list.
                         * We can't disable this ap.
                         */
                        return 0;

        return 1;
}

/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
static struct kprobe *__disable_kprobe(struct kprobe *p)
{
        struct kprobe *orig_p;
        int ret;

        /* Get an original kprobe for return */
        orig_p = __get_valid_kprobe(p);
        if (unlikely(orig_p == NULL))
                return ERR_PTR(-EINVAL);

        if (!kprobe_disabled(p)) {
                /* Disable probe if it is a child probe */
                if (p != orig_p)
                        p->flags |= KPROBE_FLAG_DISABLED;

                /* Try to disarm and disable this/parent probe */
                if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
                        /*
                         * Don't be lazy here.  Even if 'kprobes_all_disarmed'
                         * is false, 'orig_p' might not have been armed yet.
                         * Note arm_all_kprobes() __tries__ to arm all kprobes
                         * on the best effort basis.
                         */
                        if (!kprobes_all_disarmed && !kprobe_disabled(orig_p)) {
                                ret = disarm_kprobe(orig_p, true);
                                if (ret) {
                                        p->flags &= ~KPROBE_FLAG_DISABLED;
                                        return ERR_PTR(ret);
                                }
                        }
                        orig_p->flags |= KPROBE_FLAG_DISABLED;
                }
        }

        return orig_p;
}

/*
 * Unregister a kprobe without a scheduler synchronization.
 */
static int __unregister_kprobe_top(struct kprobe *p)
{
        struct kprobe *ap, *list_p;

        /* Disable kprobe. This will disarm it if needed. */
        ap = __disable_kprobe(p);
        if (IS_ERR(ap))
                return PTR_ERR(ap);

        if (ap == p)
                /*
                 * This probe is an independent(and non-optimized) kprobe
                 * (not an aggrprobe). Remove from the hash list.
                 */
                goto disarmed;

        /* Following process expects this probe is an aggrprobe */
        WARN_ON(!kprobe_aggrprobe(ap));

        if (list_is_singular(&ap->list) && kprobe_disarmed(ap))
                /*
                 * !disarmed could be happen if the probe is under delayed
                 * unoptimizing.
                 */
                goto disarmed;
        else {
                /* If disabling probe has special handlers, update aggrprobe */
                if (p->post_handler && !kprobe_gone(p)) {
                        list_for_each_entry(list_p, &ap->list, list) {
                                if ((list_p != p) && (list_p->post_handler))
                                        goto noclean;
                        }
                        /*
                         * For the kprobe-on-ftrace case, we keep the
                         * post_handler setting to identify this aggrprobe
                         * armed with kprobe_ipmodify_ops.
                         */
                        if (!kprobe_ftrace(ap))
                                ap->post_handler = NULL;
                }
noclean:
                /*
                 * Remove from the aggrprobe: this path will do nothing in
                 * __unregister_kprobe_bottom().
                 */
                list_del_rcu(&p->list);
                if (!kprobe_disabled(ap) && !kprobes_all_disarmed)
                        /*
                         * Try to optimize this probe again, because post
                         * handler may have been changed.
                         */
                        optimize_kprobe(ap);
        }
        return 0;

disarmed:
        hlist_del_rcu(&ap->hlist);
        return 0;
}

static void __unregister_kprobe_bottom(struct kprobe *p)
{
        struct kprobe *ap;

        if (list_empty(&p->list))
                /* This is an independent kprobe */
                arch_remove_kprobe(p);
        else if (list_is_singular(&p->list)) {
                /* This is the last child of an aggrprobe */
                ap = list_entry(p->list.next, struct kprobe, list);
                list_del(&p->list);
                free_aggr_kprobe(ap);
        }
        /* Otherwise, do nothing. */
}

int register_kprobes(struct kprobe **kps, int num)
{
        int i, ret = 0;

        if (num <= 0)
                return -EINVAL;
        for (i = 0; i < num; i++) {
                ret = register_kprobe(kps[i]);
                if (ret < 0) {
                        if (i > 0)
                                unregister_kprobes(kps, i);
                        break;
                }
        }
        return ret;
}
EXPORT_SYMBOL_GPL(register_kprobes);

void unregister_kprobe(struct kprobe *p)
{
        unregister_kprobes(&p, 1);
}
EXPORT_SYMBOL_GPL(unregister_kprobe);

void unregister_kprobes(struct kprobe **kps, int num)
{
        int i;

        if (num <= 0)
                return;
        mutex_lock(&kprobe_mutex);
        for (i = 0; i < num; i++)
                if (__unregister_kprobe_top(kps[i]) < 0)
                        kps[i]->addr = NULL;
        mutex_unlock(&kprobe_mutex);

        synchronize_rcu();
        for (i = 0; i < num; i++)
                if (kps[i]->addr)
                        __unregister_kprobe_bottom(kps[i]);
}
EXPORT_SYMBOL_GPL(unregister_kprobes);

int __weak kprobe_exceptions_notify(struct notifier_block *self,
                                        unsigned long val, void *data)
{
        return NOTIFY_DONE;
}
NOKPROBE_SYMBOL(kprobe_exceptions_notify);

static struct notifier_block kprobe_exceptions_nb = {
        .notifier_call = kprobe_exceptions_notify,
        .priority = 0x7fffffff /* we need to be notified first */
};

unsigned long __weak arch_deref_entry_point(void *entry)
{
        return (unsigned long)entry;
}

#ifdef CONFIG_KRETPROBES

unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs,
                                             void *trampoline_address,
                                             void *frame_pointer)
{
        struct kretprobe_instance *ri = NULL, *last = NULL;
        struct hlist_head *head;
        struct hlist_node *tmp;
        unsigned long flags;
        kprobe_opcode_t *correct_ret_addr = NULL;
        bool skipped = false;

        kretprobe_hash_lock(current, &head, &flags);

        /*
         * It is possible to have multiple instances associated with a given
         * task either because multiple functions in the call path have
         * return probes installed on them, and/or more than one
         * return probe was registered for a target function.
         *
         * We can handle this because:
         *     - instances are always pushed into the head of the list
         *     - when multiple return probes are registered for the same
         *         function, the (chronologically) first instance's ret_addr
         *         will be the real return address, and all the rest will
         *         point to kretprobe_trampoline.
         */
        hlist_for_each_entry(ri, head, hlist) {
                if (ri->task != current)
                        /* another task is sharing our hash bucket */
                        continue;
                /*
                 * Return probes must be pushed on this hash list correct
                 * order (same as return order) so that it can be popped
                 * correctly. However, if we find it is pushed it incorrect
                 * order, this means we find a function which should not be
                 * probed, because the wrong order entry is pushed on the
                 * path of processing other kretprobe itself.
                 */
                if (ri->fp != frame_pointer) {
                        if (!skipped)
                                pr_warn("kretprobe is stacked incorrectly. Trying to fixup.\n");
                        skipped = true;
                        continue;
                }

                correct_ret_addr = ri->ret_addr;
                if (skipped)
                        pr_warn("%ps must be blacklisted because of incorrect kretprobe order\n",
                                ri->rp->kp.addr);

                if (correct_ret_addr != trampoline_address)
                        /*
                         * This is the real return address. Any other
                         * instances associated with this task are for
                         * other calls deeper on the call stack
                         */
                        break;
        }

        BUG_ON(!correct_ret_addr || (correct_ret_addr == trampoline_address));
        last = ri;

        hlist_for_each_entry_safe(ri, tmp, head, hlist) {
                if (ri->task != current)
                        /* another task is sharing our hash bucket */
                        continue;
                if (ri->fp != frame_pointer)
                        continue;

                if (ri->rp && ri->rp->handler) {
                        struct kprobe *prev = kprobe_running();

                        __this_cpu_write(current_kprobe, &ri->rp->kp);
                        ri->ret_addr = correct_ret_addr;
                        ri->rp->handler(ri, regs);
                        __this_cpu_write(current_kprobe, prev);
                }

                recycle_rp_inst(ri);

                if (ri == last)
                        break;
        }

        kretprobe_hash_unlock(current, &flags);

        return (unsigned long)correct_ret_addr;
}
NOKPROBE_SYMBOL(__kretprobe_trampoline_handler)

/*
 * This kprobe pre_handler is registered with every kretprobe. When probe
 * hits it will set up the return probe.
 */
static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
        struct kretprobe *rp = container_of(p, struct kretprobe, kp);
        unsigned long hash, flags = 0;
        struct kretprobe_instance *ri;

        /* TODO: consider to only swap the RA after the last pre_handler fired */
        hash = hash_ptr(current, KPROBE_HASH_BITS);
        /*
         * Nested is a workaround that will soon not be needed.
         * There's other protections that make sure the same lock
         * is not taken on the same CPU that lockdep is unaware of.
         */
        raw_spin_lock_irqsave_nested(&rp->lock, flags, 1);
        if (!hlist_empty(&rp->free_instances)) {
                ri = hlist_entry(rp->free_instances.first,
                                struct kretprobe_instance, hlist);
                hlist_del(&ri->hlist);
                raw_spin_unlock_irqrestore(&rp->lock, flags);

                ri->rp = rp;
                ri->task = current;

                if (rp->entry_handler && rp->entry_handler(ri, regs)) {
                        raw_spin_lock_irqsave_nested(&rp->lock, flags, 1);
                        hlist_add_head(&ri->hlist, &rp->free_instances);
                        raw_spin_unlock_irqrestore(&rp->lock, flags);
                        return 0;
                }

                arch_prepare_kretprobe(ri, regs);

                /* XXX(hch): why is there no hlist_move_head? */
                INIT_HLIST_NODE(&ri->hlist);
                kretprobe_table_lock(hash, &flags);
                hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
                kretprobe_table_unlock(hash, &flags);
        } else {
                rp->nmissed++;
                raw_spin_unlock_irqrestore(&rp->lock, flags);
        }
        return 0;
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);

bool __weak arch_kprobe_on_func_entry(unsigned long offset)
{
        return !offset;
}

/**
 * kprobe_on_func_entry() -- check whether given address is function entry
 * @addr: Target address
 * @sym:  Target symbol name
 * @offset: The offset from the symbol or the address
 *
 * This checks whether the given @addr+@offset or @sym+@offset is on the
 * function entry address or not.
 * This returns 0 if it is the function entry, or -EINVAL if it is not.
 * And also it returns -ENOENT if it fails the symbol or address lookup.
 * Caller must pass @addr or @sym (either one must be NULL), or this
 * returns -EINVAL.
 */
int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
{
        kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset);

        if (IS_ERR(kp_addr))
                return PTR_ERR(kp_addr);

        if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset))
                return -ENOENT;

        if (!arch_kprobe_on_func_entry(offset))
                return -EINVAL;

        return 0;
}

int register_kretprobe(struct kretprobe *rp)
{
        int ret;
        struct kretprobe_instance *inst;
        int i;
        void *addr;

        ret = kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset);
        if (ret)
                return ret;

        /* If only rp->kp.addr is specified, check reregistering kprobes */
        if (rp->kp.addr && check_kprobe_rereg(&rp->kp))
                return -EINVAL;

        if (kretprobe_blacklist_size) {
                addr = kprobe_addr(&rp->kp);
                if (IS_ERR(addr))
                        return PTR_ERR(addr);

                for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
                        if (kretprobe_blacklist[i].addr == addr)
                                return -EINVAL;
                }
        }

        if (rp->data_size > KRETPROBE_MAX_DATA_SIZE)
                return -E2BIG;

        rp->kp.pre_handler = pre_handler_kretprobe;
        rp->kp.post_handler = NULL;
        rp->kp.fault_handler = NULL;

        /* Pre-allocate memory for max kretprobe instances */
        if (rp->maxactive <= 0) {
#ifdef CONFIG_PREEMPTION
                rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
#else
                rp->maxactive = num_possible_cpus();
#endif
        }
        raw_spin_lock_init(&rp->lock);
        INIT_HLIST_HEAD(&rp->free_instances);
        for (i = 0; i < rp->maxactive; i++) {
                inst = kmalloc(sizeof(struct kretprobe_instance) +
                               rp->data_size, GFP_KERNEL);
                if (inst == NULL) {
                        free_rp_inst(rp);
                        return -ENOMEM;
                }
                INIT_HLIST_NODE(&inst->hlist);
                hlist_add_head(&inst->hlist, &rp->free_instances);
        }

        rp->nmissed = 0;
        /* Establish function entry probe point */
        ret = register_kprobe(&rp->kp);
        if (ret != 0)
                free_rp_inst(rp);
        return ret;
}
EXPORT_SYMBOL_GPL(register_kretprobe);

int register_kretprobes(struct kretprobe **rps, int num)
{
        int ret = 0, i;

        if (num <= 0)
                return -EINVAL;
        for (i = 0; i < num; i++) {
                ret = register_kretprobe(rps[i]);
                if (ret < 0) {
                        if (i > 0)
                                unregister_kretprobes(rps, i);
                        break;
                }
        }
        return ret;
}
EXPORT_SYMBOL_GPL(register_kretprobes);

void unregister_kretprobe(struct kretprobe *rp)
{
        unregister_kretprobes(&rp, 1);
}
EXPORT_SYMBOL_GPL(unregister_kretprobe);

void unregister_kretprobes(struct kretprobe **rps, int num)
{
        int i;

        if (num <= 0)
                return;
        mutex_lock(&kprobe_mutex);
        for (i = 0; i < num; i++)
                if (__unregister_kprobe_top(&rps[i]->kp) < 0)
                        rps[i]->kp.addr = NULL;
        mutex_unlock(&kprobe_mutex);

        synchronize_rcu();
        for (i = 0; i < num; i++) {
                if (rps[i]->kp.addr) {
                        __unregister_kprobe_bottom(&rps[i]->kp);
                        cleanup_rp_inst(rps[i]);
                }
        }
}
EXPORT_SYMBOL_GPL(unregister_kretprobes);

#else /* CONFIG_KRETPROBES */
int register_kretprobe(struct kretprobe *rp)
{
        return -ENOSYS;
}
EXPORT_SYMBOL_GPL(register_kretprobe);

int register_kretprobes(struct kretprobe **rps, int num)
{
        return -ENOSYS;
}
EXPORT_SYMBOL_GPL(register_kretprobes);

void unregister_kretprobe(struct kretprobe *rp)
{
}
EXPORT_SYMBOL_GPL(unregister_kretprobe);

void unregister_kretprobes(struct kretprobe **rps, int num)
{
}
EXPORT_SYMBOL_GPL(unregister_kretprobes);

static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
        return 0;
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);

#endif /* CONFIG_KRETPROBES */

/* Set the kprobe gone and remove its instruction buffer. */
static void kill_kprobe(struct kprobe *p)
{
        struct kprobe *kp;

        lockdep_assert_held(&kprobe_mutex);

        if (WARN_ON_ONCE(kprobe_gone(p)))
                return;

        p->flags |= KPROBE_FLAG_GONE;
        if (kprobe_aggrprobe(p)) {
                /*
                 * If this is an aggr_kprobe, we have to list all the
                 * chained probes and mark them GONE.
                 */
                list_for_each_entry(kp, &p->list, list)
                        kp->flags |= KPROBE_FLAG_GONE;
                p->post_handler = NULL;
                kill_optimized_kprobe(p);
        }
        /*
         * Here, we can remove insn_slot safely, because no thread calls
         * the original probed function (which will be freed soon) any more.
         */
        arch_remove_kprobe(p);

        /*
         * The module is going away. We should disarm the kprobe which
         * is using ftrace, because ftrace framework is still available at
         * MODULE_STATE_GOING notification.
         */
        if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed)
                disarm_kprobe_ftrace(p);
}

/* Disable one kprobe */
int disable_kprobe(struct kprobe *kp)
{
        int ret = 0;
        struct kprobe *p;

        mutex_lock(&kprobe_mutex);

        /* Disable this kprobe */
        p = __disable_kprobe(kp);
        if (IS_ERR(p))
                ret = PTR_ERR(p);

        mutex_unlock(&kprobe_mutex);
        return ret;
}
EXPORT_SYMBOL_GPL(disable_kprobe);

/* Enable one kprobe */
int enable_kprobe(struct kprobe *kp)
{
        int ret = 0;
        struct kprobe *p;

        mutex_lock(&kprobe_mutex);

        /* Check whether specified probe is valid. */
        p = __get_valid_kprobe(kp);
        if (unlikely(p == NULL)) {
                ret = -EINVAL;
                goto out;
        }

        if (kprobe_gone(kp)) {
                /* This kprobe has gone, we couldn't enable it. */
                ret = -EINVAL;
                goto out;
        }

        if (p != kp)
                kp->flags &= ~KPROBE_FLAG_DISABLED;

        if (!kprobes_all_disarmed && kprobe_disabled(p)) {
                p->flags &= ~KPROBE_FLAG_DISABLED;
                ret = arm_kprobe(p);
                if (ret) {
                        p->flags |= KPROBE_FLAG_DISABLED;
                        if (p != kp)
                                kp->flags |= KPROBE_FLAG_DISABLED;
                }
        }
out:
        mutex_unlock(&kprobe_mutex);
        return ret;
}
EXPORT_SYMBOL_GPL(enable_kprobe);

/* Caller must NOT call this in usual path. This is only for critical case */
void dump_kprobe(struct kprobe *kp)
{
        pr_err("Dumping kprobe:\n");
        pr_err("Name: %s\nOffset: %x\nAddress: %pS\n",
               kp->symbol_name, kp->offset, kp->addr);
}
NOKPROBE_SYMBOL(dump_kprobe);

int kprobe_add_ksym_blacklist(unsigned long entry)
{
        struct kprobe_blacklist_entry *ent;
        unsigned long offset = 0, size = 0;

        if (!kernel_text_address(entry) ||
            !kallsyms_lookup_size_offset(entry, &size, &offset))
                return -EINVAL;

        ent = kmalloc(sizeof(*ent), GFP_KERNEL);
        if (!ent)
                return -ENOMEM;
        ent->start_addr = entry;
        ent->end_addr = entry + size;
        INIT_LIST_HEAD(&ent->list);
        list_add_tail(&ent->list, &kprobe_blacklist);

        return (int)size;
}

/* Add all symbols in given area into kprobe blacklist */
int kprobe_add_area_blacklist(unsigned long start, unsigned long end)
{
        unsigned long entry;
        int ret = 0;

        for (entry = start; entry < end; entry += ret) {
                ret = kprobe_add_ksym_blacklist(entry);
                if (ret < 0)
                        return ret;
                if (ret == 0)        /* In case of alias symbol */
                        ret = 1;
        }
        return 0;
}

/* Remove all symbols in given area from kprobe blacklist */
static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end)
{
        struct kprobe_blacklist_entry *ent, *n;

        list_for_each_entry_safe(ent, n, &kprobe_blacklist, list) {
                if (ent->start_addr < start || ent->start_addr >= end)
                        continue;
                list_del(&ent->list);
                kfree(ent);
        }
}

static void kprobe_remove_ksym_blacklist(unsigned long entry)
{
        kprobe_remove_area_blacklist(entry, entry + 1);
}

int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
                                   char *type, char *sym)
{
        return -ERANGE;
}

int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
                       char *sym)
{
#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
        if (!kprobe_cache_get_kallsym(&kprobe_insn_slots, &symnum, value, type, sym))
                return 0;
#ifdef CONFIG_OPTPROBES
        if (!kprobe_cache_get_kallsym(&kprobe_optinsn_slots, &symnum, value, type, sym))
                return 0;
#endif
#endif
        if (!arch_kprobe_get_kallsym(&symnum, value, type, sym))
                return 0;
        return -ERANGE;
}

int __init __weak arch_populate_kprobe_blacklist(void)
{
        return 0;
}

/*
 * Lookup and populate the kprobe_blacklist.
 *
 * Unlike the kretprobe blacklist, we'll need to determine
 * the range of addresses that belong to the said functions,
 * since a kprobe need not necessarily be at the beginning
 * of a function.
 */
static int __init populate_kprobe_blacklist(unsigned long *start,
                                             unsigned long *end)
{
        unsigned long entry;
        unsigned long *iter;
        int ret;

        for (iter = start; iter < end; iter++) {
                entry = arch_deref_entry_point((void *)*iter);
                ret = kprobe_add_ksym_blacklist(entry);
                if (ret == -EINVAL)
                        continue;
                if (ret < 0)
                        return ret;
        }

        /* Symbols in __kprobes_text are blacklisted */
        ret = kprobe_add_area_blacklist((unsigned long)__kprobes_text_start,
                                        (unsigned long)__kprobes_text_end);
        if (ret)
                return ret;

        /* Symbols in noinstr section are blacklisted */
        ret = kprobe_add_area_blacklist((unsigned long)__noinstr_text_start,
                                        (unsigned long)__noinstr_text_end);

        return ret ? : arch_populate_kprobe_blacklist();
}

static void add_module_kprobe_blacklist(struct module *mod)
{
        unsigned long start, end;
        int i;

        if (mod->kprobe_blacklist) {
                for (i = 0; i < mod->num_kprobe_blacklist; i++)
                        kprobe_add_ksym_blacklist(mod->kprobe_blacklist[i]);
        }

        start = (unsigned long)mod->kprobes_text_start;
        if (start) {
                end = start + mod->kprobes_text_size;
                kprobe_add_area_blacklist(start, end);
        }

        start = (unsigned long)mod->noinstr_text_start;
        if (start) {
                end = start + mod->noinstr_text_size;
                kprobe_add_area_blacklist(start, end);
        }
}

static void remove_module_kprobe_blacklist(struct module *mod)
{
        unsigned long start, end;
        int i;

        if (mod->kprobe_blacklist) {
                for (i = 0; i < mod->num_kprobe_blacklist; i++)
                        kprobe_remove_ksym_blacklist(mod->kprobe_blacklist[i]);
        }

        start = (unsigned long)mod->kprobes_text_start;
        if (start) {
                end = start + mod->kprobes_text_size;
                kprobe_remove_area_blacklist(start, end);
        }

        start = (unsigned long)mod->noinstr_text_start;
        if (start) {
                end = start + mod->noinstr_text_size;
                kprobe_remove_area_blacklist(start, end);
        }
}

/* Module notifier call back, checking kprobes on the module */
static int kprobes_module_callback(struct notifier_block *nb,
                                   unsigned long val, void *data)
{
        struct module *mod = data;
        struct hlist_head *head;
        struct kprobe *p;
        unsigned int i;
        int checkcore = (val == MODULE_STATE_GOING);

        if (val == MODULE_STATE_COMING) {
                mutex_lock(&kprobe_mutex);
                add_module_kprobe_blacklist(mod);
                mutex_unlock(&kprobe_mutex);
        }
        if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE)
                return NOTIFY_DONE;

        /*
         * When MODULE_STATE_GOING was notified, both of module .text and
         * .init.text sections would be freed. When MODULE_STATE_LIVE was
         * notified, only .init.text section would be freed. We need to
         * disable kprobes which have been inserted in the sections.
         */
        mutex_lock(&kprobe_mutex);
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry(p, head, hlist) {
                        if (kprobe_gone(p))
                                continue;

                        if (within_module_init((unsigned long)p->addr, mod) ||
                            (checkcore &&
                             within_module_core((unsigned long)p->addr, mod))) {
                                /*
                                 * The vaddr this probe is installed will soon
                                 * be vfreed buy not synced to disk. Hence,
                                 * disarming the breakpoint isn't needed.
                                 *
                                 * Note, this will also move any optimized probes
                                 * that are pending to be removed from their
                                 * corresponding lists to the freeing_list and
                                 * will not be touched by the delayed
                                 * kprobe_optimizer work handler.
                                 */
                                kill_kprobe(p);
                        }
                }
        }
        if (val == MODULE_STATE_GOING)
                remove_module_kprobe_blacklist(mod);
        mutex_unlock(&kprobe_mutex);
        return NOTIFY_DONE;
}

static struct notifier_block kprobe_module_nb = {
        .notifier_call = kprobes_module_callback,
        .priority = 0
};

/* Markers of _kprobe_blacklist section */
extern unsigned long __start_kprobe_blacklist[];
extern unsigned long __stop_kprobe_blacklist[];

void kprobe_free_init_mem(void)
{
        void *start = (void *)(&__init_begin);
        void *end = (void *)(&__init_end);
        struct hlist_head *head;
        struct kprobe *p;
        int i;

        mutex_lock(&kprobe_mutex);

        /* Kill all kprobes on initmem */
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry(p, head, hlist) {
                        if (start <= (void *)p->addr && (void *)p->addr < end)
                                kill_kprobe(p);
                }
        }

        mutex_unlock(&kprobe_mutex);
}

static int __init init_kprobes(void)
{
        int i, err = 0;

        /* FIXME allocate the probe table, currently defined statically */
        /* initialize all list heads */
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                INIT_HLIST_HEAD(&kprobe_table[i]);
                INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
                raw_spin_lock_init(&(kretprobe_table_locks[i].lock));
        }

        err = populate_kprobe_blacklist(__start_kprobe_blacklist,
                                        __stop_kprobe_blacklist);
        if (err) {
                pr_err("kprobes: failed to populate blacklist: %d\n", err);
                pr_err("Please take care of using kprobes.\n");
        }

        if (kretprobe_blacklist_size) {
                /* lookup the function address from its name */
                for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
                        kretprobe_blacklist[i].addr =
                                kprobe_lookup_name(kretprobe_blacklist[i].name, 0);
                        if (!kretprobe_blacklist[i].addr)
                                printk("kretprobe: lookup failed: %s\n",
                                       kretprobe_blacklist[i].name);
                }
        }

        /* By default, kprobes are armed */
        kprobes_all_disarmed = false;

#if defined(CONFIG_OPTPROBES) && defined(__ARCH_WANT_KPROBES_INSN_SLOT)
        /* Init kprobe_optinsn_slots for allocation */
        kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
#endif

        err = arch_init_kprobes();
        if (!err)
                err = register_die_notifier(&kprobe_exceptions_nb);
        if (!err)
                err = register_module_notifier(&kprobe_module_nb);

        kprobes_initialized = (err == 0);

        if (!err)
                init_test_probes();
        return err;
}
early_initcall(init_kprobes);

#if defined(CONFIG_OPTPROBES)
static int __init init_optprobes(void)
{
        /*
         * Enable kprobe optimization - this kicks the optimizer which
         * depends on synchronize_rcu_tasks() and ksoftirqd, that is
         * not spawned in early initcall. So delay the optimization.
         */
        optimize_all_kprobes();

        return 0;
}
subsys_initcall(init_optprobes);
#endif

#ifdef CONFIG_DEBUG_FS
static void report_probe(struct seq_file *pi, struct kprobe *p,
                const char *sym, int offset, char *modname, struct kprobe *pp)
{
        char *kprobe_type;
        void *addr = p->addr;

        if (p->pre_handler == pre_handler_kretprobe)
                kprobe_type = "r";
        else
                kprobe_type = "k";

        if (!kallsyms_show_value(pi->file->f_cred))
                addr = NULL;

        if (sym)
                seq_printf(pi, "%px  %s  %s+0x%x  %s ",
                        addr, kprobe_type, sym, offset,
                        (modname ? modname : " "));
        else        /* try to use %pS */
                seq_printf(pi, "%px  %s  %pS ",
                        addr, kprobe_type, p->addr);

        if (!pp)
                pp = p;
        seq_printf(pi, "%s%s%s%s\n",
                (kprobe_gone(p) ? "[GONE]" : ""),
                ((kprobe_disabled(p) && !kprobe_gone(p)) ?  "[DISABLED]" : ""),
                (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""),
                (kprobe_ftrace(pp) ? "[FTRACE]" : ""));
}

static void *kprobe_seq_start(struct seq_file *f, loff_t *pos)
{
        return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL;
}

static void *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
{
        (*pos)++;
        if (*pos >= KPROBE_TABLE_SIZE)
                return NULL;
        return pos;
}

static void kprobe_seq_stop(struct seq_file *f, void *v)
{
        /* Nothing to do */
}

static int show_kprobe_addr(struct seq_file *pi, void *v)
{
        struct hlist_head *head;
        struct kprobe *p, *kp;
        const char *sym = NULL;
        unsigned int i = *(loff_t *) v;
        unsigned long offset = 0;
        char *modname, namebuf[KSYM_NAME_LEN];

        head = &kprobe_table[i];
        preempt_disable();
        hlist_for_each_entry_rcu(p, head, hlist) {
                sym = kallsyms_lookup((unsigned long)p->addr, NULL,
                                        &offset, &modname, namebuf);
                if (kprobe_aggrprobe(p)) {
                        list_for_each_entry_rcu(kp, &p->list, list)
                                report_probe(pi, kp, sym, offset, modname, p);
                } else
                        report_probe(pi, p, sym, offset, modname, NULL);
        }
        preempt_enable();
        return 0;
}

static const struct seq_operations kprobes_sops = {
        .start = kprobe_seq_start,
        .next  = kprobe_seq_next,
        .stop  = kprobe_seq_stop,
        .show  = show_kprobe_addr
};

DEFINE_SEQ_ATTRIBUTE(kprobes);

/* kprobes/blacklist -- shows which functions can not be probed */
static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos)
{
        mutex_lock(&kprobe_mutex);
        return seq_list_start(&kprobe_blacklist, *pos);
}

static void *kprobe_blacklist_seq_next(struct seq_file *m, void *v, loff_t *pos)
{
        return seq_list_next(v, &kprobe_blacklist, pos);
}

static int kprobe_blacklist_seq_show(struct seq_file *m, void *v)
{
        struct kprobe_blacklist_entry *ent =
                list_entry(v, struct kprobe_blacklist_entry, list);

        /*
         * If /proc/kallsyms is not showing kernel address, we won't
         * show them here either.
         */
        if (!kallsyms_show_value(m->file->f_cred))
                seq_printf(m, "0x%px-0x%px\t%ps\n", NULL, NULL,
                           (void *)ent->start_addr);
        else
                seq_printf(m, "0x%px-0x%px\t%ps\n", (void *)ent->start_addr,
                           (void *)ent->end_addr, (void *)ent->start_addr);
        return 0;
}

static void kprobe_blacklist_seq_stop(struct seq_file *f, void *v)
{
        mutex_unlock(&kprobe_mutex);
}

static const struct seq_operations kprobe_blacklist_sops = {
        .start = kprobe_blacklist_seq_start,
        .next  = kprobe_blacklist_seq_next,
        .stop  = kprobe_blacklist_seq_stop,
        .show  = kprobe_blacklist_seq_show,
};
DEFINE_SEQ_ATTRIBUTE(kprobe_blacklist);

static int arm_all_kprobes(void)
{
        struct hlist_head *head;
        struct kprobe *p;
        unsigned int i, total = 0, errors = 0;
        int err, ret = 0;

        mutex_lock(&kprobe_mutex);

        /* If kprobes are armed, just return */
        if (!kprobes_all_disarmed)
                goto already_enabled;

        /*
         * optimize_kprobe() called by arm_kprobe() checks
         * kprobes_all_disarmed, so set kprobes_all_disarmed before
         * arm_kprobe.
         */
        kprobes_all_disarmed = false;
        /* Arming kprobes doesn't optimize kprobe itself */
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                /* Arm all kprobes on a best-effort basis */
                hlist_for_each_entry(p, head, hlist) {
                        if (!kprobe_disabled(p)) {
                                err = arm_kprobe(p);
                                if (err)  {
                                        errors++;
                                        ret = err;
                                }
                                total++;
                        }
                }
        }

        if (errors)
                pr_warn("Kprobes globally enabled, but failed to arm %d out of %d probes\n",
                        errors, total);
        else
                pr_info("Kprobes globally enabled\n");

already_enabled:
        mutex_unlock(&kprobe_mutex);
        return ret;
}

static int disarm_all_kprobes(void)
{
        struct hlist_head *head;
        struct kprobe *p;
        unsigned int i, total = 0, errors = 0;
        int err, ret = 0;

        mutex_lock(&kprobe_mutex);

        /* If kprobes are already disarmed, just return */
        if (kprobes_all_disarmed) {
                mutex_unlock(&kprobe_mutex);
                return 0;
        }

        kprobes_all_disarmed = true;

        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                /* Disarm all kprobes on a best-effort basis */
                hlist_for_each_entry(p, head, hlist) {
                        if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) {
                                err = disarm_kprobe(p, false);
                                if (err) {
                                        errors++;
                                        ret = err;
                                }
                                total++;
                        }
                }
        }

        if (errors)
                pr_warn("Kprobes globally disabled, but failed to disarm %d out of %d probes\n",
                        errors, total);
        else
                pr_info("Kprobes globally disabled\n");

        mutex_unlock(&kprobe_mutex);

        /* Wait for disarming all kprobes by optimizer */
        wait_for_kprobe_optimizer();

        return ret;
}

/*
 * XXX: The debugfs bool file interface doesn't allow for callbacks
 * when the bool state is switched. We can reuse that facility when
 * available
 */
static ssize_t read_enabled_file_bool(struct file *file,
               char __user *user_buf, size_t count, loff_t *ppos)
{
        char buf[3];

        if (!kprobes_all_disarmed)
                buf[0] = '1';
        else
                buf[0] = '0';
        buf[1] = '\n';
        buf[2] = 0x00;
        return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
}

static ssize_t write_enabled_file_bool(struct file *file,
               const char __user *user_buf, size_t count, loff_t *ppos)
{
        char buf[32];
        size_t buf_size;
        int ret = 0;

        buf_size = min(count, (sizeof(buf)-1));
        if (copy_from_user(buf, user_buf, buf_size))
                return -EFAULT;

        buf[buf_size] = '\0';
        switch (buf[0]) {
        case 'y':
        case 'Y':
        case '1':
                ret = arm_all_kprobes();
                break;
        case 'n':
        case 'N':
        case '0':
                ret = disarm_all_kprobes();
                break;
        default:
                return -EINVAL;
        }

        if (ret)
                return ret;

        return count;
}

static const struct file_operations fops_kp = {
        .read =         read_enabled_file_bool,
        .write =        write_enabled_file_bool,
        .llseek =        default_llseek,
};

static int __init debugfs_kprobe_init(void)
{
        struct dentry *dir;

        dir = debugfs_create_dir("kprobes", NULL);

        debugfs_create_file("list", 0400, dir, NULL, &kprobes_fops);

        debugfs_create_file("enabled", 0600, dir, NULL, &fops_kp);

        debugfs_create_file("blacklist", 0400, dir, NULL,
                            &kprobe_blacklist_fops);

        return 0;
}

late_initcall(debugfs_kprobe_init);
#endif /* CONFIG_DEBUG_FS */















    1 




    1 


    1 







































    1 







    1 








































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/interval_tree.c - interval tree for mapping->i_mmap
 *
 * Copyright (C) 2012, Michel Lespinasse <walken@google.com>
 */

#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/rmap.h>
#include <linux/interval_tree_generic.h>

static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
{
        return v->vm_pgoff;
}

static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
{
        return v->vm_pgoff + vma_pages(v) - 1;
}

INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
                     unsigned long, shared.rb_subtree_last,
                     vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)

/* Insert node immediately after prev in the interval tree */
void vma_interval_tree_insert_after(struct vm_area_struct *node,
                                    struct vm_area_struct *prev,
                                    struct rb_root_cached *root)
{
        struct rb_node **link;
        struct vm_area_struct *parent;
        unsigned long last = vma_last_pgoff(node);

        VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node);

        if (!prev->shared.rb.rb_right) {
                parent = prev;
                link = &prev->shared.rb.rb_right;
        } else {
                parent = rb_entry(prev->shared.rb.rb_right,
                                  struct vm_area_struct, shared.rb);
                if (parent->shared.rb_subtree_last < last)
                        parent->shared.rb_subtree_last = last;
                while (parent->shared.rb.rb_left) {
                        parent = rb_entry(parent->shared.rb.rb_left,
                                struct vm_area_struct, shared.rb);
                        if (parent->shared.rb_subtree_last < last)
                                parent->shared.rb_subtree_last = last;
                }
                link = &parent->shared.rb.rb_left;
        }

        node->shared.rb_subtree_last = last;
        rb_link_node(&node->shared.rb, &parent->shared.rb, link);
        rb_insert_augmented(&node->shared.rb, &root->rb_root,
                            &vma_interval_tree_augment);
}

static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc)
{
        return vma_start_pgoff(avc->vma);
}

static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc)
{
        return vma_last_pgoff(avc->vma);
}

INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
                     avc_start_pgoff, avc_last_pgoff,
                     static inline, __anon_vma_interval_tree)

void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
                                   struct rb_root_cached *root)
{
#ifdef CONFIG_DEBUG_VM_RB
        node->cached_vma_start = avc_start_pgoff(node);
        node->cached_vma_last = avc_last_pgoff(node);
#endif
        __anon_vma_interval_tree_insert(node, root);
}

void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
                                   struct rb_root_cached *root)
{
        __anon_vma_interval_tree_remove(node, root);
}

struct anon_vma_chain *
anon_vma_interval_tree_iter_first(struct rb_root_cached *root,
                                  unsigned long first, unsigned long last)
{
        return __anon_vma_interval_tree_iter_first(root, first, last);
}

struct anon_vma_chain *
anon_vma_interval_tree_iter_next(struct anon_vma_chain *node,
                                 unsigned long first, unsigned long last)
{
        return __anon_vma_interval_tree_iter_next(node, first, last);
}

#ifdef CONFIG_DEBUG_VM_RB
void anon_vma_interval_tree_verify(struct anon_vma_chain *node)
{
        WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node));
        WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node));
}
#endif































































































































    1 








    1 










    1 



    1 









    1 


    1 











    1 


    1 

    1 
    1 
    1 
    1 



















































































































    1 


    1 


































































































































    1 














    1 


















    1 

















    1 




























    1 







    1 
    1 











    1 





    1 

    1 

    1 


    1 




    1 
    1 

















    1 


    1 










































































    1 


    1 






    1 

    1 



    1 



    1 









    1 









    1 
    1 









    1 












    1 



























    1 









    1 




















































































    1 
    1 














    1 



    1 
    1 


    1 



    1 



    1 

    1 




    1 



    1 
    1 


    1 





    1 


    1 



    1 



















    1 

















    1 

    1 




    1 







    1 

































































    1 
    1 






















    1 


























































































































    1 












    1 



    1 
    1 
    1 




    1 
    1 
    1 





















    1 







    1 









































































































































    1 















    1 






    1 













    1 




























    1 




    1 


    1 



    1 




    1 



    1 





    1 



    1 






    1 



















    1 






    1 

    1 








    1 




    1 
    1 

    1 







    1 








    1 




    1 

    1 





    1 

    1 







    1 
























    1 









    1 

    1 
    1 
    1 










    1 

    1 




    1 


























    1 



    1 
    1 










































    1 
    1 



    1 


    1 




    1 












    1 

















    1 


    1 
























    1 


















    1 









    1 





    1 


    1 



    1 




    1 
    1 





















    1 






    1 
    1 

    1 































    1 



    1 
    1 
    1 


    1 

    1 


    1 








    1 
    1 










    1 













































































































































































































































































































































































    1 




    1 











    1 



    1 

    1 


    1 
    1 











    1 
    1 


    1 

    1 


    1 

    1 



    1 

    1 


    1 
    1 





































































































































































































































































































































    1 

    1 

    1 
















    1 


    1 









    1 
    1 
    1 




    1 















    1 
    1 





    1 
    1 





    1 


























    1 





    1 



    1 


    1 



    1 



    1 



    1 


    1 



    1 


    1 


    1 












    1 


























    1 


    1 












    1 






    1 









    1 







    1 


    1 






    1 
    1 
























    1 















    1 


    1 




    1 





    1 
















































































































































































































































































































































































































































































    1 






    1 
































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/mmap.c
 *
 * Written by obz.
 *
 * Address space accounting code        <alan@lxorguk.ukuu.org.uk>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/capability.h>
#include <linux/init.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/shmem_fs.h>
#include <linux/profile.h>
#include <linux/export.h>
#include <linux/mount.h>
#include <linux/mempolicy.h>
#include <linux/rmap.h>
#include <linux/mmu_notifier.h>
#include <linux/mmdebug.h>
#include <linux/perf_event.h>
#include <linux/audit.h>
#include <linux/khugepaged.h>
#include <linux/uprobes.h>
#include <linux/rbtree_augmented.h>
#include <linux/notifier.h>
#include <linux/memory.h>
#include <linux/printk.h>
#include <linux/userfaultfd_k.h>
#include <linux/moduleparam.h>
#include <linux/pkeys.h>
#include <linux/oom.h>
#include <linux/sched/mm.h>

#include <linux/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>

#define CREATE_TRACE_POINTS
#include <trace/events/mmap.h>

#include "internal.h"

#ifndef arch_mmap_check
#define arch_mmap_check(addr, len, flags)        (0)
#endif

#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX;
int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS;
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN;
const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
#endif

static bool ignore_rlimit_data;
core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);

static void unmap_region(struct mm_struct *mm,
                struct vm_area_struct *vma, struct vm_area_struct *prev,
                unsigned long start, unsigned long end);

/* description of effects of mapping type and prot in current implementation.
 * this is due to the limited x86 page protection hardware.  The expected
 * behavior is in parens:
 *
 * map_type        prot
 *                PROT_NONE        PROT_READ        PROT_WRITE        PROT_EXEC
 * MAP_SHARED        r: (no) no        r: (yes) yes        r: (no) yes        r: (no) yes
 *                w: (no) no        w: (no) no        w: (yes) yes        w: (no) no
 *                x: (no) no        x: (no) yes        x: (no) yes        x: (yes) yes
 *
 * MAP_PRIVATE        r: (no) no        r: (yes) yes        r: (no) yes        r: (no) yes
 *                w: (no) no        w: (no) no        w: (copy) copy        w: (no) no
 *                x: (no) no        x: (no) yes        x: (no) yes        x: (yes) yes
 */
pgprot_t protection_map[16] __ro_after_init = {
        __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
        __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
};

#ifndef CONFIG_ARCH_HAS_FILTER_PGPROT
static inline pgprot_t arch_filter_pgprot(pgprot_t prot)
{
        return prot;
}
#endif

pgprot_t vm_get_page_prot(unsigned long vm_flags)
{
        pgprot_t ret = __pgprot(pgprot_val(protection_map[vm_flags &
                                (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |
                        pgprot_val(arch_vm_get_page_prot(vm_flags)));

        return arch_filter_pgprot(ret);
}
EXPORT_SYMBOL(vm_get_page_prot);

static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
{
        return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
}

/* Update vma->vm_page_prot to reflect vma->vm_flags. */
void vma_set_page_prot(struct vm_area_struct *vma)
{
        unsigned long vm_flags = vma->vm_flags;
        pgprot_t vm_page_prot;

        vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
        if (vma_wants_writenotify(vma, vm_page_prot)) {
                vm_flags &= ~VM_SHARED;
                vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags);
        }
        /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */
        WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
}

/*
 * Requires inode->i_mapping->i_mmap_rwsem
 */
static void __remove_shared_vm_struct(struct vm_area_struct *vma,
                struct file *file, struct address_space *mapping)
{
        if (vma->vm_flags & VM_DENYWRITE)
                allow_write_access(file);
        if (vma->vm_flags & VM_SHARED)
                mapping_unmap_writable(mapping);

        flush_dcache_mmap_lock(mapping);
        vma_interval_tree_remove(vma, &mapping->i_mmap);
        flush_dcache_mmap_unlock(mapping);
}

/*
 * Unlink a file-based vm structure from its interval tree, to hide
 * vma from rmap and vmtruncate before freeing its page tables.
 */
void unlink_file_vma(struct vm_area_struct *vma)
{
        struct file *file = vma->vm_file;

        if (file) {
                struct address_space *mapping = file->f_mapping;
                i_mmap_lock_write(mapping);
                __remove_shared_vm_struct(vma, file, mapping);
                i_mmap_unlock_write(mapping);
        }
}

/*
 * Close a vm structure and free it, returning the next.
 */
static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
{
        struct vm_area_struct *next = vma->vm_next;

        might_sleep();
        if (vma->vm_ops && vma->vm_ops->close)
                vma->vm_ops->close(vma);
        if (vma->vm_file)
                fput(vma->vm_file);
        mpol_put(vma_policy(vma));
        vm_area_free(vma);
        return next;
}

static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags,
                struct list_head *uf);
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
        unsigned long retval;
        unsigned long newbrk, oldbrk, origbrk;
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *next;
        unsigned long min_brk;
        bool populate;
        bool downgraded = false;
        LIST_HEAD(uf);

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        origbrk = mm->brk;

#ifdef CONFIG_COMPAT_BRK
        /*
         * CONFIG_COMPAT_BRK can still be overridden by setting
         * randomize_va_space to 2, which will still cause mm->start_brk
         * to be arbitrarily shifted
         */
        if (current->brk_randomized)
                min_brk = mm->start_brk;
        else
                min_brk = mm->end_data;
#else
        min_brk = mm->start_brk;
#endif
        if (brk < min_brk)
                goto out;

        /*
         * Check against rlimit here. If this check is done later after the test
         * of oldbrk with newbrk then it can escape the test and let the data
         * segment grow beyond its set limit the in case where the limit is
         * not page aligned -Ram Gupta
         */
        if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
                              mm->end_data, mm->start_data))
                goto out;

        newbrk = PAGE_ALIGN(brk);
        oldbrk = PAGE_ALIGN(mm->brk);
        if (oldbrk == newbrk) {
                mm->brk = brk;
                goto success;
        }

        /*
         * Always allow shrinking brk.
         * __do_munmap() may downgrade mmap_lock to read.
         */
        if (brk <= mm->brk) {
                int ret;

                /*
                 * mm->brk must to be protected by write mmap_lock so update it
                 * before downgrading mmap_lock. When __do_munmap() fails,
                 * mm->brk will be restored from origbrk.
                 */
                mm->brk = brk;
                ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
                if (ret < 0) {
                        mm->brk = origbrk;
                        goto out;
                } else if (ret == 1) {
                        downgraded = true;
                }
                goto success;
        }

        /* Check against existing mmap mappings. */
        next = find_vma(mm, oldbrk);
        if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
                goto out;

        /* Ok, looks good - let it rip. */
        if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
                goto out;
        mm->brk = brk;

success:
        populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
        if (downgraded)
                mmap_read_unlock(mm);
        else
                mmap_write_unlock(mm);
        userfaultfd_unmap_complete(mm, &uf);
        if (populate)
                mm_populate(oldbrk, newbrk - oldbrk);
        return brk;

out:
        retval = origbrk;
        mmap_write_unlock(mm);
        return retval;
}

static inline unsigned long vma_compute_gap(struct vm_area_struct *vma)
{
        unsigned long gap, prev_end;

        /*
         * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we
         * allow two stack_guard_gaps between them here, and when choosing
         * an unmapped area; whereas when expanding we only require one.
         * That's a little inconsistent, but keeps the code here simpler.
         */
        gap = vm_start_gap(vma);
        if (vma->vm_prev) {
                prev_end = vm_end_gap(vma->vm_prev);
                if (gap > prev_end)
                        gap -= prev_end;
                else
                        gap = 0;
        }
        return gap;
}

#ifdef CONFIG_DEBUG_VM_RB
static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma)
{
        unsigned long max = vma_compute_gap(vma), subtree_gap;
        if (vma->vm_rb.rb_left) {
                subtree_gap = rb_entry(vma->vm_rb.rb_left,
                                struct vm_area_struct, vm_rb)->rb_subtree_gap;
                if (subtree_gap > max)
                        max = subtree_gap;
        }
        if (vma->vm_rb.rb_right) {
                subtree_gap = rb_entry(vma->vm_rb.rb_right,
                                struct vm_area_struct, vm_rb)->rb_subtree_gap;
                if (subtree_gap > max)
                        max = subtree_gap;
        }
        return max;
}

static int browse_rb(struct mm_struct *mm)
{
        struct rb_root *root = &mm->mm_rb;
        int i = 0, j, bug = 0;
        struct rb_node *nd, *pn = NULL;
        unsigned long prev = 0, pend = 0;

        for (nd = rb_first(root); nd; nd = rb_next(nd)) {
                struct vm_area_struct *vma;
                vma = rb_entry(nd, struct vm_area_struct, vm_rb);
                if (vma->vm_start < prev) {
                        pr_emerg("vm_start %lx < prev %lx\n",
                                  vma->vm_start, prev);
                        bug = 1;
                }
                if (vma->vm_start < pend) {
                        pr_emerg("vm_start %lx < pend %lx\n",
                                  vma->vm_start, pend);
                        bug = 1;
                }
                if (vma->vm_start > vma->vm_end) {
                        pr_emerg("vm_start %lx > vm_end %lx\n",
                                  vma->vm_start, vma->vm_end);
                        bug = 1;
                }
                spin_lock(&mm->page_table_lock);
                if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
                        pr_emerg("free gap %lx, correct %lx\n",
                               vma->rb_subtree_gap,
                               vma_compute_subtree_gap(vma));
                        bug = 1;
                }
                spin_unlock(&mm->page_table_lock);
                i++;
                pn = nd;
                prev = vma->vm_start;
                pend = vma->vm_end;
        }
        j = 0;
        for (nd = pn; nd; nd = rb_prev(nd))
                j++;
        if (i != j) {
                pr_emerg("backwards %d, forwards %d\n", j, i);
                bug = 1;
        }
        return bug ? -1 : i;
}

static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
{
        struct rb_node *nd;

        for (nd = rb_first(root); nd; nd = rb_next(nd)) {
                struct vm_area_struct *vma;
                vma = rb_entry(nd, struct vm_area_struct, vm_rb);
                VM_BUG_ON_VMA(vma != ignore &&
                        vma->rb_subtree_gap != vma_compute_subtree_gap(vma),
                        vma);
        }
}

static void validate_mm(struct mm_struct *mm)
{
        int bug = 0;
        int i = 0;
        unsigned long highest_address = 0;
        struct vm_area_struct *vma = mm->mmap;

        while (vma) {
                struct anon_vma *anon_vma = vma->anon_vma;
                struct anon_vma_chain *avc;

                if (anon_vma) {
                        anon_vma_lock_read(anon_vma);
                        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                                anon_vma_interval_tree_verify(avc);
                        anon_vma_unlock_read(anon_vma);
                }

                highest_address = vm_end_gap(vma);
                vma = vma->vm_next;
                i++;
        }
        if (i != mm->map_count) {
                pr_emerg("map_count %d vm_next %d\n", mm->map_count, i);
                bug = 1;
        }
        if (highest_address != mm->highest_vm_end) {
                pr_emerg("mm->highest_vm_end %lx, found %lx\n",
                          mm->highest_vm_end, highest_address);
                bug = 1;
        }
        i = browse_rb(mm);
        if (i != mm->map_count) {
                if (i != -1)
                        pr_emerg("map_count %d rb %d\n", mm->map_count, i);
                bug = 1;
        }
        VM_BUG_ON_MM(bug, mm);
}
#else
#define validate_mm_rb(root, ignore) do { } while (0)
#define validate_mm(mm) do { } while (0)
#endif

RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks,
                         struct vm_area_struct, vm_rb,
                         unsigned long, rb_subtree_gap, vma_compute_gap)

/*
 * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
 * vma->vm_prev->vm_end values changed, without modifying the vma's position
 * in the rbtree.
 */
static void vma_gap_update(struct vm_area_struct *vma)
{
        /*
         * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created
         * a callback function that does exactly what we want.
         */
        vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
}

static inline void vma_rb_insert(struct vm_area_struct *vma,
                                 struct rb_root *root)
{
        /* All rb_subtree_gap values must be consistent prior to insertion */
        validate_mm_rb(root, NULL);

        rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
}

static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
{
        /*
         * Note rb_erase_augmented is a fairly large inline function,
         * so make sure we instantiate it only once with our desired
         * augmented rbtree callbacks.
         */
        rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
}

static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
                                                struct rb_root *root,
                                                struct vm_area_struct *ignore)
{
        /*
         * All rb_subtree_gap values must be consistent prior to erase,
         * with the possible exception of
         *
         * a. the "next" vma being erased if next->vm_start was reduced in
         *    __vma_adjust() -> __vma_unlink()
         * b. the vma being erased in detach_vmas_to_be_unmapped() ->
         *    vma_rb_erase()
         */
        validate_mm_rb(root, ignore);

        __vma_rb_erase(vma, root);
}

static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
                                         struct rb_root *root)
{
        vma_rb_erase_ignore(vma, root, vma);
}

/*
 * vma has some anon_vma assigned, and is already inserted on that
 * anon_vma's interval trees.
 *
 * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
 * vma must be removed from the anon_vma's interval trees using
 * anon_vma_interval_tree_pre_update_vma().
 *
 * After the update, the vma will be reinserted using
 * anon_vma_interval_tree_post_update_vma().
 *
 * The entire update must be protected by exclusive mmap_lock and by
 * the root anon_vma's mutex.
 */
static inline void
anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
{
        struct anon_vma_chain *avc;

        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
}

static inline void
anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
{
        struct anon_vma_chain *avc;

        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
}

static int find_vma_links(struct mm_struct *mm, unsigned long addr,
                unsigned long end, struct vm_area_struct **pprev,
                struct rb_node ***rb_link, struct rb_node **rb_parent)
{
        struct rb_node **__rb_link, *__rb_parent, *rb_prev;

        __rb_link = &mm->mm_rb.rb_node;
        rb_prev = __rb_parent = NULL;

        while (*__rb_link) {
                struct vm_area_struct *vma_tmp;

                __rb_parent = *__rb_link;
                vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);

                if (vma_tmp->vm_end > addr) {
                        /* Fail if an existing vma overlaps the area */
                        if (vma_tmp->vm_start < end)
                                return -ENOMEM;
                        __rb_link = &__rb_parent->rb_left;
                } else {
                        rb_prev = __rb_parent;
                        __rb_link = &__rb_parent->rb_right;
                }
        }

        *pprev = NULL;
        if (rb_prev)
                *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
        *rb_link = __rb_link;
        *rb_parent = __rb_parent;
        return 0;
}

/*
 * vma_next() - Get the next VMA.
 * @mm: The mm_struct.
 * @vma: The current vma.
 *
 * If @vma is NULL, return the first vma in the mm.
 *
 * Returns: The next VMA after @vma.
 */
static inline struct vm_area_struct *vma_next(struct mm_struct *mm,
                                         struct vm_area_struct *vma)
{
        if (!vma)
                return mm->mmap;

        return vma->vm_next;
}

/*
 * munmap_vma_range() - munmap VMAs that overlap a range.
 * @mm: The mm struct
 * @start: The start of the range.
 * @len: The length of the range.
 * @pprev: pointer to the pointer that will be set to previous vm_area_struct
 * @rb_link: the rb_node
 * @rb_parent: the parent rb_node
 *
 * Find all the vm_area_struct that overlap from @start to
 * @end and munmap them.  Set @pprev to the previous vm_area_struct.
 *
 * Returns: -ENOMEM on munmap failure or 0 on success.
 */
static inline int
munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len,
                 struct vm_area_struct **pprev, struct rb_node ***link,
                 struct rb_node **parent, struct list_head *uf)
{

        while (find_vma_links(mm, start, start + len, pprev, link, parent))
                if (do_munmap(mm, start, len, uf))
                        return -ENOMEM;

        return 0;
}
static unsigned long count_vma_pages_range(struct mm_struct *mm,
                unsigned long addr, unsigned long end)
{
        unsigned long nr_pages = 0;
        struct vm_area_struct *vma;

        /* Find first overlaping mapping */
        vma = find_vma_intersection(mm, addr, end);
        if (!vma)
                return 0;

        nr_pages = (min(end, vma->vm_end) -
                max(addr, vma->vm_start)) >> PAGE_SHIFT;

        /* Iterate over the rest of the overlaps */
        for (vma = vma->vm_next; vma; vma = vma->vm_next) {
                unsigned long overlap_len;

                if (vma->vm_start > end)
                        break;

                overlap_len = min(end, vma->vm_end) - vma->vm_start;
                nr_pages += overlap_len >> PAGE_SHIFT;
        }

        return nr_pages;
}

void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
                struct rb_node **rb_link, struct rb_node *rb_parent)
{
        /* Update tracking information for the gap following the new vma. */
        if (vma->vm_next)
                vma_gap_update(vma->vm_next);
        else
                mm->highest_vm_end = vm_end_gap(vma);

        /*
         * vma->vm_prev wasn't known when we followed the rbtree to find the
         * correct insertion point for that vma. As a result, we could not
         * update the vma vm_rb parents rb_subtree_gap values on the way down.
         * So, we first insert the vma with a zero rb_subtree_gap value
         * (to be consistent with what we did on the way down), and then
         * immediately update the gap to the correct value. Finally we
         * rebalance the rbtree after all augmented values have been set.
         */
        rb_link_node(&vma->vm_rb, rb_parent, rb_link);
        vma->rb_subtree_gap = 0;
        vma_gap_update(vma);
        vma_rb_insert(vma, &mm->mm_rb);
}

static void __vma_link_file(struct vm_area_struct *vma)
{
        struct file *file;

        file = vma->vm_file;
        if (file) {
                struct address_space *mapping = file->f_mapping;

                if (vma->vm_flags & VM_DENYWRITE)
                        put_write_access(file_inode(file));
                if (vma->vm_flags & VM_SHARED)
                        mapping_allow_writable(mapping);

                flush_dcache_mmap_lock(mapping);
                vma_interval_tree_insert(vma, &mapping->i_mmap);
                flush_dcache_mmap_unlock(mapping);
        }
}

static void
__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
        struct vm_area_struct *prev, struct rb_node **rb_link,
        struct rb_node *rb_parent)
{
        __vma_link_list(mm, vma, prev);
        __vma_link_rb(mm, vma, rb_link, rb_parent);
}

static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
                        struct vm_area_struct *prev, struct rb_node **rb_link,
                        struct rb_node *rb_parent)
{
        struct address_space *mapping = NULL;

        if (vma->vm_file) {
                mapping = vma->vm_file->f_mapping;
                i_mmap_lock_write(mapping);
        }

        __vma_link(mm, vma, prev, rb_link, rb_parent);
        __vma_link_file(vma);

        if (mapping)
                i_mmap_unlock_write(mapping);

        mm->map_count++;
        validate_mm(mm);
}

/*
 * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
 * mm's list and rbtree.  It has already been inserted into the interval tree.
 */
static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
        struct vm_area_struct *prev;
        struct rb_node **rb_link, *rb_parent;

        if (find_vma_links(mm, vma->vm_start, vma->vm_end,
                           &prev, &rb_link, &rb_parent))
                BUG();
        __vma_link(mm, vma, prev, rb_link, rb_parent);
        mm->map_count++;
}

static __always_inline void __vma_unlink(struct mm_struct *mm,
                                                struct vm_area_struct *vma,
                                                struct vm_area_struct *ignore)
{
        vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
        __vma_unlink_list(mm, vma);
        /* Kill the cache */
        vmacache_invalidate(mm);
}

/*
 * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
 * is already present in an i_mmap tree without adjusting the tree.
 * The following helper function should be used when such adjustments
 * are necessary.  The "insert" vma (if any) is to be inserted
 * before we drop the necessary locks.
 */
int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
        unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
        struct vm_area_struct *expand)
{
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
        struct address_space *mapping = NULL;
        struct rb_root_cached *root = NULL;
        struct anon_vma *anon_vma = NULL;
        struct file *file = vma->vm_file;
        bool start_changed = false, end_changed = false;
        long adjust_next = 0;
        int remove_next = 0;

        if (next && !insert) {
                struct vm_area_struct *exporter = NULL, *importer = NULL;

                if (end >= next->vm_end) {
                        /*
                         * vma expands, overlapping all the next, and
                         * perhaps the one after too (mprotect case 6).
                         * The only other cases that gets here are
                         * case 1, case 7 and case 8.
                         */
                        if (next == expand) {
                                /*
                                 * The only case where we don't expand "vma"
                                 * and we expand "next" instead is case 8.
                                 */
                                VM_WARN_ON(end != next->vm_end);
                                /*
                                 * remove_next == 3 means we're
                                 * removing "vma" and that to do so we
                                 * swapped "vma" and "next".
                                 */
                                remove_next = 3;
                                VM_WARN_ON(file != next->vm_file);
                                swap(vma, next);
                        } else {
                                VM_WARN_ON(expand != vma);
                                /*
                                 * case 1, 6, 7, remove_next == 2 is case 6,
                                 * remove_next == 1 is case 1 or 7.
                                 */
                                remove_next = 1 + (end > next->vm_end);
                                VM_WARN_ON(remove_next == 2 &&
                                           end != next->vm_next->vm_end);
                                /* trim end to next, for case 6 first pass */
                                end = next->vm_end;
                        }

                        exporter = next;
                        importer = vma;

                        /*
                         * If next doesn't have anon_vma, import from vma after
                         * next, if the vma overlaps with it.
                         */
                        if (remove_next == 2 && !next->anon_vma)
                                exporter = next->vm_next;

                } else if (end > next->vm_start) {
                        /*
                         * vma expands, overlapping part of the next:
                         * mprotect case 5 shifting the boundary up.
                         */
                        adjust_next = (end - next->vm_start);
                        exporter = next;
                        importer = vma;
                        VM_WARN_ON(expand != importer);
                } else if (end < vma->vm_end) {
                        /*
                         * vma shrinks, and !insert tells it's not
                         * split_vma inserting another: so it must be
                         * mprotect case 4 shifting the boundary down.
                         */
                        adjust_next = -(vma->vm_end - end);
                        exporter = vma;
                        importer = next;
                        VM_WARN_ON(expand != importer);
                }

                /*
                 * Easily overlooked: when mprotect shifts the boundary,
                 * make sure the expanding vma has anon_vma set if the
                 * shrinking vma had, to cover any anon pages imported.
                 */
                if (exporter && exporter->anon_vma && !importer->anon_vma) {
                        int error;

                        importer->anon_vma = exporter->anon_vma;
                        error = anon_vma_clone(importer, exporter);
                        if (error)
                                return error;
                }
        }
again:
        vma_adjust_trans_huge(orig_vma, start, end, adjust_next);

        if (file) {
                mapping = file->f_mapping;
                root = &mapping->i_mmap;
                uprobe_munmap(vma, vma->vm_start, vma->vm_end);

                if (adjust_next)
                        uprobe_munmap(next, next->vm_start, next->vm_end);

                i_mmap_lock_write(mapping);
                if (insert) {
                        /*
                         * Put into interval tree now, so instantiated pages
                         * are visible to arm/parisc __flush_dcache_page
                         * throughout; but we cannot insert into address
                         * space until vma start or end is updated.
                         */
                        __vma_link_file(insert);
                }
        }

        anon_vma = vma->anon_vma;
        if (!anon_vma && adjust_next)
                anon_vma = next->anon_vma;
        if (anon_vma) {
                VM_WARN_ON(adjust_next && next->anon_vma &&
                           anon_vma != next->anon_vma);
                anon_vma_lock_write(anon_vma);
                anon_vma_interval_tree_pre_update_vma(vma);
                if (adjust_next)
                        anon_vma_interval_tree_pre_update_vma(next);
        }

        if (file) {
                flush_dcache_mmap_lock(mapping);
                vma_interval_tree_remove(vma, root);
                if (adjust_next)
                        vma_interval_tree_remove(next, root);
        }

        if (start != vma->vm_start) {
                vma->vm_start = start;
                start_changed = true;
        }
        if (end != vma->vm_end) {
                vma->vm_end = end;
                end_changed = true;
        }
        vma->vm_pgoff = pgoff;
        if (adjust_next) {
                next->vm_start += adjust_next;
                next->vm_pgoff += adjust_next >> PAGE_SHIFT;
        }

        if (file) {
                if (adjust_next)
                        vma_interval_tree_insert(next, root);
                vma_interval_tree_insert(vma, root);
                flush_dcache_mmap_unlock(mapping);
        }

        if (remove_next) {
                /*
                 * vma_merge has merged next into vma, and needs
                 * us to remove next before dropping the locks.
                 */
                if (remove_next != 3)
                        __vma_unlink(mm, next, next);
                else
                        /*
                         * vma is not before next if they've been
                         * swapped.
                         *
                         * pre-swap() next->vm_start was reduced so
                         * tell validate_mm_rb to ignore pre-swap()
                         * "next" (which is stored in post-swap()
                         * "vma").
                         */
                        __vma_unlink(mm, next, vma);
                if (file)
                        __remove_shared_vm_struct(next, file, mapping);
        } else if (insert) {
                /*
                 * split_vma has split insert from vma, and needs
                 * us to insert it before dropping the locks
                 * (it may either follow vma or precede it).
                 */
                __insert_vm_struct(mm, insert);
        } else {
                if (start_changed)
                        vma_gap_update(vma);
                if (end_changed) {
                        if (!next)
                                mm->highest_vm_end = vm_end_gap(vma);
                        else if (!adjust_next)
                                vma_gap_update(next);
                }
        }

        if (anon_vma) {
                anon_vma_interval_tree_post_update_vma(vma);
                if (adjust_next)
                        anon_vma_interval_tree_post_update_vma(next);
                anon_vma_unlock_write(anon_vma);
        }

        if (file) {
                i_mmap_unlock_write(mapping);
                uprobe_mmap(vma);

                if (adjust_next)
                        uprobe_mmap(next);
        }

        if (remove_next) {
                if (file) {
                        uprobe_munmap(next, next->vm_start, next->vm_end);
                        fput(file);
                }
                if (next->anon_vma)
                        anon_vma_merge(vma, next);
                mm->map_count--;
                mpol_put(vma_policy(next));
                vm_area_free(next);
                /*
                 * In mprotect's case 6 (see comments on vma_merge),
                 * we must remove another next too. It would clutter
                 * up the code too much to do both in one go.
                 */
                if (remove_next != 3) {
                        /*
                         * If "next" was removed and vma->vm_end was
                         * expanded (up) over it, in turn
                         * "next->vm_prev->vm_end" changed and the
                         * "vma->vm_next" gap must be updated.
                         */
                        next = vma->vm_next;
                } else {
                        /*
                         * For the scope of the comment "next" and
                         * "vma" considered pre-swap(): if "vma" was
                         * removed, next->vm_start was expanded (down)
                         * over it and the "next" gap must be updated.
                         * Because of the swap() the post-swap() "vma"
                         * actually points to pre-swap() "next"
                         * (post-swap() "next" as opposed is now a
                         * dangling pointer).
                         */
                        next = vma;
                }
                if (remove_next == 2) {
                        remove_next = 1;
                        end = next->vm_end;
                        goto again;
                }
                else if (next)
                        vma_gap_update(next);
                else {
                        /*
                         * If remove_next == 2 we obviously can't
                         * reach this path.
                         *
                         * If remove_next == 3 we can't reach this
                         * path because pre-swap() next is always not
                         * NULL. pre-swap() "next" is not being
                         * removed and its next->vm_end is not altered
                         * (and furthermore "end" already matches
                         * next->vm_end in remove_next == 3).
                         *
                         * We reach this only in the remove_next == 1
                         * case if the "next" vma that was removed was
                         * the highest vma of the mm. However in such
                         * case next->vm_end == "end" and the extended
                         * "vma" has vma->vm_end == next->vm_end so
                         * mm->highest_vm_end doesn't need any update
                         * in remove_next == 1 case.
                         */
                        VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
                }
        }
        if (insert && file)
                uprobe_mmap(insert);

        validate_mm(mm);

        return 0;
}

/*
 * If the vma has a ->close operation then the driver probably needs to release
 * per-vma resources, so we don't attempt to merge those.
 */
static inline int is_mergeable_vma(struct vm_area_struct *vma,
                                struct file *file, unsigned long vm_flags,
                                struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
{
        /*
         * VM_SOFTDIRTY should not prevent from VMA merging, if we
         * match the flags but dirty bit -- the caller should mark
         * merged VMA as dirty. If dirty bit won't be excluded from
         * comparison, we increase pressure on the memory system forcing
         * the kernel to generate new VMAs when old one could be
         * extended instead.
         */
        if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
                return 0;
        if (vma->vm_file != file)
                return 0;
        if (vma->vm_ops && vma->vm_ops->close)
                return 0;
        if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
                return 0;
        return 1;
}

static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
                                        struct anon_vma *anon_vma2,
                                        struct vm_area_struct *vma)
{
        /*
         * The list_is_singular() test is to avoid merging VMA cloned from
         * parents. This can improve scalability caused by anon_vma lock.
         */
        if ((!anon_vma1 || !anon_vma2) && (!vma ||
                list_is_singular(&vma->anon_vma_chain)))
                return 1;
        return anon_vma1 == anon_vma2;
}

/*
 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
 * in front of (at a lower virtual address and file offset than) the vma.
 *
 * We cannot merge two vmas if they have differently assigned (non-NULL)
 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
 *
 * We don't check here for the merged mmap wrapping around the end of pagecache
 * indices (16TB on ia32) because do_mmap() does not permit mmap's which
 * wrap, nor mmaps which cover the final page at index -1UL.
 */
static int
can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
                     struct anon_vma *anon_vma, struct file *file,
                     pgoff_t vm_pgoff,
                     struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
{
        if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
            is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                if (vma->vm_pgoff == vm_pgoff)
                        return 1;
        }
        return 0;
}

/*
 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
 * beyond (at a higher virtual address and file offset than) the vma.
 *
 * We cannot merge two vmas if they have differently assigned (non-NULL)
 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
 */
static int
can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
                    struct anon_vma *anon_vma, struct file *file,
                    pgoff_t vm_pgoff,
                    struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
{
        if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
            is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                pgoff_t vm_pglen;
                vm_pglen = vma_pages(vma);
                if (vma->vm_pgoff + vm_pglen == vm_pgoff)
                        return 1;
        }
        return 0;
}

/*
 * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
 * whether that can be merged with its predecessor or its successor.
 * Or both (it neatly fills a hole).
 *
 * In most cases - when called for mmap, brk or mremap - [addr,end) is
 * certain not to be mapped by the time vma_merge is called; but when
 * called for mprotect, it is certain to be already mapped (either at
 * an offset within prev, or at the start of next), and the flags of
 * this area are about to be changed to vm_flags - and the no-change
 * case has already been eliminated.
 *
 * The following mprotect cases have to be considered, where AAAA is
 * the area passed down from mprotect_fixup, never extending beyond one
 * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
 *
 *     AAAA             AAAA                   AAAA
 *    PPPPPPNNNNNN    PPPPPPNNNNNN       PPPPPPNNNNNN
 *    cannot merge    might become       might become
 *                    PPNNNNNNNNNN       PPPPPPPPPPNN
 *    mmap, brk or    case 4 below       case 5 below
 *    mremap move:
 *                        AAAA               AAAA
 *                    PPPP    NNNN       PPPPNNNNXXXX
 *                    might become       might become
 *                    PPPPPPPPPPPP 1 or  PPPPPPPPPPPP 6 or
 *                    PPPPPPPPNNNN 2 or  PPPPPPPPXXXX 7 or
 *                    PPPPNNNNNNNN 3     PPPPXXXXXXXX 8
 *
 * It is important for case 8 that the vma NNNN overlapping the
 * region AAAA is never going to extended over XXXX. Instead XXXX must
 * be extended in region AAAA and NNNN must be removed. This way in
 * all cases where vma_merge succeeds, the moment vma_adjust drops the
 * rmap_locks, the properties of the merged vma will be already
 * correct for the whole merged range. Some of those properties like
 * vm_page_prot/vm_flags may be accessed by rmap_walks and they must
 * be correct for the whole merged range immediately after the
 * rmap_locks are released. Otherwise if XXXX would be removed and
 * NNNN would be extended over the XXXX range, remove_migration_ptes
 * or other rmap walkers (if working on addresses beyond the "end"
 * parameter) may establish ptes with the wrong permissions of NNNN
 * instead of the right permissions of XXXX.
 */
struct vm_area_struct *vma_merge(struct mm_struct *mm,
                        struct vm_area_struct *prev, unsigned long addr,
                        unsigned long end, unsigned long vm_flags,
                        struct anon_vma *anon_vma, struct file *file,
                        pgoff_t pgoff, struct mempolicy *policy,
                        struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
{
        pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
        struct vm_area_struct *area, *next;
        int err;

        /*
         * We later require that vma->vm_flags == vm_flags,
         * so this tests vma->vm_flags & VM_SPECIAL, too.
         */
        if (vm_flags & VM_SPECIAL)
                return NULL;

        next = vma_next(mm, prev);
        area = next;
        if (area && area->vm_end == end)                /* cases 6, 7, 8 */
                next = next->vm_next;

        /* verify some invariant that must be enforced by the caller */
        VM_WARN_ON(prev && addr <= prev->vm_start);
        VM_WARN_ON(area && end > area->vm_end);
        VM_WARN_ON(addr >= end);

        /*
         * Can it merge with the predecessor?
         */
        if (prev && prev->vm_end == addr &&
                        mpol_equal(vma_policy(prev), policy) &&
                        can_vma_merge_after(prev, vm_flags,
                                            anon_vma, file, pgoff,
                                            vm_userfaultfd_ctx)) {
                /*
                 * OK, it can.  Can we now merge in the successor as well?
                 */
                if (next && end == next->vm_start &&
                                mpol_equal(policy, vma_policy(next)) &&
                                can_vma_merge_before(next, vm_flags,
                                                     anon_vma, file,
                                                     pgoff+pglen,
                                                     vm_userfaultfd_ctx) &&
                                is_mergeable_anon_vma(prev->anon_vma,
                                                      next->anon_vma, NULL)) {
                                                        /* cases 1, 6 */
                        err = __vma_adjust(prev, prev->vm_start,
                                         next->vm_end, prev->vm_pgoff, NULL,
                                         prev);
                } else                                        /* cases 2, 5, 7 */
                        err = __vma_adjust(prev, prev->vm_start,
                                         end, prev->vm_pgoff, NULL, prev);
                if (err)
                        return NULL;
                khugepaged_enter_vma_merge(prev, vm_flags);
                return prev;
        }

        /*
         * Can this new request be merged in front of next?
         */
        if (next && end == next->vm_start &&
                        mpol_equal(policy, vma_policy(next)) &&
                        can_vma_merge_before(next, vm_flags,
                                             anon_vma, file, pgoff+pglen,
                                             vm_userfaultfd_ctx)) {
                if (prev && addr < prev->vm_end)        /* case 4 */
                        err = __vma_adjust(prev, prev->vm_start,
                                         addr, prev->vm_pgoff, NULL, next);
                else {                                        /* cases 3, 8 */
                        err = __vma_adjust(area, addr, next->vm_end,
                                         next->vm_pgoff - pglen, NULL, next);
                        /*
                         * In case 3 area is already equal to next and
                         * this is a noop, but in case 8 "area" has
                         * been removed and next was expanded over it.
                         */
                        area = next;
                }
                if (err)
                        return NULL;
                khugepaged_enter_vma_merge(area, vm_flags);
                return area;
        }

        return NULL;
}

/*
 * Rough compatibility check to quickly see if it's even worth looking
 * at sharing an anon_vma.
 *
 * They need to have the same vm_file, and the flags can only differ
 * in things that mprotect may change.
 *
 * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
 * we can merge the two vma's. For example, we refuse to merge a vma if
 * there is a vm_ops->close() function, because that indicates that the
 * driver is doing some kind of reference counting. But that doesn't
 * really matter for the anon_vma sharing case.
 */
static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
{
        return a->vm_end == b->vm_start &&
                mpol_equal(vma_policy(a), vma_policy(b)) &&
                a->vm_file == b->vm_file &&
                !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) &&
                b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
}

/*
 * Do some basic sanity checking to see if we can re-use the anon_vma
 * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
 * the same as 'old', the other will be the new one that is trying
 * to share the anon_vma.
 *
 * NOTE! This runs with mm_sem held for reading, so it is possible that
 * the anon_vma of 'old' is concurrently in the process of being set up
 * by another page fault trying to merge _that_. But that's ok: if it
 * is being set up, that automatically means that it will be a singleton
 * acceptable for merging, so we can do all of this optimistically. But
 * we do that READ_ONCE() to make sure that we never re-load the pointer.
 *
 * IOW: that the "list_is_singular()" test on the anon_vma_chain only
 * matters for the 'stable anon_vma' case (ie the thing we want to avoid
 * is to return an anon_vma that is "complex" due to having gone through
 * a fork).
 *
 * We also make sure that the two vma's are compatible (adjacent,
 * and with the same memory policies). That's all stable, even with just
 * a read lock on the mm_sem.
 */
static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
{
        if (anon_vma_compatible(a, b)) {
                struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);

                if (anon_vma && list_is_singular(&old->anon_vma_chain))
                        return anon_vma;
        }
        return NULL;
}

/*
 * find_mergeable_anon_vma is used by anon_vma_prepare, to check
 * neighbouring vmas for a suitable anon_vma, before it goes off
 * to allocate a new anon_vma.  It checks because a repetitive
 * sequence of mprotects and faults may otherwise lead to distinct
 * anon_vmas being allocated, preventing vma merge in subsequent
 * mprotect.
 */
struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
{
        struct anon_vma *anon_vma = NULL;

        /* Try next first. */
        if (vma->vm_next) {
                anon_vma = reusable_anon_vma(vma->vm_next, vma, vma->vm_next);
                if (anon_vma)
                        return anon_vma;
        }

        /* Try prev next. */
        if (vma->vm_prev)
                anon_vma = reusable_anon_vma(vma->vm_prev, vma->vm_prev, vma);

        /*
         * We might reach here with anon_vma == NULL if we can't find
         * any reusable anon_vma.
         * There's no absolute need to look only at touching neighbours:
         * we could search further afield for "compatible" anon_vmas.
         * But it would probably just be a waste of time searching,
         * or lead to too many vmas hanging off the same anon_vma.
         * We're trying to allow mprotect remerging later on,
         * not trying to minimize memory used for anon_vmas.
         */
        return anon_vma;
}

/*
 * If a hint addr is less than mmap_min_addr change hint to be as
 * low as possible but still greater than mmap_min_addr
 */
static inline unsigned long round_hint_to_min(unsigned long hint)
{
        hint &= PAGE_MASK;
        if (((void *)hint != NULL) &&
            (hint < mmap_min_addr))
                return PAGE_ALIGN(mmap_min_addr);
        return hint;
}

static inline int mlock_future_check(struct mm_struct *mm,
                                     unsigned long flags,
                                     unsigned long len)
{
        unsigned long locked, lock_limit;

        /*  mlock MCL_FUTURE? */
        if (flags & VM_LOCKED) {
                locked = len >> PAGE_SHIFT;
                locked += mm->locked_vm;
                lock_limit = rlimit(RLIMIT_MEMLOCK);
                lock_limit >>= PAGE_SHIFT;
                if (locked > lock_limit && !capable(CAP_IPC_LOCK))
                        return -EAGAIN;
        }
        return 0;
}

static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
{
        if (S_ISREG(inode->i_mode))
                return MAX_LFS_FILESIZE;

        if (S_ISBLK(inode->i_mode))
                return MAX_LFS_FILESIZE;

        if (S_ISSOCK(inode->i_mode))
                return MAX_LFS_FILESIZE;

        /* Special "we do even unsigned file positions" case */
        if (file->f_mode & FMODE_UNSIGNED_OFFSET)
                return 0;

        /* Yes, random drivers might want more. But I'm tired of buggy drivers */
        return ULONG_MAX;
}

static inline bool file_mmap_ok(struct file *file, struct inode *inode,
                                unsigned long pgoff, unsigned long len)
{
        u64 maxsize = file_mmap_size_max(file, inode);

        if (maxsize && len > maxsize)
                return false;
        maxsize -= len;
        if (pgoff > maxsize >> PAGE_SHIFT)
                return false;
        return true;
}

/*
 * The caller must write-lock current->mm->mmap_lock.
 */
unsigned long do_mmap(struct file *file, unsigned long addr,
                        unsigned long len, unsigned long prot,
                        unsigned long flags, unsigned long pgoff,
                        unsigned long *populate, struct list_head *uf)
{
        struct mm_struct *mm = current->mm;
        vm_flags_t vm_flags;
        int pkey = 0;

        *populate = 0;

        if (!len)
                return -EINVAL;

        /*
         * Does the application expect PROT_READ to imply PROT_EXEC?
         *
         * (the exception is when the underlying filesystem is noexec
         *  mounted, in which case we dont add PROT_EXEC.)
         */
        if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
                if (!(file && path_noexec(&file->f_path)))
                        prot |= PROT_EXEC;

        /* force arch specific MAP_FIXED handling in get_unmapped_area */
        if (flags & MAP_FIXED_NOREPLACE)
                flags |= MAP_FIXED;

        if (!(flags & MAP_FIXED))
                addr = round_hint_to_min(addr);

        /* Careful about overflows.. */
        len = PAGE_ALIGN(len);
        if (!len)
                return -ENOMEM;

        /* offset overflow? */
        if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
                return -EOVERFLOW;

        /* Too many mappings? */
        if (mm->map_count > sysctl_max_map_count)
                return -ENOMEM;

        /* Obtain the address to map to. we verify (or select) it and ensure
         * that it represents a valid section of the address space.
         */
        addr = get_unmapped_area(file, addr, len, pgoff, flags);
        if (IS_ERR_VALUE(addr))
                return addr;

        if (flags & MAP_FIXED_NOREPLACE) {
                struct vm_area_struct *vma = find_vma(mm, addr);

                if (vma && vma->vm_start < addr + len)
                        return -EEXIST;
        }

        if (prot == PROT_EXEC) {
                pkey = execute_only_pkey(mm);
                if (pkey < 0)
                        pkey = 0;
        }

        /* Do simple checking here so the lower-level routines won't have
         * to. we assume access permissions have been handled by the open
         * of the memory object, so we don't do any here.
         */
        vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
                        mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;

        if (flags & MAP_LOCKED)
                if (!can_do_mlock())
                        return -EPERM;

        if (mlock_future_check(mm, vm_flags, len))
                return -EAGAIN;

        if (file) {
                struct inode *inode = file_inode(file);
                unsigned long flags_mask;

                if (!file_mmap_ok(file, inode, pgoff, len))
                        return -EOVERFLOW;

                flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;

                switch (flags & MAP_TYPE) {
                case MAP_SHARED:
                        /*
                         * Force use of MAP_SHARED_VALIDATE with non-legacy
                         * flags. E.g. MAP_SYNC is dangerous to use with
                         * MAP_SHARED as you don't know which consistency model
                         * you will get. We silently ignore unsupported flags
                         * with MAP_SHARED to preserve backward compatibility.
                         */
                        flags &= LEGACY_MAP_MASK;
                        fallthrough;
                case MAP_SHARED_VALIDATE:
                        if (flags & ~flags_mask)
                                return -EOPNOTSUPP;
                        if (prot & PROT_WRITE) {
                                if (!(file->f_mode & FMODE_WRITE))
                                        return -EACCES;
                                if (IS_SWAPFILE(file->f_mapping->host))
                                        return -ETXTBSY;
                        }

                        /*
                         * Make sure we don't allow writing to an append-only
                         * file..
                         */
                        if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
                                return -EACCES;

                        /*
                         * Make sure there are no mandatory locks on the file.
                         */
                        if (locks_verify_locked(file))
                                return -EAGAIN;

                        vm_flags |= VM_SHARED | VM_MAYSHARE;
                        if (!(file->f_mode & FMODE_WRITE))
                                vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
                        fallthrough;
                case MAP_PRIVATE:
                        if (!(file->f_mode & FMODE_READ))
                                return -EACCES;
                        if (path_noexec(&file->f_path)) {
                                if (vm_flags & VM_EXEC)
                                        return -EPERM;
                                vm_flags &= ~VM_MAYEXEC;
                        }

                        if (!file->f_op->mmap)
                                return -ENODEV;
                        if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
                                return -EINVAL;
                        break;

                default:
                        return -EINVAL;
                }
        } else {
                switch (flags & MAP_TYPE) {
                case MAP_SHARED:
                        if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
                                return -EINVAL;
                        /*
                         * Ignore pgoff.
                         */
                        pgoff = 0;
                        vm_flags |= VM_SHARED | VM_MAYSHARE;
                        break;
                case MAP_PRIVATE:
                        /*
                         * Set pgoff according to addr for anon_vma.
                         */
                        pgoff = addr >> PAGE_SHIFT;
                        break;
                default:
                        return -EINVAL;
                }
        }

        /*
         * Set 'VM_NORESERVE' if we should not account for the
         * memory use of this mapping.
         */
        if (flags & MAP_NORESERVE) {
                /* We honor MAP_NORESERVE if allowed to overcommit */
                if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
                        vm_flags |= VM_NORESERVE;

                /* hugetlb applies strict overcommit unless MAP_NORESERVE */
                if (file && is_file_hugepages(file))
                        vm_flags |= VM_NORESERVE;
        }

        addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
        if (!IS_ERR_VALUE(addr) &&
            ((vm_flags & VM_LOCKED) ||
             (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
                *populate = len;
        return addr;
}

unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
                              unsigned long prot, unsigned long flags,
                              unsigned long fd, unsigned long pgoff)
{
        struct file *file = NULL;
        unsigned long retval;

        if (!(flags & MAP_ANONYMOUS)) {
                audit_mmap_fd(fd, flags);
                file = fget(fd);
                if (!file)
                        return -EBADF;
                if (is_file_hugepages(file)) {
                        len = ALIGN(len, huge_page_size(hstate_file(file)));
                } else if (unlikely(flags & MAP_HUGETLB)) {
                        retval = -EINVAL;
                        goto out_fput;
                }
        } else if (flags & MAP_HUGETLB) {
                struct user_struct *user = NULL;
                struct hstate *hs;

                hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
                if (!hs)
                        return -EINVAL;

                len = ALIGN(len, huge_page_size(hs));
                /*
                 * VM_NORESERVE is used because the reservations will be
                 * taken when vm_ops->mmap() is called
                 * A dummy user value is used because we are not locking
                 * memory so no accounting is necessary
                 */
                file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
                                VM_NORESERVE,
                                &user, HUGETLB_ANONHUGE_INODE,
                                (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
                if (IS_ERR(file))
                        return PTR_ERR(file);
        }

        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);

        retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
out_fput:
        if (file)
                fput(file);
        return retval;
}

SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
                unsigned long, prot, unsigned long, flags,
                unsigned long, fd, unsigned long, pgoff)
{
        return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
}

#ifdef __ARCH_WANT_SYS_OLD_MMAP
struct mmap_arg_struct {
        unsigned long addr;
        unsigned long len;
        unsigned long prot;
        unsigned long flags;
        unsigned long fd;
        unsigned long offset;
};

SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
{
        struct mmap_arg_struct a;

        if (copy_from_user(&a, arg, sizeof(a)))
                return -EFAULT;
        if (offset_in_page(a.offset))
                return -EINVAL;

        return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
                               a.offset >> PAGE_SHIFT);
}
#endif /* __ARCH_WANT_SYS_OLD_MMAP */

/*
 * Some shared mappings will want the pages marked read-only
 * to track write events. If so, we'll downgrade vm_page_prot
 * to the private version (using protection_map[] without the
 * VM_SHARED bit).
 */
int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
{
        vm_flags_t vm_flags = vma->vm_flags;
        const struct vm_operations_struct *vm_ops = vma->vm_ops;

        /* If it was private or non-writable, the write bit is already clear */
        if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
                return 0;

        /* The backer wishes to know when pages are first written to? */
        if (vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite))
                return 1;

        /* The open routine did something to the protections that pgprot_modify
         * won't preserve? */
        if (pgprot_val(vm_page_prot) !=
            pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
                return 0;

        /*
         * Do we need to track softdirty? hugetlb does not support softdirty
         * tracking yet.
         */
        if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY) &&
            !is_vm_hugetlb_page(vma))
                return 1;

        /* Specialty mapping? */
        if (vm_flags & VM_PFNMAP)
                return 0;

        /* Can the mapping track the dirty pages? */
        return vma->vm_file && vma->vm_file->f_mapping &&
                mapping_can_writeback(vma->vm_file->f_mapping);
}

/*
 * We account for memory if it's a private writeable mapping,
 * not hugepages and VM_NORESERVE wasn't set.
 */
static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
{
        /*
         * hugetlb has its own accounting separate from the core VM
         * VM_HUGETLB may not be set yet so we cannot check for that flag.
         */
        if (file && is_file_hugepages(file))
                return 0;

        return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
}

unsigned long mmap_region(struct file *file, unsigned long addr,
                unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
                struct list_head *uf)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev, *merge;
        int error;
        struct rb_node **rb_link, *rb_parent;
        unsigned long charged = 0;

        /* Check against address space limit. */
        if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
                unsigned long nr_pages;

                /*
                 * MAP_FIXED may remove pages of mappings that intersects with
                 * requested mapping. Account for the pages it would unmap.
                 */
                nr_pages = count_vma_pages_range(mm, addr, addr + len);

                if (!may_expand_vm(mm, vm_flags,
                                        (len >> PAGE_SHIFT) - nr_pages))
                        return -ENOMEM;
        }

        /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
        if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
                return -ENOMEM;
        /*
         * Private writable mapping: check memory availability
         */
        if (accountable_mapping(file, vm_flags)) {
                charged = len >> PAGE_SHIFT;
                if (security_vm_enough_memory_mm(mm, charged))
                        return -ENOMEM;
                vm_flags |= VM_ACCOUNT;
        }

        /*
         * Can we just expand an old mapping?
         */
        vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
                        NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
        if (vma)
                goto out;

        /*
         * Determine the object being mapped and call the appropriate
         * specific mapper. the address has already been validated, but
         * not unmapped, but the maps are removed from the list.
         */
        vma = vm_area_alloc(mm);
        if (!vma) {
                error = -ENOMEM;
                goto unacct_error;
        }

        vma->vm_start = addr;
        vma->vm_end = addr + len;
        vma->vm_flags = vm_flags;
        vma->vm_page_prot = vm_get_page_prot(vm_flags);
        vma->vm_pgoff = pgoff;

        if (file) {
                if (vm_flags & VM_DENYWRITE) {
                        error = deny_write_access(file);
                        if (error)
                                goto free_vma;
                }
                if (vm_flags & VM_SHARED) {
                        error = mapping_map_writable(file->f_mapping);
                        if (error)
                                goto allow_write_and_free_vma;
                }

                /* ->mmap() can change vma->vm_file, but must guarantee that
                 * vma_link() below can deny write-access if VM_DENYWRITE is set
                 * and map writably if VM_SHARED is set. This usually means the
                 * new file must not have been exposed to user-space, yet.
                 */
                vma->vm_file = get_file(file);
                error = call_mmap(file, vma);
                if (error)
                        goto unmap_and_free_vma;

                /* Can addr have changed??
                 *
                 * Answer: Yes, several device drivers can do it in their
                 *         f_op->mmap method. -DaveM
                 * Bug: If addr is changed, prev, rb_link, rb_parent should
                 *      be updated for vma_link()
                 */
                WARN_ON_ONCE(addr != vma->vm_start);

                addr = vma->vm_start;

                /* If vm_flags changed after call_mmap(), we should try merge vma again
                 * as we may succeed this time.
                 */
                if (unlikely(vm_flags != vma->vm_flags && prev)) {
                        merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
                                NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX);
                        if (merge) {
                                /* ->mmap() can change vma->vm_file and fput the original file. So
                                 * fput the vma->vm_file here or we would add an extra fput for file
                                 * and cause general protection fault ultimately.
                                 */
                                fput(vma->vm_file);
                                vm_area_free(vma);
                                vma = merge;
                                /* Update vm_flags to pick up the change. */
                                vm_flags = vma->vm_flags;
                                goto unmap_writable;
                        }
                }

                vm_flags = vma->vm_flags;
        } else if (vm_flags & VM_SHARED) {
                error = shmem_zero_setup(vma);
                if (error)
                        goto free_vma;
        } else {
                vma_set_anonymous(vma);
        }

        /* Allow architectures to sanity-check the vm_flags */
        if (!arch_validate_flags(vma->vm_flags)) {
                error = -EINVAL;
                if (file)
                        goto close_and_free_vma;
                else
                        goto free_vma;
        }

        vma_link(mm, vma, prev, rb_link, rb_parent);
        /* Once vma denies write, undo our temporary denial count */
        if (file) {
unmap_writable:
                if (vm_flags & VM_SHARED)
                        mapping_unmap_writable(file->f_mapping);
                if (vm_flags & VM_DENYWRITE)
                        allow_write_access(file);
        }
        file = vma->vm_file;
out:
        perf_event_mmap(vma);

        vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
        if (vm_flags & VM_LOCKED) {
                if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
                                        is_vm_hugetlb_page(vma) ||
                                        vma == get_gate_vma(current->mm))
                        vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
                else
                        mm->locked_vm += (len >> PAGE_SHIFT);
        }

        if (file)
                uprobe_mmap(vma);

        /*
         * New (or expanded) vma always get soft dirty status.
         * Otherwise user-space soft-dirty page tracker won't
         * be able to distinguish situation when vma area unmapped,
         * then new mapped in-place (which must be aimed as
         * a completely new data area).
         */
        vma->vm_flags |= VM_SOFTDIRTY;

        vma_set_page_prot(vma);

        return addr;

close_and_free_vma:
        if (vma->vm_ops && vma->vm_ops->close)
                vma->vm_ops->close(vma);
unmap_and_free_vma:
        vma->vm_file = NULL;
        fput(file);

        /* Undo any partial mapping done by a device driver. */
        unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
        if (vm_flags & VM_SHARED)
                mapping_unmap_writable(file->f_mapping);
allow_write_and_free_vma:
        if (vm_flags & VM_DENYWRITE)
                allow_write_access(file);
free_vma:
        vm_area_free(vma);
unacct_error:
        if (charged)
                vm_unacct_memory(charged);
        return error;
}

static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
{
        /*
         * We implement the search by looking for an rbtree node that
         * immediately follows a suitable gap. That is,
         * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
         * - gap_end   = vma->vm_start        >= info->low_limit  + length;
         * - gap_end - gap_start >= length
         */

        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        unsigned long length, low_limit, high_limit, gap_start, gap_end;

        /* Adjust search length to account for worst case alignment overhead */
        length = info->length + info->align_mask;
        if (length < info->length)
                return -ENOMEM;

        /* Adjust search limits by the desired length */
        if (info->high_limit < length)
                return -ENOMEM;
        high_limit = info->high_limit - length;

        if (info->low_limit > high_limit)
                return -ENOMEM;
        low_limit = info->low_limit + length;

        /* Check if rbtree root looks promising */
        if (RB_EMPTY_ROOT(&mm->mm_rb))
                goto check_highest;
        vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
        if (vma->rb_subtree_gap < length)
                goto check_highest;

        while (true) {
                /* Visit left subtree if it looks promising */
                gap_end = vm_start_gap(vma);
                if (gap_end >= low_limit && vma->vm_rb.rb_left) {
                        struct vm_area_struct *left =
                                rb_entry(vma->vm_rb.rb_left,
                                         struct vm_area_struct, vm_rb);
                        if (left->rb_subtree_gap >= length) {
                                vma = left;
                                continue;
                        }
                }

                gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
check_current:
                /* Check if current node has a suitable gap */
                if (gap_start > high_limit)
                        return -ENOMEM;
                if (gap_end >= low_limit &&
                    gap_end > gap_start && gap_end - gap_start >= length)
                        goto found;

                /* Visit right subtree if it looks promising */
                if (vma->vm_rb.rb_right) {
                        struct vm_area_struct *right =
                                rb_entry(vma->vm_rb.rb_right,
                                         struct vm_area_struct, vm_rb);
                        if (right->rb_subtree_gap >= length) {
                                vma = right;
                                continue;
                        }
                }

                /* Go back up the rbtree to find next candidate node */
                while (true) {
                        struct rb_node *prev = &vma->vm_rb;
                        if (!rb_parent(prev))
                                goto check_highest;
                        vma = rb_entry(rb_parent(prev),
                                       struct vm_area_struct, vm_rb);
                        if (prev == vma->vm_rb.rb_left) {
                                gap_start = vm_end_gap(vma->vm_prev);
                                gap_end = vm_start_gap(vma);
                                goto check_current;
                        }
                }
        }

check_highest:
        /* Check highest gap, which does not precede any rbtree node */
        gap_start = mm->highest_vm_end;
        gap_end = ULONG_MAX;  /* Only for VM_BUG_ON below */
        if (gap_start > high_limit)
                return -ENOMEM;

found:
        /* We found a suitable gap. Clip it with the original low_limit. */
        if (gap_start < info->low_limit)
                gap_start = info->low_limit;

        /* Adjust gap address to the desired alignment */
        gap_start += (info->align_offset - gap_start) & info->align_mask;

        VM_BUG_ON(gap_start + info->length > info->high_limit);
        VM_BUG_ON(gap_start + info->length > gap_end);
        return gap_start;
}

static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        unsigned long length, low_limit, high_limit, gap_start, gap_end;

        /* Adjust search length to account for worst case alignment overhead */
        length = info->length + info->align_mask;
        if (length < info->length)
                return -ENOMEM;

        /*
         * Adjust search limits by the desired length.
         * See implementation comment at top of unmapped_area().
         */
        gap_end = info->high_limit;
        if (gap_end < length)
                return -ENOMEM;
        high_limit = gap_end - length;

        if (info->low_limit > high_limit)
                return -ENOMEM;
        low_limit = info->low_limit + length;

        /* Check highest gap, which does not precede any rbtree node */
        gap_start = mm->highest_vm_end;
        if (gap_start <= high_limit)
                goto found_highest;

        /* Check if rbtree root looks promising */
        if (RB_EMPTY_ROOT(&mm->mm_rb))
                return -ENOMEM;
        vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
        if (vma->rb_subtree_gap < length)
                return -ENOMEM;

        while (true) {
                /* Visit right subtree if it looks promising */
                gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
                if (gap_start <= high_limit && vma->vm_rb.rb_right) {
                        struct vm_area_struct *right =
                                rb_entry(vma->vm_rb.rb_right,
                                         struct vm_area_struct, vm_rb);
                        if (right->rb_subtree_gap >= length) {
                                vma = right;
                                continue;
                        }
                }

check_current:
                /* Check if current node has a suitable gap */
                gap_end = vm_start_gap(vma);
                if (gap_end < low_limit)
                        return -ENOMEM;
                if (gap_start <= high_limit &&
                    gap_end > gap_start && gap_end - gap_start >= length)
                        goto found;

                /* Visit left subtree if it looks promising */
                if (vma->vm_rb.rb_left) {
                        struct vm_area_struct *left =
                                rb_entry(vma->vm_rb.rb_left,
                                         struct vm_area_struct, vm_rb);
                        if (left->rb_subtree_gap >= length) {
                                vma = left;
                                continue;
                        }
                }

                /* Go back up the rbtree to find next candidate node */
                while (true) {
                        struct rb_node *prev = &vma->vm_rb;
                        if (!rb_parent(prev))
                                return -ENOMEM;
                        vma = rb_entry(rb_parent(prev),
                                       struct vm_area_struct, vm_rb);
                        if (prev == vma->vm_rb.rb_right) {
                                gap_start = vma->vm_prev ?
                                        vm_end_gap(vma->vm_prev) : 0;
                                goto check_current;
                        }
                }
        }

found:
        /* We found a suitable gap. Clip it with the original high_limit. */
        if (gap_end > info->high_limit)
                gap_end = info->high_limit;

found_highest:
        /* Compute highest gap address at the desired alignment */
        gap_end -= info->length;
        gap_end -= (gap_end - info->align_offset) & info->align_mask;

        VM_BUG_ON(gap_end < info->low_limit);
        VM_BUG_ON(gap_end < gap_start);
        return gap_end;
}

/*
 * Search for an unmapped address range.
 *
 * We are looking for a range that:
 * - does not intersect with any VMA;
 * - is contained within the [low_limit, high_limit) interval;
 * - is at least the desired size.
 * - satisfies (begin_addr & align_mask) == (align_offset & align_mask)
 */
unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info)
{
        unsigned long addr;

        if (info->flags & VM_UNMAPPED_AREA_TOPDOWN)
                addr = unmapped_area_topdown(info);
        else
                addr = unmapped_area(info);

        trace_vm_unmapped_area(addr, info);
        return addr;
}

/* Get an address range which is currently unmapped.
 * For shmat() with addr=0.
 *
 * Ugly calling convention alert:
 * Return value with the low bits set means error value,
 * ie
 *        if (ret & ~PAGE_MASK)
 *                error = ret;
 *
 * This function "knows" that -ENOMEM has the bits set.
 */
#ifndef HAVE_ARCH_UNMAPPED_AREA
unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
        struct vm_unmapped_area_info info;
        const unsigned long mmap_end = arch_get_mmap_end(addr);

        if (len > mmap_end - mmap_min_addr)
                return -ENOMEM;

        if (flags & MAP_FIXED)
                return addr;

        if (addr) {
                addr = PAGE_ALIGN(addr);
                vma = find_vma_prev(mm, addr, &prev);
                if (mmap_end - len >= addr && addr >= mmap_min_addr &&
                    (!vma || addr + len <= vm_start_gap(vma)) &&
                    (!prev || addr >= vm_end_gap(prev)))
                        return addr;
        }

        info.flags = 0;
        info.length = len;
        info.low_limit = mm->mmap_base;
        info.high_limit = mmap_end;
        info.align_mask = 0;
        info.align_offset = 0;
        return vm_unmapped_area(&info);
}
#endif

/*
 * This mmap-allocator allocates new areas top-down from below the
 * stack's low limit (the base):
 */
#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
unsigned long
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
                          unsigned long len, unsigned long pgoff,
                          unsigned long flags)
{
        struct vm_area_struct *vma, *prev;
        struct mm_struct *mm = current->mm;
        struct vm_unmapped_area_info info;
        const unsigned long mmap_end = arch_get_mmap_end(addr);

        /* requested length too big for entire address space */
        if (len > mmap_end - mmap_min_addr)
                return -ENOMEM;

        if (flags & MAP_FIXED)
                return addr;

        /* requesting a specific address */
        if (addr) {
                addr = PAGE_ALIGN(addr);
                vma = find_vma_prev(mm, addr, &prev);
                if (mmap_end - len >= addr && addr >= mmap_min_addr &&
                                (!vma || addr + len <= vm_start_gap(vma)) &&
                                (!prev || addr >= vm_end_gap(prev)))
                        return addr;
        }

        info.flags = VM_UNMAPPED_AREA_TOPDOWN;
        info.length = len;
        info.low_limit = max(PAGE_SIZE, mmap_min_addr);
        info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
        info.align_mask = 0;
        info.align_offset = 0;
        addr = vm_unmapped_area(&info);

        /*
         * A failed mmap() very likely causes application failure,
         * so fall back to the bottom-up function here. This scenario
         * can happen with large stack limits and large mmap()
         * allocations.
         */
        if (offset_in_page(addr)) {
                VM_BUG_ON(addr != -ENOMEM);
                info.flags = 0;
                info.low_limit = TASK_UNMAPPED_BASE;
                info.high_limit = mmap_end;
                addr = vm_unmapped_area(&info);
        }

        return addr;
}
#endif

unsigned long
get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
                unsigned long pgoff, unsigned long flags)
{
        unsigned long (*get_area)(struct file *, unsigned long,
                                  unsigned long, unsigned long, unsigned long);

        unsigned long error = arch_mmap_check(addr, len, flags);
        if (error)
                return error;

        /* Careful about overflows.. */
        if (len > TASK_SIZE)
                return -ENOMEM;

        get_area = current->mm->get_unmapped_area;
        if (file) {
                if (file->f_op->get_unmapped_area)
                        get_area = file->f_op->get_unmapped_area;
        } else if (flags & MAP_SHARED) {
                /*
                 * mmap_region() will call shmem_zero_setup() to create a file,
                 * so use shmem's get_unmapped_area in case it can be huge.
                 * do_mmap() will clear pgoff, so match alignment.
                 */
                pgoff = 0;
                get_area = shmem_get_unmapped_area;
        }

        addr = get_area(file, addr, len, pgoff, flags);
        if (IS_ERR_VALUE(addr))
                return addr;

        if (addr > TASK_SIZE - len)
                return -ENOMEM;
        if (offset_in_page(addr))
                return -EINVAL;

        error = security_mmap_addr(addr);
        return error ? error : addr;
}

EXPORT_SYMBOL(get_unmapped_area);

/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
        struct rb_node *rb_node;
        struct vm_area_struct *vma;

        /* Check the cache first. */
        vma = vmacache_find(mm, addr);
        if (likely(vma))
                return vma;

        rb_node = mm->mm_rb.rb_node;

        while (rb_node) {
                struct vm_area_struct *tmp;

                tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);

                if (tmp->vm_end > addr) {
                        vma = tmp;
                        if (tmp->vm_start <= addr)
                                break;
                        rb_node = rb_node->rb_left;
                } else
                        rb_node = rb_node->rb_right;
        }

        if (vma)
                vmacache_update(addr, vma);
        return vma;
}

EXPORT_SYMBOL(find_vma);

/*
 * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
 */
struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
                        struct vm_area_struct **pprev)
{
        struct vm_area_struct *vma;

        vma = find_vma(mm, addr);
        if (vma) {
                *pprev = vma->vm_prev;
        } else {
                struct rb_node *rb_node = rb_last(&mm->mm_rb);

                *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL;
        }
        return vma;
}

/*
 * Verify that the stack growth is acceptable and
 * update accounting. This is shared with both the
 * grow-up and grow-down cases.
 */
static int acct_stack_growth(struct vm_area_struct *vma,
                             unsigned long size, unsigned long grow)
{
        struct mm_struct *mm = vma->vm_mm;
        unsigned long new_start;

        /* address space limit tests */
        if (!may_expand_vm(mm, vma->vm_flags, grow))
                return -ENOMEM;

        /* Stack limit test */
        if (size > rlimit(RLIMIT_STACK))
                return -ENOMEM;

        /* mlock limit tests */
        if (vma->vm_flags & VM_LOCKED) {
                unsigned long locked;
                unsigned long limit;
                locked = mm->locked_vm + grow;
                limit = rlimit(RLIMIT_MEMLOCK);
                limit >>= PAGE_SHIFT;
                if (locked > limit && !capable(CAP_IPC_LOCK))
                        return -ENOMEM;
        }

        /* Check to ensure the stack will not grow into a hugetlb-only region */
        new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
                        vma->vm_end - size;
        if (is_hugepage_only_range(vma->vm_mm, new_start, size))
                return -EFAULT;

        /*
         * Overcommit..  This must be the final test, as it will
         * update security statistics.
         */
        if (security_vm_enough_memory_mm(mm, grow))
                return -ENOMEM;

        return 0;
}

#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
/*
 * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
 * vma is the last one with address > vma->vm_end.  Have to extend vma.
 */
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *next;
        unsigned long gap_addr;
        int error = 0;

        if (!(vma->vm_flags & VM_GROWSUP))
                return -EFAULT;

        /* Guard against exceeding limits of the address space. */
        address &= PAGE_MASK;
        if (address >= (TASK_SIZE & PAGE_MASK))
                return -ENOMEM;
        address += PAGE_SIZE;

        /* Enforce stack_guard_gap */
        gap_addr = address + stack_guard_gap;

        /* Guard against overflow */
        if (gap_addr < address || gap_addr > TASK_SIZE)
                gap_addr = TASK_SIZE;

        next = vma->vm_next;
        if (next && next->vm_start < gap_addr && vma_is_accessible(next)) {
                if (!(next->vm_flags & VM_GROWSUP))
                        return -ENOMEM;
                /* Check that both stack segments have the same anon_vma? */
        }

        /* We must make sure the anon_vma is allocated. */
        if (unlikely(anon_vma_prepare(vma)))
                return -ENOMEM;

        /*
         * vma->vm_start/vm_end cannot change under us because the caller
         * is required to hold the mmap_lock in read mode.  We need the
         * anon_vma lock to serialize against concurrent expand_stacks.
         */
        anon_vma_lock_write(vma->anon_vma);

        /* Somebody else might have raced and expanded it already */
        if (address > vma->vm_end) {
                unsigned long size, grow;

                size = address - vma->vm_start;
                grow = (address - vma->vm_end) >> PAGE_SHIFT;

                error = -ENOMEM;
                if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
                        error = acct_stack_growth(vma, size, grow);
                        if (!error) {
                                /*
                                 * vma_gap_update() doesn't support concurrent
                                 * updates, but we only hold a shared mmap_lock
                                 * lock here, so we need to protect against
                                 * concurrent vma expansions.
                                 * anon_vma_lock_write() doesn't help here, as
                                 * we don't guarantee that all growable vmas
                                 * in a mm share the same root anon vma.
                                 * So, we reuse mm->page_table_lock to guard
                                 * against concurrent vma expansions.
                                 */
                                spin_lock(&mm->page_table_lock);
                                if (vma->vm_flags & VM_LOCKED)
                                        mm->locked_vm += grow;
                                vm_stat_account(mm, vma->vm_flags, grow);
                                anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_end = address;
                                anon_vma_interval_tree_post_update_vma(vma);
                                if (vma->vm_next)
                                        vma_gap_update(vma->vm_next);
                                else
                                        mm->highest_vm_end = vm_end_gap(vma);
                                spin_unlock(&mm->page_table_lock);

                                perf_event_mmap(vma);
                        }
                }
        }
        anon_vma_unlock_write(vma->anon_vma);
        khugepaged_enter_vma_merge(vma, vma->vm_flags);
        validate_mm(mm);
        return error;
}
#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */

/*
 * vma is the first one with address < vma->vm_start.  Have to extend vma.
 */
int expand_downwards(struct vm_area_struct *vma,
                                   unsigned long address)
{
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *prev;
        int error = 0;

        address &= PAGE_MASK;
        if (address < mmap_min_addr)
                return -EPERM;

        /* Enforce stack_guard_gap */
        prev = vma->vm_prev;
        /* Check that both stack segments have the same anon_vma? */
        if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
                        vma_is_accessible(prev)) {
                if (address - prev->vm_end < stack_guard_gap)
                        return -ENOMEM;
        }

        /* We must make sure the anon_vma is allocated. */
        if (unlikely(anon_vma_prepare(vma)))
                return -ENOMEM;

        /*
         * vma->vm_start/vm_end cannot change under us because the caller
         * is required to hold the mmap_lock in read mode.  We need the
         * anon_vma lock to serialize against concurrent expand_stacks.
         */
        anon_vma_lock_write(vma->anon_vma);

        /* Somebody else might have raced and expanded it already */
        if (address < vma->vm_start) {
                unsigned long size, grow;

                size = vma->vm_end - address;
                grow = (vma->vm_start - address) >> PAGE_SHIFT;

                error = -ENOMEM;
                if (grow <= vma->vm_pgoff) {
                        error = acct_stack_growth(vma, size, grow);
                        if (!error) {
                                /*
                                 * vma_gap_update() doesn't support concurrent
                                 * updates, but we only hold a shared mmap_lock
                                 * lock here, so we need to protect against
                                 * concurrent vma expansions.
                                 * anon_vma_lock_write() doesn't help here, as
                                 * we don't guarantee that all growable vmas
                                 * in a mm share the same root anon vma.
                                 * So, we reuse mm->page_table_lock to guard
                                 * against concurrent vma expansions.
                                 */
                                spin_lock(&mm->page_table_lock);
                                if (vma->vm_flags & VM_LOCKED)
                                        mm->locked_vm += grow;
                                vm_stat_account(mm, vma->vm_flags, grow);
                                anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_start = address;
                                vma->vm_pgoff -= grow;
                                anon_vma_interval_tree_post_update_vma(vma);
                                vma_gap_update(vma);
                                spin_unlock(&mm->page_table_lock);

                                perf_event_mmap(vma);
                        }
                }
        }
        anon_vma_unlock_write(vma->anon_vma);
        khugepaged_enter_vma_merge(vma, vma->vm_flags);
        validate_mm(mm);
        return error;
}

/* enforced gap between the expanding stack and other mappings. */
unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;

static int __init cmdline_parse_stack_guard_gap(char *p)
{
        unsigned long val;
        char *endptr;

        val = simple_strtoul(p, &endptr, 10);
        if (!*endptr)
                stack_guard_gap = val << PAGE_SHIFT;

        return 1;
}
__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);

#ifdef CONFIG_STACK_GROWSUP
int expand_stack(struct vm_area_struct *vma, unsigned long address)
{
        return expand_upwards(vma, address);
}

struct vm_area_struct *
find_extend_vma(struct mm_struct *mm, unsigned long addr)
{
        struct vm_area_struct *vma, *prev;

        addr &= PAGE_MASK;
        vma = find_vma_prev(mm, addr, &prev);
        if (vma && (vma->vm_start <= addr))
                return vma;
        /* don't alter vm_end if the coredump is running */
        if (!prev || expand_stack(prev, addr))
                return NULL;
        if (prev->vm_flags & VM_LOCKED)
                populate_vma_page_range(prev, addr, prev->vm_end, NULL);
        return prev;
}
#else
int expand_stack(struct vm_area_struct *vma, unsigned long address)
{
        return expand_downwards(vma, address);
}

struct vm_area_struct *
find_extend_vma(struct mm_struct *mm, unsigned long addr)
{
        struct vm_area_struct *vma;
        unsigned long start;

        addr &= PAGE_MASK;
        vma = find_vma(mm, addr);
        if (!vma)
                return NULL;
        if (vma->vm_start <= addr)
                return vma;
        if (!(vma->vm_flags & VM_GROWSDOWN))
                return NULL;
        start = vma->vm_start;
        if (expand_stack(vma, addr))
                return NULL;
        if (vma->vm_flags & VM_LOCKED)
                populate_vma_page_range(vma, addr, start, NULL);
        return vma;
}
#endif

EXPORT_SYMBOL_GPL(find_extend_vma);

/*
 * Ok - we have the memory areas we should free on the vma list,
 * so release them, and do the vma updates.
 *
 * Called with the mm semaphore held.
 */
static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
{
        unsigned long nr_accounted = 0;

        /* Update high watermark before we lower total_vm */
        update_hiwater_vm(mm);
        do {
                long nrpages = vma_pages(vma);

                if (vma->vm_flags & VM_ACCOUNT)
                        nr_accounted += nrpages;
                vm_stat_account(mm, vma->vm_flags, -nrpages);
                vma = remove_vma(vma);
        } while (vma);
        vm_unacct_memory(nr_accounted);
        validate_mm(mm);
}

/*
 * Get rid of page table information in the indicated region.
 *
 * Called with the mm semaphore held.
 */
static void unmap_region(struct mm_struct *mm,
                struct vm_area_struct *vma, struct vm_area_struct *prev,
                unsigned long start, unsigned long end)
{
        struct vm_area_struct *next = vma_next(mm, prev);
        struct mmu_gather tlb;
        struct vm_area_struct *cur_vma;

        lru_add_drain();
        tlb_gather_mmu(&tlb, mm, start, end);
        update_hiwater_rss(mm);
        unmap_vmas(&tlb, vma, start, end);

        /*
         * Ensure we have no stale TLB entries by the time this mapping is
         * removed from the rmap.
         * Note that we don't have to worry about nested flushes here because
         * we're holding the mm semaphore for removing the mapping - so any
         * concurrent flush in this region has to be coming through the rmap,
         * and we synchronize against that using the rmap lock.
         */
        for (cur_vma = vma; cur_vma; cur_vma = cur_vma->vm_next) {
                if ((cur_vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) != 0) {
                        tlb_flush_mmu(&tlb);
                        break;
                }
        }

        free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
                                 next ? next->vm_start : USER_PGTABLES_CEILING);
        tlb_finish_mmu(&tlb, start, end);
}

/*
 * Create a list of vma's touched by the unmap, removing them from the mm's
 * vma list as we go..
 */
static bool
detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
        struct vm_area_struct *prev, unsigned long end)
{
        struct vm_area_struct **insertion_point;
        struct vm_area_struct *tail_vma = NULL;

        insertion_point = (prev ? &prev->vm_next : &mm->mmap);
        vma->vm_prev = NULL;
        do {
                vma_rb_erase(vma, &mm->mm_rb);
                mm->map_count--;
                tail_vma = vma;
                vma = vma->vm_next;
        } while (vma && vma->vm_start < end);
        *insertion_point = vma;
        if (vma) {
                vma->vm_prev = prev;
                vma_gap_update(vma);
        } else
                mm->highest_vm_end = prev ? vm_end_gap(prev) : 0;
        tail_vma->vm_next = NULL;

        /* Kill the cache */
        vmacache_invalidate(mm);

        /*
         * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
         * VM_GROWSUP VMA. Such VMAs can change their size under
         * down_read(mmap_lock) and collide with the VMA we are about to unmap.
         */
        if (vma && (vma->vm_flags & VM_GROWSDOWN))
                return false;
        if (prev && (prev->vm_flags & VM_GROWSUP))
                return false;
        return true;
}

/*
 * __split_vma() bypasses sysctl_max_map_count checking.  We use this where it
 * has already been checked or doesn't make sense to fail.
 */
int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
                unsigned long addr, int new_below)
{
        struct vm_area_struct *new;
        int err;

        if (vma->vm_ops && vma->vm_ops->split) {
                err = vma->vm_ops->split(vma, addr);
                if (err)
                        return err;
        }

        new = vm_area_dup(vma);
        if (!new)
                return -ENOMEM;

        if (new_below)
                new->vm_end = addr;
        else {
                new->vm_start = addr;
                new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
        }

        err = vma_dup_policy(vma, new);
        if (err)
                goto out_free_vma;

        err = anon_vma_clone(new, vma);
        if (err)
                goto out_free_mpol;

        if (new->vm_file)
                get_file(new->vm_file);

        if (new->vm_ops && new->vm_ops->open)
                new->vm_ops->open(new);

        if (new_below)
                err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
                        ((addr - new->vm_start) >> PAGE_SHIFT), new);
        else
                err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);

        /* Success. */
        if (!err)
                return 0;

        /* Clean everything up if vma_adjust failed. */
        if (new->vm_ops && new->vm_ops->close)
                new->vm_ops->close(new);
        if (new->vm_file)
                fput(new->vm_file);
        unlink_anon_vmas(new);
 out_free_mpol:
        mpol_put(vma_policy(new));
 out_free_vma:
        vm_area_free(new);
        return err;
}

/*
 * Split a vma into two pieces at address 'addr', a new vma is allocated
 * either for the first part or the tail.
 */
int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
              unsigned long addr, int new_below)
{
        if (mm->map_count >= sysctl_max_map_count)
                return -ENOMEM;

        return __split_vma(mm, vma, addr, new_below);
}

/* Munmap is split into 2 main parts -- this part which finds
 * what needs doing, and the areas themselves, which do the
 * work.  This now handles partial unmappings.
 * Jeremy Fitzhardinge <jeremy@goop.org>
 */
int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
                struct list_head *uf, bool downgrade)
{
        unsigned long end;
        struct vm_area_struct *vma, *prev, *last;

        if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
                return -EINVAL;

        len = PAGE_ALIGN(len);
        end = start + len;
        if (len == 0)
                return -EINVAL;

        /*
         * arch_unmap() might do unmaps itself.  It must be called
         * and finish any rbtree manipulation before this code
         * runs and also starts to manipulate the rbtree.
         */
        arch_unmap(mm, start, end);

        /* Find the first overlapping VMA */
        vma = find_vma(mm, start);
        if (!vma)
                return 0;
        prev = vma->vm_prev;
        /* we have  start < vma->vm_end  */

        /* if it doesn't overlap, we have nothing.. */
        if (vma->vm_start >= end)
                return 0;

        /*
         * If we need to split any vma, do it now to save pain later.
         *
         * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
         * unmapped vm_area_struct will remain in use: so lower split_vma
         * places tmp vma above, and higher split_vma places tmp vma below.
         */
        if (start > vma->vm_start) {
                int error;

                /*
                 * Make sure that map_count on return from munmap() will
                 * not exceed its limit; but let map_count go just above
                 * its limit temporarily, to help free resources as expected.
                 */
                if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
                        return -ENOMEM;

                error = __split_vma(mm, vma, start, 0);
                if (error)
                        return error;
                prev = vma;
        }

        /* Does it split the last one? */
        last = find_vma(mm, end);
        if (last && end > last->vm_start) {
                int error = __split_vma(mm, last, end, 1);
                if (error)
                        return error;
        }
        vma = vma_next(mm, prev);

        if (unlikely(uf)) {
                /*
                 * If userfaultfd_unmap_prep returns an error the vmas
                 * will remain splitted, but userland will get a
                 * highly unexpected error anyway. This is no
                 * different than the case where the first of the two
                 * __split_vma fails, but we don't undo the first
                 * split, despite we could. This is unlikely enough
                 * failure that it's not worth optimizing it for.
                 */
                int error = userfaultfd_unmap_prep(vma, start, end, uf);
                if (error)
                        return error;
        }

        /*
         * unlock any mlock()ed ranges before detaching vmas
         */
        if (mm->locked_vm) {
                struct vm_area_struct *tmp = vma;
                while (tmp && tmp->vm_start < end) {
                        if (tmp->vm_flags & VM_LOCKED) {
                                mm->locked_vm -= vma_pages(tmp);
                                munlock_vma_pages_all(tmp);
                        }

                        tmp = tmp->vm_next;
                }
        }

        /* Detach vmas from rbtree */
        if (!detach_vmas_to_be_unmapped(mm, vma, prev, end))
                downgrade = false;

        if (downgrade)
                mmap_write_downgrade(mm);

        unmap_region(mm, vma, prev, start, end);

        /* Fix up all other VM information */
        remove_vma_list(mm, vma);

        return downgrade ? 1 : 0;
}

int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
              struct list_head *uf)
{
        return __do_munmap(mm, start, len, uf, false);
}

static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
{
        int ret;
        struct mm_struct *mm = current->mm;
        LIST_HEAD(uf);

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        ret = __do_munmap(mm, start, len, &uf, downgrade);
        /*
         * Returning 1 indicates mmap_lock is downgraded.
         * But 1 is not legal return value of vm_munmap() and munmap(), reset
         * it to 0 before return.
         */
        if (ret == 1) {
                mmap_read_unlock(mm);
                ret = 0;
        } else
                mmap_write_unlock(mm);

        userfaultfd_unmap_complete(mm, &uf);
        return ret;
}

int vm_munmap(unsigned long start, size_t len)
{
        return __vm_munmap(start, len, false);
}
EXPORT_SYMBOL(vm_munmap);

SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
{
        addr = untagged_addr(addr);
        profile_munmap(addr);
        return __vm_munmap(addr, len, true);
}


/*
 * Emulation of deprecated remap_file_pages() syscall.
 */
SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
                unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
{

        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        unsigned long populate = 0;
        unsigned long ret = -EINVAL;
        struct file *file;

        pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.rst.\n",
                     current->comm, current->pid);

        if (prot)
                return ret;
        start = start & PAGE_MASK;
        size = size & PAGE_MASK;

        if (start + size <= start)
                return ret;

        /* Does pgoff wrap? */
        if (pgoff + (size >> PAGE_SHIFT) < pgoff)
                return ret;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        vma = find_vma(mm, start);

        if (!vma || !(vma->vm_flags & VM_SHARED))
                goto out;

        if (start < vma->vm_start)
                goto out;

        if (start + size > vma->vm_end) {
                struct vm_area_struct *next;

                for (next = vma->vm_next; next; next = next->vm_next) {
                        /* hole between vmas ? */
                        if (next->vm_start != next->vm_prev->vm_end)
                                goto out;

                        if (next->vm_file != vma->vm_file)
                                goto out;

                        if (next->vm_flags != vma->vm_flags)
                                goto out;

                        if (start + size <= next->vm_end)
                                break;
                }

                if (!next)
                        goto out;
        }

        prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
        prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
        prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;

        flags &= MAP_NONBLOCK;
        flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
        if (vma->vm_flags & VM_LOCKED) {
                struct vm_area_struct *tmp;
                flags |= MAP_LOCKED;

                /* drop PG_Mlocked flag for over-mapped range */
                for (tmp = vma; tmp->vm_start >= start + size;
                                tmp = tmp->vm_next) {
                        /*
                         * Split pmd and munlock page on the border
                         * of the range.
                         */
                        vma_adjust_trans_huge(tmp, start, start + size, 0);

                        munlock_vma_pages_range(tmp,
                                        max(tmp->vm_start, start),
                                        min(tmp->vm_end, start + size));
                }
        }

        file = get_file(vma->vm_file);
        ret = do_mmap(vma->vm_file, start, size,
                        prot, flags, pgoff, &populate, NULL);
        fput(file);
out:
        mmap_write_unlock(mm);
        if (populate)
                mm_populate(ret, populate);
        if (!IS_ERR_VALUE(ret))
                ret = 0;
        return ret;
}

/*
 *  this is really a simplified "do_mmap".  it only handles
 *  anonymous maps.  eventually we may be able to do some
 *  brk-specific accounting here.
 */
static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long flags, struct list_head *uf)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
        struct rb_node **rb_link, *rb_parent;
        pgoff_t pgoff = addr >> PAGE_SHIFT;
        int error;
        unsigned long mapped_addr;

        /* Until we need other flags, refuse anything except VM_EXEC. */
        if ((flags & (~VM_EXEC)) != 0)
                return -EINVAL;
        flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;

        mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
        if (IS_ERR_VALUE(mapped_addr))
                return mapped_addr;

        error = mlock_future_check(mm, mm->def_flags, len);
        if (error)
                return error;

        /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
        if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
                return -ENOMEM;

        /* Check against address space limits *after* clearing old maps... */
        if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
                return -ENOMEM;

        if (mm->map_count > sysctl_max_map_count)
                return -ENOMEM;

        if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
                return -ENOMEM;

        /* Can we just expand an old private anonymous mapping? */
        vma = vma_merge(mm, prev, addr, addr + len, flags,
                        NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
        if (vma)
                goto out;

        /*
         * create a vma struct for an anonymous mapping
         */
        vma = vm_area_alloc(mm);
        if (!vma) {
                vm_unacct_memory(len >> PAGE_SHIFT);
                return -ENOMEM;
        }

        vma_set_anonymous(vma);
        vma->vm_start = addr;
        vma->vm_end = addr + len;
        vma->vm_pgoff = pgoff;
        vma->vm_flags = flags;
        vma->vm_page_prot = vm_get_page_prot(flags);
        vma_link(mm, vma, prev, rb_link, rb_parent);
out:
        perf_event_mmap(vma);
        mm->total_vm += len >> PAGE_SHIFT;
        mm->data_vm += len >> PAGE_SHIFT;
        if (flags & VM_LOCKED)
                mm->locked_vm += (len >> PAGE_SHIFT);
        vma->vm_flags |= VM_SOFTDIRTY;
        return 0;
}

int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
{
        struct mm_struct *mm = current->mm;
        unsigned long len;
        int ret;
        bool populate;
        LIST_HEAD(uf);

        len = PAGE_ALIGN(request);
        if (len < request)
                return -ENOMEM;
        if (!len)
                return 0;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        ret = do_brk_flags(addr, len, flags, &uf);
        populate = ((mm->def_flags & VM_LOCKED) != 0);
        mmap_write_unlock(mm);
        userfaultfd_unmap_complete(mm, &uf);
        if (populate && !ret)
                mm_populate(addr, len);
        return ret;
}
EXPORT_SYMBOL(vm_brk_flags);

int vm_brk(unsigned long addr, unsigned long len)
{
        return vm_brk_flags(addr, len, 0);
}
EXPORT_SYMBOL(vm_brk);

/* Release all mmaps. */
void exit_mmap(struct mm_struct *mm)
{
        struct mmu_gather tlb;
        struct vm_area_struct *vma;
        unsigned long nr_accounted = 0;

        /* mm's last user has gone, and its about to be pulled down */
        mmu_notifier_release(mm);

        if (unlikely(mm_is_oom_victim(mm))) {
                /*
                 * Manually reap the mm to free as much memory as possible.
                 * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard
                 * this mm from further consideration.  Taking mm->mmap_lock for
                 * write after setting MMF_OOM_SKIP will guarantee that the oom
                 * reaper will not run on this mm again after mmap_lock is
                 * dropped.
                 *
                 * Nothing can be holding mm->mmap_lock here and the above call
                 * to mmu_notifier_release(mm) ensures mmu notifier callbacks in
                 * __oom_reap_task_mm() will not block.
                 *
                 * This needs to be done before calling munlock_vma_pages_all(),
                 * which clears VM_LOCKED, otherwise the oom reaper cannot
                 * reliably test it.
                 */
                (void)__oom_reap_task_mm(mm);

                set_bit(MMF_OOM_SKIP, &mm->flags);
                mmap_write_lock(mm);
                mmap_write_unlock(mm);
        }

        if (mm->locked_vm) {
                vma = mm->mmap;
                while (vma) {
                        if (vma->vm_flags & VM_LOCKED)
                                munlock_vma_pages_all(vma);
                        vma = vma->vm_next;
                }
        }

        arch_exit_mmap(mm);

        vma = mm->mmap;
        if (!vma)        /* Can happen if dup_mmap() received an OOM */
                return;

        lru_add_drain();
        flush_cache_mm(mm);
        tlb_gather_mmu(&tlb, mm, 0, -1);
        /* update_hiwater_rss(mm) here? but nobody should be looking */
        /* Use -1 here to ensure all VMAs in the mm are unmapped */
        unmap_vmas(&tlb, vma, 0, -1);
        free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
        tlb_finish_mmu(&tlb, 0, -1);

        /*
         * Walk the list again, actually closing and freeing it,
         * with preemption enabled, without holding any MM locks.
         */
        while (vma) {
                if (vma->vm_flags & VM_ACCOUNT)
                        nr_accounted += vma_pages(vma);
                vma = remove_vma(vma);
                cond_resched();
        }
        vm_unacct_memory(nr_accounted);
}

/* Insert vm structure into process list sorted by address
 * and into the inode's i_mmap tree.  If vm_file is non-NULL
 * then i_mmap_rwsem is taken here.
 */
int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
        struct vm_area_struct *prev;
        struct rb_node **rb_link, *rb_parent;

        if (find_vma_links(mm, vma->vm_start, vma->vm_end,
                           &prev, &rb_link, &rb_parent))
                return -ENOMEM;
        if ((vma->vm_flags & VM_ACCOUNT) &&
             security_vm_enough_memory_mm(mm, vma_pages(vma)))
                return -ENOMEM;

        /*
         * The vm_pgoff of a purely anonymous vma should be irrelevant
         * until its first write fault, when page's anon_vma and index
         * are set.  But now set the vm_pgoff it will almost certainly
         * end up with (unless mremap moves it elsewhere before that
         * first wfault), so /proc/pid/maps tells a consistent story.
         *
         * By setting it to reflect the virtual start address of the
         * vma, merges and splits can happen in a seamless way, just
         * using the existing file pgoff checks and manipulations.
         * Similarly in do_mmap and in do_brk_flags.
         */
        if (vma_is_anonymous(vma)) {
                BUG_ON(vma->anon_vma);
                vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
        }

        vma_link(mm, vma, prev, rb_link, rb_parent);
        return 0;
}

/*
 * Copy the vma structure to a new location in the same mm,
 * prior to moving page table entries, to effect an mremap move.
 */
struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
        unsigned long addr, unsigned long len, pgoff_t pgoff,
        bool *need_rmap_locks)
{
        struct vm_area_struct *vma = *vmap;
        unsigned long vma_start = vma->vm_start;
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *new_vma, *prev;
        struct rb_node **rb_link, *rb_parent;
        bool faulted_in_anon_vma = true;

        /*
         * If anonymous vma has not yet been faulted, update new pgoff
         * to match new location, to increase its chance of merging.
         */
        if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
                pgoff = addr >> PAGE_SHIFT;
                faulted_in_anon_vma = false;
        }

        if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
                return NULL;        /* should never get here */
        new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
                            vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
                            vma->vm_userfaultfd_ctx);
        if (new_vma) {
                /*
                 * Source vma may have been merged into new_vma
                 */
                if (unlikely(vma_start >= new_vma->vm_start &&
                             vma_start < new_vma->vm_end)) {
                        /*
                         * The only way we can get a vma_merge with
                         * self during an mremap is if the vma hasn't
                         * been faulted in yet and we were allowed to
                         * reset the dst vma->vm_pgoff to the
                         * destination address of the mremap to allow
                         * the merge to happen. mremap must change the
                         * vm_pgoff linearity between src and dst vmas
                         * (in turn preventing a vma_merge) to be
                         * safe. It is only safe to keep the vm_pgoff
                         * linear if there are no pages mapped yet.
                         */
                        VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
                        *vmap = vma = new_vma;
                }
                *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
        } else {
                new_vma = vm_area_dup(vma);
                if (!new_vma)
                        goto out;
                new_vma->vm_start = addr;
                new_vma->vm_end = addr + len;
                new_vma->vm_pgoff = pgoff;
                if (vma_dup_policy(vma, new_vma))
                        goto out_free_vma;
                if (anon_vma_clone(new_vma, vma))
                        goto out_free_mempol;
                if (new_vma->vm_file)
                        get_file(new_vma->vm_file);
                if (new_vma->vm_ops && new_vma->vm_ops->open)
                        new_vma->vm_ops->open(new_vma);
                vma_link(mm, new_vma, prev, rb_link, rb_parent);
                *need_rmap_locks = false;
        }
        return new_vma;

out_free_mempol:
        mpol_put(vma_policy(new_vma));
out_free_vma:
        vm_area_free(new_vma);
out:
        return NULL;
}

/*
 * Return true if the calling process may expand its vm space by the passed
 * number of pages
 */
bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
{
        if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
                return false;

        if (is_data_mapping(flags) &&
            mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
                /* Workaround for Valgrind */
                if (rlimit(RLIMIT_DATA) == 0 &&
                    mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT)
                        return true;

                pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits%s.\n",
                             current->comm, current->pid,
                             (mm->data_vm + npages) << PAGE_SHIFT,
                             rlimit(RLIMIT_DATA),
                             ignore_rlimit_data ? "" : " or use boot option ignore_rlimit_data");

                if (!ignore_rlimit_data)
                        return false;
        }

        return true;
}

void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
{
        mm->total_vm += npages;

        if (is_exec_mapping(flags))
                mm->exec_vm += npages;
        else if (is_stack_mapping(flags))
                mm->stack_vm += npages;
        else if (is_data_mapping(flags))
                mm->data_vm += npages;
}

static vm_fault_t special_mapping_fault(struct vm_fault *vmf);

/*
 * Having a close hook prevents vma merging regardless of flags.
 */
static void special_mapping_close(struct vm_area_struct *vma)
{
}

static const char *special_mapping_name(struct vm_area_struct *vma)
{
        return ((struct vm_special_mapping *)vma->vm_private_data)->name;
}

static int special_mapping_mremap(struct vm_area_struct *new_vma)
{
        struct vm_special_mapping *sm = new_vma->vm_private_data;

        if (WARN_ON_ONCE(current->mm != new_vma->vm_mm))
                return -EFAULT;

        if (sm->mremap)
                return sm->mremap(sm, new_vma);

        return 0;
}

static const struct vm_operations_struct special_mapping_vmops = {
        .close = special_mapping_close,
        .fault = special_mapping_fault,
        .mremap = special_mapping_mremap,
        .name = special_mapping_name,
        /* vDSO code relies that VVAR can't be accessed remotely */
        .access = NULL,
};

static const struct vm_operations_struct legacy_special_mapping_vmops = {
        .close = special_mapping_close,
        .fault = special_mapping_fault,
};

static vm_fault_t special_mapping_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        pgoff_t pgoff;
        struct page **pages;

        if (vma->vm_ops == &legacy_special_mapping_vmops) {
                pages = vma->vm_private_data;
        } else {
                struct vm_special_mapping *sm = vma->vm_private_data;

                if (sm->fault)
                        return sm->fault(sm, vmf->vma, vmf);

                pages = sm->pages;
        }

        for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
                pgoff--;

        if (*pages) {
                struct page *page = *pages;
                get_page(page);
                vmf->page = page;
                return 0;
        }

        return VM_FAULT_SIGBUS;
}

static struct vm_area_struct *__install_special_mapping(
        struct mm_struct *mm,
        unsigned long addr, unsigned long len,
        unsigned long vm_flags, void *priv,
        const struct vm_operations_struct *ops)
{
        int ret;
        struct vm_area_struct *vma;

        vma = vm_area_alloc(mm);
        if (unlikely(vma == NULL))
                return ERR_PTR(-ENOMEM);

        vma->vm_start = addr;
        vma->vm_end = addr + len;

        vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);

        vma->vm_ops = ops;
        vma->vm_private_data = priv;

        ret = insert_vm_struct(mm, vma);
        if (ret)
                goto out;

        vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);

        perf_event_mmap(vma);

        return vma;

out:
        vm_area_free(vma);
        return ERR_PTR(ret);
}

bool vma_is_special_mapping(const struct vm_area_struct *vma,
        const struct vm_special_mapping *sm)
{
        return vma->vm_private_data == sm &&
                (vma->vm_ops == &special_mapping_vmops ||
                 vma->vm_ops == &legacy_special_mapping_vmops);
}

/*
 * Called with mm->mmap_lock held for writing.
 * Insert a new vma covering the given region, with the given flags.
 * Its pages are supplied by the given array of struct page *.
 * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
 * The region past the last page supplied will always produce SIGBUS.
 * The array pointer and the pages it points to are assumed to stay alive
 * for as long as this mapping might exist.
 */
struct vm_area_struct *_install_special_mapping(
        struct mm_struct *mm,
        unsigned long addr, unsigned long len,
        unsigned long vm_flags, const struct vm_special_mapping *spec)
{
        return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec,
                                        &special_mapping_vmops);
}

int install_special_mapping(struct mm_struct *mm,
                            unsigned long addr, unsigned long len,
                            unsigned long vm_flags, struct page **pages)
{
        struct vm_area_struct *vma = __install_special_mapping(
                mm, addr, len, vm_flags, (void *)pages,
                &legacy_special_mapping_vmops);

        return PTR_ERR_OR_ZERO(vma);
}

static DEFINE_MUTEX(mm_all_locks_mutex);

static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
{
        if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
                /*
                 * The LSB of head.next can't change from under us
                 * because we hold the mm_all_locks_mutex.
                 */
                down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock);
                /*
                 * We can safely modify head.next after taking the
                 * anon_vma->root->rwsem. If some other vma in this mm shares
                 * the same anon_vma we won't take it again.
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us thanks to the
                 * anon_vma->root->rwsem.
                 */
                if (__test_and_set_bit(0, (unsigned long *)
                                       &anon_vma->root->rb_root.rb_root.rb_node))
                        BUG();
        }
}

static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
{
        if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
                /*
                 * AS_MM_ALL_LOCKS can't change from under us because
                 * we hold the mm_all_locks_mutex.
                 *
                 * Operations on ->flags have to be atomic because
                 * even if AS_MM_ALL_LOCKS is stable thanks to the
                 * mm_all_locks_mutex, there may be other cpus
                 * changing other bitflags in parallel to us.
                 */
                if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
                        BUG();
                down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock);
        }
}

/*
 * This operation locks against the VM for all pte/vma/mm related
 * operations that could ever happen on a certain mm. This includes
 * vmtruncate, try_to_unmap, and all page faults.
 *
 * The caller must take the mmap_lock in write mode before calling
 * mm_take_all_locks(). The caller isn't allowed to release the
 * mmap_lock until mm_drop_all_locks() returns.
 *
 * mmap_lock in write mode is required in order to block all operations
 * that could modify pagetables and free pages without need of
 * altering the vma layout. It's also needed in write mode to avoid new
 * anon_vmas to be associated with existing vmas.
 *
 * A single task can't take more than one mm_take_all_locks() in a row
 * or it would deadlock.
 *
 * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
 * mapping->flags avoid to take the same lock twice, if more than one
 * vma in this mm is backed by the same anon_vma or address_space.
 *
 * We take locks in following order, accordingly to comment at beginning
 * of mm/rmap.c:
 *   - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
 *     hugetlb mapping);
 *   - all i_mmap_rwsem locks;
 *   - all anon_vma->rwseml
 *
 * We can take all locks within these types randomly because the VM code
 * doesn't nest them and we protected from parallel mm_take_all_locks() by
 * mm_all_locks_mutex.
 *
 * mm_take_all_locks() and mm_drop_all_locks are expensive operations
 * that may have to take thousand of locks.
 *
 * mm_take_all_locks() can fail if it's interrupted by signals.
 */
int mm_take_all_locks(struct mm_struct *mm)
{
        struct vm_area_struct *vma;
        struct anon_vma_chain *avc;

        BUG_ON(mmap_read_trylock(mm));

        mutex_lock(&mm_all_locks_mutex);

        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                if (signal_pending(current))
                        goto out_unlock;
                if (vma->vm_file && vma->vm_file->f_mapping &&
                                is_vm_hugetlb_page(vma))
                        vm_lock_mapping(mm, vma->vm_file->f_mapping);
        }

        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                if (signal_pending(current))
                        goto out_unlock;
                if (vma->vm_file && vma->vm_file->f_mapping &&
                                !is_vm_hugetlb_page(vma))
                        vm_lock_mapping(mm, vma->vm_file->f_mapping);
        }

        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                if (signal_pending(current))
                        goto out_unlock;
                if (vma->anon_vma)
                        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                                vm_lock_anon_vma(mm, avc->anon_vma);
        }

        return 0;

out_unlock:
        mm_drop_all_locks(mm);
        return -EINTR;
}

static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
{
        if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
                /*
                 * The LSB of head.next can't change to 0 from under
                 * us because we hold the mm_all_locks_mutex.
                 *
                 * We must however clear the bitflag before unlocking
                 * the vma so the users using the anon_vma->rb_root will
                 * never see our bitflag.
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us until we release the
                 * anon_vma->root->rwsem.
                 */
                if (!__test_and_clear_bit(0, (unsigned long *)
                                          &anon_vma->root->rb_root.rb_root.rb_node))
                        BUG();
                anon_vma_unlock_write(anon_vma);
        }
}

static void vm_unlock_mapping(struct address_space *mapping)
{
        if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
                /*
                 * AS_MM_ALL_LOCKS can't change to 0 from under us
                 * because we hold the mm_all_locks_mutex.
                 */
                i_mmap_unlock_write(mapping);
                if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
                                        &mapping->flags))
                        BUG();
        }
}

/*
 * The mmap_lock cannot be released by the caller until
 * mm_drop_all_locks() returns.
 */
void mm_drop_all_locks(struct mm_struct *mm)
{
        struct vm_area_struct *vma;
        struct anon_vma_chain *avc;

        BUG_ON(mmap_read_trylock(mm));
        BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));

        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                if (vma->anon_vma)
                        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                                vm_unlock_anon_vma(avc->anon_vma);
                if (vma->vm_file && vma->vm_file->f_mapping)
                        vm_unlock_mapping(vma->vm_file->f_mapping);
        }

        mutex_unlock(&mm_all_locks_mutex);
}

/*
 * initialise the percpu counter for VM
 */
void __init mmap_init(void)
{
        int ret;

        ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
        VM_BUG_ON(ret);
}

/*
 * Initialise sysctl_user_reserve_kbytes.
 *
 * This is intended to prevent a user from starting a single memory hogging
 * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
 * mode.
 *
 * The default value is min(3% of free memory, 128MB)
 * 128MB is enough to recover with sshd/login, bash, and top/kill.
 */
static int init_user_reserve(void)
{
        unsigned long free_kbytes;

        free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);

        sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
        return 0;
}
subsys_initcall(init_user_reserve);

/*
 * Initialise sysctl_admin_reserve_kbytes.
 *
 * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
 * to log in and kill a memory hogging process.
 *
 * Systems with more than 256MB will reserve 8MB, enough to recover
 * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
 * only reserve 3% of free pages by default.
 */
static int init_admin_reserve(void)
{
        unsigned long free_kbytes;

        free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);

        sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
        return 0;
}
subsys_initcall(init_admin_reserve);

/*
 * Reinititalise user and admin reserves if memory is added or removed.
 *
 * The default user reserve max is 128MB, and the default max for the
 * admin reserve is 8MB. These are usually, but not always, enough to
 * enable recovery from a memory hogging process using login/sshd, a shell,
 * and tools like top. It may make sense to increase or even disable the
 * reserve depending on the existence of swap or variations in the recovery
 * tools. So, the admin may have changed them.
 *
 * If memory is added and the reserves have been eliminated or increased above
 * the default max, then we'll trust the admin.
 *
 * If memory is removed and there isn't enough free memory, then we
 * need to reset the reserves.
 *
 * Otherwise keep the reserve set by the admin.
 */
static int reserve_mem_notifier(struct notifier_block *nb,
                             unsigned long action, void *data)
{
        unsigned long tmp, free_kbytes;

        switch (action) {
        case MEM_ONLINE:
                /* Default max is 128MB. Leave alone if modified by operator. */
                tmp = sysctl_user_reserve_kbytes;
                if (0 < tmp && tmp < (1UL << 17))
                        init_user_reserve();

                /* Default max is 8MB.  Leave alone if modified by operator. */
                tmp = sysctl_admin_reserve_kbytes;
                if (0 < tmp && tmp < (1UL << 13))
                        init_admin_reserve();

                break;
        case MEM_OFFLINE:
                free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);

                if (sysctl_user_reserve_kbytes > free_kbytes) {
                        init_user_reserve();
                        pr_info("vm.user_reserve_kbytes reset to %lu\n",
                                sysctl_user_reserve_kbytes);
                }

                if (sysctl_admin_reserve_kbytes > free_kbytes) {
                        init_admin_reserve();
                        pr_info("vm.admin_reserve_kbytes reset to %lu\n",
                                sysctl_admin_reserve_kbytes);
                }
                break;
        default:
                break;
        }
        return NOTIFY_OK;
}

static struct notifier_block reserve_mem_nb = {
        .notifier_call = reserve_mem_notifier,
};

static int __meminit init_reserve_notifier(void)
{
        if (register_hotmemory_notifier(&reserve_mem_nb))
                pr_err("Failed registering memory add/remove notifier for admin reserve\n");

        return 0;
}
subsys_initcall(init_reserve_notifier);












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 

    1 













































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
// SPDX-License-Identifier: GPL-2.0
/*
 * Implementation of the diskquota system for the LINUX operating system. QUOTA
 * is implemented using the BSD system call interface as the means of
 * communication with the user level. This file contains the generic routines
 * called by the different filesystems on allocation of an inode or block.
 * These routines take care of the administration needed to have a consistent
 * diskquota tracking system. The ideas of both user and group quotas are based
 * on the Melbourne quota system as used on BSD derived systems. The internal
 * implementation is based on one of the several variants of the LINUX
 * inode-subsystem with added complexity of the diskquota system.
 *
 * Author:        Marco van Wieringen <mvw@planets.elm.net>
 *
 * Fixes:   Dmitry Gorodchanin <pgmdsg@ibi.com>, 11 Feb 96
 *
 *                Revised list management to avoid races
 *                -- Bill Hawes, <whawes@star.net>, 9/98
 *
 *                Fixed races in dquot_transfer(), dqget() and dquot_alloc_...().
 *                As the consequence the locking was moved from dquot_decr_...(),
 *                dquot_incr_...() to calling functions.
 *                invalidate_dquots() now writes modified dquots.
 *                Serialized quota_off() and quota_on() for mount point.
 *                Fixed a few bugs in grow_dquots().
 *                Fixed deadlock in write_dquot() - we no longer account quotas on
 *                quota files
 *                remove_dquot_ref() moved to inode.c - it now traverses through inodes
 *                add_dquot_ref() restarts after blocking
 *                Added check for bogus uid and fixed check for group in quotactl.
 *                Jan Kara, <jack@suse.cz>, sponsored by SuSE CR, 10-11/99
 *
 *                Used struct list_head instead of own list struct
 *                Invalidation of referenced dquots is no longer possible
 *                Improved free_dquots list management
 *                Quota and i_blocks are now updated in one place to avoid races
 *                Warnings are now delayed so we won't block in critical section
 *                Write updated not to require dquot lock
 *                Jan Kara, <jack@suse.cz>, 9/2000
 *
 *                Added dynamic quota structure allocation
 *                Jan Kara <jack@suse.cz> 12/2000
 *
 *                Rewritten quota interface. Implemented new quota format and
 *                formats registering.
 *                Jan Kara, <jack@suse.cz>, 2001,2002
 *
 *                New SMP locking.
 *                Jan Kara, <jack@suse.cz>, 10/2002
 *
 *                Added journalled quota support, fix lock inversion problems
 *                Jan Kara, <jack@suse.cz>, 2003,2004
 *
 * (C) Copyright 1994 - 1997 Marco van Wieringen
 */

#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/mm.h>
#include <linux/time.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/fcntl.h>
#include <linux/stat.h>
#include <linux/tty.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/security.h>
#include <linux/sched.h>
#include <linux/cred.h>
#include <linux/kmod.h>
#include <linux/namei.h>
#include <linux/capability.h>
#include <linux/quotaops.h>
#include <linux/blkdev.h>
#include <linux/sched/mm.h>
#include "../internal.h" /* ugh */

#include <linux/uaccess.h>

/*
 * There are five quota SMP locks:
 * * dq_list_lock protects all lists with quotas and quota formats.
 * * dquot->dq_dqb_lock protects data from dq_dqb
 * * inode->i_lock protects inode->i_blocks, i_bytes and also guards
 *   consistency of dquot->dq_dqb with inode->i_blocks, i_bytes so that
 *   dquot_transfer() can stabilize amount it transfers
 * * dq_data_lock protects mem_dqinfo structures and modifications of dquot
 *   pointers in the inode
 * * dq_state_lock protects modifications of quota state (on quotaon and
 *   quotaoff) and readers who care about latest values take it as well.
 *
 * The spinlock ordering is hence:
 *   dq_data_lock > dq_list_lock > i_lock > dquot->dq_dqb_lock,
 *   dq_list_lock > dq_state_lock
 *
 * Note that some things (eg. sb pointer, type, id) doesn't change during
 * the life of the dquot structure and so needn't to be protected by a lock
 *
 * Operation accessing dquots via inode pointers are protected by dquot_srcu.
 * Operation of reading pointer needs srcu_read_lock(&dquot_srcu), and
 * synchronize_srcu(&dquot_srcu) is called after clearing pointers from
 * inode and before dropping dquot references to avoid use of dquots after
 * they are freed. dq_data_lock is used to serialize the pointer setting and
 * clearing operations.
 * Special care needs to be taken about S_NOQUOTA inode flag (marking that
 * inode is a quota file). Functions adding pointers from inode to dquots have
 * to check this flag under dq_data_lock and then (if S_NOQUOTA is not set) they
 * have to do all pointer modifications before dropping dq_data_lock. This makes
 * sure they cannot race with quotaon which first sets S_NOQUOTA flag and
 * then drops all pointers to dquots from an inode.
 *
 * Each dquot has its dq_lock mutex.  Dquot is locked when it is being read to
 * memory (or space for it is being allocated) on the first dqget(), when it is
 * being written out, and when it is being released on the last dqput(). The
 * allocation and release operations are serialized by the dq_lock and by
 * checking the use count in dquot_release().
 *
 * Lock ordering (including related VFS locks) is the following:
 *   s_umount > i_mutex > journal_lock > dquot->dq_lock > dqio_sem
 */

static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock);
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
EXPORT_SYMBOL(dq_data_lock);
DEFINE_STATIC_SRCU(dquot_srcu);

static DECLARE_WAIT_QUEUE_HEAD(dquot_ref_wq);

void __quota_error(struct super_block *sb, const char *func,
                   const char *fmt, ...)
{
        if (printk_ratelimit()) {
                va_list args;
                struct va_format vaf;

                va_start(args, fmt);

                vaf.fmt = fmt;
                vaf.va = &args;

                printk(KERN_ERR "Quota error (device %s): %s: %pV\n",
                       sb->s_id, func, &vaf);

                va_end(args);
        }
}
EXPORT_SYMBOL(__quota_error);

#if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING)
static char *quotatypes[] = INITQFNAMES;
#endif
static struct quota_format_type *quota_formats;        /* List of registered formats */
static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES;

/* SLAB cache for dquot structures */
static struct kmem_cache *dquot_cachep;

int register_quota_format(struct quota_format_type *fmt)
{
        spin_lock(&dq_list_lock);
        fmt->qf_next = quota_formats;
        quota_formats = fmt;
        spin_unlock(&dq_list_lock);
        return 0;
}
EXPORT_SYMBOL(register_quota_format);

void unregister_quota_format(struct quota_format_type *fmt)
{
        struct quota_format_type **actqf;

        spin_lock(&dq_list_lock);
        for (actqf = &quota_formats; *actqf && *actqf != fmt;
             actqf = &(*actqf)->qf_next)
                ;
        if (*actqf)
                *actqf = (*actqf)->qf_next;
        spin_unlock(&dq_list_lock);
}
EXPORT_SYMBOL(unregister_quota_format);

static struct quota_format_type *find_quota_format(int id)
{
        struct quota_format_type *actqf;

        spin_lock(&dq_list_lock);
        for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id;
             actqf = actqf->qf_next)
                ;
        if (!actqf || !try_module_get(actqf->qf_owner)) {
                int qm;

                spin_unlock(&dq_list_lock);

                for (qm = 0; module_names[qm].qm_fmt_id &&
                             module_names[qm].qm_fmt_id != id; qm++)
                        ;
                if (!module_names[qm].qm_fmt_id ||
                    request_module(module_names[qm].qm_mod_name))
                        return NULL;

                spin_lock(&dq_list_lock);
                for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id;
                     actqf = actqf->qf_next)
                        ;
                if (actqf && !try_module_get(actqf->qf_owner))
                        actqf = NULL;
        }
        spin_unlock(&dq_list_lock);
        return actqf;
}

static void put_quota_format(struct quota_format_type *fmt)
{
        module_put(fmt->qf_owner);
}

/*
 * Dquot List Management:
 * The quota code uses five lists for dquot management: the inuse_list,
 * releasing_dquots, free_dquots, dqi_dirty_list, and dquot_hash[] array.
 * A single dquot structure may be on some of those lists, depending on
 * its current state.
 *
 * All dquots are placed to the end of inuse_list when first created, and this
 * list is used for invalidate operation, which must look at every dquot.
 *
 * When the last reference of a dquot is dropped, the dquot is added to
 * releasing_dquots. We'll then queue work item which will call
 * synchronize_srcu() and after that perform the final cleanup of all the
 * dquots on the list. Each cleaned up dquot is moved to free_dquots list.
 * Both releasing_dquots and free_dquots use the dq_free list_head in the dquot
 * struct.
 *
 * Unused and cleaned up dquots are in the free_dquots list and this list is
 * searched whenever we need an available dquot. Dquots are removed from the
 * list as soon as they are used again and dqstats.free_dquots gives the number
 * of dquots on the list. When dquot is invalidated it's completely released
 * from memory.
 *
 * Dirty dquots are added to the dqi_dirty_list of quota_info when mark
 * dirtied, and this list is searched when writing dirty dquots back to
 * quota file. Note that some filesystems do dirty dquot tracking on their
 * own (e.g. in a journal) and thus don't use dqi_dirty_list.
 *
 * Dquots with a specific identity (device, type and id) are placed on
 * one of the dquot_hash[] hash chains. The provides an efficient search
 * mechanism to locate a specific dquot.
 */

static LIST_HEAD(inuse_list);
static LIST_HEAD(free_dquots);
static LIST_HEAD(releasing_dquots);
static unsigned int dq_hash_bits, dq_hash_mask;
static struct hlist_head *dquot_hash;

struct dqstats dqstats;
EXPORT_SYMBOL(dqstats);

static qsize_t inode_get_rsv_space(struct inode *inode);
static qsize_t __inode_get_rsv_space(struct inode *inode);
static int __dquot_initialize(struct inode *inode, int type);

static void quota_release_workfn(struct work_struct *work);
static DECLARE_DELAYED_WORK(quota_release_work, quota_release_workfn);

static inline unsigned int
hashfn(const struct super_block *sb, struct kqid qid)
{
        unsigned int id = from_kqid(&init_user_ns, qid);
        int type = qid.type;
        unsigned long tmp;

        tmp = (((unsigned long)sb>>L1_CACHE_SHIFT) ^ id) * (MAXQUOTAS - type);
        return (tmp + (tmp >> dq_hash_bits)) & dq_hash_mask;
}

/*
 * Following list functions expect dq_list_lock to be held
 */
static inline void insert_dquot_hash(struct dquot *dquot)
{
        struct hlist_head *head;
        head = dquot_hash + hashfn(dquot->dq_sb, dquot->dq_id);
        hlist_add_head(&dquot->dq_hash, head);
}

static inline void remove_dquot_hash(struct dquot *dquot)
{
        hlist_del_init(&dquot->dq_hash);
}

static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb,
                                struct kqid qid)
{
        struct hlist_node *node;
        struct dquot *dquot;

        hlist_for_each (node, dquot_hash+hashent) {
                dquot = hlist_entry(node, struct dquot, dq_hash);
                if (dquot->dq_sb == sb && qid_eq(dquot->dq_id, qid))
                        return dquot;
        }
        return NULL;
}

/* Add a dquot to the tail of the free list */
static inline void put_dquot_last(struct dquot *dquot)
{
        list_add_tail(&dquot->dq_free, &free_dquots);
        dqstats_inc(DQST_FREE_DQUOTS);
}

static inline void put_releasing_dquots(struct dquot *dquot)
{
        list_add_tail(&dquot->dq_free, &releasing_dquots);
        set_bit(DQ_RELEASING_B, &dquot->dq_flags);
}

static inline void remove_free_dquot(struct dquot *dquot)
{
        if (list_empty(&dquot->dq_free))
                return;
        list_del_init(&dquot->dq_free);
        if (!test_bit(DQ_RELEASING_B, &dquot->dq_flags))
                dqstats_dec(DQST_FREE_DQUOTS);
        else
                clear_bit(DQ_RELEASING_B, &dquot->dq_flags);
}

static inline void put_inuse(struct dquot *dquot)
{
        /* We add to the back of inuse list so we don't have to restart
         * when traversing this list and we block */
        list_add_tail(&dquot->dq_inuse, &inuse_list);
        dqstats_inc(DQST_ALLOC_DQUOTS);
}

static inline void remove_inuse(struct dquot *dquot)
{
        dqstats_dec(DQST_ALLOC_DQUOTS);
        list_del(&dquot->dq_inuse);
}
/*
 * End of list functions needing dq_list_lock
 */

static void wait_on_dquot(struct dquot *dquot)
{
        mutex_lock(&dquot->dq_lock);
        mutex_unlock(&dquot->dq_lock);
}

static inline int dquot_active(struct dquot *dquot)
{
        return test_bit(DQ_ACTIVE_B, &dquot->dq_flags);
}

static inline int dquot_dirty(struct dquot *dquot)
{
        return test_bit(DQ_MOD_B, &dquot->dq_flags);
}

static inline int mark_dquot_dirty(struct dquot *dquot)
{
        return dquot->dq_sb->dq_op->mark_dirty(dquot);
}

/* Mark dquot dirty in atomic manner, and return it's old dirty flag state */
int dquot_mark_dquot_dirty(struct dquot *dquot)
{
        int ret = 1;

        if (!dquot_active(dquot))
                return 0;

        if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY)
                return test_and_set_bit(DQ_MOD_B, &dquot->dq_flags);

        /* If quota is dirty already, we don't have to acquire dq_list_lock */
        if (dquot_dirty(dquot))
                return 1;

        spin_lock(&dq_list_lock);
        if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) {
                list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)->
                                info[dquot->dq_id.type].dqi_dirty_list);
                ret = 0;
        }
        spin_unlock(&dq_list_lock);
        return ret;
}
EXPORT_SYMBOL(dquot_mark_dquot_dirty);

/* Dirtify all the dquots - this can block when journalling */
static inline int mark_all_dquot_dirty(struct dquot __rcu * const *dquots)
{
        int ret, err, cnt;
        struct dquot *dquot;

        ret = err = 0;
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (dquot)
                        /* Even in case of error we have to continue */
                        ret = mark_dquot_dirty(dquot);
                if (!err)
                        err = ret;
        }
        return err;
}

static inline void dqput_all(struct dquot **dquot)
{
        unsigned int cnt;

        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                dqput(dquot[cnt]);
}

static inline int clear_dquot_dirty(struct dquot *dquot)
{
        if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY)
                return test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags);

        spin_lock(&dq_list_lock);
        if (!test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags)) {
                spin_unlock(&dq_list_lock);
                return 0;
        }
        list_del_init(&dquot->dq_dirty);
        spin_unlock(&dq_list_lock);
        return 1;
}

void mark_info_dirty(struct super_block *sb, int type)
{
        spin_lock(&dq_data_lock);
        sb_dqopt(sb)->info[type].dqi_flags |= DQF_INFO_DIRTY;
        spin_unlock(&dq_data_lock);
}
EXPORT_SYMBOL(mark_info_dirty);

/*
 *        Read dquot from disk and alloc space for it
 */

int dquot_acquire(struct dquot *dquot)
{
        int ret = 0, ret2 = 0;
        unsigned int memalloc;
        struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);

        mutex_lock(&dquot->dq_lock);
        memalloc = memalloc_nofs_save();
        if (!test_bit(DQ_READ_B, &dquot->dq_flags)) {
                ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot);
                if (ret < 0)
                        goto out_iolock;
        }
        /* Make sure flags update is visible after dquot has been filled */
        smp_mb__before_atomic();
        set_bit(DQ_READ_B, &dquot->dq_flags);
        /* Instantiate dquot if needed */
        if (!dquot_active(dquot) && !dquot->dq_off) {
                ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot);
                /* Write the info if needed */
                if (info_dirty(&dqopt->info[dquot->dq_id.type])) {
                        ret2 = dqopt->ops[dquot->dq_id.type]->write_file_info(
                                        dquot->dq_sb, dquot->dq_id.type);
                }
                if (ret < 0)
                        goto out_iolock;
                if (ret2 < 0) {
                        ret = ret2;
                        goto out_iolock;
                }
        }
        /*
         * Make sure flags update is visible after on-disk struct has been
         * allocated. Paired with smp_rmb() in dqget().
         */
        smp_mb__before_atomic();
        set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out_iolock:
        memalloc_nofs_restore(memalloc);
        mutex_unlock(&dquot->dq_lock);
        return ret;
}
EXPORT_SYMBOL(dquot_acquire);

/*
 *        Write dquot to disk
 */
int dquot_commit(struct dquot *dquot)
{
        int ret = 0;
        unsigned int memalloc;
        struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);

        mutex_lock(&dquot->dq_lock);
        memalloc = memalloc_nofs_save();
        if (!clear_dquot_dirty(dquot))
                goto out_lock;
        /* Inactive dquot can be only if there was error during read/init
         * => we have better not writing it */
        if (dquot_active(dquot))
                ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot);
        else
                ret = -EIO;
out_lock:
        memalloc_nofs_restore(memalloc);
        mutex_unlock(&dquot->dq_lock);
        return ret;
}
EXPORT_SYMBOL(dquot_commit);

/*
 *        Release dquot
 */
int dquot_release(struct dquot *dquot)
{
        int ret = 0, ret2 = 0;
        unsigned int memalloc;
        struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);

        mutex_lock(&dquot->dq_lock);
        memalloc = memalloc_nofs_save();
        /* Check whether we are not racing with some other dqget() */
        if (dquot_is_busy(dquot))
                goto out_dqlock;
        if (dqopt->ops[dquot->dq_id.type]->release_dqblk) {
                ret = dqopt->ops[dquot->dq_id.type]->release_dqblk(dquot);
                /* Write the info */
                if (info_dirty(&dqopt->info[dquot->dq_id.type])) {
                        ret2 = dqopt->ops[dquot->dq_id.type]->write_file_info(
                                                dquot->dq_sb, dquot->dq_id.type);
                }
                if (ret >= 0)
                        ret = ret2;
        }
        clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out_dqlock:
        memalloc_nofs_restore(memalloc);
        mutex_unlock(&dquot->dq_lock);
        return ret;
}
EXPORT_SYMBOL(dquot_release);

void dquot_destroy(struct dquot *dquot)
{
        kmem_cache_free(dquot_cachep, dquot);
}
EXPORT_SYMBOL(dquot_destroy);

static inline void do_destroy_dquot(struct dquot *dquot)
{
        dquot->dq_sb->dq_op->destroy_dquot(dquot);
}

/* Invalidate all dquots on the list. Note that this function is called after
 * quota is disabled and pointers from inodes removed so there cannot be new
 * quota users. There can still be some users of quotas due to inodes being
 * just deleted or pruned by prune_icache() (those are not attached to any
 * list) or parallel quotactl call. We have to wait for such users.
 */
static void invalidate_dquots(struct super_block *sb, int type)
{
        struct dquot *dquot, *tmp;

restart:
        flush_delayed_work(&quota_release_work);

        spin_lock(&dq_list_lock);
        list_for_each_entry_safe(dquot, tmp, &inuse_list, dq_inuse) {
                if (dquot->dq_sb != sb)
                        continue;
                if (dquot->dq_id.type != type)
                        continue;
                /* Wait for dquot users */
                if (atomic_read(&dquot->dq_count)) {
                        atomic_inc(&dquot->dq_count);
                        spin_unlock(&dq_list_lock);
                        /*
                         * Once dqput() wakes us up, we know it's time to free
                         * the dquot.
                         * IMPORTANT: we rely on the fact that there is always
                         * at most one process waiting for dquot to free.
                         * Otherwise dq_count would be > 1 and we would never
                         * wake up.
                         */
                        wait_event(dquot_ref_wq,
                                   atomic_read(&dquot->dq_count) == 1);
                        dqput(dquot);
                        /* At this moment dquot() need not exist (it could be
                         * reclaimed by prune_dqcache(). Hence we must
                         * restart. */
                        goto restart;
                }
                /*
                 * The last user already dropped its reference but dquot didn't
                 * get fully cleaned up yet. Restart the scan which flushes the
                 * work cleaning up released dquots.
                 */
                if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) {
                        spin_unlock(&dq_list_lock);
                        goto restart;
                }
                /*
                 * Quota now has no users and it has been written on last
                 * dqput()
                 */
                remove_dquot_hash(dquot);
                remove_free_dquot(dquot);
                remove_inuse(dquot);
                do_destroy_dquot(dquot);
        }
        spin_unlock(&dq_list_lock);
}

/* Call callback for every active dquot on given filesystem */
int dquot_scan_active(struct super_block *sb,
                      int (*fn)(struct dquot *dquot, unsigned long priv),
                      unsigned long priv)
{
        struct dquot *dquot, *old_dquot = NULL;
        int ret = 0;

        WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount));

        spin_lock(&dq_list_lock);
        list_for_each_entry(dquot, &inuse_list, dq_inuse) {
                if (!dquot_active(dquot))
                        continue;
                if (dquot->dq_sb != sb)
                        continue;
                /* Now we have active dquot so we can just increase use count */
                atomic_inc(&dquot->dq_count);
                spin_unlock(&dq_list_lock);
                dqput(old_dquot);
                old_dquot = dquot;
                /*
                 * ->release_dquot() can be racing with us. Our reference
                 * protects us from new calls to it so just wait for any
                 * outstanding call and recheck the DQ_ACTIVE_B after that.
                 */
                wait_on_dquot(dquot);
                if (dquot_active(dquot)) {
                        ret = fn(dquot, priv);
                        if (ret < 0)
                                goto out;
                }
                spin_lock(&dq_list_lock);
                /* We are safe to continue now because our dquot could not
                 * be moved out of the inuse list while we hold the reference */
        }
        spin_unlock(&dq_list_lock);
out:
        dqput(old_dquot);
        return ret;
}
EXPORT_SYMBOL(dquot_scan_active);

static inline int dquot_write_dquot(struct dquot *dquot)
{
        int ret = dquot->dq_sb->dq_op->write_dquot(dquot);
        if (ret < 0) {
                quota_error(dquot->dq_sb, "Can't write quota structure "
                            "(error %d). Quota may get out of sync!", ret);
                /* Clear dirty bit anyway to avoid infinite loop. */
                clear_dquot_dirty(dquot);
        }
        return ret;
}

/* Write all dquot structures to quota files */
int dquot_writeback_dquots(struct super_block *sb, int type)
{
        struct list_head dirty;
        struct dquot *dquot;
        struct quota_info *dqopt = sb_dqopt(sb);
        int cnt;
        int err, ret = 0;

        WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount));

        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
                if (!sb_has_quota_active(sb, cnt))
                        continue;
                spin_lock(&dq_list_lock);
                /* Move list away to avoid livelock. */
                list_replace_init(&dqopt->info[cnt].dqi_dirty_list, &dirty);
                while (!list_empty(&dirty)) {
                        dquot = list_first_entry(&dirty, struct dquot,
                                                 dq_dirty);

                        WARN_ON(!dquot_active(dquot));
                        /* If the dquot is releasing we should not touch it */
                        if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) {
                                spin_unlock(&dq_list_lock);
                                flush_delayed_work(&quota_release_work);
                                spin_lock(&dq_list_lock);
                                continue;
                        }

                        /* Now we have active dquot from which someone is
                          * holding reference so we can safely just increase
                         * use count */
                        dqgrab(dquot);
                        spin_unlock(&dq_list_lock);
                        err = dquot_write_dquot(dquot);
                        if (err && !ret)
                                ret = err;
                        dqput(dquot);
                        spin_lock(&dq_list_lock);
                }
                spin_unlock(&dq_list_lock);
        }

        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt)
                    && info_dirty(&dqopt->info[cnt]))
                        sb->dq_op->write_info(sb, cnt);
        dqstats_inc(DQST_SYNCS);

        return ret;
}
EXPORT_SYMBOL(dquot_writeback_dquots);

/* Write all dquot structures to disk and make them visible from userspace */
int dquot_quota_sync(struct super_block *sb, int type)
{
        struct quota_info *dqopt = sb_dqopt(sb);
        int cnt;
        int ret;

        ret = dquot_writeback_dquots(sb, type);
        if (ret)
                return ret;
        if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
                return 0;

        /* This is not very clever (and fast) but currently I don't know about
         * any other simple way of getting quota data to disk and we must get
         * them there for userspace to be visible... */
        if (sb->s_op->sync_fs) {
                ret = sb->s_op->sync_fs(sb, 1);
                if (ret)
                        return ret;
        }
        ret = sync_blockdev(sb->s_bdev);
        if (ret)
                return ret;

        /*
         * Now when everything is written we can discard the pagecache so
         * that userspace sees the changes.
         */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
                if (!sb_has_quota_active(sb, cnt))
                        continue;
                inode_lock(dqopt->files[cnt]);
                truncate_inode_pages(&dqopt->files[cnt]->i_data, 0);
                inode_unlock(dqopt->files[cnt]);
        }

        return 0;
}
EXPORT_SYMBOL(dquot_quota_sync);

static unsigned long
dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
        struct dquot *dquot;
        unsigned long freed = 0;

        spin_lock(&dq_list_lock);
        while (!list_empty(&free_dquots) && sc->nr_to_scan) {
                dquot = list_first_entry(&free_dquots, struct dquot, dq_free);
                remove_dquot_hash(dquot);
                remove_free_dquot(dquot);
                remove_inuse(dquot);
                do_destroy_dquot(dquot);
                sc->nr_to_scan--;
                freed++;
        }
        spin_unlock(&dq_list_lock);
        return freed;
}

static unsigned long
dqcache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
        return vfs_pressure_ratio(
        percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS]));
}

static struct shrinker dqcache_shrinker = {
        .count_objects = dqcache_shrink_count,
        .scan_objects = dqcache_shrink_scan,
        .seeks = DEFAULT_SEEKS,
};

/*
 * Safely release dquot and put reference to dquot.
 */
static void quota_release_workfn(struct work_struct *work)
{
        struct dquot *dquot;
        struct list_head rls_head;

        spin_lock(&dq_list_lock);
        /* Exchange the list head to avoid livelock. */
        list_replace_init(&releasing_dquots, &rls_head);
        spin_unlock(&dq_list_lock);
        synchronize_srcu(&dquot_srcu);

restart:
        spin_lock(&dq_list_lock);
        while (!list_empty(&rls_head)) {
                dquot = list_first_entry(&rls_head, struct dquot, dq_free);
                WARN_ON_ONCE(atomic_read(&dquot->dq_count));
                /*
                 * Note that DQ_RELEASING_B protects us from racing with
                 * invalidate_dquots() calls so we are safe to work with the
                 * dquot even after we drop dq_list_lock.
                 */
                if (dquot_dirty(dquot)) {
                        spin_unlock(&dq_list_lock);
                        /* Commit dquot before releasing */
                        dquot_write_dquot(dquot);
                        goto restart;
                }
                if (dquot_active(dquot)) {
                        spin_unlock(&dq_list_lock);
                        dquot->dq_sb->dq_op->release_dquot(dquot);
                        goto restart;
                }
                /* Dquot is inactive and clean, now move it to free list */
                remove_free_dquot(dquot);
                put_dquot_last(dquot);
        }
        spin_unlock(&dq_list_lock);
}

/*
 * Put reference to dquot
 */
void dqput(struct dquot *dquot)
{
        if (!dquot)
                return;
#ifdef CONFIG_QUOTA_DEBUG
        if (!atomic_read(&dquot->dq_count)) {
                quota_error(dquot->dq_sb, "trying to free free dquot of %s %d",
                            quotatypes[dquot->dq_id.type],
                            from_kqid(&init_user_ns, dquot->dq_id));
                BUG();
        }
#endif
        dqstats_inc(DQST_DROPS);

        spin_lock(&dq_list_lock);
        if (atomic_read(&dquot->dq_count) > 1) {
                /* We have more than one user... nothing to do */
                atomic_dec(&dquot->dq_count);
                /* Releasing dquot during quotaoff phase? */
                if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_id.type) &&
                    atomic_read(&dquot->dq_count) == 1)
                        wake_up(&dquot_ref_wq);
                spin_unlock(&dq_list_lock);
                return;
        }

        /* Need to release dquot? */
#ifdef CONFIG_QUOTA_DEBUG
        /* sanity check */
        BUG_ON(!list_empty(&dquot->dq_free));
#endif
        put_releasing_dquots(dquot);
        atomic_dec(&dquot->dq_count);
        spin_unlock(&dq_list_lock);
        queue_delayed_work(system_unbound_wq, &quota_release_work, 1);
}
EXPORT_SYMBOL(dqput);

struct dquot *dquot_alloc(struct super_block *sb, int type)
{
        return kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
}
EXPORT_SYMBOL(dquot_alloc);

static struct dquot *get_empty_dquot(struct super_block *sb, int type)
{
        struct dquot *dquot;

        dquot = sb->dq_op->alloc_dquot(sb, type);
        if(!dquot)
                return NULL;

        mutex_init(&dquot->dq_lock);
        INIT_LIST_HEAD(&dquot->dq_free);
        INIT_LIST_HEAD(&dquot->dq_inuse);
        INIT_HLIST_NODE(&dquot->dq_hash);
        INIT_LIST_HEAD(&dquot->dq_dirty);
        dquot->dq_sb = sb;
        dquot->dq_id = make_kqid_invalid(type);
        atomic_set(&dquot->dq_count, 1);
        spin_lock_init(&dquot->dq_dqb_lock);

        return dquot;
}

/*
 * Get reference to dquot
 *
 * Locking is slightly tricky here. We are guarded from parallel quotaoff()
 * destroying our dquot by:
 *   a) checking for quota flags under dq_list_lock and
 *   b) getting a reference to dquot before we release dq_list_lock
 */
struct dquot *dqget(struct super_block *sb, struct kqid qid)
{
        unsigned int hashent = hashfn(sb, qid);
        struct dquot *dquot, *empty = NULL;

        if (!qid_has_mapping(sb->s_user_ns, qid))
                return ERR_PTR(-EINVAL);

        if (!sb_has_quota_active(sb, qid.type))
                return ERR_PTR(-ESRCH);
we_slept:
        spin_lock(&dq_list_lock);
        spin_lock(&dq_state_lock);
        if (!sb_has_quota_active(sb, qid.type)) {
                spin_unlock(&dq_state_lock);
                spin_unlock(&dq_list_lock);
                dquot = ERR_PTR(-ESRCH);
                goto out;
        }
        spin_unlock(&dq_state_lock);

        dquot = find_dquot(hashent, sb, qid);
        if (!dquot) {
                if (!empty) {
                        spin_unlock(&dq_list_lock);
                        empty = get_empty_dquot(sb, qid.type);
                        if (!empty)
                                schedule();        /* Try to wait for a moment... */
                        goto we_slept;
                }
                dquot = empty;
                empty = NULL;
                dquot->dq_id = qid;
                /* all dquots go on the inuse_list */
                put_inuse(dquot);
                /* hash it first so it can be found */
                insert_dquot_hash(dquot);
                spin_unlock(&dq_list_lock);
                dqstats_inc(DQST_LOOKUPS);
        } else {
                if (!atomic_read(&dquot->dq_count))
                        remove_free_dquot(dquot);
                atomic_inc(&dquot->dq_count);
                spin_unlock(&dq_list_lock);
                dqstats_inc(DQST_CACHE_HITS);
                dqstats_inc(DQST_LOOKUPS);
        }
        /* Wait for dq_lock - after this we know that either dquot_release() is
         * already finished or it will be canceled due to dq_count > 0 test */
        wait_on_dquot(dquot);
        /* Read the dquot / allocate space in quota file */
        if (!dquot_active(dquot)) {
                int err;

                err = sb->dq_op->acquire_dquot(dquot);
                if (err < 0) {
                        dqput(dquot);
                        dquot = ERR_PTR(err);
                        goto out;
                }
        }
        /*
         * Make sure following reads see filled structure - paired with
         * smp_mb__before_atomic() in dquot_acquire().
         */
        smp_rmb();
#ifdef CONFIG_QUOTA_DEBUG
        BUG_ON(!dquot->dq_sb);        /* Has somebody invalidated entry under us? */
#endif
out:
        if (empty)
                do_destroy_dquot(empty);

        return dquot;
}
EXPORT_SYMBOL(dqget);

static inline struct dquot __rcu **i_dquot(struct inode *inode)
{
        /* Force __rcu for now until filesystems are fixed */
        return (struct dquot __rcu **)inode->i_sb->s_op->get_dquots(inode);
}

static int dqinit_needed(struct inode *inode, int type)
{
        struct dquot __rcu * const *dquots;
        int cnt;

        if (IS_NOQUOTA(inode))
                return 0;

        dquots = i_dquot(inode);
        if (type != -1)
                return !dquots[type];
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if (!dquots[cnt])
                        return 1;
        return 0;
}

/* This routine is guarded by s_umount semaphore */
static int add_dquot_ref(struct super_block *sb, int type)
{
        struct inode *inode, *old_inode = NULL;
#ifdef CONFIG_QUOTA_DEBUG
        int reserved = 0;
#endif
        int err = 0;

        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                spin_lock(&inode->i_lock);
                if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
                    !atomic_read(&inode->i_writecount) ||
                    !dqinit_needed(inode, type)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }
                __iget(inode);
                spin_unlock(&inode->i_lock);
                spin_unlock(&sb->s_inode_list_lock);

#ifdef CONFIG_QUOTA_DEBUG
                if (unlikely(inode_get_rsv_space(inode) > 0))
                        reserved = 1;
#endif
                iput(old_inode);
                err = __dquot_initialize(inode, type);
                if (err) {
                        iput(inode);
                        goto out;
                }

                /*
                 * We hold a reference to 'inode' so it couldn't have been
                 * removed from s_inodes list while we dropped the
                 * s_inode_list_lock. We cannot iput the inode now as we can be
                 * holding the last reference and we cannot iput it under
                 * s_inode_list_lock. So we keep the reference and iput it
                 * later.
                 */
                old_inode = inode;
                cond_resched();
                spin_lock(&sb->s_inode_list_lock);
        }
        spin_unlock(&sb->s_inode_list_lock);
        iput(old_inode);
out:
#ifdef CONFIG_QUOTA_DEBUG
        if (reserved) {
                quota_error(sb, "Writes happened before quota was turned on "
                        "thus quota information is probably inconsistent. "
                        "Please run quotacheck(8)");
        }
#endif
        return err;
}

static void remove_dquot_ref(struct super_block *sb, int type)
{
        struct inode *inode;
#ifdef CONFIG_QUOTA_DEBUG
        int reserved = 0;
#endif

        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                /*
                 *  We have to scan also I_NEW inodes because they can already
                 *  have quota pointer initialized. Luckily, we need to touch
                 *  only quota pointers and these have separate locking
                 *  (dq_data_lock).
                 */
                spin_lock(&dq_data_lock);
                if (!IS_NOQUOTA(inode)) {
                        struct dquot __rcu **dquots = i_dquot(inode);
                        struct dquot *dquot = srcu_dereference_check(
                                dquots[type], &dquot_srcu,
                                lockdep_is_held(&dq_data_lock));

#ifdef CONFIG_QUOTA_DEBUG
                        if (unlikely(inode_get_rsv_space(inode) > 0))
                                reserved = 1;
#endif
                        rcu_assign_pointer(dquots[type], NULL);
                        if (dquot)
                                dqput(dquot);
                }
                spin_unlock(&dq_data_lock);
        }
        spin_unlock(&sb->s_inode_list_lock);
#ifdef CONFIG_QUOTA_DEBUG
        if (reserved) {
                printk(KERN_WARNING "VFS (%s): Writes happened after quota"
                        " was disabled thus quota information is probably "
                        "inconsistent. Please run quotacheck(8).\n", sb->s_id);
        }
#endif
}

/* Gather all references from inodes and drop them */
static void drop_dquot_ref(struct super_block *sb, int type)
{
        if (sb->dq_op)
                remove_dquot_ref(sb, type);
}

static inline
void dquot_free_reserved_space(struct dquot *dquot, qsize_t number)
{
        if (dquot->dq_dqb.dqb_rsvspace >= number)
                dquot->dq_dqb.dqb_rsvspace -= number;
        else {
                WARN_ON_ONCE(1);
                dquot->dq_dqb.dqb_rsvspace = 0;
        }
        if (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace <=
            dquot->dq_dqb.dqb_bsoftlimit)
                dquot->dq_dqb.dqb_btime = (time64_t) 0;
        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
}

static void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
{
        if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
            dquot->dq_dqb.dqb_curinodes >= number)
                dquot->dq_dqb.dqb_curinodes -= number;
        else
                dquot->dq_dqb.dqb_curinodes = 0;
        if (dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit)
                dquot->dq_dqb.dqb_itime = (time64_t) 0;
        clear_bit(DQ_INODES_B, &dquot->dq_flags);
}

static void dquot_decr_space(struct dquot *dquot, qsize_t number)
{
        if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
            dquot->dq_dqb.dqb_curspace >= number)
                dquot->dq_dqb.dqb_curspace -= number;
        else
                dquot->dq_dqb.dqb_curspace = 0;
        if (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace <=
            dquot->dq_dqb.dqb_bsoftlimit)
                dquot->dq_dqb.dqb_btime = (time64_t) 0;
        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
}

struct dquot_warn {
        struct super_block *w_sb;
        struct kqid w_dq_id;
        short w_type;
};

static int warning_issued(struct dquot *dquot, const int warntype)
{
        int flag = (warntype == QUOTA_NL_BHARDWARN ||
                warntype == QUOTA_NL_BSOFTLONGWARN) ? DQ_BLKS_B :
                ((warntype == QUOTA_NL_IHARDWARN ||
                warntype == QUOTA_NL_ISOFTLONGWARN) ? DQ_INODES_B : 0);

        if (!flag)
                return 0;
        return test_and_set_bit(flag, &dquot->dq_flags);
}

#ifdef CONFIG_PRINT_QUOTA_WARNING
static int flag_print_warnings = 1;

static int need_print_warning(struct dquot_warn *warn)
{
        if (!flag_print_warnings)
                return 0;

        switch (warn->w_dq_id.type) {
                case USRQUOTA:
                        return uid_eq(current_fsuid(), warn->w_dq_id.uid);
                case GRPQUOTA:
                        return in_group_p(warn->w_dq_id.gid);
                case PRJQUOTA:
                        return 1;
        }
        return 0;
}

/* Print warning to user which exceeded quota */
static void print_warning(struct dquot_warn *warn)
{
        char *msg = NULL;
        struct tty_struct *tty;
        int warntype = warn->w_type;

        if (warntype == QUOTA_NL_IHARDBELOW ||
            warntype == QUOTA_NL_ISOFTBELOW ||
            warntype == QUOTA_NL_BHARDBELOW ||
            warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(warn))
                return;

        tty = get_current_tty();
        if (!tty)
                return;
        tty_write_message(tty, warn->w_sb->s_id);
        if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN)
                tty_write_message(tty, ": warning, ");
        else
                tty_write_message(tty, ": write failed, ");
        tty_write_message(tty, quotatypes[warn->w_dq_id.type]);
        switch (warntype) {
                case QUOTA_NL_IHARDWARN:
                        msg = " file limit reached.\r\n";
                        break;
                case QUOTA_NL_ISOFTLONGWARN:
                        msg = " file quota exceeded too long.\r\n";
                        break;
                case QUOTA_NL_ISOFTWARN:
                        msg = " file quota exceeded.\r\n";
                        break;
                case QUOTA_NL_BHARDWARN:
                        msg = " block limit reached.\r\n";
                        break;
                case QUOTA_NL_BSOFTLONGWARN:
                        msg = " block quota exceeded too long.\r\n";
                        break;
                case QUOTA_NL_BSOFTWARN:
                        msg = " block quota exceeded.\r\n";
                        break;
        }
        tty_write_message(tty, msg);
        tty_kref_put(tty);
}
#endif

static void prepare_warning(struct dquot_warn *warn, struct dquot *dquot,
                            int warntype)
{
        if (warning_issued(dquot, warntype))
                return;
        warn->w_type = warntype;
        warn->w_sb = dquot->dq_sb;
        warn->w_dq_id = dquot->dq_id;
}

/*
 * Write warnings to the console and send warning messages over netlink.
 *
 * Note that this function can call into tty and networking code.
 */
static void flush_warnings(struct dquot_warn *warn)
{
        int i;

        for (i = 0; i < MAXQUOTAS; i++) {
                if (warn[i].w_type == QUOTA_NL_NOWARN)
                        continue;
#ifdef CONFIG_PRINT_QUOTA_WARNING
                print_warning(&warn[i]);
#endif
                quota_send_warning(warn[i].w_dq_id,
                                   warn[i].w_sb->s_dev, warn[i].w_type);
        }
}

static int ignore_hardlimit(struct dquot *dquot)
{
        struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];

        return capable(CAP_SYS_RESOURCE) &&
               (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
                !(info->dqi_flags & DQF_ROOT_SQUASH));
}

static int dquot_add_inodes(struct dquot *dquot, qsize_t inodes,
                            struct dquot_warn *warn)
{
        qsize_t newinodes;
        int ret = 0;

        spin_lock(&dquot->dq_dqb_lock);
        newinodes = dquot->dq_dqb.dqb_curinodes + inodes;
        if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type) ||
            test_bit(DQ_FAKE_B, &dquot->dq_flags))
                goto add;

        if (dquot->dq_dqb.dqb_ihardlimit &&
            newinodes > dquot->dq_dqb.dqb_ihardlimit &&
            !ignore_hardlimit(dquot)) {
                prepare_warning(warn, dquot, QUOTA_NL_IHARDWARN);
                ret = -EDQUOT;
                goto out;
        }

        if (dquot->dq_dqb.dqb_isoftlimit &&
            newinodes > dquot->dq_dqb.dqb_isoftlimit &&
            dquot->dq_dqb.dqb_itime &&
            ktime_get_real_seconds() >= dquot->dq_dqb.dqb_itime &&
            !ignore_hardlimit(dquot)) {
                prepare_warning(warn, dquot, QUOTA_NL_ISOFTLONGWARN);
                ret = -EDQUOT;
                goto out;
        }

        if (dquot->dq_dqb.dqb_isoftlimit &&
            newinodes > dquot->dq_dqb.dqb_isoftlimit &&
            dquot->dq_dqb.dqb_itime == 0) {
                prepare_warning(warn, dquot, QUOTA_NL_ISOFTWARN);
                dquot->dq_dqb.dqb_itime = ktime_get_real_seconds() +
                    sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type].dqi_igrace;
        }
add:
        dquot->dq_dqb.dqb_curinodes = newinodes;

out:
        spin_unlock(&dquot->dq_dqb_lock);
        return ret;
}

static int dquot_add_space(struct dquot *dquot, qsize_t space,
                           qsize_t rsv_space, unsigned int flags,
                           struct dquot_warn *warn)
{
        qsize_t tspace;
        struct super_block *sb = dquot->dq_sb;
        int ret = 0;

        spin_lock(&dquot->dq_dqb_lock);
        if (!sb_has_quota_limits_enabled(sb, dquot->dq_id.type) ||
            test_bit(DQ_FAKE_B, &dquot->dq_flags))
                goto finish;

        tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace
                + space + rsv_space;

        if (dquot->dq_dqb.dqb_bhardlimit &&
            tspace > dquot->dq_dqb.dqb_bhardlimit &&
            !ignore_hardlimit(dquot)) {
                if (flags & DQUOT_SPACE_WARN)
                        prepare_warning(warn, dquot, QUOTA_NL_BHARDWARN);
                ret = -EDQUOT;
                goto finish;
        }

        if (dquot->dq_dqb.dqb_bsoftlimit &&
            tspace > dquot->dq_dqb.dqb_bsoftlimit &&
            dquot->dq_dqb.dqb_btime &&
            ktime_get_real_seconds() >= dquot->dq_dqb.dqb_btime &&
            !ignore_hardlimit(dquot)) {
                if (flags & DQUOT_SPACE_WARN)
                        prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN);
                ret = -EDQUOT;
                goto finish;
        }

        if (dquot->dq_dqb.dqb_bsoftlimit &&
            tspace > dquot->dq_dqb.dqb_bsoftlimit &&
            dquot->dq_dqb.dqb_btime == 0) {
                if (flags & DQUOT_SPACE_WARN) {
                        prepare_warning(warn, dquot, QUOTA_NL_BSOFTWARN);
                        dquot->dq_dqb.dqb_btime = ktime_get_real_seconds() +
                            sb_dqopt(sb)->info[dquot->dq_id.type].dqi_bgrace;
                } else {
                        /*
                         * We don't allow preallocation to exceed softlimit so exceeding will
                         * be always printed
                         */
                        ret = -EDQUOT;
                        goto finish;
                }
        }
finish:
        /*
         * We have to be careful and go through warning generation & grace time
         * setting even if DQUOT_SPACE_NOFAIL is set. That's why we check it
         * only here...
         */
        if (flags & DQUOT_SPACE_NOFAIL)
                ret = 0;
        if (!ret) {
                dquot->dq_dqb.dqb_rsvspace += rsv_space;
                dquot->dq_dqb.dqb_curspace += space;
        }
        spin_unlock(&dquot->dq_dqb_lock);
        return ret;
}

static int info_idq_free(struct dquot *dquot, qsize_t inodes)
{
        qsize_t newinodes;

        if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
            dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit ||
            !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type))
                return QUOTA_NL_NOWARN;

        newinodes = dquot->dq_dqb.dqb_curinodes - inodes;
        if (newinodes <= dquot->dq_dqb.dqb_isoftlimit)
                return QUOTA_NL_ISOFTBELOW;
        if (dquot->dq_dqb.dqb_curinodes >= dquot->dq_dqb.dqb_ihardlimit &&
            newinodes < dquot->dq_dqb.dqb_ihardlimit)
                return QUOTA_NL_IHARDBELOW;
        return QUOTA_NL_NOWARN;
}

static int info_bdq_free(struct dquot *dquot, qsize_t space)
{
        qsize_t tspace;

        tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace;

        if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
            tspace <= dquot->dq_dqb.dqb_bsoftlimit)
                return QUOTA_NL_NOWARN;

        if (tspace - space <= dquot->dq_dqb.dqb_bsoftlimit)
                return QUOTA_NL_BSOFTBELOW;
        if (tspace >= dquot->dq_dqb.dqb_bhardlimit &&
            tspace - space < dquot->dq_dqb.dqb_bhardlimit)
                return QUOTA_NL_BHARDBELOW;
        return QUOTA_NL_NOWARN;
}

static int inode_quota_active(const struct inode *inode)
{
        struct super_block *sb = inode->i_sb;

        if (IS_NOQUOTA(inode))
                return 0;
        return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb);
}

/*
 * Initialize quota pointers in inode
 *
 * It is better to call this function outside of any transaction as it
 * might need a lot of space in journal for dquot structure allocation.
 */
static int __dquot_initialize(struct inode *inode, int type)
{
        int cnt, init_needed = 0;
        struct dquot __rcu **dquots;
        struct dquot *got[MAXQUOTAS] = {};
        struct super_block *sb = inode->i_sb;
        qsize_t rsv;
        int ret = 0;

        if (!inode_quota_active(inode))
                return 0;

        dquots = i_dquot(inode);

        /* First get references to structures we might need. */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                struct kqid qid;
                kprojid_t projid;
                int rc;
                struct dquot *dquot;

                if (type != -1 && cnt != type)
                        continue;
                /*
                 * The i_dquot should have been initialized in most cases,
                 * we check it without locking here to avoid unnecessary
                 * dqget()/dqput() calls.
                 */
                if (dquots[cnt])
                        continue;

                if (!sb_has_quota_active(sb, cnt))
                        continue;

                init_needed = 1;

                switch (cnt) {
                case USRQUOTA:
                        qid = make_kqid_uid(inode->i_uid);
                        break;
                case GRPQUOTA:
                        qid = make_kqid_gid(inode->i_gid);
                        break;
                case PRJQUOTA:
                        rc = inode->i_sb->dq_op->get_projid(inode, &projid);
                        if (rc)
                                continue;
                        qid = make_kqid_projid(projid);
                        break;
                }
                dquot = dqget(sb, qid);
                if (IS_ERR(dquot)) {
                        /* We raced with somebody turning quotas off... */
                        if (PTR_ERR(dquot) != -ESRCH) {
                                ret = PTR_ERR(dquot);
                                goto out_put;
                        }
                        dquot = NULL;
                }
                got[cnt] = dquot;
        }

        /* All required i_dquot has been initialized */
        if (!init_needed)
                return 0;

        spin_lock(&dq_data_lock);
        if (IS_NOQUOTA(inode))
                goto out_lock;
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
                /* Avoid races with quotaoff() */
                if (!sb_has_quota_active(sb, cnt))
                        continue;
                /* We could race with quotaon or dqget() could have failed */
                if (!got[cnt])
                        continue;
                if (!dquots[cnt]) {
                        rcu_assign_pointer(dquots[cnt], got[cnt]);
                        got[cnt] = NULL;
                        /*
                         * Make quota reservation system happy if someone
                         * did a write before quota was turned on
                         */
                        rsv = inode_get_rsv_space(inode);
                        if (unlikely(rsv)) {
                                struct dquot *dquot = srcu_dereference_check(
                                        dquots[cnt], &dquot_srcu,
                                        lockdep_is_held(&dq_data_lock));

                                spin_lock(&inode->i_lock);
                                /* Get reservation again under proper lock */
                                rsv = __inode_get_rsv_space(inode);
                                spin_lock(&dquot->dq_dqb_lock);
                                dquot->dq_dqb.dqb_rsvspace += rsv;
                                spin_unlock(&dquot->dq_dqb_lock);
                                spin_unlock(&inode->i_lock);
                        }
                }
        }
out_lock:
        spin_unlock(&dq_data_lock);
out_put:
        /* Drop unused references */
        dqput_all(got);

        return ret;
}

int dquot_initialize(struct inode *inode)
{
        return __dquot_initialize(inode, -1);
}
EXPORT_SYMBOL(dquot_initialize);

bool dquot_initialize_needed(struct inode *inode)
{
        struct dquot __rcu **dquots;
        int i;

        if (!inode_quota_active(inode))
                return false;

        dquots = i_dquot(inode);
        for (i = 0; i < MAXQUOTAS; i++)
                if (!dquots[i] && sb_has_quota_active(inode->i_sb, i))
                        return true;
        return false;
}
EXPORT_SYMBOL(dquot_initialize_needed);

/*
 * Release all quotas referenced by inode.
 *
 * This function only be called on inode free or converting
 * a file to quota file, no other users for the i_dquot in
 * both cases, so we needn't call synchronize_srcu() after
 * clearing i_dquot.
 */
static void __dquot_drop(struct inode *inode)
{
        int cnt;
        struct dquot __rcu **dquots = i_dquot(inode);
        struct dquot *put[MAXQUOTAS];

        spin_lock(&dq_data_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                put[cnt] = srcu_dereference_check(dquots[cnt], &dquot_srcu,
                                        lockdep_is_held(&dq_data_lock));
                rcu_assign_pointer(dquots[cnt], NULL);
        }
        spin_unlock(&dq_data_lock);
        dqput_all(put);
}

void dquot_drop(struct inode *inode)
{
        struct dquot __rcu * const *dquots;
        int cnt;

        if (IS_NOQUOTA(inode))
                return;

        /*
         * Test before calling to rule out calls from proc and such
         * where we are not allowed to block. Note that this is
         * actually reliable test even without the lock - the caller
         * must assure that nobody can come after the DQUOT_DROP and
         * add quota pointers back anyway.
         */
        dquots = i_dquot(inode);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (dquots[cnt])
                        break;
        }

        if (cnt < MAXQUOTAS)
                __dquot_drop(inode);
}
EXPORT_SYMBOL(dquot_drop);

/*
 * inode_reserved_space is managed internally by quota, and protected by
 * i_lock similar to i_blocks+i_bytes.
 */
static qsize_t *inode_reserved_space(struct inode * inode)
{
        /* Filesystem must explicitly define it's own method in order to use
         * quota reservation interface */
        BUG_ON(!inode->i_sb->dq_op->get_reserved_space);
        return inode->i_sb->dq_op->get_reserved_space(inode);
}

static qsize_t __inode_get_rsv_space(struct inode *inode)
{
        if (!inode->i_sb->dq_op->get_reserved_space)
                return 0;
        return *inode_reserved_space(inode);
}

static qsize_t inode_get_rsv_space(struct inode *inode)
{
        qsize_t ret;

        if (!inode->i_sb->dq_op->get_reserved_space)
                return 0;
        spin_lock(&inode->i_lock);
        ret = __inode_get_rsv_space(inode);
        spin_unlock(&inode->i_lock);
        return ret;
}

/*
 * This functions updates i_blocks+i_bytes fields and quota information
 * (together with appropriate checks).
 *
 * NOTE: We absolutely rely on the fact that caller dirties the inode
 * (usually helpers in quotaops.h care about this) and holds a handle for
 * the current transaction so that dquot write and inode write go into the
 * same transaction.
 */

/*
 * This operation can block, but only after everything is updated
 */
int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
{
        int cnt, ret = 0, index;
        struct dquot_warn warn[MAXQUOTAS];
        int reserve = flags & DQUOT_SPACE_RESERVE;
        struct dquot __rcu **dquots;
        struct dquot *dquot;

        if (!inode_quota_active(inode)) {
                if (reserve) {
                        spin_lock(&inode->i_lock);
                        *inode_reserved_space(inode) += number;
                        spin_unlock(&inode->i_lock);
                } else {
                        inode_add_bytes(inode, number);
                }
                goto out;
        }

        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                warn[cnt].w_type = QUOTA_NL_NOWARN;

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (!dquot)
                        continue;
                if (reserve) {
                        ret = dquot_add_space(dquot, 0, number, flags, &warn[cnt]);
                } else {
                        ret = dquot_add_space(dquot, number, 0, flags, &warn[cnt]);
                }
                if (ret) {
                        /* Back out changes we already did */
                        for (cnt--; cnt >= 0; cnt--) {
                                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                                if (!dquot)
                                        continue;
                                spin_lock(&dquot->dq_dqb_lock);
                                if (reserve)
                                        dquot_free_reserved_space(dquot, number);
                                else
                                        dquot_decr_space(dquot, number);
                                spin_unlock(&dquot->dq_dqb_lock);
                        }
                        spin_unlock(&inode->i_lock);
                        goto out_flush_warn;
                }
        }
        if (reserve)
                *inode_reserved_space(inode) += number;
        else
                __inode_add_bytes(inode, number);
        spin_unlock(&inode->i_lock);

        if (reserve)
                goto out_flush_warn;
        mark_all_dquot_dirty(dquots);
out_flush_warn:
        srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
out:
        return ret;
}
EXPORT_SYMBOL(__dquot_alloc_space);

/*
 * This operation can block, but only after everything is updated
 */
int dquot_alloc_inode(struct inode *inode)
{
        int cnt, ret = 0, index;
        struct dquot_warn warn[MAXQUOTAS];
        struct dquot __rcu * const *dquots;
        struct dquot *dquot;

        if (!inode_quota_active(inode))
                return 0;
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                warn[cnt].w_type = QUOTA_NL_NOWARN;

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (!dquot)
                        continue;
                ret = dquot_add_inodes(dquot, 1, &warn[cnt]);
                if (ret) {
                        for (cnt--; cnt >= 0; cnt--) {
                                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                                if (!dquot)
                                        continue;
                                /* Back out changes we already did */
                                spin_lock(&dquot->dq_dqb_lock);
                                dquot_decr_inodes(dquot, 1);
                                spin_unlock(&dquot->dq_dqb_lock);
                        }
                        goto warn_put_all;
                }
        }

warn_put_all:
        spin_unlock(&inode->i_lock);
        if (ret == 0)
                mark_all_dquot_dirty(dquots);
        srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
        return ret;
}
EXPORT_SYMBOL(dquot_alloc_inode);

/*
 * Convert in-memory reserved quotas to real consumed quotas
 */
int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
{
        struct dquot __rcu **dquots;
        struct dquot *dquot;
        int cnt, index;

        if (!inode_quota_active(inode)) {
                spin_lock(&inode->i_lock);
                *inode_reserved_space(inode) -= number;
                __inode_add_bytes(inode, number);
                spin_unlock(&inode->i_lock);
                return 0;
        }

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        /* Claim reserved quotas to allocated quotas */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (dquot) {
                        spin_lock(&dquot->dq_dqb_lock);
                        if (WARN_ON_ONCE(dquot->dq_dqb.dqb_rsvspace < number))
                                number = dquot->dq_dqb.dqb_rsvspace;
                        dquot->dq_dqb.dqb_curspace += number;
                        dquot->dq_dqb.dqb_rsvspace -= number;
                        spin_unlock(&dquot->dq_dqb_lock);
                }
        }
        /* Update inode bytes */
        *inode_reserved_space(inode) -= number;
        __inode_add_bytes(inode, number);
        spin_unlock(&inode->i_lock);
        mark_all_dquot_dirty(dquots);
        srcu_read_unlock(&dquot_srcu, index);
        return 0;
}
EXPORT_SYMBOL(dquot_claim_space_nodirty);

/*
 * Convert allocated space back to in-memory reserved quotas
 */
void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
{
        struct dquot __rcu **dquots;
        struct dquot *dquot;
        int cnt, index;

        if (!inode_quota_active(inode)) {
                spin_lock(&inode->i_lock);
                *inode_reserved_space(inode) += number;
                __inode_sub_bytes(inode, number);
                spin_unlock(&inode->i_lock);
                return;
        }

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        /* Claim reserved quotas to allocated quotas */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (dquot) {
                        spin_lock(&dquot->dq_dqb_lock);
                        if (WARN_ON_ONCE(dquot->dq_dqb.dqb_curspace < number))
                                number = dquot->dq_dqb.dqb_curspace;
                        dquot->dq_dqb.dqb_rsvspace += number;
                        dquot->dq_dqb.dqb_curspace -= number;
                        spin_unlock(&dquot->dq_dqb_lock);
                }
        }
        /* Update inode bytes */
        *inode_reserved_space(inode) += number;
        __inode_sub_bytes(inode, number);
        spin_unlock(&inode->i_lock);
        mark_all_dquot_dirty(dquots);
        srcu_read_unlock(&dquot_srcu, index);
        return;
}
EXPORT_SYMBOL(dquot_reclaim_space_nodirty);

/*
 * This operation can block, but only after everything is updated
 */
void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
{
        unsigned int cnt;
        struct dquot_warn warn[MAXQUOTAS];
        struct dquot __rcu **dquots;
        struct dquot *dquot;
        int reserve = flags & DQUOT_SPACE_RESERVE, index;

        if (!inode_quota_active(inode)) {
                if (reserve) {
                        spin_lock(&inode->i_lock);
                        *inode_reserved_space(inode) -= number;
                        spin_unlock(&inode->i_lock);
                } else {
                        inode_sub_bytes(inode, number);
                }
                return;
        }

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                int wtype;

                warn[cnt].w_type = QUOTA_NL_NOWARN;
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (!dquot)
                        continue;
                spin_lock(&dquot->dq_dqb_lock);
                wtype = info_bdq_free(dquot, number);
                if (wtype != QUOTA_NL_NOWARN)
                        prepare_warning(&warn[cnt], dquot, wtype);
                if (reserve)
                        dquot_free_reserved_space(dquot, number);
                else
                        dquot_decr_space(dquot, number);
                spin_unlock(&dquot->dq_dqb_lock);
        }
        if (reserve)
                *inode_reserved_space(inode) -= number;
        else
                __inode_sub_bytes(inode, number);
        spin_unlock(&inode->i_lock);

        if (reserve)
                goto out_unlock;
        mark_all_dquot_dirty(dquots);
out_unlock:
        srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
}
EXPORT_SYMBOL(__dquot_free_space);

/*
 * This operation can block, but only after everything is updated
 */
void dquot_free_inode(struct inode *inode)
{
        unsigned int cnt;
        struct dquot_warn warn[MAXQUOTAS];
        struct dquot __rcu * const *dquots;
        struct dquot *dquot;
        int index;

        if (!inode_quota_active(inode))
                return;

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                int wtype;
                warn[cnt].w_type = QUOTA_NL_NOWARN;
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (!dquot)
                        continue;
                spin_lock(&dquot->dq_dqb_lock);
                wtype = info_idq_free(dquot, 1);
                if (wtype != QUOTA_NL_NOWARN)
                        prepare_warning(&warn[cnt], dquot, wtype);
                dquot_decr_inodes(dquot, 1);
                spin_unlock(&dquot->dq_dqb_lock);
        }
        spin_unlock(&inode->i_lock);
        mark_all_dquot_dirty(dquots);
        srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
}
EXPORT_SYMBOL(dquot_free_inode);

/*
 * Transfer the number of inode and blocks from one diskquota to an other.
 * On success, dquot references in transfer_to are consumed and references
 * to original dquots that need to be released are placed there. On failure,
 * references are kept untouched.
 *
 * This operation can block, but only after everything is updated
 * A transaction must be started when entering this function.
 *
 * We are holding reference on transfer_from & transfer_to, no need to
 * protect them by srcu_read_lock().
 */
int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
{
        qsize_t cur_space;
        qsize_t rsv_space = 0;
        qsize_t inode_usage = 1;
        struct dquot __rcu **dquots;
        struct dquot *transfer_from[MAXQUOTAS] = {};
        int cnt, index, ret = 0;
        char is_valid[MAXQUOTAS] = {};
        struct dquot_warn warn_to[MAXQUOTAS];
        struct dquot_warn warn_from_inodes[MAXQUOTAS];
        struct dquot_warn warn_from_space[MAXQUOTAS];

        if (IS_NOQUOTA(inode))
                return 0;

        if (inode->i_sb->dq_op->get_inode_usage) {
                ret = inode->i_sb->dq_op->get_inode_usage(inode, &inode_usage);
                if (ret)
                        return ret;
        }

        /* Initialize the arrays */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                warn_to[cnt].w_type = QUOTA_NL_NOWARN;
                warn_from_inodes[cnt].w_type = QUOTA_NL_NOWARN;
                warn_from_space[cnt].w_type = QUOTA_NL_NOWARN;
        }

        spin_lock(&dq_data_lock);
        spin_lock(&inode->i_lock);
        if (IS_NOQUOTA(inode)) {        /* File without quota accounting? */
                spin_unlock(&inode->i_lock);
                spin_unlock(&dq_data_lock);
                return 0;
        }
        cur_space = __inode_get_bytes(inode);
        rsv_space = __inode_get_rsv_space(inode);
        dquots = i_dquot(inode);
        /*
         * Build the transfer_from list, check limits, and update usage in
         * the target structures.
         */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                /*
                 * Skip changes for same uid or gid or for turned off quota-type.
                 */
                if (!transfer_to[cnt])
                        continue;
                /* Avoid races with quotaoff() */
                if (!sb_has_quota_active(inode->i_sb, cnt))
                        continue;
                is_valid[cnt] = 1;
                transfer_from[cnt] = srcu_dereference_check(dquots[cnt],
                                &dquot_srcu, lockdep_is_held(&dq_data_lock));
                ret = dquot_add_inodes(transfer_to[cnt], inode_usage,
                                       &warn_to[cnt]);
                if (ret)
                        goto over_quota;
                ret = dquot_add_space(transfer_to[cnt], cur_space, rsv_space,
                                      DQUOT_SPACE_WARN, &warn_to[cnt]);
                if (ret) {
                        spin_lock(&transfer_to[cnt]->dq_dqb_lock);
                        dquot_decr_inodes(transfer_to[cnt], inode_usage);
                        spin_unlock(&transfer_to[cnt]->dq_dqb_lock);
                        goto over_quota;
                }
        }

        /* Decrease usage for source structures and update quota pointers */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (!is_valid[cnt])
                        continue;
                /* Due to IO error we might not have transfer_from[] structure */
                if (transfer_from[cnt]) {
                        int wtype;

                        spin_lock(&transfer_from[cnt]->dq_dqb_lock);
                        wtype = info_idq_free(transfer_from[cnt], inode_usage);
                        if (wtype != QUOTA_NL_NOWARN)
                                prepare_warning(&warn_from_inodes[cnt],
                                                transfer_from[cnt], wtype);
                        wtype = info_bdq_free(transfer_from[cnt],
                                              cur_space + rsv_space);
                        if (wtype != QUOTA_NL_NOWARN)
                                prepare_warning(&warn_from_space[cnt],
                                                transfer_from[cnt], wtype);
                        dquot_decr_inodes(transfer_from[cnt], inode_usage);
                        dquot_decr_space(transfer_from[cnt], cur_space);
                        dquot_free_reserved_space(transfer_from[cnt],
                                                  rsv_space);
                        spin_unlock(&transfer_from[cnt]->dq_dqb_lock);
                }
                rcu_assign_pointer(dquots[cnt], transfer_to[cnt]);
        }
        spin_unlock(&inode->i_lock);
        spin_unlock(&dq_data_lock);

        /*
         * These arrays are local and we hold dquot references so we don't need
         * the srcu protection but still take dquot_srcu to avoid warning in
         * mark_all_dquot_dirty().
         */
        index = srcu_read_lock(&dquot_srcu);
        mark_all_dquot_dirty((struct dquot __rcu **)transfer_from);
        mark_all_dquot_dirty((struct dquot __rcu **)transfer_to);
        srcu_read_unlock(&dquot_srcu, index);

        flush_warnings(warn_to);
        flush_warnings(warn_from_inodes);
        flush_warnings(warn_from_space);
        /* Pass back references to put */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if (is_valid[cnt])
                        transfer_to[cnt] = transfer_from[cnt];
        return 0;
over_quota:
        /* Back out changes we already did */
        for (cnt--; cnt >= 0; cnt--) {
                if (!is_valid[cnt])
                        continue;
                spin_lock(&transfer_to[cnt]->dq_dqb_lock);
                dquot_decr_inodes(transfer_to[cnt], inode_usage);
                dquot_decr_space(transfer_to[cnt], cur_space);
                dquot_free_reserved_space(transfer_to[cnt], rsv_space);
                spin_unlock(&transfer_to[cnt]->dq_dqb_lock);
        }
        spin_unlock(&inode->i_lock);
        spin_unlock(&dq_data_lock);
        flush_warnings(warn_to);
        return ret;
}
EXPORT_SYMBOL(__dquot_transfer);

/* Wrapper for transferring ownership of an inode for uid/gid only
 * Called from FSXXX_setattr()
 */
int dquot_transfer(struct inode *inode, struct iattr *iattr)
{
        struct dquot *transfer_to[MAXQUOTAS] = {};
        struct dquot *dquot;
        struct super_block *sb = inode->i_sb;
        int ret;

        if (!inode_quota_active(inode))
                return 0;

        if (iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)){
                dquot = dqget(sb, make_kqid_uid(iattr->ia_uid));
                if (IS_ERR(dquot)) {
                        if (PTR_ERR(dquot) != -ESRCH) {
                                ret = PTR_ERR(dquot);
                                goto out_put;
                        }
                        dquot = NULL;
                }
                transfer_to[USRQUOTA] = dquot;
        }
        if (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid)){
                dquot = dqget(sb, make_kqid_gid(iattr->ia_gid));
                if (IS_ERR(dquot)) {
                        if (PTR_ERR(dquot) != -ESRCH) {
                                ret = PTR_ERR(dquot);
                                goto out_put;
                        }
                        dquot = NULL;
                }
                transfer_to[GRPQUOTA] = dquot;
        }
        ret = __dquot_transfer(inode, transfer_to);
out_put:
        dqput_all(transfer_to);
        return ret;
}
EXPORT_SYMBOL(dquot_transfer);

/*
 * Write info of quota file to disk
 */
int dquot_commit_info(struct super_block *sb, int type)
{
        struct quota_info *dqopt = sb_dqopt(sb);

        return dqopt->ops[type]->write_file_info(sb, type);
}
EXPORT_SYMBOL(dquot_commit_info);

int dquot_get_next_id(struct super_block *sb, struct kqid *qid)
{
        struct quota_info *dqopt = sb_dqopt(sb);

        if (!sb_has_quota_active(sb, qid->type))
                return -ESRCH;
        if (!dqopt->ops[qid->type]->get_next_id)
                return -ENOSYS;
        return dqopt->ops[qid->type]->get_next_id(sb, qid);
}
EXPORT_SYMBOL(dquot_get_next_id);

/*
 * Definitions of diskquota operations.
 */
const struct dquot_operations dquot_operations = {
        .write_dquot        = dquot_commit,
        .acquire_dquot        = dquot_acquire,
        .release_dquot        = dquot_release,
        .mark_dirty        = dquot_mark_dquot_dirty,
        .write_info        = dquot_commit_info,
        .alloc_dquot        = dquot_alloc,
        .destroy_dquot        = dquot_destroy,
        .get_next_id        = dquot_get_next_id,
};
EXPORT_SYMBOL(dquot_operations);

/*
 * Generic helper for ->open on filesystems supporting disk quotas.
 */
int dquot_file_open(struct inode *inode, struct file *file)
{
        int error;

        error = generic_file_open(inode, file);
        if (!error && (file->f_mode & FMODE_WRITE))
                error = dquot_initialize(inode);
        return error;
}
EXPORT_SYMBOL(dquot_file_open);

static void vfs_cleanup_quota_inode(struct super_block *sb, int type)
{
        struct quota_info *dqopt = sb_dqopt(sb);
        struct inode *inode = dqopt->files[type];

        if (!inode)
                return;
        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
                inode_lock(inode);
                inode->i_flags &= ~S_NOQUOTA;
                inode_unlock(inode);
        }
        dqopt->files[type] = NULL;
        iput(inode);
}

/*
 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
 */
int dquot_disable(struct super_block *sb, int type, unsigned int flags)
{
        int cnt;
        struct quota_info *dqopt = sb_dqopt(sb);

        /* s_umount should be held in exclusive mode */
        if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount)))
                up_read(&sb->s_umount);

        /* Cannot turn off usage accounting without turning off limits, or
         * suspend quotas and simultaneously turn quotas off. */
        if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED))
            || (flags & DQUOT_SUSPENDED && flags & (DQUOT_LIMITS_ENABLED |
            DQUOT_USAGE_ENABLED)))
                return -EINVAL;

        /*
         * Skip everything if there's nothing to do. We have to do this because
         * sometimes we are called when fill_super() failed and calling
         * sync_fs() in such cases does no good.
         */
        if (!sb_any_quota_loaded(sb))
                return 0;

        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
                if (!sb_has_quota_loaded(sb, cnt))
                        continue;

                if (flags & DQUOT_SUSPENDED) {
                        spin_lock(&dq_state_lock);
                        dqopt->flags |=
                                dquot_state_flag(DQUOT_SUSPENDED, cnt);
                        spin_unlock(&dq_state_lock);
                } else {
                        spin_lock(&dq_state_lock);
                        dqopt->flags &= ~dquot_state_flag(flags, cnt);
                        /* Turning off suspended quotas? */
                        if (!sb_has_quota_loaded(sb, cnt) &&
                            sb_has_quota_suspended(sb, cnt)) {
                                dqopt->flags &=        ~dquot_state_flag(
                                                        DQUOT_SUSPENDED, cnt);
                                spin_unlock(&dq_state_lock);
                                vfs_cleanup_quota_inode(sb, cnt);
                                continue;
                        }
                        spin_unlock(&dq_state_lock);
                }

                /* We still have to keep quota loaded? */
                if (sb_has_quota_loaded(sb, cnt) && !(flags & DQUOT_SUSPENDED))
                        continue;

                /* Note: these are blocking operations */
                drop_dquot_ref(sb, cnt);
                invalidate_dquots(sb, cnt);
                /*
                 * Now all dquots should be invalidated, all writes done so we
                 * should be only users of the info. No locks needed.
                 */
                if (info_dirty(&dqopt->info[cnt]))
                        sb->dq_op->write_info(sb, cnt);
                if (dqopt->ops[cnt]->free_file_info)
                        dqopt->ops[cnt]->free_file_info(sb, cnt);
                put_quota_format(dqopt->info[cnt].dqi_format);
                dqopt->info[cnt].dqi_flags = 0;
                dqopt->info[cnt].dqi_igrace = 0;
                dqopt->info[cnt].dqi_bgrace = 0;
                dqopt->ops[cnt] = NULL;
        }

        /* Skip syncing and setting flags if quota files are hidden */
        if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
                goto put_inodes;

        /* Sync the superblock so that buffers with quota data are written to
         * disk (and so userspace sees correct data afterwards). */
        if (sb->s_op->sync_fs)
                sb->s_op->sync_fs(sb, 1);
        sync_blockdev(sb->s_bdev);
        /* Now the quota files are just ordinary files and we can set the
         * inode flags back. Moreover we discard the pagecache so that
         * userspace sees the writes we did bypassing the pagecache. We
         * must also discard the blockdev buffers so that we see the
         * changes done by userspace on the next quotaon() */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if (!sb_has_quota_loaded(sb, cnt) && dqopt->files[cnt]) {
                        inode_lock(dqopt->files[cnt]);
                        truncate_inode_pages(&dqopt->files[cnt]->i_data, 0);
                        inode_unlock(dqopt->files[cnt]);
                }
        if (sb->s_bdev)
                invalidate_bdev(sb->s_bdev);
put_inodes:
        /* We are done when suspending quotas */
        if (flags & DQUOT_SUSPENDED)
                return 0;

        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if (!sb_has_quota_loaded(sb, cnt))
                        vfs_cleanup_quota_inode(sb, cnt);
        return 0;
}
EXPORT_SYMBOL(dquot_disable);

int dquot_quota_off(struct super_block *sb, int type)
{
        return dquot_disable(sb, type,
                             DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
}
EXPORT_SYMBOL(dquot_quota_off);

/*
 *        Turn quotas on on a device
 */

static int vfs_setup_quota_inode(struct inode *inode, int type)
{
        struct super_block *sb = inode->i_sb;
        struct quota_info *dqopt = sb_dqopt(sb);

        if (is_bad_inode(inode))
                return -EUCLEAN;
        if (!S_ISREG(inode->i_mode))
                return -EACCES;
        if (IS_RDONLY(inode))
                return -EROFS;
        if (sb_has_quota_loaded(sb, type))
                return -EBUSY;

        /*
         * Quota files should never be encrypted.  They should be thought of as
         * filesystem metadata, not user data.  New-style internal quota files
         * cannot be encrypted by users anyway, but old-style external quota
         * files could potentially be incorrectly created in an encrypted
         * directory, hence this explicit check.  Some reasons why encrypted
         * quota files don't work include: (1) some filesystems that support
         * encryption don't handle it in their quota_read and quota_write, and
         * (2) cleaning up encrypted quota files at unmount would need special
         * consideration, as quota files are cleaned up later than user files.
         */
        if (IS_ENCRYPTED(inode))
                return -EINVAL;

        dqopt->files[type] = igrab(inode);
        if (!dqopt->files[type])
                return -EIO;
        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
                /* We don't want quota and atime on quota files (deadlocks
                 * possible) Also nobody should write to the file - we use
                 * special IO operations which ignore the immutable bit. */
                inode_lock(inode);
                inode->i_flags |= S_NOQUOTA;
                inode_unlock(inode);
                /*
                 * When S_NOQUOTA is set, remove dquot references as no more
                 * references can be added
                 */
                __dquot_drop(inode);
        }
        return 0;
}

int dquot_load_quota_sb(struct super_block *sb, int type, int format_id,
        unsigned int flags)
{
        struct quota_format_type *fmt = find_quota_format(format_id);
        struct quota_info *dqopt = sb_dqopt(sb);
        int error;

        /* Just unsuspend quotas? */
        BUG_ON(flags & DQUOT_SUSPENDED);
        /* s_umount should be held in exclusive mode */
        if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount)))
                up_read(&sb->s_umount);

        if (!fmt)
                return -ESRCH;
        if (!sb->s_op->quota_write || !sb->s_op->quota_read ||
            (type == PRJQUOTA && sb->dq_op->get_projid == NULL)) {
                error = -EINVAL;
                goto out_fmt;
        }
        /* Filesystems outside of init_user_ns not yet supported */
        if (sb->s_user_ns != &init_user_ns) {
                error = -EINVAL;
                goto out_fmt;
        }
        /* Usage always has to be set... */
        if (!(flags & DQUOT_USAGE_ENABLED)) {
                error = -EINVAL;
                goto out_fmt;
        }
        if (sb_has_quota_loaded(sb, type)) {
                error = -EBUSY;
                goto out_fmt;
        }

        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
                /* As we bypass the pagecache we must now flush all the
                 * dirty data and invalidate caches so that kernel sees
                 * changes from userspace. It is not enough to just flush
                 * the quota file since if blocksize < pagesize, invalidation
                 * of the cache could fail because of other unrelated dirty
                 * data */
                sync_filesystem(sb);
                invalidate_bdev(sb->s_bdev);
        }

        error = -EINVAL;
        if (!fmt->qf_ops->check_quota_file(sb, type))
                goto out_fmt;

        dqopt->ops[type] = fmt->qf_ops;
        dqopt->info[type].dqi_format = fmt;
        dqopt->info[type].dqi_fmt_id = format_id;
        INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list);
        error = dqopt->ops[type]->read_file_info(sb, type);
        if (error < 0)
                goto out_fmt;
        if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) {
                spin_lock(&dq_data_lock);
                dqopt->info[type].dqi_flags |= DQF_SYS_FILE;
                spin_unlock(&dq_data_lock);
        }
        spin_lock(&dq_state_lock);
        dqopt->flags |= dquot_state_flag(flags, type);
        spin_unlock(&dq_state_lock);

        error = add_dquot_ref(sb, type);
        if (error)
                dquot_disable(sb, type,
                              DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);

        return error;
out_fmt:
        put_quota_format(fmt);

        return error;
}
EXPORT_SYMBOL(dquot_load_quota_sb);

/*
 * More powerful function for turning on quotas on given quota inode allowing
 * setting of individual quota flags
 */
int dquot_load_quota_inode(struct inode *inode, int type, int format_id,
        unsigned int flags)
{
        int err;

        err = vfs_setup_quota_inode(inode, type);
        if (err < 0)
                return err;
        err = dquot_load_quota_sb(inode->i_sb, type, format_id, flags);
        if (err < 0)
                vfs_cleanup_quota_inode(inode->i_sb, type);
        return err;
}
EXPORT_SYMBOL(dquot_load_quota_inode);

/* Reenable quotas on remount RW */
int dquot_resume(struct super_block *sb, int type)
{
        struct quota_info *dqopt = sb_dqopt(sb);
        int ret = 0, cnt;
        unsigned int flags;

        /* s_umount should be held in exclusive mode */
        if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount)))
                up_read(&sb->s_umount);

        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
                if (!sb_has_quota_suspended(sb, cnt))
                        continue;

                spin_lock(&dq_state_lock);
                flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
                                                        DQUOT_LIMITS_ENABLED,
                                                        cnt);
                dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt);
                spin_unlock(&dq_state_lock);

                flags = dquot_generic_flag(flags, cnt);
                ret = dquot_load_quota_sb(sb, cnt, dqopt->info[cnt].dqi_fmt_id,
                                          flags);
                if (ret < 0)
                        vfs_cleanup_quota_inode(sb, cnt);
        }

        return ret;
}
EXPORT_SYMBOL(dquot_resume);

int dquot_quota_on(struct super_block *sb, int type, int format_id,
                   const struct path *path)
{
        int error = security_quota_on(path->dentry);
        if (error)
                return error;
        /* Quota file not on the same filesystem? */
        if (path->dentry->d_sb != sb)
                error = -EXDEV;
        else
                error = dquot_load_quota_inode(d_inode(path->dentry), type,
                                             format_id, DQUOT_USAGE_ENABLED |
                                             DQUOT_LIMITS_ENABLED);
        return error;
}
EXPORT_SYMBOL(dquot_quota_on);

/*
 * This function is used when filesystem needs to initialize quotas
 * during mount time.
 */
int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
                int format_id, int type)
{
        struct dentry *dentry;
        int error;

        dentry = lookup_positive_unlocked(qf_name, sb->s_root, strlen(qf_name));
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);

        error = security_quota_on(dentry);
        if (!error)
                error = dquot_load_quota_inode(d_inode(dentry), type, format_id,
                                DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);

        dput(dentry);
        return error;
}
EXPORT_SYMBOL(dquot_quota_on_mount);

static int dquot_quota_enable(struct super_block *sb, unsigned int flags)
{
        int ret;
        int type;
        struct quota_info *dqopt = sb_dqopt(sb);

        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
                return -ENOSYS;
        /* Accounting cannot be turned on while fs is mounted */
        flags &= ~(FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT);
        if (!flags)
                return -EINVAL;
        for (type = 0; type < MAXQUOTAS; type++) {
                if (!(flags & qtype_enforce_flag(type)))
                        continue;
                /* Can't enforce without accounting */
                if (!sb_has_quota_usage_enabled(sb, type)) {
                        ret = -EINVAL;
                        goto out_err;
                }
                if (sb_has_quota_limits_enabled(sb, type)) {
                        ret = -EBUSY;
                        goto out_err;
                }
                spin_lock(&dq_state_lock);
                dqopt->flags |= dquot_state_flag(DQUOT_LIMITS_ENABLED, type);
                spin_unlock(&dq_state_lock);
        }
        return 0;
out_err:
        /* Backout enforcement enablement we already did */
        for (type--; type >= 0; type--)  {
                if (flags & qtype_enforce_flag(type))
                        dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
        }
        /* Error code translation for better compatibility with XFS */
        if (ret == -EBUSY)
                ret = -EEXIST;
        return ret;
}

static int dquot_quota_disable(struct super_block *sb, unsigned int flags)
{
        int ret;
        int type;
        struct quota_info *dqopt = sb_dqopt(sb);

        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
                return -ENOSYS;
        /*
         * We don't support turning off accounting via quotactl. In principle
         * quota infrastructure can do this but filesystems don't expect
         * userspace to be able to do it.
         */
        if (flags &
                  (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT))
                return -EOPNOTSUPP;

        /* Filter out limits not enabled */
        for (type = 0; type < MAXQUOTAS; type++)
                if (!sb_has_quota_limits_enabled(sb, type))
                        flags &= ~qtype_enforce_flag(type);
        /* Nothing left? */
        if (!flags)
                return -EEXIST;
        for (type = 0; type < MAXQUOTAS; type++) {
                if (flags & qtype_enforce_flag(type)) {
                        ret = dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
                        if (ret < 0)
                                goto out_err;
                }
        }
        return 0;
out_err:
        /* Backout enforcement disabling we already did */
        for (type--; type >= 0; type--)  {
                if (flags & qtype_enforce_flag(type)) {
                        spin_lock(&dq_state_lock);
                        dqopt->flags |=
                                dquot_state_flag(DQUOT_LIMITS_ENABLED, type);
                        spin_unlock(&dq_state_lock);
                }
        }
        return ret;
}

/* Generic routine for getting common part of quota structure */
static void do_get_dqblk(struct dquot *dquot, struct qc_dqblk *di)
{
        struct mem_dqblk *dm = &dquot->dq_dqb;

        memset(di, 0, sizeof(*di));
        spin_lock(&dquot->dq_dqb_lock);
        di->d_spc_hardlimit = dm->dqb_bhardlimit;
        di->d_spc_softlimit = dm->dqb_bsoftlimit;
        di->d_ino_hardlimit = dm->dqb_ihardlimit;
        di->d_ino_softlimit = dm->dqb_isoftlimit;
        di->d_space = dm->dqb_curspace + dm->dqb_rsvspace;
        di->d_ino_count = dm->dqb_curinodes;
        di->d_spc_timer = dm->dqb_btime;
        di->d_ino_timer = dm->dqb_itime;
        spin_unlock(&dquot->dq_dqb_lock);
}

int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
                    struct qc_dqblk *di)
{
        struct dquot *dquot;

        dquot = dqget(sb, qid);
        if (IS_ERR(dquot))
                return PTR_ERR(dquot);
        do_get_dqblk(dquot, di);
        dqput(dquot);

        return 0;
}
EXPORT_SYMBOL(dquot_get_dqblk);

int dquot_get_next_dqblk(struct super_block *sb, struct kqid *qid,
                         struct qc_dqblk *di)
{
        struct dquot *dquot;
        int err;

        if (!sb->dq_op->get_next_id)
                return -ENOSYS;
        err = sb->dq_op->get_next_id(sb, qid);
        if (err < 0)
                return err;
        dquot = dqget(sb, *qid);
        if (IS_ERR(dquot))
                return PTR_ERR(dquot);
        do_get_dqblk(dquot, di);
        dqput(dquot);

        return 0;
}
EXPORT_SYMBOL(dquot_get_next_dqblk);

#define VFS_QC_MASK \
        (QC_SPACE | QC_SPC_SOFT | QC_SPC_HARD | \
         QC_INO_COUNT | QC_INO_SOFT | QC_INO_HARD | \
         QC_SPC_TIMER | QC_INO_TIMER)

/* Generic routine for setting common part of quota structure */
static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
{
        struct mem_dqblk *dm = &dquot->dq_dqb;
        int check_blim = 0, check_ilim = 0;
        struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];

        if (di->d_fieldmask & ~VFS_QC_MASK)
                return -EINVAL;

        if (((di->d_fieldmask & QC_SPC_SOFT) &&
             di->d_spc_softlimit > dqi->dqi_max_spc_limit) ||
            ((di->d_fieldmask & QC_SPC_HARD) &&
             di->d_spc_hardlimit > dqi->dqi_max_spc_limit) ||
            ((di->d_fieldmask & QC_INO_SOFT) &&
             (di->d_ino_softlimit > dqi->dqi_max_ino_limit)) ||
            ((di->d_fieldmask & QC_INO_HARD) &&
             (di->d_ino_hardlimit > dqi->dqi_max_ino_limit)))
                return -ERANGE;

        spin_lock(&dquot->dq_dqb_lock);
        if (di->d_fieldmask & QC_SPACE) {
                dm->dqb_curspace = di->d_space - dm->dqb_rsvspace;
                check_blim = 1;
                set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
        }

        if (di->d_fieldmask & QC_SPC_SOFT)
                dm->dqb_bsoftlimit = di->d_spc_softlimit;
        if (di->d_fieldmask & QC_SPC_HARD)
                dm->dqb_bhardlimit = di->d_spc_hardlimit;
        if (di->d_fieldmask & (QC_SPC_SOFT | QC_SPC_HARD)) {
                check_blim = 1;
                set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
        }

        if (di->d_fieldmask & QC_INO_COUNT) {
                dm->dqb_curinodes = di->d_ino_count;
                check_ilim = 1;
                set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
        }

        if (di->d_fieldmask & QC_INO_SOFT)
                dm->dqb_isoftlimit = di->d_ino_softlimit;
        if (di->d_fieldmask & QC_INO_HARD)
                dm->dqb_ihardlimit = di->d_ino_hardlimit;
        if (di->d_fieldmask & (QC_INO_SOFT | QC_INO_HARD)) {
                check_ilim = 1;
                set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
        }

        if (di->d_fieldmask & QC_SPC_TIMER) {
                dm->dqb_btime = di->d_spc_timer;
                check_blim = 1;
                set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
        }

        if (di->d_fieldmask & QC_INO_TIMER) {
                dm->dqb_itime = di->d_ino_timer;
                check_ilim = 1;
                set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
        }

        if (check_blim) {
                if (!dm->dqb_bsoftlimit ||
                    dm->dqb_curspace + dm->dqb_rsvspace <= dm->dqb_bsoftlimit) {
                        dm->dqb_btime = 0;
                        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
                } else if (!(di->d_fieldmask & QC_SPC_TIMER))
                        /* Set grace only if user hasn't provided his own... */
                        dm->dqb_btime = ktime_get_real_seconds() + dqi->dqi_bgrace;
        }
        if (check_ilim) {
                if (!dm->dqb_isoftlimit ||
                    dm->dqb_curinodes <= dm->dqb_isoftlimit) {
                        dm->dqb_itime = 0;
                        clear_bit(DQ_INODES_B, &dquot->dq_flags);
                } else if (!(di->d_fieldmask & QC_INO_TIMER))
                        /* Set grace only if user hasn't provided his own... */
                        dm->dqb_itime = ktime_get_real_seconds() + dqi->dqi_igrace;
        }
        if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit ||
            dm->dqb_isoftlimit)
                clear_bit(DQ_FAKE_B, &dquot->dq_flags);
        else
                set_bit(DQ_FAKE_B, &dquot->dq_flags);
        spin_unlock(&dquot->dq_dqb_lock);
        mark_dquot_dirty(dquot);

        return 0;
}

int dquot_set_dqblk(struct super_block *sb, struct kqid qid,
                  struct qc_dqblk *di)
{
        struct dquot *dquot;
        int rc;

        dquot = dqget(sb, qid);
        if (IS_ERR(dquot)) {
                rc = PTR_ERR(dquot);
                goto out;
        }
        rc = do_set_dqblk(dquot, di);
        dqput(dquot);
out:
        return rc;
}
EXPORT_SYMBOL(dquot_set_dqblk);

/* Generic routine for getting common part of quota file information */
int dquot_get_state(struct super_block *sb, struct qc_state *state)
{
        struct mem_dqinfo *mi;
        struct qc_type_state *tstate;
        struct quota_info *dqopt = sb_dqopt(sb);
        int type;

        memset(state, 0, sizeof(*state));
        for (type = 0; type < MAXQUOTAS; type++) {
                if (!sb_has_quota_active(sb, type))
                        continue;
                tstate = state->s_state + type;
                mi = sb_dqopt(sb)->info + type;
                tstate->flags = QCI_ACCT_ENABLED;
                spin_lock(&dq_data_lock);
                if (mi->dqi_flags & DQF_SYS_FILE)
                        tstate->flags |= QCI_SYSFILE;
                if (mi->dqi_flags & DQF_ROOT_SQUASH)
                        tstate->flags |= QCI_ROOT_SQUASH;
                if (sb_has_quota_limits_enabled(sb, type))
                        tstate->flags |= QCI_LIMITS_ENFORCED;
                tstate->spc_timelimit = mi->dqi_bgrace;
                tstate->ino_timelimit = mi->dqi_igrace;
                if (dqopt->files[type]) {
                        tstate->ino = dqopt->files[type]->i_ino;
                        tstate->blocks = dqopt->files[type]->i_blocks;
                }
                tstate->nextents = 1;        /* We don't know... */
                spin_unlock(&dq_data_lock);
        }
        return 0;
}
EXPORT_SYMBOL(dquot_get_state);

/* Generic routine for setting common part of quota file information */
int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii)
{
        struct mem_dqinfo *mi;
        int err = 0;

        if ((ii->i_fieldmask & QC_WARNS_MASK) ||
            (ii->i_fieldmask & QC_RT_SPC_TIMER))
                return -EINVAL;
        if (!sb_has_quota_active(sb, type))
                return -ESRCH;
        mi = sb_dqopt(sb)->info + type;
        if (ii->i_fieldmask & QC_FLAGS) {
                if ((ii->i_flags & QCI_ROOT_SQUASH &&
                     mi->dqi_format->qf_fmt_id != QFMT_VFS_OLD))
                        return -EINVAL;
        }
        spin_lock(&dq_data_lock);
        if (ii->i_fieldmask & QC_SPC_TIMER)
                mi->dqi_bgrace = ii->i_spc_timelimit;
        if (ii->i_fieldmask & QC_INO_TIMER)
                mi->dqi_igrace = ii->i_ino_timelimit;
        if (ii->i_fieldmask & QC_FLAGS) {
                if (ii->i_flags & QCI_ROOT_SQUASH)
                        mi->dqi_flags |= DQF_ROOT_SQUASH;
                else
                        mi->dqi_flags &= ~DQF_ROOT_SQUASH;
        }
        spin_unlock(&dq_data_lock);
        mark_info_dirty(sb, type);
        /* Force write to disk */
        sb->dq_op->write_info(sb, type);
        return err;
}
EXPORT_SYMBOL(dquot_set_dqinfo);

const struct quotactl_ops dquot_quotactl_sysfile_ops = {
        .quota_enable        = dquot_quota_enable,
        .quota_disable        = dquot_quota_disable,
        .quota_sync        = dquot_quota_sync,
        .get_state        = dquot_get_state,
        .set_info        = dquot_set_dqinfo,
        .get_dqblk        = dquot_get_dqblk,
        .get_nextdqblk        = dquot_get_next_dqblk,
        .set_dqblk        = dquot_set_dqblk
};
EXPORT_SYMBOL(dquot_quotactl_sysfile_ops);

static int do_proc_dqstats(struct ctl_table *table, int write,
                     void *buffer, size_t *lenp, loff_t *ppos)
{
        unsigned int type = (unsigned long *)table->data - dqstats.stat;
        s64 value = percpu_counter_sum(&dqstats.counter[type]);

        /* Filter negative values for non-monotonic counters */
        if (value < 0 && (type == DQST_ALLOC_DQUOTS ||
                          type == DQST_FREE_DQUOTS))
                value = 0;

        /* Update global table */
        dqstats.stat[type] = value;
        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}

static struct ctl_table fs_dqstats_table[] = {
        {
                .procname        = "lookups",
                .data                = &dqstats.stat[DQST_LOOKUPS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "drops",
                .data                = &dqstats.stat[DQST_DROPS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "reads",
                .data                = &dqstats.stat[DQST_READS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "writes",
                .data                = &dqstats.stat[DQST_WRITES],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "cache_hits",
                .data                = &dqstats.stat[DQST_CACHE_HITS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "allocated_dquots",
                .data                = &dqstats.stat[DQST_ALLOC_DQUOTS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "free_dquots",
                .data                = &dqstats.stat[DQST_FREE_DQUOTS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "syncs",
                .data                = &dqstats.stat[DQST_SYNCS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
#ifdef CONFIG_PRINT_QUOTA_WARNING
        {
                .procname        = "warnings",
                .data                = &flag_print_warnings,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
#endif
        { },
};

static struct ctl_table fs_table[] = {
        {
                .procname        = "quota",
                .mode                = 0555,
                .child                = fs_dqstats_table,
        },
        { },
};

static struct ctl_table sys_table[] = {
        {
                .procname        = "fs",
                .mode                = 0555,
                .child                = fs_table,
        },
        { },
};

static int __init dquot_init(void)
{
        int i, ret;
        unsigned long nr_hash, order;

        printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__);

        register_sysctl_table(sys_table);

        dquot_cachep = kmem_cache_create("dquot",
                        sizeof(struct dquot), sizeof(unsigned long) * 4,
                        (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
                                SLAB_MEM_SPREAD|SLAB_PANIC),
                        NULL);

        order = 0;
        dquot_hash = (struct hlist_head *)__get_free_pages(GFP_KERNEL, order);
        if (!dquot_hash)
                panic("Cannot create dquot hash table");

        for (i = 0; i < _DQST_DQSTAT_LAST; i++) {
                ret = percpu_counter_init(&dqstats.counter[i], 0, GFP_KERNEL);
                if (ret)
                        panic("Cannot create dquot stat counters");
        }

        /* Find power-of-two hlist_heads which can fit into allocation */
        nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head);
        dq_hash_bits = ilog2(nr_hash);

        nr_hash = 1UL << dq_hash_bits;
        dq_hash_mask = nr_hash - 1;
        for (i = 0; i < nr_hash; i++)
                INIT_HLIST_HEAD(dquot_hash + i);

        pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld,"
                " %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order));

        if (register_shrinker(&dqcache_shrinker))
                panic("Cannot register dquot shrinker");

        return 0;
}
fs_initcall(dquot_init);











































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM block

#if !defined(_TRACE_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_BLOCK_H

#include <linux/blktrace_api.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/tracepoint.h>

#define RWBS_LEN        8

DECLARE_EVENT_CLASS(block_buffer,

        TP_PROTO(struct buffer_head *bh),

        TP_ARGS(bh),

        TP_STRUCT__entry (
                __field(  dev_t,        dev                        )
                __field(  sector_t,        sector                        )
                __field(  size_t,        size                        )
        ),

        TP_fast_assign(
                __entry->dev                = bh->b_bdev->bd_dev;
                __entry->sector                = bh->b_blocknr;
                __entry->size                = bh->b_size;
        ),

        TP_printk("%d,%d sector=%llu size=%zu",
                MAJOR(__entry->dev), MINOR(__entry->dev),
                (unsigned long long)__entry->sector, __entry->size
        )
);

/**
 * block_touch_buffer - mark a buffer accessed
 * @bh: buffer_head being touched
 *
 * Called from touch_buffer().
 */
DEFINE_EVENT(block_buffer, block_touch_buffer,

        TP_PROTO(struct buffer_head *bh),

        TP_ARGS(bh)
);

/**
 * block_dirty_buffer - mark a buffer dirty
 * @bh: buffer_head being dirtied
 *
 * Called from mark_buffer_dirty().
 */
DEFINE_EVENT(block_buffer, block_dirty_buffer,

        TP_PROTO(struct buffer_head *bh),

        TP_ARGS(bh)
);

/**
 * block_rq_requeue - place block IO request back on a queue
 * @rq: block IO operation request
 *
 * The block operation request @rq is being placed back into queue
 * @q.  For some reason the request was not completed and needs to be
 * put back in the queue.
 */
TRACE_EVENT(block_rq_requeue,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq),

        TP_STRUCT__entry(
                __field(  dev_t,        dev                        )
                __field(  sector_t,        sector                        )
                __field(  unsigned int,        nr_sector                )
                __array(  char,                rwbs,        RWBS_LEN        )
                __dynamic_array( char,        cmd,        1                )
        ),

        TP_fast_assign(
                __entry->dev           = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
                __entry->sector    = blk_rq_trace_sector(rq);
                __entry->nr_sector = blk_rq_trace_nr_sectors(rq);

                blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
                __get_str(cmd)[0] = '\0';
        ),

        TP_printk("%d,%d %s (%s) %llu + %u [%d]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->rwbs, __get_str(cmd),
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, 0)
);

/**
 * block_rq_complete - block IO operation completed by device driver
 * @rq: block operations request
 * @error: status code
 * @nr_bytes: number of completed bytes
 *
 * The block_rq_complete tracepoint event indicates that some portion
 * of operation request has been completed by the device driver.  If
 * the @rq->bio is %NULL, then there is absolutely no additional work to
 * do for the request. If @rq->bio is non-NULL then there is
 * additional work required to complete the request.
 */
TRACE_EVENT(block_rq_complete,

        TP_PROTO(struct request *rq, int error, unsigned int nr_bytes),

        TP_ARGS(rq, error, nr_bytes),

        TP_STRUCT__entry(
                __field(  dev_t,        dev                        )
                __field(  sector_t,        sector                        )
                __field(  unsigned int,        nr_sector                )
                __field(  int,                error                        )
                __array(  char,                rwbs,        RWBS_LEN        )
                __dynamic_array( char,        cmd,        1                )
        ),

        TP_fast_assign(
                __entry->dev           = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
                __entry->sector    = blk_rq_pos(rq);
                __entry->nr_sector = nr_bytes >> 9;
                __entry->error     = error;

                blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, nr_bytes);
                __get_str(cmd)[0] = '\0';
        ),

        TP_printk("%d,%d %s (%s) %llu + %u [%d]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->rwbs, __get_str(cmd),
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, __entry->error)
);

DECLARE_EVENT_CLASS(block_rq,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq),

        TP_STRUCT__entry(
                __field(  dev_t,        dev                        )
                __field(  sector_t,        sector                        )
                __field(  unsigned int,        nr_sector                )
                __field(  unsigned int,        bytes                        )
                __array(  char,                rwbs,        RWBS_LEN        )
                __array(  char,         comm,   TASK_COMM_LEN   )
                __dynamic_array( char,        cmd,        1                )
        ),

        TP_fast_assign(
                __entry->dev           = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
                __entry->sector    = blk_rq_trace_sector(rq);
                __entry->nr_sector = blk_rq_trace_nr_sectors(rq);
                __entry->bytes     = blk_rq_bytes(rq);

                blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
                __get_str(cmd)[0] = '\0';
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->rwbs, __entry->bytes, __get_str(cmd),
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, __entry->comm)
);

/**
 * block_rq_insert - insert block operation request into queue
 * @rq: block IO operation request
 *
 * Called immediately before block operation request @rq is inserted
 * into queue @q.  The fields in the operation request @rq struct can
 * be examined to determine which device and sectors the pending
 * operation would access.
 */
DEFINE_EVENT(block_rq, block_rq_insert,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq)
);

/**
 * block_rq_issue - issue pending block IO request operation to device driver
 * @rq: block IO operation operation request
 *
 * Called when block operation request @rq from queue @q is sent to a
 * device driver for processing.
 */
DEFINE_EVENT(block_rq, block_rq_issue,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq)
);

/**
 * block_rq_merge - merge request with another one in the elevator
 * @rq: block IO operation operation request
 *
 * Called when block operation request @rq from queue @q is merged to another
 * request queued in the elevator.
 */
DEFINE_EVENT(block_rq, block_rq_merge,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq)
);

/**
 * block_bio_bounce - used bounce buffer when processing block operation
 * @q: queue holding the block operation
 * @bio: block operation
 *
 * A bounce buffer was used to handle the block operation @bio in @q.
 * This occurs when hardware limitations prevent a direct transfer of
 * data between the @bio data memory area and the IO device.  Use of a
 * bounce buffer requires extra copying of data and decreases
 * performance.
 */
TRACE_EVENT(block_bio_bounce,

        TP_PROTO(struct request_queue *q, struct bio *bio),

        TP_ARGS(q, bio),

        TP_STRUCT__entry(
                __field( dev_t,                dev                        )
                __field( sector_t,        sector                        )
                __field( unsigned int,        nr_sector                )
                __array( char,                rwbs,        RWBS_LEN        )
                __array( char,                comm,        TASK_COMM_LEN        )
        ),

        TP_fast_assign(
                __entry->dev                = bio_dev(bio);
                __entry->sector                = bio->bi_iter.bi_sector;
                __entry->nr_sector        = bio_sectors(bio);
                blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("%d,%d %s %llu + %u [%s]",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, __entry->comm)
);

/**
 * block_bio_complete - completed all work on the block operation
 * @q: queue holding the block operation
 * @bio: block operation completed
 *
 * This tracepoint indicates there is no further work to do on this
 * block IO operation @bio.
 */
TRACE_EVENT(block_bio_complete,

        TP_PROTO(struct request_queue *q, struct bio *bio),

        TP_ARGS(q, bio),

        TP_STRUCT__entry(
                __field( dev_t,                dev                )
                __field( sector_t,        sector                )
                __field( unsigned,        nr_sector        )
                __field( int,                error                )
                __array( char,                rwbs,        RWBS_LEN)
        ),

        TP_fast_assign(
                __entry->dev                = bio_dev(bio);
                __entry->sector                = bio->bi_iter.bi_sector;
                __entry->nr_sector        = bio_sectors(bio);
                __entry->error                = blk_status_to_errno(bio->bi_status);
                blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
        ),

        TP_printk("%d,%d %s %llu + %u [%d]",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, __entry->error)
);

DECLARE_EVENT_CLASS(block_bio_merge,

        TP_PROTO(struct request_queue *q, struct request *rq, struct bio *bio),

        TP_ARGS(q, rq, bio),

        TP_STRUCT__entry(
                __field( dev_t,                dev                        )
                __field( sector_t,        sector                        )
                __field( unsigned int,        nr_sector                )
                __array( char,                rwbs,        RWBS_LEN        )
                __array( char,                comm,        TASK_COMM_LEN        )
        ),

        TP_fast_assign(
                __entry->dev                = bio_dev(bio);
                __entry->sector                = bio->bi_iter.bi_sector;
                __entry->nr_sector        = bio_sectors(bio);
                blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("%d,%d %s %llu + %u [%s]",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, __entry->comm)
);

/**
 * block_bio_backmerge - merging block operation to the end of an existing operation
 * @q: queue holding operation
 * @rq: request bio is being merged into
 * @bio: new block operation to merge
 *
 * Merging block request @bio to the end of an existing block request
 * in queue @q.
 */
DEFINE_EVENT(block_bio_merge, block_bio_backmerge,

        TP_PROTO(struct request_queue *q, struct request *rq, struct bio *bio),

        TP_ARGS(q, rq, bio)
);

/**
 * block_bio_frontmerge - merging block operation to the beginning of an existing operation
 * @q: queue holding operation
 * @rq: request bio is being merged into
 * @bio: new block operation to merge
 *
 * Merging block IO operation @bio to the beginning of an existing block
 * operation in queue @q.
 */
DEFINE_EVENT(block_bio_merge, block_bio_frontmerge,

        TP_PROTO(struct request_queue *q, struct request *rq, struct bio *bio),

        TP_ARGS(q, rq, bio)
);

/**
 * block_bio_queue - putting new block IO operation in queue
 * @q: queue holding operation
 * @bio: new block operation
 *
 * About to place the block IO operation @bio into queue @q.
 */
TRACE_EVENT(block_bio_queue,

        TP_PROTO(struct request_queue *q, struct bio *bio),

        TP_ARGS(q, bio),

        TP_STRUCT__entry(
                __field( dev_t,                dev                        )
                __field( sector_t,        sector                        )
                __field( unsigned int,        nr_sector                )
                __array( char,                rwbs,        RWBS_LEN        )
                __array( char,                comm,        TASK_COMM_LEN        )
        ),

        TP_fast_assign(
                __entry->dev                = bio_dev(bio);
                __entry->sector                = bio->bi_iter.bi_sector;
                __entry->nr_sector        = bio_sectors(bio);
                blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("%d,%d %s %llu + %u [%s]",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, __entry->comm)
);

DECLARE_EVENT_CLASS(block_get_rq,

        TP_PROTO(struct request_queue *q, struct bio *bio, int rw),

        TP_ARGS(q, bio, rw),

        TP_STRUCT__entry(
                __field( dev_t,                dev                        )
                __field( sector_t,        sector                        )
                __field( unsigned int,        nr_sector                )
                __array( char,                rwbs,        RWBS_LEN        )
                __array( char,                comm,        TASK_COMM_LEN        )
        ),

        TP_fast_assign(
                __entry->dev                = bio ? bio_dev(bio) : 0;
                __entry->sector                = bio ? bio->bi_iter.bi_sector : 0;
                __entry->nr_sector        = bio ? bio_sectors(bio) : 0;
                blk_fill_rwbs(__entry->rwbs,
                              bio ? bio->bi_opf : 0, __entry->nr_sector);
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("%d,%d %s %llu + %u [%s]",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, __entry->comm)
);

/**
 * block_getrq - get a free request entry in queue for block IO operations
 * @q: queue for operations
 * @bio: pending block IO operation (can be %NULL)
 * @rw: low bit indicates a read (%0) or a write (%1)
 *
 * A request struct for queue @q has been allocated to handle the
 * block IO operation @bio.
 */
DEFINE_EVENT(block_get_rq, block_getrq,

        TP_PROTO(struct request_queue *q, struct bio *bio, int rw),

        TP_ARGS(q, bio, rw)
);

/**
 * block_sleeprq - waiting to get a free request entry in queue for block IO operation
 * @q: queue for operation
 * @bio: pending block IO operation (can be %NULL)
 * @rw: low bit indicates a read (%0) or a write (%1)
 *
 * In the case where a request struct cannot be provided for queue @q
 * the process needs to wait for an request struct to become
 * available.  This tracepoint event is generated each time the
 * process goes to sleep waiting for request struct become available.
 */
DEFINE_EVENT(block_get_rq, block_sleeprq,

        TP_PROTO(struct request_queue *q, struct bio *bio, int rw),

        TP_ARGS(q, bio, rw)
);

/**
 * block_plug - keep operations requests in request queue
 * @q: request queue to plug
 *
 * Plug the request queue @q.  Do not allow block operation requests
 * to be sent to the device driver. Instead, accumulate requests in
 * the queue to improve throughput performance of the block device.
 */
TRACE_EVENT(block_plug,

        TP_PROTO(struct request_queue *q),

        TP_ARGS(q),

        TP_STRUCT__entry(
                __array( char,                comm,        TASK_COMM_LEN        )
        ),

        TP_fast_assign(
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("[%s]", __entry->comm)
);

DECLARE_EVENT_CLASS(block_unplug,

        TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit),

        TP_ARGS(q, depth, explicit),

        TP_STRUCT__entry(
                __field( int,                nr_rq                        )
                __array( char,                comm,        TASK_COMM_LEN        )
        ),

        TP_fast_assign(
                __entry->nr_rq = depth;
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
);

/**
 * block_unplug - release of operations requests in request queue
 * @q: request queue to unplug
 * @depth: number of requests just added to the queue
 * @explicit: whether this was an explicit unplug, or one from schedule()
 *
 * Unplug request queue @q because device driver is scheduled to work
 * on elements in the request queue.
 */
DEFINE_EVENT(block_unplug, block_unplug,

        TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit),

        TP_ARGS(q, depth, explicit)
);

/**
 * block_split - split a single bio struct into two bio structs
 * @q: queue containing the bio
 * @bio: block operation being split
 * @new_sector: The starting sector for the new bio
 *
 * The bio request @bio in request queue @q needs to be split into two
 * bio requests. The newly created @bio request starts at
 * @new_sector. This split may be required due to hardware limitation
 * such as operation crossing device boundaries in a RAID system.
 */
TRACE_EVENT(block_split,

        TP_PROTO(struct request_queue *q, struct bio *bio,
                 unsigned int new_sector),

        TP_ARGS(q, bio, new_sector),

        TP_STRUCT__entry(
                __field( dev_t,                dev                                )
                __field( sector_t,        sector                                )
                __field( sector_t,        new_sector                        )
                __array( char,                rwbs,                RWBS_LEN        )
                __array( char,                comm,                TASK_COMM_LEN        )
        ),

        TP_fast_assign(
                __entry->dev                = bio_dev(bio);
                __entry->sector                = bio->bi_iter.bi_sector;
                __entry->new_sector        = new_sector;
                blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("%d,%d %s %llu / %llu [%s]",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  (unsigned long long)__entry->new_sector,
                  __entry->comm)
);

/**
 * block_bio_remap - map request for a logical device to the raw device
 * @q: queue holding the operation
 * @bio: revised operation
 * @dev: device for the operation
 * @from: original sector for the operation
 *
 * An operation for a logical device has been mapped to the
 * raw block device.
 */
TRACE_EVENT(block_bio_remap,

        TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
                 sector_t from),

        TP_ARGS(q, bio, dev, from),

        TP_STRUCT__entry(
                __field( dev_t,                dev                )
                __field( sector_t,        sector                )
                __field( unsigned int,        nr_sector        )
                __field( dev_t,                old_dev                )
                __field( sector_t,        old_sector        )
                __array( char,                rwbs,        RWBS_LEN)
        ),

        TP_fast_assign(
                __entry->dev                = bio_dev(bio);
                __entry->sector                = bio->bi_iter.bi_sector;
                __entry->nr_sector        = bio_sectors(bio);
                __entry->old_dev        = dev;
                __entry->old_sector        = from;
                blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
        ),

        TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector,
                  MAJOR(__entry->old_dev), MINOR(__entry->old_dev),
                  (unsigned long long)__entry->old_sector)
);

/**
 * block_rq_remap - map request for a block operation request
 * @rq: block IO operation request
 * @dev: device for the operation
 * @from: original sector for the operation
 *
 * The block operation request @rq in @q has been remapped.  The block
 * operation request @rq holds the current information and @from hold
 * the original sector.
 */
TRACE_EVENT(block_rq_remap,

        TP_PROTO(struct request *rq, dev_t dev, sector_t from),

        TP_ARGS(rq, dev, from),

        TP_STRUCT__entry(
                __field( dev_t,                dev                )
                __field( sector_t,        sector                )
                __field( unsigned int,        nr_sector        )
                __field( dev_t,                old_dev                )
                __field( sector_t,        old_sector        )
                __field( unsigned int,        nr_bios                )
                __array( char,                rwbs,        RWBS_LEN)
        ),

        TP_fast_assign(
                __entry->dev                = disk_devt(rq->rq_disk);
                __entry->sector                = blk_rq_pos(rq);
                __entry->nr_sector        = blk_rq_sectors(rq);
                __entry->old_dev        = dev;
                __entry->old_sector        = from;
                __entry->nr_bios        = blk_rq_count_bios(rq);
                blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
        ),

        TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector,
                  MAJOR(__entry->old_dev), MINOR(__entry->old_dev),
                  (unsigned long long)__entry->old_sector, __entry->nr_bios)
);

#endif /* _TRACE_BLOCK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>














































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Operations on the network namespace
 */
#ifndef __NET_NET_NAMESPACE_H
#define __NET_NET_NAMESPACE_H

#include <linux/atomic.h>
#include <linux/refcount.h>
#include <linux/workqueue.h>
#include <linux/list.h>
#include <linux/sysctl.h>
#include <linux/uidgid.h>

#include <net/flow.h>
#include <net/netns/core.h>
#include <net/netns/mib.h>
#include <net/netns/unix.h>
#include <net/netns/packet.h>
#include <net/netns/ipv4.h>
#include <net/netns/ipv6.h>
#include <net/netns/nexthop.h>
#include <net/netns/ieee802154_6lowpan.h>
#include <net/netns/sctp.h>
#include <net/netns/dccp.h>
#include <net/netns/netfilter.h>
#include <net/netns/x_tables.h>
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
#include <net/netns/conntrack.h>
#endif
#include <net/netns/nftables.h>
#include <net/netns/xfrm.h>
#include <net/netns/mpls.h>
#include <net/netns/can.h>
#include <net/netns/xdp.h>
#include <net/netns/bpf.h>
#include <linux/ns_common.h>
#include <linux/idr.h>
#include <linux/skbuff.h>
#include <linux/notifier.h>

struct user_namespace;
struct proc_dir_entry;
struct net_device;
struct sock;
struct ctl_table_header;
struct net_generic;
struct uevent_sock;
struct netns_ipvs;
struct bpf_prog;


#define NETDEV_HASHBITS    8
#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)

struct net {
        /* First cache line can be often dirtied.
         * Do not place here read-mostly fields.
         */
        refcount_t                passive;        /* To decide when the network
                                                 * namespace should be freed.
                                                 */
        refcount_t                count;                /* To decided when the network
                                                 *  namespace should be shut down.
                                                 */
        spinlock_t                rules_mod_lock;

        unsigned int                dev_unreg_count;

        unsigned int                dev_base_seq;        /* protected by rtnl_mutex */
        int                        ifindex;

        spinlock_t                nsid_lock;
        atomic_t                fnhe_genid;

        struct list_head        list;                /* list of network namespaces */
        struct list_head        exit_list;        /* To linked to call pernet exit
                                                 * methods on dead net (
                                                 * pernet_ops_rwsem read locked),
                                                 * or to unregister pernet ops
                                                 * (pernet_ops_rwsem write locked).
                                                 */
        struct llist_node        cleanup_list;        /* namespaces on death row */

#ifdef CONFIG_KEYS
        struct key_tag                *key_domain;        /* Key domain of operation tag */
#endif
        struct user_namespace   *user_ns;        /* Owning user namespace */
        struct ucounts                *ucounts;
        struct idr                netns_ids;

        struct ns_common        ns;

        struct list_head         dev_base_head;
        struct proc_dir_entry         *proc_net;
        struct proc_dir_entry         *proc_net_stat;

#ifdef CONFIG_SYSCTL
        struct ctl_table_set        sysctls;
#endif

        struct sock                 *rtnl;                        /* rtnetlink socket */
        struct sock                *genl_sock;

        struct uevent_sock        *uevent_sock;                /* uevent socket */

        struct hlist_head         *dev_name_head;
        struct hlist_head        *dev_index_head;
        struct raw_notifier_head        netdev_chain;

        /* Note that @hash_mix can be read millions times per second,
         * it is critical that it is on a read_mostly cache line.
         */
        u32                        hash_mix;

        struct net_device       *loopback_dev;          /* The loopback */

        /* core fib_rules */
        struct list_head        rules_ops;

        struct netns_core        core;
        struct netns_mib        mib;
        struct netns_packet        packet;
        struct netns_unix        unx;
        struct netns_nexthop        nexthop;
        struct netns_ipv4        ipv4;
#if IS_ENABLED(CONFIG_IPV6)
        struct netns_ipv6        ipv6;
#endif
#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
        struct netns_ieee802154_lowpan        ieee802154_lowpan;
#endif
#if defined(CONFIG_IP_SCTP) || defined(CONFIG_IP_SCTP_MODULE)
        struct netns_sctp        sctp;
#endif
#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE)
        struct netns_dccp        dccp;
#endif
#ifdef CONFIG_NETFILTER
        struct netns_nf                nf;
        struct netns_xt                xt;
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        struct netns_ct                ct;
#endif
#if defined(CONFIG_NF_TABLES) || defined(CONFIG_NF_TABLES_MODULE)
        struct netns_nftables        nft;
#endif
#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
        struct netns_nf_frag        nf_frag;
        struct ctl_table_header *nf_frag_frags_hdr;
#endif
        struct sock                *nfnl;
        struct sock                *nfnl_stash;
#if IS_ENABLED(CONFIG_NETFILTER_NETLINK_ACCT)
        struct list_head        nfnl_acct_list;
#endif
#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
        struct list_head        nfct_timeout_list;
#endif
#endif
#ifdef CONFIG_WEXT_CORE
        struct sk_buff_head        wext_nlevents;
#endif
        struct net_generic __rcu        *gen;

        /* Used to store attached BPF programs */
        struct netns_bpf        bpf;

        /* Note : following structs are cache line aligned */
#ifdef CONFIG_XFRM
        struct netns_xfrm        xfrm;
#endif

        atomic64_t                net_cookie; /* written once */

#if IS_ENABLED(CONFIG_IP_VS)
        struct netns_ipvs        *ipvs;
#endif
#if IS_ENABLED(CONFIG_MPLS)
        struct netns_mpls        mpls;
#endif
#if IS_ENABLED(CONFIG_CAN)
        struct netns_can        can;
#endif
#ifdef CONFIG_XDP_SOCKETS
        struct netns_xdp        xdp;
#endif
#if IS_ENABLED(CONFIG_CRYPTO_USER)
        struct sock                *crypto_nlsk;
#endif
        struct sock                *diag_nlsk;
} __randomize_layout;

#include <linux/seq_file_net.h>

/* Init's network namespace */
extern struct net init_net;

#ifdef CONFIG_NET_NS
struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns,
                        struct net *old_net);

void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid);

void net_ns_barrier(void);

struct ns_common *get_net_ns(struct ns_common *ns);
#else /* CONFIG_NET_NS */
#include <linux/sched.h>
#include <linux/nsproxy.h>
static inline struct net *copy_net_ns(unsigned long flags,
        struct user_namespace *user_ns, struct net *old_net)
{
        if (flags & CLONE_NEWNET)
                return ERR_PTR(-EINVAL);
        return old_net;
}

static inline void net_ns_get_ownership(const struct net *net,
                                        kuid_t *uid, kgid_t *gid)
{
        *uid = GLOBAL_ROOT_UID;
        *gid = GLOBAL_ROOT_GID;
}

static inline void net_ns_barrier(void) {}

static inline struct ns_common *get_net_ns(struct ns_common *ns)
{
        return ERR_PTR(-EINVAL);
}
#endif /* CONFIG_NET_NS */


extern struct list_head net_namespace_list;

struct net *get_net_ns_by_pid(pid_t pid);
struct net *get_net_ns_by_fd(int fd);

u64 __net_gen_cookie(struct net *net);

#ifdef CONFIG_SYSCTL
void ipx_register_sysctl(void);
void ipx_unregister_sysctl(void);
#else
#define ipx_register_sysctl()
#define ipx_unregister_sysctl()
#endif

#ifdef CONFIG_NET_NS
void __put_net(struct net *net);

static inline struct net *get_net(struct net *net)
{
        refcount_inc(&net->count);
        return net;
}

static inline struct net *maybe_get_net(struct net *net)
{
        /* Used when we know struct net exists but we
         * aren't guaranteed a previous reference count
         * exists.  If the reference count is zero this
         * function fails and returns NULL.
         */
        if (!refcount_inc_not_zero(&net->count))
                net = NULL;
        return net;
}

static inline void put_net(struct net *net)
{
        if (refcount_dec_and_test(&net->count))
                __put_net(net);
}

static inline
int net_eq(const struct net *net1, const struct net *net2)
{
        return net1 == net2;
}

static inline int check_net(const struct net *net)
{
        return refcount_read(&net->count) != 0;
}

void net_drop_ns(void *);

#else

static inline struct net *get_net(struct net *net)
{
        return net;
}

static inline void put_net(struct net *net)
{
}

static inline struct net *maybe_get_net(struct net *net)
{
        return net;
}

static inline
int net_eq(const struct net *net1, const struct net *net2)
{
        return 1;
}

static inline int check_net(const struct net *net)
{
        return 1;
}

#define net_drop_ns NULL
#endif


typedef struct {
#ifdef CONFIG_NET_NS
        struct net *net;
#endif
} possible_net_t;

static inline void write_pnet(possible_net_t *pnet, struct net *net)
{
#ifdef CONFIG_NET_NS
        pnet->net = net;
#endif
}

static inline struct net *read_pnet(const possible_net_t *pnet)
{
#ifdef CONFIG_NET_NS
        return pnet->net;
#else
        return &init_net;
#endif
}

/* Protected by net_rwsem */
#define for_each_net(VAR)                                \
        list_for_each_entry(VAR, &net_namespace_list, list)
#define for_each_net_continue_reverse(VAR)                \
        list_for_each_entry_continue_reverse(VAR, &net_namespace_list, list)
#define for_each_net_rcu(VAR)                                \
        list_for_each_entry_rcu(VAR, &net_namespace_list, list)

#ifdef CONFIG_NET_NS
#define __net_init
#define __net_exit
#define __net_initdata
#define __net_initconst
#else
#define __net_init        __init
#define __net_exit        __ref
#define __net_initdata        __initdata
#define __net_initconst        __initconst
#endif

int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp);
int peernet2id(const struct net *net, struct net *peer);
bool peernet_has_id(const struct net *net, struct net *peer);
struct net *get_net_ns_by_id(const struct net *net, int id);

struct pernet_operations {
        struct list_head list;
        /*
         * Below methods are called without any exclusive locks.
         * More than one net may be constructed and destructed
         * in parallel on several cpus. Every pernet_operations
         * have to keep in mind all other pernet_operations and
         * to introduce a locking, if they share common resources.
         *
         * The only time they are called with exclusive lock is
         * from register_pernet_subsys(), unregister_pernet_subsys()
         * register_pernet_device() and unregister_pernet_device().
         *
         * Exit methods using blocking RCU primitives, such as
         * synchronize_rcu(), should be implemented via exit_batch.
         * Then, destruction of a group of net requires single
         * synchronize_rcu() related to these pernet_operations,
         * instead of separate synchronize_rcu() for every net.
         * Please, avoid synchronize_rcu() at all, where it's possible.
         *
         * Note that a combination of pre_exit() and exit() can
         * be used, since a synchronize_rcu() is guaranteed between
         * the calls.
         */
        int (*init)(struct net *net);
        void (*pre_exit)(struct net *net);
        void (*exit)(struct net *net);
        void (*exit_batch)(struct list_head *net_exit_list);
        unsigned int *id;
        size_t size;
};

/*
 * Use these carefully.  If you implement a network device and it
 * needs per network namespace operations use device pernet operations,
 * otherwise use pernet subsys operations.
 *
 * Network interfaces need to be removed from a dying netns _before_
 * subsys notifiers can be called, as most of the network code cleanup
 * (which is done from subsys notifiers) runs with the assumption that
 * dev_remove_pack has been called so no new packets will arrive during
 * and after the cleanup functions have been called.  dev_remove_pack
 * is not per namespace so instead the guarantee of no more packets
 * arriving in a network namespace is provided by ensuring that all
 * network devices and all sockets have left the network namespace
 * before the cleanup methods are called.
 *
 * For the longest time the ipv4 icmp code was registered as a pernet
 * device which caused kernel oops, and panics during network
 * namespace cleanup.   So please don't get this wrong.
 */
int register_pernet_subsys(struct pernet_operations *);
void unregister_pernet_subsys(struct pernet_operations *);
int register_pernet_device(struct pernet_operations *);
void unregister_pernet_device(struct pernet_operations *);

struct ctl_table;
struct ctl_table_header;

#ifdef CONFIG_SYSCTL
int net_sysctl_init(void);
struct ctl_table_header *register_net_sysctl(struct net *net, const char *path,
                                             struct ctl_table *table);
void unregister_net_sysctl_table(struct ctl_table_header *header);
#else
static inline int net_sysctl_init(void) { return 0; }
static inline struct ctl_table_header *register_net_sysctl(struct net *net,
        const char *path, struct ctl_table *table)
{
        return NULL;
}
static inline void unregister_net_sysctl_table(struct ctl_table_header *header)
{
}
#endif

static inline int rt_genid_ipv4(const struct net *net)
{
        return atomic_read(&net->ipv4.rt_genid);
}

#if IS_ENABLED(CONFIG_IPV6)
static inline int rt_genid_ipv6(const struct net *net)
{
        return atomic_read(&net->ipv6.fib6_sernum);
}
#endif

static inline void rt_genid_bump_ipv4(struct net *net)
{
        atomic_inc(&net->ipv4.rt_genid);
}

extern void (*__fib6_flush_trees)(struct net *net);
static inline void rt_genid_bump_ipv6(struct net *net)
{
        if (__fib6_flush_trees)
                __fib6_flush_trees(net);
}

#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
static inline struct netns_ieee802154_lowpan *
net_ieee802154_lowpan(struct net *net)
{
        return &net->ieee802154_lowpan;
}
#endif

/* For callers who don't really care about whether it's IPv4 or IPv6 */
static inline void rt_genid_bump_all(struct net *net)
{
        rt_genid_bump_ipv4(net);
        rt_genid_bump_ipv6(net);
}

static inline int fnhe_genid(const struct net *net)
{
        return atomic_read(&net->fnhe_genid);
}

static inline void fnhe_genid_bump(struct net *net)
{
        atomic_inc(&net->fnhe_genid);
}

#endif /* __NET_NET_NAMESPACE_H */

















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *  include/linux/eventpoll.h ( Efficient event polling implementation )
 *  Copyright (C) 2001,...,2006         Davide Libenzi
 *
 *  Davide Libenzi <davidel@xmailserver.org>
 */
#ifndef _LINUX_EVENTPOLL_H
#define _LINUX_EVENTPOLL_H

#include <uapi/linux/eventpoll.h>
#include <uapi/linux/kcmp.h>


/* Forward declarations to avoid compiler errors */
struct file;


#ifdef CONFIG_EPOLL

#ifdef CONFIG_KCMP
struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff);
#endif

/* Used to initialize the epoll bits inside the "struct file" */
static inline void eventpoll_init_file(struct file *file)
{
        INIT_LIST_HEAD(&file->f_ep_links);
        INIT_LIST_HEAD(&file->f_tfile_llink);
}


/* Used to release the epoll bits inside the "struct file" */
void eventpoll_release_file(struct file *file);

/*
 * This is called from inside fs/file_table.c:__fput() to unlink files
 * from the eventpoll interface. We need to have this facility to cleanup
 * correctly files that are closed without being removed from the eventpoll
 * interface.
 */
static inline void eventpoll_release(struct file *file)
{

        /*
         * Fast check to avoid the get/release of the semaphore. Since
         * we're doing this outside the semaphore lock, it might return
         * false negatives, but we don't care. It'll help in 99.99% of cases
         * to avoid the semaphore lock. False positives simply cannot happen
         * because the file in on the way to be removed and nobody ( but
         * eventpoll ) has still a reference to this file.
         */
        if (likely(list_empty(&file->f_ep_links)))
                return;

        /*
         * The file is being closed while it is still linked to an epoll
         * descriptor. We need to handle this by correctly unlinking it
         * from its containers.
         */
        eventpoll_release_file(file);
}

int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
                 bool nonblock);

/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
static inline int ep_op_has_event(int op)
{
        return op != EPOLL_CTL_DEL;
}

#else

static inline void eventpoll_init_file(struct file *file) {}
static inline void eventpoll_release(struct file *file) {}

#endif

#endif /* #ifndef _LINUX_EVENTPOLL_H */
































































































































































































































































































































































































































































































































































































































































































































































    1 










    1 










    1 










    1 


















































































































































































































































































































































































































































































    1 










































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PGTABLE_H
#define _LINUX_PGTABLE_H

#include <linux/pfn.h>
#include <asm/pgtable.h>

#ifndef __ASSEMBLY__
#ifdef CONFIG_MMU

#include <linux/mm_types.h>
#include <linux/bug.h>
#include <linux/errno.h>
#include <asm-generic/pgtable_uffd.h>

#if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \
        defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS
#error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{P4D,PUD,PMD}_FOLDED
#endif

/*
 * On almost all architectures and configurations, 0 can be used as the
 * upper ceiling to free_pgtables(): on many architectures it has the same
 * effect as using TASK_SIZE.  However, there is one configuration which
 * must impose a more careful limit, to avoid freeing kernel pgtables.
 */
#ifndef USER_PGTABLES_CEILING
#define USER_PGTABLES_CEILING        0UL
#endif

/*
 * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD]
 *
 * The pXx_index() functions return the index of the entry in the page
 * table page which would control the given virtual address
 *
 * As these functions may be used by the same code for different levels of
 * the page table folding, they are always available, regardless of
 * CONFIG_PGTABLE_LEVELS value. For the folded levels they simply return 0
 * because in such cases PTRS_PER_PxD equals 1.
 */

static inline unsigned long pte_index(unsigned long address)
{
        return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
}
#define pte_index pte_index

#ifndef pmd_index
static inline unsigned long pmd_index(unsigned long address)
{
        return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1);
}
#define pmd_index pmd_index
#endif

#ifndef pud_index
static inline unsigned long pud_index(unsigned long address)
{
        return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
}
#define pud_index pud_index
#endif

#ifndef pgd_index
/* Must be a compile-time constant, so implement it as a macro */
#define pgd_index(a)  (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
#endif

#ifndef pte_offset_kernel
static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
{
        return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address);
}
#define pte_offset_kernel pte_offset_kernel
#endif

#if defined(CONFIG_HIGHPTE)
#define pte_offset_map(dir, address)                                \
        ((pte_t *)kmap_atomic(pmd_page(*(dir))) +                \
         pte_index((address)))
#define pte_unmap(pte) kunmap_atomic((pte))
#else
#define pte_offset_map(dir, address)        pte_offset_kernel((dir), (address))
#define pte_unmap(pte) ((void)(pte))        /* NOP */
#endif

/* Find an entry in the second-level page table.. */
#ifndef pmd_offset
static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
{
        return pud_pgtable(*pud) + pmd_index(address);
}
#define pmd_offset pmd_offset
#endif

#ifndef pud_offset
static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
{
        return p4d_pgtable(*p4d) + pud_index(address);
}
#define pud_offset pud_offset
#endif

static inline pgd_t *pgd_offset_pgd(pgd_t *pgd, unsigned long address)
{
        return (pgd + pgd_index(address));
};

/*
 * a shortcut to get a pgd_t in a given mm
 */
#ifndef pgd_offset
#define pgd_offset(mm, address)                pgd_offset_pgd((mm)->pgd, (address))
#endif

/*
 * a shortcut which implies the use of the kernel's pgd, instead
 * of a process's
 */
#ifndef pgd_offset_k
#define pgd_offset_k(address)                pgd_offset(&init_mm, (address))
#endif

/*
 * In many cases it is known that a virtual address is mapped at PMD or PTE
 * level, so instead of traversing all the page table levels, we can get a
 * pointer to the PMD entry in user or kernel page table or translate a virtual
 * address to the pointer in the PTE in the kernel page tables with simple
 * helpers.
 */
static inline pmd_t *pmd_off(struct mm_struct *mm, unsigned long va)
{
        return pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, va), va), va), va);
}

static inline pmd_t *pmd_off_k(unsigned long va)
{
        return pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va);
}

static inline pte_t *virt_to_kpte(unsigned long vaddr)
{
        pmd_t *pmd = pmd_off_k(vaddr);

        return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr);
}

#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
extern int ptep_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pte_t *ptep,
                                 pte_t entry, int dirty);
#endif

#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern int pmdp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp,
                                 pmd_t entry, int dirty);
extern int pudp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pud_t *pudp,
                                 pud_t entry, int dirty);
#else
static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
                                        unsigned long address, pmd_t *pmdp,
                                        pmd_t entry, int dirty)
{
        BUILD_BUG();
        return 0;
}
static inline int pudp_set_access_flags(struct vm_area_struct *vma,
                                        unsigned long address, pud_t *pudp,
                                        pud_t entry, int dirty)
{
        BUILD_BUG();
        return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif

#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
                                            unsigned long address,
                                            pte_t *ptep)
{
        pte_t pte = *ptep;
        int r = 1;
        if (!pte_young(pte))
                r = 0;
        else
                set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte));
        return r;
}
#endif

#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
                                            unsigned long address,
                                            pmd_t *pmdp)
{
        pmd_t pmd = *pmdp;
        int r = 1;
        if (!pmd_young(pmd))
                r = 0;
        else
                set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd));
        return r;
}
#else
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
                                            unsigned long address,
                                            pmd_t *pmdp)
{
        BUILD_BUG();
        return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif

#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
int ptep_clear_flush_young(struct vm_area_struct *vma,
                           unsigned long address, pte_t *ptep);
#endif

#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
                                  unsigned long address, pmd_t *pmdp);
#else
/*
 * Despite relevant to THP only, this API is called from generic rmap code
 * under PageTransHuge(), hence needs a dummy implementation for !THP
 */
static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
                                         unsigned long address, pmd_t *pmdp)
{
        BUILD_BUG();
        return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif

#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
                                       unsigned long address,
                                       pte_t *ptep)
{
        pte_t pte = *ptep;
        pte_clear(mm, address, ptep);
        return pte;
}
#endif

#ifndef __HAVE_ARCH_PTEP_GET
static inline pte_t ptep_get(pte_t *ptep)
{
        return READ_ONCE(*ptep);
}
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
                                            unsigned long address,
                                            pmd_t *pmdp)
{
        pmd_t pmd = *pmdp;
        pmd_clear(pmdp);
        return pmd;
}
#endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */
#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
                                            unsigned long address,
                                            pud_t *pudp)
{
        pud_t pud = *pudp;

        pud_clear(pudp);
        return pud;
}
#endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
                                            unsigned long address, pmd_t *pmdp,
                                            int full)
{
        return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
}
#endif

#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL
static inline pud_t pudp_huge_get_and_clear_full(struct mm_struct *mm,
                                            unsigned long address, pud_t *pudp,
                                            int full)
{
        return pudp_huge_get_and_clear(mm, address, pudp);
}
#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
                                            unsigned long address, pte_t *ptep,
                                            int full)
{
        pte_t pte;
        pte = ptep_get_and_clear(mm, address, ptep);
        return pte;
}
#endif


/*
 * If two threads concurrently fault at the same page, the thread that
 * won the race updates the PTE and its local TLB/Cache. The other thread
 * gives up, simply does nothing, and continues; on architectures where
 * software can update TLB,  local TLB can be updated here to avoid next page
 * fault. This function updates TLB only, do nothing with cache or others.
 * It is the difference with function update_mmu_cache.
 */
#ifndef __HAVE_ARCH_UPDATE_MMU_TLB
static inline void update_mmu_tlb(struct vm_area_struct *vma,
                                unsigned long address, pte_t *ptep)
{
}
#define __HAVE_ARCH_UPDATE_MMU_TLB
#endif

/*
 * Some architectures may be able to avoid expensive synchronization
 * primitives when modifications are made to PTE's which are already
 * not present, or in the process of an address space destruction.
 */
#ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL
static inline void pte_clear_not_present_full(struct mm_struct *mm,
                                              unsigned long address,
                                              pte_t *ptep,
                                              int full)
{
        pte_clear(mm, address, ptep);
}
#endif

#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
                              unsigned long address,
                              pte_t *ptep);
#endif

#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
                              unsigned long address,
                              pmd_t *pmdp);
extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma,
                              unsigned long address,
                              pud_t *pudp);
#endif

#ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT
struct mm_struct;
static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
{
        pte_t old_pte = *ptep;
        set_pte_at(mm, address, ptep, pte_wrprotect(old_pte));
}
#endif

/*
 * On some architectures hardware does not set page access bit when accessing
 * memory page, it is responsibilty of software setting this bit. It brings
 * out extra page fault penalty to track page access bit. For optimization page
 * access bit can be set during all page fault flow on these arches.
 * To be differentiate with macro pte_mkyoung, this macro is used on platforms
 * where software maintains page access bit.
 */
#ifndef pte_sw_mkyoung
static inline pte_t pte_sw_mkyoung(pte_t pte)
{
        return pte;
}
#define pte_sw_mkyoung        pte_sw_mkyoung
#endif

#ifndef pte_savedwrite
#define pte_savedwrite pte_write
#endif

#ifndef pte_mk_savedwrite
#define pte_mk_savedwrite pte_mkwrite
#endif

#ifndef pte_clear_savedwrite
#define pte_clear_savedwrite pte_wrprotect
#endif

#ifndef pmd_savedwrite
#define pmd_savedwrite pmd_write
#endif

#ifndef pmd_mk_savedwrite
#define pmd_mk_savedwrite pmd_mkwrite
#endif

#ifndef pmd_clear_savedwrite
#define pmd_clear_savedwrite pmd_wrprotect
#endif

#ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pmd_t *pmdp)
{
        pmd_t old_pmd = *pmdp;
        set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd));
}
#else
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pmd_t *pmdp)
{
        BUILD_BUG();
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
#ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static inline void pudp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pud_t *pudp)
{
        pud_t old_pud = *pudp;

        set_pud_at(mm, address, pudp, pud_wrprotect(old_pud));
}
#else
static inline void pudp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pud_t *pudp)
{
        BUILD_BUG();
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
#endif

#ifndef pmdp_collapse_flush
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp);
#else
static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
                                        unsigned long address,
                                        pmd_t *pmdp)
{
        BUILD_BUG();
        return *pmdp;
}
#define pmdp_collapse_flush pmdp_collapse_flush
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif

#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                       pgtable_t pgtable);
#endif

#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
 * This is an implementation of pmdp_establish() that is only suitable for an
 * architecture that doesn't have hardware dirty/accessed bits. In this case we
 * can't race with CPU which sets these bits and non-atomic aproach is fine.
 */
static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmdp, pmd_t pmd)
{
        pmd_t old_pmd = *pmdp;
        set_pmd_at(vma->vm_mm, address, pmdp, pmd);
        return old_pmd;
}
#endif

#ifndef __HAVE_ARCH_PMDP_INVALIDATE
extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                            pmd_t *pmdp);
#endif

#ifndef __HAVE_ARCH_PTE_SAME
static inline int pte_same(pte_t pte_a, pte_t pte_b)
{
        return pte_val(pte_a) == pte_val(pte_b);
}
#endif

#ifndef __HAVE_ARCH_PTE_UNUSED
/*
 * Some architectures provide facilities to virtualization guests
 * so that they can flag allocated pages as unused. This allows the
 * host to transparently reclaim unused pages. This function returns
 * whether the pte's page is unused.
 */
static inline int pte_unused(pte_t pte)
{
        return 0;
}
#endif

#ifndef pte_access_permitted
#define pte_access_permitted(pte, write) \
        (pte_present(pte) && (!(write) || pte_write(pte)))
#endif

#ifndef pmd_access_permitted
#define pmd_access_permitted(pmd, write) \
        (pmd_present(pmd) && (!(write) || pmd_write(pmd)))
#endif

#ifndef pud_access_permitted
#define pud_access_permitted(pud, write) \
        (pud_present(pud) && (!(write) || pud_write(pud)))
#endif

#ifndef p4d_access_permitted
#define p4d_access_permitted(p4d, write) \
        (p4d_present(p4d) && (!(write) || p4d_write(p4d)))
#endif

#ifndef pgd_access_permitted
#define pgd_access_permitted(pgd, write) \
        (pgd_present(pgd) && (!(write) || pgd_write(pgd)))
#endif

#ifndef __HAVE_ARCH_PMD_SAME
static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
{
        return pmd_val(pmd_a) == pmd_val(pmd_b);
}

static inline int pud_same(pud_t pud_a, pud_t pud_b)
{
        return pud_val(pud_a) == pud_val(pud_b);
}
#endif

#ifndef __HAVE_ARCH_P4D_SAME
static inline int p4d_same(p4d_t p4d_a, p4d_t p4d_b)
{
        return p4d_val(p4d_a) == p4d_val(p4d_b);
}
#endif

#ifndef __HAVE_ARCH_PGD_SAME
static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b)
{
        return pgd_val(pgd_a) == pgd_val(pgd_b);
}
#endif

/*
 * Use set_p*_safe(), and elide TLB flushing, when confident that *no*
 * TLB flush will be required as a result of the "set". For example, use
 * in scenarios where it is known ahead of time that the routine is
 * setting non-present entries, or re-setting an existing entry to the
 * same value. Otherwise, use the typical "set" helpers and flush the
 * TLB.
 */
#define set_pte_safe(ptep, pte) \
({ \
        WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \
        set_pte(ptep, pte); \
})

#define set_pmd_safe(pmdp, pmd) \
({ \
        WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \
        set_pmd(pmdp, pmd); \
})

#define set_pud_safe(pudp, pud) \
({ \
        WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \
        set_pud(pudp, pud); \
})

#define set_p4d_safe(p4dp, p4d) \
({ \
        WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \
        set_p4d(p4dp, p4d); \
})

#define set_pgd_safe(pgdp, pgd) \
({ \
        WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \
        set_pgd(pgdp, pgd); \
})

#ifndef __HAVE_ARCH_DO_SWAP_PAGE
/*
 * Some architectures support metadata associated with a page. When a
 * page is being swapped out, this metadata must be saved so it can be
 * restored when the page is swapped back in. SPARC M7 and newer
 * processors support an ADI (Application Data Integrity) tag for the
 * page as metadata for the page. arch_do_swap_page() can restore this
 * metadata when a page is swapped back in.
 */
static inline void arch_do_swap_page(struct mm_struct *mm,
                                     struct vm_area_struct *vma,
                                     unsigned long addr,
                                     pte_t pte, pte_t oldpte)
{

}
#endif

#ifndef __HAVE_ARCH_UNMAP_ONE
/*
 * Some architectures support metadata associated with a page. When a
 * page is being swapped out, this metadata must be saved so it can be
 * restored when the page is swapped back in. SPARC M7 and newer
 * processors support an ADI (Application Data Integrity) tag for the
 * page as metadata for the page. arch_unmap_one() can save this
 * metadata on a swap-out of a page.
 */
static inline int arch_unmap_one(struct mm_struct *mm,
                                  struct vm_area_struct *vma,
                                  unsigned long addr,
                                  pte_t orig_pte)
{
        return 0;
}
#endif

/*
 * Allow architectures to preserve additional metadata associated with
 * swapped-out pages. The corresponding __HAVE_ARCH_SWAP_* macros and function
 * prototypes must be defined in the arch-specific asm/pgtable.h file.
 */
#ifndef __HAVE_ARCH_PREPARE_TO_SWAP
static inline int arch_prepare_to_swap(struct page *page)
{
        return 0;
}
#endif

#ifndef __HAVE_ARCH_SWAP_INVALIDATE
static inline void arch_swap_invalidate_page(int type, pgoff_t offset)
{
}

static inline void arch_swap_invalidate_area(int type)
{
}
#endif

#ifndef __HAVE_ARCH_SWAP_RESTORE
static inline void arch_swap_restore(swp_entry_t entry, struct page *page)
{
}
#endif

#ifndef __HAVE_ARCH_PGD_OFFSET_GATE
#define pgd_offset_gate(mm, addr)        pgd_offset(mm, addr)
#endif

#ifndef __HAVE_ARCH_MOVE_PTE
#define move_pte(pte, prot, old_addr, new_addr)        (pte)
#endif

#ifndef pte_accessible
# define pte_accessible(mm, pte)        ((void)(pte), 1)
#endif

#ifndef flush_tlb_fix_spurious_fault
#define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
#endif

/*
 * When walking page tables, get the address of the next boundary,
 * or the end address of the range if that comes earlier.  Although no
 * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout.
 */

#define pgd_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})

#ifndef p4d_addr_end
#define p4d_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + P4D_SIZE) & P4D_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})
#endif

#ifndef pud_addr_end
#define pud_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + PUD_SIZE) & PUD_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})
#endif

#ifndef pmd_addr_end
#define pmd_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + PMD_SIZE) & PMD_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})
#endif

/*
 * When walking page tables, we usually want to skip any p?d_none entries;
 * and any p?d_bad entries - reporting the error before resetting to none.
 * Do the tests inline, but report and clear the bad entry in mm/memory.c.
 */
void pgd_clear_bad(pgd_t *);

#ifndef __PAGETABLE_P4D_FOLDED
void p4d_clear_bad(p4d_t *);
#else
#define p4d_clear_bad(p4d)        do { } while (0)
#endif

#ifndef __PAGETABLE_PUD_FOLDED
void pud_clear_bad(pud_t *);
#else
#define pud_clear_bad(p4d)        do { } while (0)
#endif

void pmd_clear_bad(pmd_t *);

static inline int pgd_none_or_clear_bad(pgd_t *pgd)
{
        if (pgd_none(*pgd))
                return 1;
        if (unlikely(pgd_bad(*pgd))) {
                pgd_clear_bad(pgd);
                return 1;
        }
        return 0;
}

static inline int p4d_none_or_clear_bad(p4d_t *p4d)
{
        if (p4d_none(*p4d))
                return 1;
        if (unlikely(p4d_bad(*p4d))) {
                p4d_clear_bad(p4d);
                return 1;
        }
        return 0;
}

static inline int pud_none_or_clear_bad(pud_t *pud)
{
        if (pud_none(*pud))
                return 1;
        if (unlikely(pud_bad(*pud))) {
                pud_clear_bad(pud);
                return 1;
        }
        return 0;
}

static inline int pmd_none_or_clear_bad(pmd_t *pmd)
{
        if (pmd_none(*pmd))
                return 1;
        if (unlikely(pmd_bad(*pmd))) {
                pmd_clear_bad(pmd);
                return 1;
        }
        return 0;
}

static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma,
                                             unsigned long addr,
                                             pte_t *ptep)
{
        /*
         * Get the current pte state, but zero it out to make it
         * non-present, preventing the hardware from asynchronously
         * updating it.
         */
        return ptep_get_and_clear(vma->vm_mm, addr, ptep);
}

static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma,
                                             unsigned long addr,
                                             pte_t *ptep, pte_t pte)
{
        /*
         * The pte is non-present, so there's no hardware state to
         * preserve.
         */
        set_pte_at(vma->vm_mm, addr, ptep, pte);
}

#ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
/*
 * Start a pte protection read-modify-write transaction, which
 * protects against asynchronous hardware modifications to the pte.
 * The intention is not to prevent the hardware from making pte
 * updates, but to prevent any updates it may make from being lost.
 *
 * This does not protect against other software modifications of the
 * pte; the appropriate pte lock must be held over the transation.
 *
 * Note that this interface is intended to be batchable, meaning that
 * ptep_modify_prot_commit may not actually update the pte, but merely
 * queue the update to be done at some later time.  The update must be
 * actually committed before the pte lock is released, however.
 */
static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma,
                                           unsigned long addr,
                                           pte_t *ptep)
{
        return __ptep_modify_prot_start(vma, addr, ptep);
}

/*
 * Commit an update to a pte, leaving any hardware-controlled bits in
 * the PTE unmodified.
 */
static inline void ptep_modify_prot_commit(struct vm_area_struct *vma,
                                           unsigned long addr,
                                           pte_t *ptep, pte_t old_pte, pte_t pte)
{
        __ptep_modify_prot_commit(vma, addr, ptep, pte);
}
#endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */
#endif /* CONFIG_MMU */

/*
 * No-op macros that just return the current protection value. Defined here
 * because these macros can be used even if CONFIG_MMU is not defined.
 */

#ifndef pgprot_nx
#define pgprot_nx(prot)        (prot)
#endif

#ifndef pgprot_noncached
#define pgprot_noncached(prot)        (prot)
#endif

#ifndef pgprot_writecombine
#define pgprot_writecombine pgprot_noncached
#endif

#ifndef pgprot_writethrough
#define pgprot_writethrough pgprot_noncached
#endif

#ifndef pgprot_device
#define pgprot_device pgprot_noncached
#endif

#ifndef pgprot_mhp
#define pgprot_mhp(prot)        (prot)
#endif

#ifdef CONFIG_MMU
#ifndef pgprot_modify
#define pgprot_modify pgprot_modify
static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
{
        if (pgprot_val(oldprot) == pgprot_val(pgprot_noncached(oldprot)))
                newprot = pgprot_noncached(newprot);
        if (pgprot_val(oldprot) == pgprot_val(pgprot_writecombine(oldprot)))
                newprot = pgprot_writecombine(newprot);
        if (pgprot_val(oldprot) == pgprot_val(pgprot_device(oldprot)))
                newprot = pgprot_device(newprot);
        return newprot;
}
#endif
#endif /* CONFIG_MMU */

#ifndef pgprot_encrypted
#define pgprot_encrypted(prot)        (prot)
#endif

#ifndef pgprot_decrypted
#define pgprot_decrypted(prot)        (prot)
#endif

/*
 * A facility to provide lazy MMU batching.  This allows PTE updates and
 * page invalidations to be delayed until a call to leave lazy MMU mode
 * is issued.  Some architectures may benefit from doing this, and it is
 * beneficial for both shadow and direct mode hypervisors, which may batch
 * the PTE updates which happen during this window.  Note that using this
 * interface requires that read hazards be removed from the code.  A read
 * hazard could result in the direct mode hypervisor case, since the actual
 * write to the page tables may not yet have taken place, so reads though
 * a raw PTE pointer after it has been modified are not guaranteed to be
 * up to date.  This mode can only be entered and left under the protection of
 * the page table locks for all page tables which may be modified.  In the UP
 * case, this is required so that preemption is disabled, and in the SMP case,
 * it must synchronize the delayed page table writes properly on other CPUs.
 */
#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE
#define arch_enter_lazy_mmu_mode()        do {} while (0)
#define arch_leave_lazy_mmu_mode()        do {} while (0)
#define arch_flush_lazy_mmu_mode()        do {} while (0)
#endif

/*
 * A facility to provide batching of the reload of page tables and
 * other process state with the actual context switch code for
 * paravirtualized guests.  By convention, only one of the batched
 * update (lazy) modes (CPU, MMU) should be active at any given time,
 * entry should never be nested, and entry and exits should always be
 * paired.  This is for sanity of maintaining and reasoning about the
 * kernel code.  In this case, the exit (end of the context switch) is
 * in architecture-specific code, and so doesn't need a generic
 * definition.
 */
#ifndef __HAVE_ARCH_START_CONTEXT_SWITCH
#define arch_start_context_switch(prev)        do {} while (0)
#endif

#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
#ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION
static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline int pmd_swp_soft_dirty(pmd_t pmd)
{
        return 0;
}

static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
{
        return pmd;
}
#endif
#else /* !CONFIG_HAVE_ARCH_SOFT_DIRTY */
static inline int pte_soft_dirty(pte_t pte)
{
        return 0;
}

static inline int pmd_soft_dirty(pmd_t pmd)
{
        return 0;
}

static inline pte_t pte_mksoft_dirty(pte_t pte)
{
        return pte;
}

static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline pte_t pte_clear_soft_dirty(pte_t pte)
{
        return pte;
}

static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
{
        return pte;
}

static inline int pte_swp_soft_dirty(pte_t pte)
{
        return 0;
}

static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
{
        return pte;
}

static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline int pmd_swp_soft_dirty(pmd_t pmd)
{
        return 0;
}

static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
{
        return pmd;
}
#endif

#ifndef __HAVE_PFNMAP_TRACKING
/*
 * Interfaces that can be used by architecture code to keep track of
 * memory type of pfn mappings specified by the remap_pfn_range,
 * vmf_insert_pfn.
 */

/*
 * track_pfn_remap is called when a _new_ pfn mapping is being established
 * by remap_pfn_range() for physical range indicated by pfn and size.
 */
static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
                                  unsigned long pfn, unsigned long addr,
                                  unsigned long size)
{
        return 0;
}

/*
 * track_pfn_insert is called when a _new_ single pfn is established
 * by vmf_insert_pfn().
 */
static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
                                    pfn_t pfn)
{
}

/*
 * track_pfn_copy is called when vma that is covering the pfnmap gets
 * copied through copy_page_range().
 */
static inline int track_pfn_copy(struct vm_area_struct *vma)
{
        return 0;
}

/*
 * untrack_pfn is called while unmapping a pfnmap for a region.
 * untrack can be called for a specific region indicated by pfn and size or
 * can be for the entire vma (in which case pfn, size are zero).
 */
static inline void untrack_pfn(struct vm_area_struct *vma,
                               unsigned long pfn, unsigned long size)
{
}

/*
 * untrack_pfn_moved is called while mremapping a pfnmap for a new region.
 */
static inline void untrack_pfn_moved(struct vm_area_struct *vma)
{
}
#else
extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
                           unsigned long pfn, unsigned long addr,
                           unsigned long size);
extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
                             pfn_t pfn);
extern int track_pfn_copy(struct vm_area_struct *vma);
extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
                        unsigned long size);
extern void untrack_pfn_moved(struct vm_area_struct *vma);
#endif

#ifdef __HAVE_COLOR_ZERO_PAGE
static inline int is_zero_pfn(unsigned long pfn)
{
        extern unsigned long zero_pfn;
        unsigned long offset_from_zero_pfn = pfn - zero_pfn;
        return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
}

#define my_zero_pfn(addr)        page_to_pfn(ZERO_PAGE(addr))

#else
static inline int is_zero_pfn(unsigned long pfn)
{
        extern unsigned long zero_pfn;
        return pfn == zero_pfn;
}

static inline unsigned long my_zero_pfn(unsigned long addr)
{
        extern unsigned long zero_pfn;
        return zero_pfn;
}
#endif

#ifdef CONFIG_MMU

#ifndef CONFIG_TRANSPARENT_HUGEPAGE
static inline int pmd_trans_huge(pmd_t pmd)
{
        return 0;
}
#ifndef pmd_write
static inline int pmd_write(pmd_t pmd)
{
        BUG();
        return 0;
}
#endif /* pmd_write */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#ifndef pud_write
static inline int pud_write(pud_t pud)
{
        BUG();
        return 0;
}
#endif /* pud_write */

#if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
static inline int pmd_devmap(pmd_t pmd)
{
        return 0;
}
static inline int pud_devmap(pud_t pud)
{
        return 0;
}
static inline int pgd_devmap(pgd_t pgd)
{
        return 0;
}
#endif

#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
        (defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
         !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD))
static inline int pud_trans_huge(pud_t pud)
{
        return 0;
}
#endif

/* See pmd_none_or_trans_huge_or_clear_bad for discussion. */
static inline int pud_none_or_trans_huge_or_dev_or_clear_bad(pud_t *pud)
{
        pud_t pudval = READ_ONCE(*pud);

        if (pud_none(pudval) || pud_trans_huge(pudval) || pud_devmap(pudval))
                return 1;
        if (unlikely(pud_bad(pudval))) {
                pud_clear_bad(pud);
                return 1;
        }
        return 0;
}

/* See pmd_trans_unstable for discussion. */
static inline int pud_trans_unstable(pud_t *pud)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&                        \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
        return pud_none_or_trans_huge_or_dev_or_clear_bad(pud);
#else
        return 0;
#endif
}

#ifndef pmd_read_atomic
static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
{
        /*
         * Depend on compiler for an atomic pmd read. NOTE: this is
         * only going to work, if the pmdval_t isn't larger than
         * an unsigned long.
         */
        return *pmdp;
}
#endif

#ifndef arch_needs_pgtable_deposit
#define arch_needs_pgtable_deposit() (false)
#endif
/*
 * This function is meant to be used by sites walking pagetables with
 * the mmap_lock held in read mode to protect against MADV_DONTNEED and
 * transhuge page faults. MADV_DONTNEED can convert a transhuge pmd
 * into a null pmd and the transhuge page fault can convert a null pmd
 * into an hugepmd or into a regular pmd (if the hugepage allocation
 * fails). While holding the mmap_lock in read mode the pmd becomes
 * stable and stops changing under us only if it's not null and not a
 * transhuge pmd. When those races occurs and this function makes a
 * difference vs the standard pmd_none_or_clear_bad, the result is
 * undefined so behaving like if the pmd was none is safe (because it
 * can return none anyway). The compiler level barrier() is critically
 * important to compute the two checks atomically on the same pmdval.
 *
 * For 32bit kernels with a 64bit large pmd_t this automatically takes
 * care of reading the pmd atomically to avoid SMP race conditions
 * against pmd_populate() when the mmap_lock is hold for reading by the
 * caller (a special atomic read not done by "gcc" as in the generic
 * version above, is also needed when THP is disabled because the page
 * fault can populate the pmd from under us).
 */
static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
{
        pmd_t pmdval = pmd_read_atomic(pmd);
        /*
         * The barrier will stabilize the pmdval in a register or on
         * the stack so that it will stop changing under the code.
         *
         * When CONFIG_TRANSPARENT_HUGEPAGE=y on x86 32bit PAE,
         * pmd_read_atomic is allowed to return a not atomic pmdval
         * (for example pointing to an hugepage that has never been
         * mapped in the pmd). The below checks will only care about
         * the low part of the pmd with 32bit PAE x86 anyway, with the
         * exception of pmd_none(). So the important thing is that if
         * the low part of the pmd is found null, the high part will
         * be also null or the pmd_none() check below would be
         * confused.
         */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        barrier();
#endif
        /*
         * !pmd_present() checks for pmd migration entries
         *
         * The complete check uses is_pmd_migration_entry() in linux/swapops.h
         * But using that requires moving current function and pmd_trans_unstable()
         * to linux/swapops.h to resovle dependency, which is too much code move.
         *
         * !pmd_present() is equivalent to is_pmd_migration_entry() currently,
         * because !pmd_present() pages can only be under migration not swapped
         * out.
         *
         * pmd_none() is preseved for future condition checks on pmd migration
         * entries and not confusing with this function name, although it is
         * redundant with !pmd_present().
         */
        if (pmd_none(pmdval) || pmd_trans_huge(pmdval) ||
                (IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION) && !pmd_present(pmdval)))
                return 1;
        if (unlikely(pmd_bad(pmdval))) {
                pmd_clear_bad(pmd);
                return 1;
        }
        return 0;
}

/*
 * This is a noop if Transparent Hugepage Support is not built into
 * the kernel. Otherwise it is equivalent to
 * pmd_none_or_trans_huge_or_clear_bad(), and shall only be called in
 * places that already verified the pmd is not none and they want to
 * walk ptes while holding the mmap sem in read mode (write mode don't
 * need this). If THP is not enabled, the pmd can't go away under the
 * code even if MADV_DONTNEED runs, but if THP is enabled we need to
 * run a pmd_trans_unstable before walking the ptes after
 * split_huge_pmd returns (because it may have run when the pmd become
 * null, but then a page fault can map in a THP and not a regular page).
 */
static inline int pmd_trans_unstable(pmd_t *pmd)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        return pmd_none_or_trans_huge_or_clear_bad(pmd);
#else
        return 0;
#endif
}

#ifndef CONFIG_NUMA_BALANCING
/*
 * Technically a PTE can be PROTNONE even when not doing NUMA balancing but
 * the only case the kernel cares is for NUMA balancing and is only ever set
 * when the VMA is accessible. For PROT_NONE VMAs, the PTEs are not marked
 * _PAGE_PROTNONE so by default, implement the helper as "always no". It
 * is the responsibility of the caller to distinguish between PROT_NONE
 * protections and NUMA hinting fault protections.
 */
static inline int pte_protnone(pte_t pte)
{
        return 0;
}

static inline int pmd_protnone(pmd_t pmd)
{
        return 0;
}
#endif /* CONFIG_NUMA_BALANCING */

#endif /* CONFIG_MMU */

#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP

#ifndef __PAGETABLE_P4D_FOLDED
int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot);
int p4d_clear_huge(p4d_t *p4d);
#else
static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline int p4d_clear_huge(p4d_t *p4d)
{
        return 0;
}
#endif /* !__PAGETABLE_P4D_FOLDED */

int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
int pud_clear_huge(pud_t *pud);
int pmd_clear_huge(pmd_t *pmd);
int p4d_free_pud_page(p4d_t *p4d, unsigned long addr);
int pud_free_pmd_page(pud_t *pud, unsigned long addr);
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr);
#else        /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline int p4d_clear_huge(p4d_t *p4d)
{
        return 0;
}
static inline int pud_clear_huge(pud_t *pud)
{
        return 0;
}
static inline int pmd_clear_huge(pmd_t *pmd)
{
        return 0;
}
static inline int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
{
        return 0;
}
static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
        return 0;
}
static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
        return 0;
}
#endif        /* CONFIG_HAVE_ARCH_HUGE_VMAP */

#ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
 * ARCHes with special requirements for evicting THP backing TLB entries can
 * implement this. Otherwise also, it can help optimize normal TLB flush in
 * THP regime. Stock flush_tlb_range() typically has optimization to nuke the
 * entire TLB if flush span is greater than a threshold, which will
 * likely be true for a single huge page. Thus a single THP flush will
 * invalidate the entire TLB which is not desirable.
 * e.g. see arch/arc: flush_pmd_tlb_range
 */
#define flush_pmd_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
#define flush_pud_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
#else
#define flush_pmd_tlb_range(vma, addr, end)        BUILD_BUG()
#define flush_pud_tlb_range(vma, addr, end)        BUILD_BUG()
#endif
#endif

struct file;
int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
                        unsigned long size, pgprot_t *vma_prot);

#ifndef CONFIG_X86_ESPFIX64
static inline void init_espfix_bsp(void) { }
#endif

extern void __init pgtable_cache_init(void);

#ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED
static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
{
        return true;
}

static inline bool arch_has_pfn_modify_check(void)
{
        return false;
}
#endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */

/*
 * Architecture PAGE_KERNEL_* fallbacks
 *
 * Some architectures don't define certain PAGE_KERNEL_* flags. This is either
 * because they really don't support them, or the port needs to be updated to
 * reflect the required functionality. Below are a set of relatively safe
 * fallbacks, as best effort, which we can count on in lieu of the architectures
 * not defining them on their own yet.
 */

#ifndef PAGE_KERNEL_RO
# define PAGE_KERNEL_RO PAGE_KERNEL
#endif

#ifndef PAGE_KERNEL_EXEC
# define PAGE_KERNEL_EXEC PAGE_KERNEL
#endif

/*
 * Page Table Modification bits for pgtbl_mod_mask.
 *
 * These are used by the p?d_alloc_track*() set of functions an in the generic
 * vmalloc/ioremap code to track at which page-table levels entries have been
 * modified. Based on that the code can better decide when vmalloc and ioremap
 * mapping changes need to be synchronized to other page-tables in the system.
 */
#define                __PGTBL_PGD_MODIFIED        0
#define                __PGTBL_P4D_MODIFIED        1
#define                __PGTBL_PUD_MODIFIED        2
#define                __PGTBL_PMD_MODIFIED        3
#define                __PGTBL_PTE_MODIFIED        4

#define                PGTBL_PGD_MODIFIED        BIT(__PGTBL_PGD_MODIFIED)
#define                PGTBL_P4D_MODIFIED        BIT(__PGTBL_P4D_MODIFIED)
#define                PGTBL_PUD_MODIFIED        BIT(__PGTBL_PUD_MODIFIED)
#define                PGTBL_PMD_MODIFIED        BIT(__PGTBL_PMD_MODIFIED)
#define                PGTBL_PTE_MODIFIED        BIT(__PGTBL_PTE_MODIFIED)

/* Page-Table Modification Mask */
typedef unsigned int pgtbl_mod_mask;

#endif /* !__ASSEMBLY__ */

#if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT)
#ifdef CONFIG_PHYS_ADDR_T_64BIT
/*
 * ZSMALLOC needs to know the highest PFN on 32-bit architectures
 * with physical address space extension, but falls back to
 * BITS_PER_LONG otherwise.
 */
#error Missing MAX_POSSIBLE_PHYSMEM_BITS definition
#else
#define MAX_POSSIBLE_PHYSMEM_BITS 32
#endif
#endif

#ifndef has_transparent_hugepage
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define has_transparent_hugepage() 1
#else
#define has_transparent_hugepage() 0
#endif
#endif

/*
 * On some architectures it depends on the mm if the p4d/pud or pmd
 * layer of the page table hierarchy is folded or not.
 */
#ifndef mm_p4d_folded
#define mm_p4d_folded(mm)        __is_defined(__PAGETABLE_P4D_FOLDED)
#endif

#ifndef mm_pud_folded
#define mm_pud_folded(mm)        __is_defined(__PAGETABLE_PUD_FOLDED)
#endif

#ifndef mm_pmd_folded
#define mm_pmd_folded(mm)        __is_defined(__PAGETABLE_PMD_FOLDED)
#endif

#ifndef p4d_offset_lockless
#define p4d_offset_lockless(pgdp, pgd, address) p4d_offset(&(pgd), address)
#endif
#ifndef pud_offset_lockless
#define pud_offset_lockless(p4dp, p4d, address) pud_offset(&(p4d), address)
#endif
#ifndef pmd_offset_lockless
#define pmd_offset_lockless(pudp, pud, address) pmd_offset(&(pud), address)
#endif

/*
 * p?d_leaf() - true if this entry is a final mapping to a physical address.
 * This differs from p?d_huge() by the fact that they are always available (if
 * the architecture supports large pages at the appropriate level) even
 * if CONFIG_HUGETLB_PAGE is not defined.
 * Only meaningful when called on a valid entry.
 */
#ifndef pgd_leaf
#define pgd_leaf(x)        0
#endif
#ifndef p4d_leaf
#define p4d_leaf(x)        0
#endif
#ifndef pud_leaf
#define pud_leaf(x)        0
#endif
#ifndef pmd_leaf
#define pmd_leaf(x)        0
#endif

#endif /* _LINUX_PGTABLE_H */
























































    1 


























    1 

















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PERCPU_COUNTER_H
#define _LINUX_PERCPU_COUNTER_H
/*
 * A simple "approximate counter" for use in ext2 and ext3 superblocks.
 *
 * WARNING: these things are HUGE.  4 kbytes per counter on 32-way P4.
 */

#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/list.h>
#include <linux/threads.h>
#include <linux/percpu.h>
#include <linux/types.h>
#include <linux/gfp.h>

#ifdef CONFIG_SMP

struct percpu_counter {
        raw_spinlock_t lock;
        s64 count;
#ifdef CONFIG_HOTPLUG_CPU
        struct list_head list;        /* All percpu_counters are on a list */
#endif
        s32 __percpu *counters;
};

extern int percpu_counter_batch;

int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
                          struct lock_class_key *key);

#define percpu_counter_init(fbc, value, gfp)                                \
        ({                                                                \
                static struct lock_class_key __key;                        \
                                                                        \
                __percpu_counter_init(fbc, value, gfp, &__key);                \
        })

void percpu_counter_destroy(struct percpu_counter *fbc);
void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
                              s32 batch);
s64 __percpu_counter_sum(struct percpu_counter *fbc);
int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch);
void percpu_counter_sync(struct percpu_counter *fbc);

static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
{
        return __percpu_counter_compare(fbc, rhs, percpu_counter_batch);
}

static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add_batch(fbc, amount, percpu_counter_batch);
}

static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
{
        s64 ret = __percpu_counter_sum(fbc);
        return ret < 0 ? 0 : ret;
}

static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
{
        return __percpu_counter_sum(fbc);
}

static inline s64 percpu_counter_read(struct percpu_counter *fbc)
{
        return fbc->count;
}

/*
 * It is possible for the percpu_counter_read() to return a small negative
 * number for some counter which should never be negative.
 *
 */
static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
{
        /* Prevent reloads of fbc->count */
        s64 ret = READ_ONCE(fbc->count);

        if (ret >= 0)
                return ret;
        return 0;
}

static inline bool percpu_counter_initialized(struct percpu_counter *fbc)
{
        return (fbc->counters != NULL);
}

#else /* !CONFIG_SMP */

struct percpu_counter {
        s64 count;
};

static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount,
                                      gfp_t gfp)
{
        fbc->count = amount;
        return 0;
}

static inline void percpu_counter_destroy(struct percpu_counter *fbc)
{
}

static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
{
        fbc->count = amount;
}

static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
{
        if (fbc->count > rhs)
                return 1;
        else if (fbc->count < rhs)
                return -1;
        else
                return 0;
}

static inline int
__percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch)
{
        return percpu_counter_compare(fbc, rhs);
}

static inline void
percpu_counter_add(struct percpu_counter *fbc, s64 amount)
{
        preempt_disable();
        fbc->count += amount;
        preempt_enable();
}

static inline void
percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
{
        percpu_counter_add(fbc, amount);
}

static inline s64 percpu_counter_read(struct percpu_counter *fbc)
{
        return fbc->count;
}

/*
 * percpu_counter is intended to track positive numbers. In the UP case the
 * number should never be negative.
 */
static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
{
        return fbc->count;
}

static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
{
        return percpu_counter_read_positive(fbc);
}

static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
{
        return percpu_counter_read(fbc);
}

static inline bool percpu_counter_initialized(struct percpu_counter *fbc)
{
        return true;
}

static inline void percpu_counter_sync(struct percpu_counter *fbc)
{
}
#endif        /* CONFIG_SMP */

static inline void percpu_counter_inc(struct percpu_counter *fbc)
{
        percpu_counter_add(fbc, 1);
}

static inline void percpu_counter_dec(struct percpu_counter *fbc)
{
        percpu_counter_add(fbc, -1);
}

static inline void percpu_counter_sub(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add(fbc, -amount);
}

#endif /* _LINUX_PERCPU_COUNTER_H */

























































































































































































    1 


    1 




















































































































































    1 
    1 



    1 














    1 












































    1 








    1 

    1 


    1 


























































































































































    1 
















































































































































































































    1 

























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Macros for manipulating and testing page->flags
 */

#ifndef PAGE_FLAGS_H
#define PAGE_FLAGS_H

#include <linux/types.h>
#include <linux/bug.h>
#include <linux/mmdebug.h>
#ifndef __GENERATING_BOUNDS_H
#include <linux/mm_types.h>
#include <generated/bounds.h>
#endif /* !__GENERATING_BOUNDS_H */

/*
 * Various page->flags bits:
 *
 * PG_reserved is set for special pages. The "struct page" of such a page
 * should in general not be touched (e.g. set dirty) except by its owner.
 * Pages marked as PG_reserved include:
 * - Pages part of the kernel image (including vDSO) and similar (e.g. BIOS,
 *   initrd, HW tables)
 * - Pages reserved or allocated early during boot (before the page allocator
 *   was initialized). This includes (depending on the architecture) the
 *   initial vmemmap, initial page tables, crashkernel, elfcorehdr, and much
 *   much more. Once (if ever) freed, PG_reserved is cleared and they will
 *   be given to the page allocator.
 * - Pages falling into physical memory gaps - not IORESOURCE_SYSRAM. Trying
 *   to read/write these pages might end badly. Don't touch!
 * - The zero page(s)
 * - Pages not added to the page allocator when onlining a section because
 *   they were excluded via the online_page_callback() or because they are
 *   PG_hwpoison.
 * - Pages allocated in the context of kexec/kdump (loaded kernel image,
 *   control pages, vmcoreinfo)
 * - MMIO/DMA pages. Some architectures don't allow to ioremap pages that are
 *   not marked PG_reserved (as they might be in use by somebody else who does
 *   not respect the caching strategy).
 * - Pages part of an offline section (struct pages of offline sections should
 *   not be trusted as they will be initialized when first onlined).
 * - MCA pages on ia64
 * - Pages holding CPU notes for POWER Firmware Assisted Dump
 * - Device memory (e.g. PMEM, DAX, HMM)
 * Some PG_reserved pages will be excluded from the hibernation image.
 * PG_reserved does in general not hinder anybody from dumping or swapping
 * and is no longer required for remap_pfn_range(). ioremap might require it.
 * Consequently, PG_reserved for a page mapped into user space can indicate
 * the zero page, the vDSO, MMIO pages or device memory.
 *
 * The PG_private bitflag is set on pagecache pages if they contain filesystem
 * specific data (which is normally at page->private). It can be used by
 * private allocations for its own usage.
 *
 * During initiation of disk I/O, PG_locked is set. This bit is set before I/O
 * and cleared when writeback _starts_ or when read _completes_. PG_writeback
 * is set before writeback starts and cleared when it finishes.
 *
 * PG_locked also pins a page in pagecache, and blocks truncation of the file
 * while it is held.
 *
 * page_waitqueue(page) is a wait queue of all tasks waiting for the page
 * to become unlocked.
 *
 * PG_swapbacked is set when a page uses swap as a backing storage.  This are
 * usually PageAnon or shmem pages but please note that even anonymous pages
 * might lose their PG_swapbacked flag when they simply can be dropped (e.g. as
 * a result of MADV_FREE).
 *
 * PG_uptodate tells whether the page's contents is valid.  When a read
 * completes, the page becomes uptodate, unless a disk I/O error happened.
 *
 * PG_referenced, PG_reclaim are used for page reclaim for anonymous and
 * file-backed pagecache (see mm/vmscan.c).
 *
 * PG_error is set to indicate that an I/O error occurred on this page.
 *
 * PG_arch_1 is an architecture specific page state bit.  The generic code
 * guarantees that this bit is cleared for a page when it first is entered into
 * the page cache.
 *
 * PG_hwpoison indicates that a page got corrupted in hardware and contains
 * data with incorrect ECC bits that triggered a machine check. Accessing is
 * not safe since it may cause another machine check. Don't touch!
 */

/*
 * Don't use the *_dontuse flags.  Use the macros.  Otherwise you'll break
 * locked- and dirty-page accounting.
 *
 * The page flags field is split into two parts, the main flags area
 * which extends from the low bits upwards, and the fields area which
 * extends from the high bits downwards.
 *
 *  | FIELD | ... | FLAGS |
 *  N-1           ^       0
 *               (NR_PAGEFLAGS)
 *
 * The fields area is reserved for fields mapping zone, node (for NUMA) and
 * SPARSEMEM section (for variants of SPARSEMEM that require section ids like
 * SPARSEMEM_EXTREME with !SPARSEMEM_VMEMMAP).
 */
enum pageflags {
        PG_locked,                /* Page is locked. Don't touch. */
        PG_referenced,
        PG_uptodate,
        PG_dirty,
        PG_lru,
        PG_active,
        PG_workingset,
        PG_waiters,                /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
        PG_error,
        PG_slab,
        PG_owner_priv_1,        /* Owner use. If pagecache, fs may use*/
        PG_arch_1,
        PG_reserved,
        PG_private,                /* If pagecache, has fs-private data */
        PG_private_2,                /* If pagecache, has fs aux data */
        PG_writeback,                /* Page is under writeback */
        PG_head,                /* A head page */
        PG_mappedtodisk,        /* Has blocks allocated on-disk */
        PG_reclaim,                /* To be reclaimed asap */
        PG_swapbacked,                /* Page is backed by RAM/swap */
        PG_unevictable,                /* Page is "unevictable"  */
#ifdef CONFIG_MMU
        PG_mlocked,                /* Page is vma mlocked */
#endif
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
        PG_uncached,                /* Page has been mapped as uncached */
#endif
#ifdef CONFIG_MEMORY_FAILURE
        PG_hwpoison,                /* hardware poisoned page. Don't touch */
#endif
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
        PG_young,
        PG_idle,
#endif
#ifdef CONFIG_64BIT
        PG_arch_2,
#endif
        __NR_PAGEFLAGS,

        /* Filesystems */
        PG_checked = PG_owner_priv_1,

        /* SwapBacked */
        PG_swapcache = PG_owner_priv_1,        /* Swap page: swp_entry_t in private */

        /* Two page bits are conscripted by FS-Cache to maintain local caching
         * state.  These bits are set on pages belonging to the netfs's inodes
         * when those inodes are being locally cached.
         */
        PG_fscache = PG_private_2,        /* page backed by cache */

        /* XEN */
        /* Pinned in Xen as a read-only pagetable page. */
        PG_pinned = PG_owner_priv_1,
        /* Pinned as part of domain save (see xen_mm_pin_all()). */
        PG_savepinned = PG_dirty,
        /* Has a grant mapping of another (foreign) domain's page. */
        PG_foreign = PG_owner_priv_1,
        /* Remapped by swiotlb-xen. */
        PG_xen_remapped = PG_owner_priv_1,

        /* SLOB */
        PG_slob_free = PG_private,

        /* Compound pages. Stored in first tail page's flags */
        PG_double_map = PG_workingset,

        /* non-lru isolated movable page */
        PG_isolated = PG_reclaim,

        /* Only valid for buddy pages. Used to track pages that are reported */
        PG_reported = PG_uptodate,
};

#ifndef __GENERATING_BOUNDS_H

struct page;        /* forward declaration */

static inline struct page *compound_head(struct page *page)
{
        unsigned long head = READ_ONCE(page->compound_head);

        if (unlikely(head & 1))
                return (struct page *) (head - 1);
        return page;
}

static __always_inline int PageTail(struct page *page)
{
        return READ_ONCE(page->compound_head) & 1;
}

static __always_inline int PageCompound(struct page *page)
{
        return test_bit(PG_head, &page->flags) || PageTail(page);
}

#define        PAGE_POISON_PATTERN        -1l
static inline int PagePoisoned(const struct page *page)
{
        return page->flags == PAGE_POISON_PATTERN;
}

#ifdef CONFIG_DEBUG_VM
void page_init_poison(struct page *page, size_t size);
#else
static inline void page_init_poison(struct page *page, size_t size)
{
}
#endif

/*
 * Page flags policies wrt compound pages
 *
 * PF_POISONED_CHECK
 *     check if this struct page poisoned/uninitialized
 *
 * PF_ANY:
 *     the page flag is relevant for small, head and tail pages.
 *
 * PF_HEAD:
 *     for compound page all operations related to the page flag applied to
 *     head page.
 *
 * PF_ONLY_HEAD:
 *     for compound page, callers only ever operate on the head page.
 *
 * PF_NO_TAIL:
 *     modifications of the page flag must be done on small or head pages,
 *     checks can be done on tail pages too.
 *
 * PF_NO_COMPOUND:
 *     the page flag is not relevant for compound pages.
 *
 * PF_SECOND:
 *     the page flag is stored in the first tail page.
 */
#define PF_POISONED_CHECK(page) ({                                        \
                VM_BUG_ON_PGFLAGS(PagePoisoned(page), page);                \
                page; })
#define PF_ANY(page, enforce)        PF_POISONED_CHECK(page)
#define PF_HEAD(page, enforce)        PF_POISONED_CHECK(compound_head(page))
#define PF_ONLY_HEAD(page, enforce) ({                                        \
                VM_BUG_ON_PGFLAGS(PageTail(page), page);                \
                PF_POISONED_CHECK(page); })
#define PF_NO_TAIL(page, enforce) ({                                        \
                VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page);        \
                PF_POISONED_CHECK(compound_head(page)); })
#define PF_NO_COMPOUND(page, enforce) ({                                \
                VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page);        \
                PF_POISONED_CHECK(page); })
#define PF_SECOND(page, enforce) ({                                        \
                VM_BUG_ON_PGFLAGS(!PageHead(page), page);                \
                PF_POISONED_CHECK(&page[1]); })

/*
 * Macros to create function definitions for page flags
 */
#define TESTPAGEFLAG(uname, lname, policy)                                \
static __always_inline int Page##uname(struct page *page)                \
        { return test_bit(PG_##lname, &policy(page, 0)->flags); }

#define SETPAGEFLAG(uname, lname, policy)                                \
static __always_inline void SetPage##uname(struct page *page)                \
        { set_bit(PG_##lname, &policy(page, 1)->flags); }

#define CLEARPAGEFLAG(uname, lname, policy)                                \
static __always_inline void ClearPage##uname(struct page *page)                \
        { clear_bit(PG_##lname, &policy(page, 1)->flags); }

#define __SETPAGEFLAG(uname, lname, policy)                                \
static __always_inline void __SetPage##uname(struct page *page)                \
        { __set_bit(PG_##lname, &policy(page, 1)->flags); }

#define __CLEARPAGEFLAG(uname, lname, policy)                                \
static __always_inline void __ClearPage##uname(struct page *page)        \
        { __clear_bit(PG_##lname, &policy(page, 1)->flags); }

#define TESTSETFLAG(uname, lname, policy)                                \
static __always_inline int TestSetPage##uname(struct page *page)        \
        { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }

#define TESTCLEARFLAG(uname, lname, policy)                                \
static __always_inline int TestClearPage##uname(struct page *page)        \
        { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }

#define PAGEFLAG(uname, lname, policy)                                        \
        TESTPAGEFLAG(uname, lname, policy)                                \
        SETPAGEFLAG(uname, lname, policy)                                \
        CLEARPAGEFLAG(uname, lname, policy)

#define __PAGEFLAG(uname, lname, policy)                                \
        TESTPAGEFLAG(uname, lname, policy)                                \
        __SETPAGEFLAG(uname, lname, policy)                                \
        __CLEARPAGEFLAG(uname, lname, policy)

#define TESTSCFLAG(uname, lname, policy)                                \
        TESTSETFLAG(uname, lname, policy)                                \
        TESTCLEARFLAG(uname, lname, policy)

#define TESTPAGEFLAG_FALSE(uname)                                        \
static inline int Page##uname(const struct page *page) { return 0; }

#define SETPAGEFLAG_NOOP(uname)                                                \
static inline void SetPage##uname(struct page *page) {  }

#define CLEARPAGEFLAG_NOOP(uname)                                        \
static inline void ClearPage##uname(struct page *page) {  }

#define __CLEARPAGEFLAG_NOOP(uname)                                        \
static inline void __ClearPage##uname(struct page *page) {  }

#define TESTSETFLAG_FALSE(uname)                                        \
static inline int TestSetPage##uname(struct page *page) { return 0; }

#define TESTCLEARFLAG_FALSE(uname)                                        \
static inline int TestClearPage##uname(struct page *page) { return 0; }

#define PAGEFLAG_FALSE(uname) TESTPAGEFLAG_FALSE(uname)                        \
        SETPAGEFLAG_NOOP(uname) CLEARPAGEFLAG_NOOP(uname)

#define TESTSCFLAG_FALSE(uname)                                                \
        TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname)

__PAGEFLAG(Locked, locked, PF_NO_TAIL)
PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) __CLEARPAGEFLAG(Waiters, waiters, PF_ONLY_HEAD)
PAGEFLAG(Error, error, PF_NO_TAIL) TESTCLEARFLAG(Error, error, PF_NO_TAIL)
PAGEFLAG(Referenced, referenced, PF_HEAD)
        TESTCLEARFLAG(Referenced, referenced, PF_HEAD)
        __SETPAGEFLAG(Referenced, referenced, PF_HEAD)
PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
        __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD)
PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
        TESTCLEARFLAG(Active, active, PF_HEAD)
PAGEFLAG(Workingset, workingset, PF_HEAD)
        TESTCLEARFLAG(Workingset, workingset, PF_HEAD)
__PAGEFLAG(Slab, slab, PF_NO_TAIL)
__PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL)
PAGEFLAG(Checked, checked, PF_NO_COMPOUND)           /* Used by some filesystems */

/* Xen */
PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND)
        TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND)
PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND);
PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND);
PAGEFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND)
        TESTCLEARFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND)

PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
        __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
        __SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
        __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
        __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)

/*
 * Private page markings that may be used by the filesystem that owns the page
 * for its own purposes.
 * - PG_private and PG_private_2 cause releasepage() and co to be invoked
 */
PAGEFLAG(Private, private, PF_ANY) __SETPAGEFLAG(Private, private, PF_ANY)
        __CLEARPAGEFLAG(Private, private, PF_ANY)
PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY)
PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
        TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY)

/*
 * Only test-and-set exist for PG_writeback.  The unconditional operators are
 * risky: they bypass page accounting.
 */
TESTPAGEFLAG(Writeback, writeback, PF_NO_TAIL)
        TESTSCFLAG(Writeback, writeback, PF_NO_TAIL)
PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL)

/* PG_readahead is only used for reads; PG_reclaim is only for writes */
PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL)
        TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL)
PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
        TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND)

#ifdef CONFIG_HIGHMEM
/*
 * Must use a macro here due to header dependency issues. page_zone() is not
 * available at this point.
 */
#define PageHighMem(__p) is_highmem_idx(page_zonenum(__p))
#else
PAGEFLAG_FALSE(HighMem)
#endif

#ifdef CONFIG_SWAP
static __always_inline int PageSwapCache(struct page *page)
{
#ifdef CONFIG_THP_SWAP
        page = compound_head(page);
#endif
        return PageSwapBacked(page) && test_bit(PG_swapcache, &page->flags);

}
SETPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL)
CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL)
#else
PAGEFLAG_FALSE(SwapCache)
#endif

PAGEFLAG(Unevictable, unevictable, PF_HEAD)
        __CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD)
        TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD)

#ifdef CONFIG_MMU
PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
        __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
        TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL)
#else
PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked)
        TESTSCFLAG_FALSE(Mlocked)
#endif

#ifdef CONFIG_ARCH_USES_PG_UNCACHED
PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND)
#else
PAGEFLAG_FALSE(Uncached)
#endif

#ifdef CONFIG_MEMORY_FAILURE
PAGEFLAG(HWPoison, hwpoison, PF_ANY)
TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
#define __PG_HWPOISON (1UL << PG_hwpoison)
extern bool take_page_off_buddy(struct page *page);
#else
PAGEFLAG_FALSE(HWPoison)
#define __PG_HWPOISON 0
#endif

#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
TESTPAGEFLAG(Young, young, PF_ANY)
SETPAGEFLAG(Young, young, PF_ANY)
TESTCLEARFLAG(Young, young, PF_ANY)
PAGEFLAG(Idle, idle, PF_ANY)
#endif

/*
 * PageReported() is used to track reported free pages within the Buddy
 * allocator. We can use the non-atomic version of the test and set
 * operations as both should be shielded with the zone lock to prevent
 * any possible races on the setting or clearing of the bit.
 */
__PAGEFLAG(Reported, reported, PF_NO_COMPOUND)

/*
 * On an anonymous page mapped into a user virtual memory area,
 * page->mapping points to its anon_vma, not to a struct address_space;
 * with the PAGE_MAPPING_ANON bit set to distinguish it.  See rmap.h.
 *
 * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled,
 * the PAGE_MAPPING_MOVABLE bit may be set along with the PAGE_MAPPING_ANON
 * bit; and then page->mapping points, not to an anon_vma, but to a private
 * structure which KSM associates with that merged page.  See ksm.h.
 *
 * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is used for non-lru movable
 * page and then page->mapping points a struct address_space.
 *
 * Please note that, confusingly, "page_mapping" refers to the inode
 * address_space which maps the page from disk; whereas "page_mapped"
 * refers to user virtual address space into which the page is mapped.
 */
#define PAGE_MAPPING_ANON        0x1
#define PAGE_MAPPING_MOVABLE        0x2
#define PAGE_MAPPING_KSM        (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
#define PAGE_MAPPING_FLAGS        (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)

static __always_inline int PageMappingFlags(struct page *page)
{
        return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0;
}

static __always_inline int PageAnon(struct page *page)
{
        page = compound_head(page);
        return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
}

static __always_inline int __PageMovable(struct page *page)
{
        return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
                                PAGE_MAPPING_MOVABLE;
}

#ifdef CONFIG_KSM
/*
 * A KSM page is one of those write-protected "shared pages" or "merged pages"
 * which KSM maps into multiple mms, wherever identical anonymous page content
 * is found in VM_MERGEABLE vmas.  It's a PageAnon page, pointing not to any
 * anon_vma, but to that page's node of the stable tree.
 */
static __always_inline int PageKsm(struct page *page)
{
        page = compound_head(page);
        return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
                                PAGE_MAPPING_KSM;
}
#else
TESTPAGEFLAG_FALSE(Ksm)
#endif

u64 stable_page_flags(struct page *page);

static inline int PageUptodate(struct page *page)
{
        int ret;
        page = compound_head(page);
        ret = test_bit(PG_uptodate, &(page)->flags);
        /*
         * Must ensure that the data we read out of the page is loaded
         * _after_ we've loaded page->flags to check for PageUptodate.
         * We can skip the barrier if the page is not uptodate, because
         * we wouldn't be reading anything from it.
         *
         * See SetPageUptodate() for the other side of the story.
         */
        if (ret)
                smp_rmb();

        return ret;
}

static __always_inline void __SetPageUptodate(struct page *page)
{
        VM_BUG_ON_PAGE(PageTail(page), page);
        smp_wmb();
        __set_bit(PG_uptodate, &page->flags);
}

static __always_inline void SetPageUptodate(struct page *page)
{
        VM_BUG_ON_PAGE(PageTail(page), page);
        /*
         * Memory barrier must be issued before setting the PG_uptodate bit,
         * so that all previous stores issued in order to bring the page
         * uptodate are actually visible before PageUptodate becomes true.
         */
        smp_wmb();
        set_bit(PG_uptodate, &page->flags);
}

CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL)

int test_clear_page_writeback(struct page *page);
int __test_set_page_writeback(struct page *page, bool keep_write);

#define test_set_page_writeback(page)                        \
        __test_set_page_writeback(page, false)
#define test_set_page_writeback_keepwrite(page)        \
        __test_set_page_writeback(page, true)

static inline void set_page_writeback(struct page *page)
{
        test_set_page_writeback(page);
}

static inline void set_page_writeback_keepwrite(struct page *page)
{
        test_set_page_writeback_keepwrite(page);
}

__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY)

static __always_inline void set_compound_head(struct page *page, struct page *head)
{
        WRITE_ONCE(page->compound_head, (unsigned long)head + 1);
}

static __always_inline void clear_compound_head(struct page *page)
{
        WRITE_ONCE(page->compound_head, 0);
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline void ClearPageCompound(struct page *page)
{
        BUG_ON(!PageHead(page));
        ClearPageHead(page);
}
#endif

#define PG_head_mask ((1UL << PG_head))

#ifdef CONFIG_HUGETLB_PAGE
int PageHuge(struct page *page);
int PageHeadHuge(struct page *page);
bool page_huge_active(struct page *page);
#else
TESTPAGEFLAG_FALSE(Huge)
TESTPAGEFLAG_FALSE(HeadHuge)

static inline bool page_huge_active(struct page *page)
{
        return 0;
}
#endif


#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
 * PageHuge() only returns true for hugetlbfs pages, but not for
 * normal or transparent huge pages.
 *
 * PageTransHuge() returns true for both transparent huge and
 * hugetlbfs pages, but not normal pages. PageTransHuge() can only be
 * called only in the core VM paths where hugetlbfs pages can't exist.
 */
static inline int PageTransHuge(struct page *page)
{
        VM_BUG_ON_PAGE(PageTail(page), page);
        return PageHead(page);
}

/*
 * PageTransCompound returns true for both transparent huge pages
 * and hugetlbfs pages, so it should only be called when it's known
 * that hugetlbfs pages aren't involved.
 */
static inline int PageTransCompound(struct page *page)
{
        return PageCompound(page);
}

/*
 * PageTransCompoundMap is the same as PageTransCompound, but it also
 * guarantees the primary MMU has the entire compound page mapped
 * through pmd_trans_huge, which in turn guarantees the secondary MMUs
 * can also map the entire compound page. This allows the secondary
 * MMUs to call get_user_pages() only once for each compound page and
 * to immediately map the entire compound page with a single secondary
 * MMU fault. If there will be a pmd split later, the secondary MMUs
 * will get an update through the MMU notifier invalidation through
 * split_huge_pmd().
 *
 * Unlike PageTransCompound, this is safe to be called only while
 * split_huge_pmd() cannot run from under us, like if protected by the
 * MMU notifier, otherwise it may result in page->_mapcount check false
 * positives.
 *
 * We have to treat page cache THP differently since every subpage of it
 * would get _mapcount inc'ed once it is PMD mapped.  But, it may be PTE
 * mapped in the current process so comparing subpage's _mapcount to
 * compound_mapcount to filter out PTE mapped case.
 */
static inline int PageTransCompoundMap(struct page *page)
{
        struct page *head;

        if (!PageTransCompound(page))
                return 0;

        if (PageAnon(page))
                return atomic_read(&page->_mapcount) < 0;

        head = compound_head(page);
        /* File THP is PMD mapped and not PTE mapped */
        return atomic_read(&page->_mapcount) ==
               atomic_read(compound_mapcount_ptr(head));
}

/*
 * PageTransTail returns true for both transparent huge pages
 * and hugetlbfs pages, so it should only be called when it's known
 * that hugetlbfs pages aren't involved.
 */
static inline int PageTransTail(struct page *page)
{
        return PageTail(page);
}

/*
 * PageDoubleMap indicates that the compound page is mapped with PTEs as well
 * as PMDs.
 *
 * This is required for optimization of rmap operations for THP: we can postpone
 * per small page mapcount accounting (and its overhead from atomic operations)
 * until the first PMD split.
 *
 * For the page PageDoubleMap means ->_mapcount in all sub-pages is offset up
 * by one. This reference will go away with last compound_mapcount.
 *
 * See also __split_huge_pmd_locked() and page_remove_anon_compound_rmap().
 */
PAGEFLAG(DoubleMap, double_map, PF_SECOND)
        TESTSCFLAG(DoubleMap, double_map, PF_SECOND)
#else
TESTPAGEFLAG_FALSE(TransHuge)
TESTPAGEFLAG_FALSE(TransCompound)
TESTPAGEFLAG_FALSE(TransCompoundMap)
TESTPAGEFLAG_FALSE(TransTail)
PAGEFLAG_FALSE(DoubleMap)
        TESTSCFLAG_FALSE(DoubleMap)
#endif

/*
 * For pages that are never mapped to userspace (and aren't PageSlab),
 * page_type may be used.  Because it is initialised to -1, we invert the
 * sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and
 * __ClearPageFoo *sets* the bit used for PageFoo.  We reserve a few high and
 * low bits so that an underflow or overflow of page_mapcount() won't be
 * mistaken for a page type value.
 */

#define PAGE_TYPE_BASE        0xf0000000
/* Reserve                0x0000007f to catch underflows of page_mapcount */
#define PAGE_MAPCOUNT_RESERVE        -128
#define PG_buddy        0x00000080
#define PG_offline        0x00000100
#define PG_kmemcg        0x00000200
#define PG_table        0x00000400
#define PG_guard        0x00000800

#define PageType(page, flag)                                                \
        ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)

static inline int page_has_type(struct page *page)
{
        return (int)page->page_type < PAGE_MAPCOUNT_RESERVE;
}

#define PAGE_TYPE_OPS(uname, lname)                                        \
static __always_inline int Page##uname(struct page *page)                \
{                                                                        \
        return PageType(page, PG_##lname);                                \
}                                                                        \
static __always_inline void __SetPage##uname(struct page *page)                \
{                                                                        \
        VM_BUG_ON_PAGE(!PageType(page, 0), page);                        \
        page->page_type &= ~PG_##lname;                                        \
}                                                                        \
static __always_inline void __ClearPage##uname(struct page *page)        \
{                                                                        \
        VM_BUG_ON_PAGE(!Page##uname(page), page);                        \
        page->page_type |= PG_##lname;                                        \
}

/*
 * PageBuddy() indicates that the page is free and in the buddy system
 * (see mm/page_alloc.c).
 */
PAGE_TYPE_OPS(Buddy, buddy)

/*
 * PageOffline() indicates that the page is logically offline although the
 * containing section is online. (e.g. inflated in a balloon driver or
 * not onlined when onlining the section).
 * The content of these pages is effectively stale. Such pages should not
 * be touched (read/write/dump/save) except by their owner.
 *
 * If a driver wants to allow to offline unmovable PageOffline() pages without
 * putting them back to the buddy, it can do so via the memory notifier by
 * decrementing the reference count in MEM_GOING_OFFLINE and incrementing the
 * reference count in MEM_CANCEL_OFFLINE. When offlining, the PageOffline()
 * pages (now with a reference count of zero) are treated like free pages,
 * allowing the containing memory block to get offlined. A driver that
 * relies on this feature is aware that re-onlining the memory block will
 * require to re-set the pages PageOffline() and not giving them to the
 * buddy via online_page_callback_t.
 */
PAGE_TYPE_OPS(Offline, offline)

/*
 * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on
 * pages allocated with __GFP_ACCOUNT. It gets cleared on page free.
 */
PAGE_TYPE_OPS(Kmemcg, kmemcg)

/*
 * Marks pages in use as page tables.
 */
PAGE_TYPE_OPS(Table, table)

/*
 * Marks guardpages used with debug_pagealloc.
 */
PAGE_TYPE_OPS(Guard, guard)

extern bool is_free_buddy_page(struct page *page);

__PAGEFLAG(Isolated, isolated, PF_ANY);

/*
 * If network-based swap is enabled, sl*b must keep track of whether pages
 * were allocated from pfmemalloc reserves.
 */
static inline int PageSlabPfmemalloc(struct page *page)
{
        VM_BUG_ON_PAGE(!PageSlab(page), page);
        return PageActive(page);
}

static inline void SetPageSlabPfmemalloc(struct page *page)
{
        VM_BUG_ON_PAGE(!PageSlab(page), page);
        SetPageActive(page);
}

static inline void __ClearPageSlabPfmemalloc(struct page *page)
{
        VM_BUG_ON_PAGE(!PageSlab(page), page);
        __ClearPageActive(page);
}

static inline void ClearPageSlabPfmemalloc(struct page *page)
{
        VM_BUG_ON_PAGE(!PageSlab(page), page);
        ClearPageActive(page);
}

#ifdef CONFIG_MMU
#define __PG_MLOCKED                (1UL << PG_mlocked)
#else
#define __PG_MLOCKED                0
#endif

/*
 * Flags checked when a page is freed.  Pages being freed should not have
 * these flags set.  It they are, there is a problem.
 */
#define PAGE_FLAGS_CHECK_AT_FREE                                \
        (1UL << PG_lru                | 1UL << PG_locked        |        \
         1UL << PG_private        | 1UL << PG_private_2        |        \
         1UL << PG_writeback        | 1UL << PG_reserved        |        \
         1UL << PG_slab                | 1UL << PG_active         |        \
         1UL << PG_unevictable        | __PG_MLOCKED)

/*
 * Flags checked when a page is prepped for return by the page allocator.
 * Pages being prepped should not have these flags set.  It they are set,
 * there has been a kernel bug or struct page corruption.
 *
 * __PG_HWPOISON is exceptional because it needs to be kept beyond page's
 * alloc-free cycle to prevent from reusing the page.
 */
#define PAGE_FLAGS_CHECK_AT_PREP        \
        (((1UL << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON)

#define PAGE_FLAGS_PRIVATE                                \
        (1UL << PG_private | 1UL << PG_private_2)
/**
 * page_has_private - Determine if page has private stuff
 * @page: The page to be checked
 *
 * Determine if a page has private stuff, indicating that release routines
 * should be invoked upon it.
 */
static inline int page_has_private(struct page *page)
{
        return !!(page->flags & PAGE_FLAGS_PRIVATE);
}

#undef PF_ANY
#undef PF_HEAD
#undef PF_ONLY_HEAD
#undef PF_NO_TAIL
#undef PF_NO_COMPOUND
#undef PF_SECOND
#endif /* !__GENERATING_BOUNDS_H */

#endif        /* PAGE_FLAGS_H */















































































    1 

























































































    1 































    1 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






    1 








    1 































    1 











    1 




































































































































































    1 
























    1 









































































    1 















































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
/* SPDX-License-Identifier: GPL-2.0+ */
#ifndef _LINUX_XARRAY_H
#define _LINUX_XARRAY_H
/*
 * eXtensible Arrays
 * Copyright (c) 2017 Microsoft Corporation
 * Author: Matthew Wilcox <willy@infradead.org>
 *
 * See Documentation/core-api/xarray.rst for how to use the XArray.
 */

#include <linux/bug.h>
#include <linux/compiler.h>
#include <linux/gfp.h>
#include <linux/kconfig.h>
#include <linux/kernel.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/types.h>

/*
 * The bottom two bits of the entry determine how the XArray interprets
 * the contents:
 *
 * 00: Pointer entry
 * 10: Internal entry
 * x1: Value entry or tagged pointer
 *
 * Attempting to store internal entries in the XArray is a bug.
 *
 * Most internal entries are pointers to the next node in the tree.
 * The following internal entries have a special meaning:
 *
 * 0-62: Sibling entries
 * 256: Retry entry
 * 257: Zero entry
 *
 * Errors are also represented as internal entries, but use the negative
 * space (-4094 to -2).  They're never stored in the slots array; only
 * returned by the normal API.
 */

#define BITS_PER_XA_VALUE        (BITS_PER_LONG - 1)

/**
 * xa_mk_value() - Create an XArray entry from an integer.
 * @v: Value to store in XArray.
 *
 * Context: Any context.
 * Return: An entry suitable for storing in the XArray.
 */
static inline void *xa_mk_value(unsigned long v)
{
        WARN_ON((long)v < 0);
        return (void *)((v << 1) | 1);
}

/**
 * xa_to_value() - Get value stored in an XArray entry.
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: The value stored in the XArray entry.
 */
static inline unsigned long xa_to_value(const void *entry)
{
        return (unsigned long)entry >> 1;
}

/**
 * xa_is_value() - Determine if an entry is a value.
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: True if the entry is a value, false if it is a pointer.
 */
static inline bool xa_is_value(const void *entry)
{
        return (unsigned long)entry & 1;
}

/**
 * xa_tag_pointer() - Create an XArray entry for a tagged pointer.
 * @p: Plain pointer.
 * @tag: Tag value (0, 1 or 3).
 *
 * If the user of the XArray prefers, they can tag their pointers instead
 * of storing value entries.  Three tags are available (0, 1 and 3).
 * These are distinct from the xa_mark_t as they are not replicated up
 * through the array and cannot be searched for.
 *
 * Context: Any context.
 * Return: An XArray entry.
 */
static inline void *xa_tag_pointer(void *p, unsigned long tag)
{
        return (void *)((unsigned long)p | tag);
}

/**
 * xa_untag_pointer() - Turn an XArray entry into a plain pointer.
 * @entry: XArray entry.
 *
 * If you have stored a tagged pointer in the XArray, call this function
 * to get the untagged version of the pointer.
 *
 * Context: Any context.
 * Return: A pointer.
 */
static inline void *xa_untag_pointer(void *entry)
{
        return (void *)((unsigned long)entry & ~3UL);
}

/**
 * xa_pointer_tag() - Get the tag stored in an XArray entry.
 * @entry: XArray entry.
 *
 * If you have stored a tagged pointer in the XArray, call this function
 * to get the tag of that pointer.
 *
 * Context: Any context.
 * Return: A tag.
 */
static inline unsigned int xa_pointer_tag(void *entry)
{
        return (unsigned long)entry & 3UL;
}

/*
 * xa_mk_internal() - Create an internal entry.
 * @v: Value to turn into an internal entry.
 *
 * Internal entries are used for a number of purposes.  Entries 0-255 are
 * used for sibling entries (only 0-62 are used by the current code).  256
 * is used for the retry entry.  257 is used for the reserved / zero entry.
 * Negative internal entries are used to represent errnos.  Node pointers
 * are also tagged as internal entries in some situations.
 *
 * Context: Any context.
 * Return: An XArray internal entry corresponding to this value.
 */
static inline void *xa_mk_internal(unsigned long v)
{
        return (void *)((v << 2) | 2);
}

/*
 * xa_to_internal() - Extract the value from an internal entry.
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: The value which was stored in the internal entry.
 */
static inline unsigned long xa_to_internal(const void *entry)
{
        return (unsigned long)entry >> 2;
}

/*
 * xa_is_internal() - Is the entry an internal entry?
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: %true if the entry is an internal entry.
 */
static inline bool xa_is_internal(const void *entry)
{
        return ((unsigned long)entry & 3) == 2;
}

#define XA_ZERO_ENTRY                xa_mk_internal(257)

/**
 * xa_is_zero() - Is the entry a zero entry?
 * @entry: Entry retrieved from the XArray
 *
 * The normal API will return NULL as the contents of a slot containing
 * a zero entry.  You can only see zero entries by using the advanced API.
 *
 * Return: %true if the entry is a zero entry.
 */
static inline bool xa_is_zero(const void *entry)
{
        return unlikely(entry == XA_ZERO_ENTRY);
}

/**
 * xa_is_err() - Report whether an XArray operation returned an error
 * @entry: Result from calling an XArray function
 *
 * If an XArray operation cannot complete an operation, it will return
 * a special value indicating an error.  This function tells you
 * whether an error occurred; xa_err() tells you which error occurred.
 *
 * Context: Any context.
 * Return: %true if the entry indicates an error.
 */
static inline bool xa_is_err(const void *entry)
{
        return unlikely(xa_is_internal(entry) &&
                        entry >= xa_mk_internal(-MAX_ERRNO));
}

/**
 * xa_err() - Turn an XArray result into an errno.
 * @entry: Result from calling an XArray function.
 *
 * If an XArray operation cannot complete an operation, it will return
 * a special pointer value which encodes an errno.  This function extracts
 * the errno from the pointer value, or returns 0 if the pointer does not
 * represent an errno.
 *
 * Context: Any context.
 * Return: A negative errno or 0.
 */
static inline int xa_err(void *entry)
{
        /* xa_to_internal() would not do sign extension. */
        if (xa_is_err(entry))
                return (long)entry >> 2;
        return 0;
}

/**
 * struct xa_limit - Represents a range of IDs.
 * @min: The lowest ID to allocate (inclusive).
 * @max: The maximum ID to allocate (inclusive).
 *
 * This structure is used either directly or via the XA_LIMIT() macro
 * to communicate the range of IDs that are valid for allocation.
 * Two common ranges are predefined for you:
 * * xa_limit_32b        - [0 - UINT_MAX]
 * * xa_limit_31b        - [0 - INT_MAX]
 */
struct xa_limit {
        u32 max;
        u32 min;
};

#define XA_LIMIT(_min, _max) (struct xa_limit) { .min = _min, .max = _max }

#define xa_limit_32b        XA_LIMIT(0, UINT_MAX)
#define xa_limit_31b        XA_LIMIT(0, INT_MAX)

typedef unsigned __bitwise xa_mark_t;
#define XA_MARK_0                ((__force xa_mark_t)0U)
#define XA_MARK_1                ((__force xa_mark_t)1U)
#define XA_MARK_2                ((__force xa_mark_t)2U)
#define XA_PRESENT                ((__force xa_mark_t)8U)
#define XA_MARK_MAX                XA_MARK_2
#define XA_FREE_MARK                XA_MARK_0

enum xa_lock_type {
        XA_LOCK_IRQ = 1,
        XA_LOCK_BH = 2,
};

/*
 * Values for xa_flags.  The radix tree stores its GFP flags in the xa_flags,
 * and we remain compatible with that.
 */
#define XA_FLAGS_LOCK_IRQ        ((__force gfp_t)XA_LOCK_IRQ)
#define XA_FLAGS_LOCK_BH        ((__force gfp_t)XA_LOCK_BH)
#define XA_FLAGS_TRACK_FREE        ((__force gfp_t)4U)
#define XA_FLAGS_ZERO_BUSY        ((__force gfp_t)8U)
#define XA_FLAGS_ALLOC_WRAPPED        ((__force gfp_t)16U)
#define XA_FLAGS_ACCOUNT        ((__force gfp_t)32U)
#define XA_FLAGS_MARK(mark)        ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \
                                                (__force unsigned)(mark)))

/* ALLOC is for a normal 0-based alloc.  ALLOC1 is for an 1-based alloc */
#define XA_FLAGS_ALLOC        (XA_FLAGS_TRACK_FREE | XA_FLAGS_MARK(XA_FREE_MARK))
#define XA_FLAGS_ALLOC1        (XA_FLAGS_TRACK_FREE | XA_FLAGS_ZERO_BUSY)

/**
 * struct xarray - The anchor of the XArray.
 * @xa_lock: Lock that protects the contents of the XArray.
 *
 * To use the xarray, define it statically or embed it in your data structure.
 * It is a very small data structure, so it does not usually make sense to
 * allocate it separately and keep a pointer to it in your data structure.
 *
 * You may use the xa_lock to protect your own data structures as well.
 */
/*
 * If all of the entries in the array are NULL, @xa_head is a NULL pointer.
 * If the only non-NULL entry in the array is at index 0, @xa_head is that
 * entry.  If any other entry in the array is non-NULL, @xa_head points
 * to an @xa_node.
 */
struct xarray {
        spinlock_t        xa_lock;
/* private: The rest of the data structure is not to be used directly. */
        gfp_t                xa_flags;
        void __rcu *        xa_head;
};

#define XARRAY_INIT(name, flags) {                                \
        .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock),                \
        .xa_flags = flags,                                        \
        .xa_head = NULL,                                        \
}

/**
 * DEFINE_XARRAY_FLAGS() - Define an XArray with custom flags.
 * @name: A string that names your XArray.
 * @flags: XA_FLAG values.
 *
 * This is intended for file scope definitions of XArrays.  It declares
 * and initialises an empty XArray with the chosen name and flags.  It is
 * equivalent to calling xa_init_flags() on the array, but it does the
 * initialisation at compiletime instead of runtime.
 */
#define DEFINE_XARRAY_FLAGS(name, flags)                                \
        struct xarray name = XARRAY_INIT(name, flags)

/**
 * DEFINE_XARRAY() - Define an XArray.
 * @name: A string that names your XArray.
 *
 * This is intended for file scope definitions of XArrays.  It declares
 * and initialises an empty XArray with the chosen name.  It is equivalent
 * to calling xa_init() on the array, but it does the initialisation at
 * compiletime instead of runtime.
 */
#define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0)

/**
 * DEFINE_XARRAY_ALLOC() - Define an XArray which allocates IDs starting at 0.
 * @name: A string that names your XArray.
 *
 * This is intended for file scope definitions of allocating XArrays.
 * See also DEFINE_XARRAY().
 */
#define DEFINE_XARRAY_ALLOC(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC)

/**
 * DEFINE_XARRAY_ALLOC1() - Define an XArray which allocates IDs starting at 1.
 * @name: A string that names your XArray.
 *
 * This is intended for file scope definitions of allocating XArrays.
 * See also DEFINE_XARRAY().
 */
#define DEFINE_XARRAY_ALLOC1(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC1)

void *xa_load(struct xarray *, unsigned long index);
void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
void *xa_erase(struct xarray *, unsigned long index);
void *xa_store_range(struct xarray *, unsigned long first, unsigned long last,
                        void *entry, gfp_t);
bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t);
void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t);
void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t);
void *xa_find(struct xarray *xa, unsigned long *index,
                unsigned long max, xa_mark_t) __attribute__((nonnull(2)));
void *xa_find_after(struct xarray *xa, unsigned long *index,
                unsigned long max, xa_mark_t) __attribute__((nonnull(2)));
unsigned int xa_extract(struct xarray *, void **dst, unsigned long start,
                unsigned long max, unsigned int n, xa_mark_t);
void xa_destroy(struct xarray *);

/**
 * xa_init_flags() - Initialise an empty XArray with flags.
 * @xa: XArray.
 * @flags: XA_FLAG values.
 *
 * If you need to initialise an XArray with special flags (eg you need
 * to take the lock from interrupt context), use this function instead
 * of xa_init().
 *
 * Context: Any context.
 */
static inline void xa_init_flags(struct xarray *xa, gfp_t flags)
{
        spin_lock_init(&xa->xa_lock);
        xa->xa_flags = flags;
        xa->xa_head = NULL;
}

/**
 * xa_init() - Initialise an empty XArray.
 * @xa: XArray.
 *
 * An empty XArray is full of NULL entries.
 *
 * Context: Any context.
 */
static inline void xa_init(struct xarray *xa)
{
        xa_init_flags(xa, 0);
}

/**
 * xa_empty() - Determine if an array has any present entries.
 * @xa: XArray.
 *
 * Context: Any context.
 * Return: %true if the array contains only NULL pointers.
 */
static inline bool xa_empty(const struct xarray *xa)
{
        return xa->xa_head == NULL;
}

/**
 * xa_marked() - Inquire whether any entry in this array has a mark set
 * @xa: Array
 * @mark: Mark value
 *
 * Context: Any context.
 * Return: %true if any entry has this mark set.
 */
static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark)
{
        return xa->xa_flags & XA_FLAGS_MARK(mark);
}

/**
 * xa_for_each_range() - Iterate over a portion of an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 * @start: First index to retrieve from array.
 * @last: Last index to retrieve from array.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  You may modify @index during the iteration if you
 * want to skip or reprocess indices.  It is safe to modify the array
 * during the iteration.  At the end of the iteration, @entry will be set
 * to NULL and @index will have a value less than or equal to max.
 *
 * xa_for_each_range() is O(n.log(n)) while xas_for_each() is O(n).  You have
 * to handle your own locking with xas_for_each(), and if you have to unlock
 * after each iteration, it will also end up being O(n.log(n)).
 * xa_for_each_range() will spin if it hits a retry entry; if you intend to
 * see retry entries, you should use the xas_for_each() iterator instead.
 * The xas_for_each() iterator will expand into more inline code than
 * xa_for_each_range().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each_range(xa, index, entry, start, last)                \
        for (index = start,                                                \
             entry = xa_find(xa, &index, last, XA_PRESENT);                \
             entry;                                                        \
             entry = xa_find_after(xa, &index, last, XA_PRESENT))

/**
 * xa_for_each_start() - Iterate over a portion of an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 * @start: First index to retrieve from array.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  You may modify @index during the iteration if you
 * want to skip or reprocess indices.  It is safe to modify the array
 * during the iteration.  At the end of the iteration, @entry will be set
 * to NULL and @index will have a value less than or equal to max.
 *
 * xa_for_each_start() is O(n.log(n)) while xas_for_each() is O(n).  You have
 * to handle your own locking with xas_for_each(), and if you have to unlock
 * after each iteration, it will also end up being O(n.log(n)).
 * xa_for_each_start() will spin if it hits a retry entry; if you intend to
 * see retry entries, you should use the xas_for_each() iterator instead.
 * The xas_for_each() iterator will expand into more inline code than
 * xa_for_each_start().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each_start(xa, index, entry, start) \
        xa_for_each_range(xa, index, entry, start, ULONG_MAX)

/**
 * xa_for_each() - Iterate over present entries in an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  You may modify @index during the iteration if you want
 * to skip or reprocess indices.  It is safe to modify the array during the
 * iteration.  At the end of the iteration, @entry will be set to NULL and
 * @index will have a value less than or equal to max.
 *
 * xa_for_each() is O(n.log(n)) while xas_for_each() is O(n).  You have
 * to handle your own locking with xas_for_each(), and if you have to unlock
 * after each iteration, it will also end up being O(n.log(n)).  xa_for_each()
 * will spin if it hits a retry entry; if you intend to see retry entries,
 * you should use the xas_for_each() iterator instead.  The xas_for_each()
 * iterator will expand into more inline code than xa_for_each().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each(xa, index, entry) \
        xa_for_each_start(xa, index, entry, 0)

/**
 * xa_for_each_marked() - Iterate over marked entries in an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 * @filter: Selection criterion.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  The iteration will skip all entries in the array
 * which do not match @filter.  You may modify @index during the iteration
 * if you want to skip or reprocess indices.  It is safe to modify the array
 * during the iteration.  At the end of the iteration, @entry will be set to
 * NULL and @index will have a value less than or equal to max.
 *
 * xa_for_each_marked() is O(n.log(n)) while xas_for_each_marked() is O(n).
 * You have to handle your own locking with xas_for_each(), and if you have
 * to unlock after each iteration, it will also end up being O(n.log(n)).
 * xa_for_each_marked() will spin if it hits a retry entry; if you intend to
 * see retry entries, you should use the xas_for_each_marked() iterator
 * instead.  The xas_for_each_marked() iterator will expand into more inline
 * code than xa_for_each_marked().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each_marked(xa, index, entry, filter) \
        for (index = 0, entry = xa_find(xa, &index, ULONG_MAX, filter); \
             entry; entry = xa_find_after(xa, &index, ULONG_MAX, filter))

#define xa_trylock(xa)                spin_trylock(&(xa)->xa_lock)
#define xa_lock(xa)                spin_lock(&(xa)->xa_lock)
#define xa_unlock(xa)                spin_unlock(&(xa)->xa_lock)
#define xa_lock_bh(xa)                spin_lock_bh(&(xa)->xa_lock)
#define xa_unlock_bh(xa)        spin_unlock_bh(&(xa)->xa_lock)
#define xa_lock_irq(xa)                spin_lock_irq(&(xa)->xa_lock)
#define xa_unlock_irq(xa)        spin_unlock_irq(&(xa)->xa_lock)
#define xa_lock_irqsave(xa, flags) \
                                spin_lock_irqsave(&(xa)->xa_lock, flags)
#define xa_unlock_irqrestore(xa, flags) \
                                spin_unlock_irqrestore(&(xa)->xa_lock, flags)
#define xa_lock_nested(xa, subclass) \
                                spin_lock_nested(&(xa)->xa_lock, subclass)
#define xa_lock_bh_nested(xa, subclass) \
                                spin_lock_bh_nested(&(xa)->xa_lock, subclass)
#define xa_lock_irq_nested(xa, subclass) \
                                spin_lock_irq_nested(&(xa)->xa_lock, subclass)
#define xa_lock_irqsave_nested(xa, flags, subclass) \
                spin_lock_irqsave_nested(&(xa)->xa_lock, flags, subclass)

/*
 * Versions of the normal API which require the caller to hold the
 * xa_lock.  If the GFP flags allow it, they will drop the lock to
 * allocate memory, then reacquire it afterwards.  These functions
 * may also re-enable interrupts if the XArray flags indicate the
 * locking should be interrupt safe.
 */
void *__xa_erase(struct xarray *, unsigned long index);
void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old,
                void *entry, gfp_t);
int __must_check __xa_insert(struct xarray *, unsigned long index,
                void *entry, gfp_t);
int __must_check __xa_alloc(struct xarray *, u32 *id, void *entry,
                struct xa_limit, gfp_t);
int __must_check __xa_alloc_cyclic(struct xarray *, u32 *id, void *entry,
                struct xa_limit, u32 *next, gfp_t);
void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t);
void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t);

/**
 * xa_store_bh() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_store() except it disables softirqs
 * while holding the array lock.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.
 * Return: The old entry at this index or xa_err() if an error happened.
 */
static inline void *xa_store_bh(struct xarray *xa, unsigned long index,
                void *entry, gfp_t gfp)
{
        void *curr;

        xa_lock_bh(xa);
        curr = __xa_store(xa, index, entry, gfp);
        xa_unlock_bh(xa);

        return curr;
}

/**
 * xa_store_irq() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_store() except it disables interrupts
 * while holding the array lock.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.
 * Return: The old entry at this index or xa_err() if an error happened.
 */
static inline void *xa_store_irq(struct xarray *xa, unsigned long index,
                void *entry, gfp_t gfp)
{
        void *curr;

        xa_lock_irq(xa);
        curr = __xa_store(xa, index, entry, gfp);
        xa_unlock_irq(xa);

        return curr;
}

/**
 * xa_erase_bh() - Erase this entry from the XArray.
 * @xa: XArray.
 * @index: Index of entry.
 *
 * After this function returns, loading from @index will return %NULL.
 * If the index is part of a multi-index entry, all indices will be erased
 * and none of the entries will be part of a multi-index entry.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.
 * Return: The entry which used to be at this index.
 */
static inline void *xa_erase_bh(struct xarray *xa, unsigned long index)
{
        void *entry;

        xa_lock_bh(xa);
        entry = __xa_erase(xa, index);
        xa_unlock_bh(xa);

        return entry;
}

/**
 * xa_erase_irq() - Erase this entry from the XArray.
 * @xa: XArray.
 * @index: Index of entry.
 *
 * After this function returns, loading from @index will return %NULL.
 * If the index is part of a multi-index entry, all indices will be erased
 * and none of the entries will be part of a multi-index entry.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.
 * Return: The entry which used to be at this index.
 */
static inline void *xa_erase_irq(struct xarray *xa, unsigned long index)
{
        void *entry;

        xa_lock_irq(xa);
        entry = __xa_erase(xa, index);
        xa_unlock_irq(xa);

        return entry;
}

/**
 * xa_cmpxchg() - Conditionally replace an entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @old: Old value to test against.
 * @entry: New value to place in array.
 * @gfp: Memory allocation flags.
 *
 * If the entry at @index is the same as @old, replace it with @entry.
 * If the return value is equal to @old, then the exchange was successful.
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep
 * if the @gfp flags permit.
 * Return: The old value at this index or xa_err() if an error happened.
 */
static inline void *xa_cmpxchg(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        void *curr;

        xa_lock(xa);
        curr = __xa_cmpxchg(xa, index, old, entry, gfp);
        xa_unlock(xa);

        return curr;
}

/**
 * xa_cmpxchg_bh() - Conditionally replace an entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @old: Old value to test against.
 * @entry: New value to place in array.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_cmpxchg() except it disables softirqs
 * while holding the array lock.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: The old value at this index or xa_err() if an error happened.
 */
static inline void *xa_cmpxchg_bh(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        void *curr;

        xa_lock_bh(xa);
        curr = __xa_cmpxchg(xa, index, old, entry, gfp);
        xa_unlock_bh(xa);

        return curr;
}

/**
 * xa_cmpxchg_irq() - Conditionally replace an entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @old: Old value to test against.
 * @entry: New value to place in array.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_cmpxchg() except it disables interrupts
 * while holding the array lock.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: The old value at this index or xa_err() if an error happened.
 */
static inline void *xa_cmpxchg_irq(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        void *curr;

        xa_lock_irq(xa);
        curr = __xa_cmpxchg(xa, index, old, entry, gfp);
        xa_unlock_irq(xa);

        return curr;
}

/**
 * xa_insert() - Store this entry in the XArray unless another entry is
 *                        already present.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Inserting a NULL entry will store a reserved entry (like xa_reserve())
 * if no entry is present.  Inserting will fail if a reserved entry is
 * present, even though loading from this index will return NULL.
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep if
 * the @gfp flags permit.
 * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
 * -ENOMEM if memory could not be allocated.
 */
static inline int __must_check xa_insert(struct xarray *xa,
                unsigned long index, void *entry, gfp_t gfp)
{
        int err;

        xa_lock(xa);
        err = __xa_insert(xa, index, entry, gfp);
        xa_unlock(xa);

        return err;
}

/**
 * xa_insert_bh() - Store this entry in the XArray unless another entry is
 *                        already present.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Inserting a NULL entry will store a reserved entry (like xa_reserve())
 * if no entry is present.  Inserting will fail if a reserved entry is
 * present, even though loading from this index will return NULL.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
 * -ENOMEM if memory could not be allocated.
 */
static inline int __must_check xa_insert_bh(struct xarray *xa,
                unsigned long index, void *entry, gfp_t gfp)
{
        int err;

        xa_lock_bh(xa);
        err = __xa_insert(xa, index, entry, gfp);
        xa_unlock_bh(xa);

        return err;
}

/**
 * xa_insert_irq() - Store this entry in the XArray unless another entry is
 *                        already present.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Inserting a NULL entry will store a reserved entry (like xa_reserve())
 * if no entry is present.  Inserting will fail if a reserved entry is
 * present, even though loading from this index will return NULL.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
 * -ENOMEM if memory could not be allocated.
 */
static inline int __must_check xa_insert_irq(struct xarray *xa,
                unsigned long index, void *entry, gfp_t gfp)
{
        int err;

        xa_lock_irq(xa);
        err = __xa_insert(xa, index, entry, gfp);
        xa_unlock_irq(xa);

        return err;
}

/**
 * xa_alloc() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep if
 * the @gfp flags permit.
 * Return: 0 on success, -ENOMEM if memory could not be allocated or
 * -EBUSY if there are no free entries in @limit.
 */
static inline __must_check int xa_alloc(struct xarray *xa, u32 *id,
                void *entry, struct xa_limit limit, gfp_t gfp)
{
        int err;

        xa_lock(xa);
        err = __xa_alloc(xa, id, entry, limit, gfp);
        xa_unlock(xa);

        return err;
}

/**
 * xa_alloc_bh() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: 0 on success, -ENOMEM if memory could not be allocated or
 * -EBUSY if there are no free entries in @limit.
 */
static inline int __must_check xa_alloc_bh(struct xarray *xa, u32 *id,
                void *entry, struct xa_limit limit, gfp_t gfp)
{
        int err;

        xa_lock_bh(xa);
        err = __xa_alloc(xa, id, entry, limit, gfp);
        xa_unlock_bh(xa);

        return err;
}

/**
 * xa_alloc_irq() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: 0 on success, -ENOMEM if memory could not be allocated or
 * -EBUSY if there are no free entries in @limit.
 */
static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id,
                void *entry, struct xa_limit limit, gfp_t gfp)
{
        int err;

        xa_lock_irq(xa);
        err = __xa_alloc(xa, id, entry, limit, gfp);
        xa_unlock_irq(xa);

        return err;
}

/**
 * xa_alloc_cyclic() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of allocated ID.
 * @next: Pointer to next ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 * The search for an empty entry will start at @next and will wrap
 * around if necessary.
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep if
 * the @gfp flags permit.
 * Return: 0 if the allocation succeeded without wrapping.  1 if the
 * allocation succeeded after wrapping, -ENOMEM if memory could not be
 * allocated or -EBUSY if there are no free entries in @limit.
 */
static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, u32 *next, gfp_t gfp)
{
        int err;

        xa_lock(xa);
        err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
        xa_unlock(xa);

        return err;
}

/**
 * xa_alloc_cyclic_bh() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of allocated ID.
 * @next: Pointer to next ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 * The search for an empty entry will start at @next and will wrap
 * around if necessary.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: 0 if the allocation succeeded without wrapping.  1 if the
 * allocation succeeded after wrapping, -ENOMEM if memory could not be
 * allocated or -EBUSY if there are no free entries in @limit.
 */
static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, u32 *next, gfp_t gfp)
{
        int err;

        xa_lock_bh(xa);
        err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
        xa_unlock_bh(xa);

        return err;
}

/**
 * xa_alloc_cyclic_irq() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of allocated ID.
 * @next: Pointer to next ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 * The search for an empty entry will start at @next and will wrap
 * around if necessary.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: 0 if the allocation succeeded without wrapping.  1 if the
 * allocation succeeded after wrapping, -ENOMEM if memory could not be
 * allocated or -EBUSY if there are no free entries in @limit.
 */
static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, u32 *next, gfp_t gfp)
{
        int err;

        xa_lock_irq(xa);
        err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
        xa_unlock_irq(xa);

        return err;
}

/**
 * xa_reserve() - Reserve this index in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @gfp: Memory allocation flags.
 *
 * Ensures there is somewhere to store an entry at @index in the array.
 * If there is already something stored at @index, this function does
 * nothing.  If there was nothing there, the entry is marked as reserved.
 * Loading from a reserved entry returns a %NULL pointer.
 *
 * If you do not use the entry that you have reserved, call xa_release()
 * or xa_erase() to free any unnecessary memory.
 *
 * Context: Any context.  Takes and releases the xa_lock.
 * May sleep if the @gfp flags permit.
 * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
 */
static inline __must_check
int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp)
{
        return xa_err(xa_cmpxchg(xa, index, NULL, XA_ZERO_ENTRY, gfp));
}

/**
 * xa_reserve_bh() - Reserve this index in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @gfp: Memory allocation flags.
 *
 * A softirq-disabling version of xa_reserve().
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.
 * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
 */
static inline __must_check
int xa_reserve_bh(struct xarray *xa, unsigned long index, gfp_t gfp)
{
        return xa_err(xa_cmpxchg_bh(xa, index, NULL, XA_ZERO_ENTRY, gfp));
}

/**
 * xa_reserve_irq() - Reserve this index in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @gfp: Memory allocation flags.
 *
 * An interrupt-disabling version of xa_reserve().
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.
 * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
 */
static inline __must_check
int xa_reserve_irq(struct xarray *xa, unsigned long index, gfp_t gfp)
{
        return xa_err(xa_cmpxchg_irq(xa, index, NULL, XA_ZERO_ENTRY, gfp));
}

/**
 * xa_release() - Release a reserved entry.
 * @xa: XArray.
 * @index: Index of entry.
 *
 * After calling xa_reserve(), you can call this function to release the
 * reservation.  If the entry at @index has been stored to, this function
 * will do nothing.
 */
static inline void xa_release(struct xarray *xa, unsigned long index)
{
        xa_cmpxchg(xa, index, XA_ZERO_ENTRY, NULL, 0);
}

/* Everything below here is the Advanced API.  Proceed with caution. */

/*
 * The xarray is constructed out of a set of 'chunks' of pointers.  Choosing
 * the best chunk size requires some tradeoffs.  A power of two recommends
 * itself so that we can walk the tree based purely on shifts and masks.
 * Generally, the larger the better; as the number of slots per level of the
 * tree increases, the less tall the tree needs to be.  But that needs to be
 * balanced against the memory consumption of each node.  On a 64-bit system,
 * xa_node is currently 576 bytes, and we get 7 of them per 4kB page.  If we
 * doubled the number of slots per node, we'd get only 3 nodes per 4kB page.
 */
#ifndef XA_CHUNK_SHIFT
#define XA_CHUNK_SHIFT                (CONFIG_BASE_SMALL ? 4 : 6)
#endif
#define XA_CHUNK_SIZE                (1UL << XA_CHUNK_SHIFT)
#define XA_CHUNK_MASK                (XA_CHUNK_SIZE - 1)
#define XA_MAX_MARKS                3
#define XA_MARK_LONGS                DIV_ROUND_UP(XA_CHUNK_SIZE, BITS_PER_LONG)

/*
 * @count is the count of every non-NULL element in the ->slots array
 * whether that is a value entry, a retry entry, a user pointer,
 * a sibling entry or a pointer to the next level of the tree.
 * @nr_values is the count of every element in ->slots which is
 * either a value entry or a sibling of a value entry.
 */
struct xa_node {
        unsigned char        shift;                /* Bits remaining in each slot */
        unsigned char        offset;                /* Slot offset in parent */
        unsigned char        count;                /* Total entry count */
        unsigned char        nr_values;        /* Value entry count */
        struct xa_node __rcu *parent;        /* NULL at top of tree */
        struct xarray        *array;                /* The array we belong to */
        union {
                struct list_head private_list;        /* For tree user */
                struct rcu_head        rcu_head;        /* Used when freeing node */
        };
        void __rcu        *slots[XA_CHUNK_SIZE];
        union {
                unsigned long        tags[XA_MAX_MARKS][XA_MARK_LONGS];
                unsigned long        marks[XA_MAX_MARKS][XA_MARK_LONGS];
        };
};

void xa_dump(const struct xarray *);
void xa_dump_node(const struct xa_node *);

#ifdef XA_DEBUG
#define XA_BUG_ON(xa, x) do {                                        \
                if (x) {                                        \
                        xa_dump(xa);                                \
                        BUG();                                        \
                }                                                \
        } while (0)
#define XA_NODE_BUG_ON(node, x) do {                                \
                if (x) {                                        \
                        if (node) xa_dump_node(node);                \
                        BUG();                                        \
                }                                                \
        } while (0)
#else
#define XA_BUG_ON(xa, x)        do { } while (0)
#define XA_NODE_BUG_ON(node, x)        do { } while (0)
#endif

/* Private */
static inline void *xa_head(const struct xarray *xa)
{
        return rcu_dereference_check(xa->xa_head,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_head_locked(const struct xarray *xa)
{
        return rcu_dereference_protected(xa->xa_head,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_entry(const struct xarray *xa,
                                const struct xa_node *node, unsigned int offset)
{
        XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE);
        return rcu_dereference_check(node->slots[offset],
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_entry_locked(const struct xarray *xa,
                                const struct xa_node *node, unsigned int offset)
{
        XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE);
        return rcu_dereference_protected(node->slots[offset],
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline struct xa_node *xa_parent(const struct xarray *xa,
                                        const struct xa_node *node)
{
        return rcu_dereference_check(node->parent,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline struct xa_node *xa_parent_locked(const struct xarray *xa,
                                        const struct xa_node *node)
{
        return rcu_dereference_protected(node->parent,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_mk_node(const struct xa_node *node)
{
        return (void *)((unsigned long)node | 2);
}

/* Private */
static inline struct xa_node *xa_to_node(const void *entry)
{
        return (struct xa_node *)((unsigned long)entry - 2);
}

/* Private */
static inline bool xa_is_node(const void *entry)
{
        return xa_is_internal(entry) && (unsigned long)entry > 4096;
}

/* Private */
static inline void *xa_mk_sibling(unsigned int offset)
{
        return xa_mk_internal(offset);
}

/* Private */
static inline unsigned long xa_to_sibling(const void *entry)
{
        return xa_to_internal(entry);
}

/**
 * xa_is_sibling() - Is the entry a sibling entry?
 * @entry: Entry retrieved from the XArray
 *
 * Return: %true if the entry is a sibling entry.
 */
static inline bool xa_is_sibling(const void *entry)
{
        return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) &&
                (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1));
}

#define XA_RETRY_ENTRY                xa_mk_internal(256)

/**
 * xa_is_retry() - Is the entry a retry entry?
 * @entry: Entry retrieved from the XArray
 *
 * Return: %true if the entry is a retry entry.
 */
static inline bool xa_is_retry(const void *entry)
{
        return unlikely(entry == XA_RETRY_ENTRY);
}

/**
 * xa_is_advanced() - Is the entry only permitted for the advanced API?
 * @entry: Entry to be stored in the XArray.
 *
 * Return: %true if the entry cannot be stored by the normal API.
 */
static inline bool xa_is_advanced(const void *entry)
{
        return xa_is_internal(entry) && (entry <= XA_RETRY_ENTRY);
}

/**
 * typedef xa_update_node_t - A callback function from the XArray.
 * @node: The node which is being processed
 *
 * This function is called every time the XArray updates the count of
 * present and value entries in a node.  It allows advanced users to
 * maintain the private_list in the node.
 *
 * Context: The xa_lock is held and interrupts may be disabled.
 *            Implementations should not drop the xa_lock, nor re-enable
 *            interrupts.
 */
typedef void (*xa_update_node_t)(struct xa_node *node);

void xa_delete_node(struct xa_node *, xa_update_node_t);

/*
 * The xa_state is opaque to its users.  It contains various different pieces
 * of state involved in the current operation on the XArray.  It should be
 * declared on the stack and passed between the various internal routines.
 * The various elements in it should not be accessed directly, but only
 * through the provided accessor functions.  The below documentation is for
 * the benefit of those working on the code, not for users of the XArray.
 *
 * @xa_node usually points to the xa_node containing the slot we're operating
 * on (and @xa_offset is the offset in the slots array).  If there is a
 * single entry in the array at index 0, there are no allocated xa_nodes to
 * point to, and so we store %NULL in @xa_node.  @xa_node is set to
 * the value %XAS_RESTART if the xa_state is not walked to the correct
 * position in the tree of nodes for this operation.  If an error occurs
 * during an operation, it is set to an %XAS_ERROR value.  If we run off the
 * end of the allocated nodes, it is set to %XAS_BOUNDS.
 */
struct xa_state {
        struct xarray *xa;
        unsigned long xa_index;
        unsigned char xa_shift;
        unsigned char xa_sibs;
        unsigned char xa_offset;
        unsigned char xa_pad;                /* Helps gcc generate better code */
        struct xa_node *xa_node;
        struct xa_node *xa_alloc;
        xa_update_node_t xa_update;
};

/*
 * We encode errnos in the xas->xa_node.  If an error has happened, we need to
 * drop the lock to fix it, and once we've done so the xa_state is invalid.
 */
#define XA_ERROR(errno) ((struct xa_node *)(((unsigned long)errno << 2) | 2UL))
#define XAS_BOUNDS        ((struct xa_node *)1UL)
#define XAS_RESTART        ((struct xa_node *)3UL)

#define __XA_STATE(array, index, shift, sibs)  {        \
        .xa = array,                                        \
        .xa_index = index,                                \
        .xa_shift = shift,                                \
        .xa_sibs = sibs,                                \
        .xa_offset = 0,                                        \
        .xa_pad = 0,                                        \
        .xa_node = XAS_RESTART,                                \
        .xa_alloc = NULL,                                \
        .xa_update = NULL                                \
}

/**
 * XA_STATE() - Declare an XArray operation state.
 * @name: Name of this operation state (usually xas).
 * @array: Array to operate on.
 * @index: Initial index of interest.
 *
 * Declare and initialise an xa_state on the stack.
 */
#define XA_STATE(name, array, index)                                \
        struct xa_state name = __XA_STATE(array, index, 0, 0)

/**
 * XA_STATE_ORDER() - Declare an XArray operation state.
 * @name: Name of this operation state (usually xas).
 * @array: Array to operate on.
 * @index: Initial index of interest.
 * @order: Order of entry.
 *
 * Declare and initialise an xa_state on the stack.  This variant of
 * XA_STATE() allows you to specify the 'order' of the element you
 * want to operate on.`
 */
#define XA_STATE_ORDER(name, array, index, order)                \
        struct xa_state name = __XA_STATE(array,                \
                        (index >> order) << order,                \
                        order - (order % XA_CHUNK_SHIFT),        \
                        (1U << (order % XA_CHUNK_SHIFT)) - 1)

#define xas_marked(xas, mark)        xa_marked((xas)->xa, (mark))
#define xas_trylock(xas)        xa_trylock((xas)->xa)
#define xas_lock(xas)                xa_lock((xas)->xa)
#define xas_unlock(xas)                xa_unlock((xas)->xa)
#define xas_lock_bh(xas)        xa_lock_bh((xas)->xa)
#define xas_unlock_bh(xas)        xa_unlock_bh((xas)->xa)
#define xas_lock_irq(xas)        xa_lock_irq((xas)->xa)
#define xas_unlock_irq(xas)        xa_unlock_irq((xas)->xa)
#define xas_lock_irqsave(xas, flags) \
                                xa_lock_irqsave((xas)->xa, flags)
#define xas_unlock_irqrestore(xas, flags) \
                                xa_unlock_irqrestore((xas)->xa, flags)

/**
 * xas_error() - Return an errno stored in the xa_state.
 * @xas: XArray operation state.
 *
 * Return: 0 if no error has been noted.  A negative errno if one has.
 */
static inline int xas_error(const struct xa_state *xas)
{
        return xa_err(xas->xa_node);
}

/**
 * xas_set_err() - Note an error in the xa_state.
 * @xas: XArray operation state.
 * @err: Negative error number.
 *
 * Only call this function with a negative @err; zero or positive errors
 * will probably not behave the way you think they should.  If you want
 * to clear the error from an xa_state, use xas_reset().
 */
static inline void xas_set_err(struct xa_state *xas, long err)
{
        xas->xa_node = XA_ERROR(err);
}

/**
 * xas_invalid() - Is the xas in a retry or error state?
 * @xas: XArray operation state.
 *
 * Return: %true if the xas cannot be used for operations.
 */
static inline bool xas_invalid(const struct xa_state *xas)
{
        return (unsigned long)xas->xa_node & 3;
}

/**
 * xas_valid() - Is the xas a valid cursor into the array?
 * @xas: XArray operation state.
 *
 * Return: %true if the xas can be used for operations.
 */
static inline bool xas_valid(const struct xa_state *xas)
{
        return !xas_invalid(xas);
}

/**
 * xas_is_node() - Does the xas point to a node?
 * @xas: XArray operation state.
 *
 * Return: %true if the xas currently references a node.
 */
static inline bool xas_is_node(const struct xa_state *xas)
{
        return xas_valid(xas) && xas->xa_node;
}

/* True if the pointer is something other than a node */
static inline bool xas_not_node(struct xa_node *node)
{
        return ((unsigned long)node & 3) || !node;
}

/* True if the node represents RESTART or an error */
static inline bool xas_frozen(struct xa_node *node)
{
        return (unsigned long)node & 2;
}

/* True if the node represents head-of-tree, RESTART or BOUNDS */
static inline bool xas_top(struct xa_node *node)
{
        return node <= XAS_RESTART;
}

/**
 * xas_reset() - Reset an XArray operation state.
 * @xas: XArray operation state.
 *
 * Resets the error or walk state of the @xas so future walks of the
 * array will start from the root.  Use this if you have dropped the
 * xarray lock and want to reuse the xa_state.
 *
 * Context: Any context.
 */
static inline void xas_reset(struct xa_state *xas)
{
        xas->xa_node = XAS_RESTART;
}

/**
 * xas_retry() - Retry the operation if appropriate.
 * @xas: XArray operation state.
 * @entry: Entry from xarray.
 *
 * The advanced functions may sometimes return an internal entry, such as
 * a retry entry or a zero entry.  This function sets up the @xas to restart
 * the walk from the head of the array if needed.
 *
 * Context: Any context.
 * Return: true if the operation needs to be retried.
 */
static inline bool xas_retry(struct xa_state *xas, const void *entry)
{
        if (xa_is_zero(entry))
                return true;
        if (!xa_is_retry(entry))
                return false;
        xas_reset(xas);
        return true;
}

void *xas_load(struct xa_state *);
void *xas_store(struct xa_state *, void *entry);
void *xas_find(struct xa_state *, unsigned long max);
void *xas_find_conflict(struct xa_state *);

bool xas_get_mark(const struct xa_state *, xa_mark_t);
void xas_set_mark(const struct xa_state *, xa_mark_t);
void xas_clear_mark(const struct xa_state *, xa_mark_t);
void *xas_find_marked(struct xa_state *, unsigned long max, xa_mark_t);
void xas_init_marks(const struct xa_state *);

bool xas_nomem(struct xa_state *, gfp_t);
void xas_pause(struct xa_state *);

void xas_create_range(struct xa_state *);

#ifdef CONFIG_XARRAY_MULTI
int xa_get_order(struct xarray *, unsigned long index);
void xas_split(struct xa_state *, void *entry, unsigned int order);
void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t);
#else
static inline int xa_get_order(struct xarray *xa, unsigned long index)
{
        return 0;
}

static inline void xas_split(struct xa_state *xas, void *entry,
                unsigned int order)
{
        xas_store(xas, entry);
}

static inline void xas_split_alloc(struct xa_state *xas, void *entry,
                unsigned int order, gfp_t gfp)
{
}
#endif

/**
 * xas_reload() - Refetch an entry from the xarray.
 * @xas: XArray operation state.
 *
 * Use this function to check that a previously loaded entry still has
 * the same value.  This is useful for the lockless pagecache lookup where
 * we walk the array with only the RCU lock to protect us, lock the page,
 * then check that the page hasn't moved since we looked it up.
 *
 * The caller guarantees that @xas is still valid.  If it may be in an
 * error or restart state, call xas_load() instead.
 *
 * Return: The entry at this location in the xarray.
 */
static inline void *xas_reload(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;
        void *entry;
        char offset;

        if (!node)
                return xa_head(xas->xa);
        if (IS_ENABLED(CONFIG_XARRAY_MULTI)) {
                offset = (xas->xa_index >> node->shift) & XA_CHUNK_MASK;
                entry = xa_entry(xas->xa, node, offset);
                if (!xa_is_sibling(entry))
                        return entry;
                offset = xa_to_sibling(entry);
        } else {
                offset = xas->xa_offset;
        }
        return xa_entry(xas->xa, node, offset);
}

/**
 * xas_set() - Set up XArray operation state for a different index.
 * @xas: XArray operation state.
 * @index: New index into the XArray.
 *
 * Move the operation state to refer to a different index.  This will
 * have the effect of starting a walk from the top; see xas_next()
 * to move to an adjacent index.
 */
static inline void xas_set(struct xa_state *xas, unsigned long index)
{
        xas->xa_index = index;
        xas->xa_node = XAS_RESTART;
}

/**
 * xas_set_order() - Set up XArray operation state for a multislot entry.
 * @xas: XArray operation state.
 * @index: Target of the operation.
 * @order: Entry occupies 2^@order indices.
 */
static inline void xas_set_order(struct xa_state *xas, unsigned long index,
                                        unsigned int order)
{
#ifdef CONFIG_XARRAY_MULTI
        xas->xa_index = order < BITS_PER_LONG ? (index >> order) << order : 0;
        xas->xa_shift = order - (order % XA_CHUNK_SHIFT);
        xas->xa_sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;
        xas->xa_node = XAS_RESTART;
#else
        BUG_ON(order > 0);
        xas_set(xas, index);
#endif
}

/**
 * xas_set_update() - Set up XArray operation state for a callback.
 * @xas: XArray operation state.
 * @update: Function to call when updating a node.
 *
 * The XArray can notify a caller after it has updated an xa_node.
 * This is advanced functionality and is only needed by the page cache.
 */
static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update)
{
        xas->xa_update = update;
}

/**
 * xas_next_entry() - Advance iterator to next present entry.
 * @xas: XArray operation state.
 * @max: Highest index to return.
 *
 * xas_next_entry() is an inline function to optimise xarray traversal for
 * speed.  It is equivalent to calling xas_find(), and will call xas_find()
 * for all the hard cases.
 *
 * Return: The next present entry after the one currently referred to by @xas.
 */
static inline void *xas_next_entry(struct xa_state *xas, unsigned long max)
{
        struct xa_node *node = xas->xa_node;
        void *entry;

        if (unlikely(xas_not_node(node) || node->shift ||
                        xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)))
                return xas_find(xas, max);

        do {
                if (unlikely(xas->xa_index >= max))
                        return xas_find(xas, max);
                if (unlikely(xas->xa_offset == XA_CHUNK_MASK))
                        return xas_find(xas, max);
                entry = xa_entry(xas->xa, node, xas->xa_offset + 1);
                if (unlikely(xa_is_internal(entry)))
                        return xas_find(xas, max);
                xas->xa_offset++;
                xas->xa_index++;
        } while (!entry);

        return entry;
}

/* Private */
static inline unsigned int xas_find_chunk(struct xa_state *xas, bool advance,
                xa_mark_t mark)
{
        unsigned long *addr = xas->xa_node->marks[(__force unsigned)mark];
        unsigned int offset = xas->xa_offset;

        if (advance)
                offset++;
        if (XA_CHUNK_SIZE == BITS_PER_LONG) {
                if (offset < XA_CHUNK_SIZE) {
                        unsigned long data = *addr & (~0UL << offset);
                        if (data)
                                return __ffs(data);
                }
                return XA_CHUNK_SIZE;
        }

        return find_next_bit(addr, XA_CHUNK_SIZE, offset);
}

/**
 * xas_next_marked() - Advance iterator to next marked entry.
 * @xas: XArray operation state.
 * @max: Highest index to return.
 * @mark: Mark to search for.
 *
 * xas_next_marked() is an inline function to optimise xarray traversal for
 * speed.  It is equivalent to calling xas_find_marked(), and will call
 * xas_find_marked() for all the hard cases.
 *
 * Return: The next marked entry after the one currently referred to by @xas.
 */
static inline void *xas_next_marked(struct xa_state *xas, unsigned long max,
                                                                xa_mark_t mark)
{
        struct xa_node *node = xas->xa_node;
        void *entry;
        unsigned int offset;

        if (unlikely(xas_not_node(node) || node->shift))
                return xas_find_marked(xas, max, mark);
        offset = xas_find_chunk(xas, true, mark);
        xas->xa_offset = offset;
        xas->xa_index = (xas->xa_index & ~XA_CHUNK_MASK) + offset;
        if (xas->xa_index > max)
                return NULL;
        if (offset == XA_CHUNK_SIZE)
                return xas_find_marked(xas, max, mark);
        entry = xa_entry(xas->xa, node, offset);
        if (!entry)
                return xas_find_marked(xas, max, mark);
        return entry;
}

/*
 * If iterating while holding a lock, drop the lock and reschedule
 * every %XA_CHECK_SCHED loops.
 */
enum {
        XA_CHECK_SCHED = 4096,
};

/**
 * xas_for_each() - Iterate over a range of an XArray.
 * @xas: XArray operation state.
 * @entry: Entry retrieved from the array.
 * @max: Maximum index to retrieve from array.
 *
 * The loop body will be executed for each entry present in the xarray
 * between the current xas position and @max.  @entry will be set to
 * the entry retrieved from the xarray.  It is safe to delete entries
 * from the array in the loop body.  You should hold either the RCU lock
 * or the xa_lock while iterating.  If you need to drop the lock, call
 * xas_pause() first.
 */
#define xas_for_each(xas, entry, max) \
        for (entry = xas_find(xas, max); entry; \
             entry = xas_next_entry(xas, max))

/**
 * xas_for_each_marked() - Iterate over a range of an XArray.
 * @xas: XArray operation state.
 * @entry: Entry retrieved from the array.
 * @max: Maximum index to retrieve from array.
 * @mark: Mark to search for.
 *
 * The loop body will be executed for each marked entry in the xarray
 * between the current xas position and @max.  @entry will be set to
 * the entry retrieved from the xarray.  It is safe to delete entries
 * from the array in the loop body.  You should hold either the RCU lock
 * or the xa_lock while iterating.  If you need to drop the lock, call
 * xas_pause() first.
 */
#define xas_for_each_marked(xas, entry, max, mark) \
        for (entry = xas_find_marked(xas, max, mark); entry; \
             entry = xas_next_marked(xas, max, mark))

/**
 * xas_for_each_conflict() - Iterate over a range of an XArray.
 * @xas: XArray operation state.
 * @entry: Entry retrieved from the array.
 *
 * The loop body will be executed for each entry in the XArray that
 * lies within the range specified by @xas.  If the loop terminates
 * normally, @entry will be %NULL.  The user may break out of the loop,
 * which will leave @entry set to the conflicting entry.  The caller
 * may also call xa_set_err() to exit the loop while setting an error
 * to record the reason.
 */
#define xas_for_each_conflict(xas, entry) \
        while ((entry = xas_find_conflict(xas)))

void *__xas_next(struct xa_state *);
void *__xas_prev(struct xa_state *);

/**
 * xas_prev() - Move iterator to previous index.
 * @xas: XArray operation state.
 *
 * If the @xas was in an error state, it will remain in an error state
 * and this function will return %NULL.  If the @xas has never been walked,
 * it will have the effect of calling xas_load().  Otherwise one will be
 * subtracted from the index and the state will be walked to the correct
 * location in the array for the next operation.
 *
 * If the iterator was referencing index 0, this function wraps
 * around to %ULONG_MAX.
 *
 * Return: The entry at the new index.  This may be %NULL or an internal
 * entry.
 */
static inline void *xas_prev(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;

        if (unlikely(xas_not_node(node) || node->shift ||
                                xas->xa_offset == 0))
                return __xas_prev(xas);

        xas->xa_index--;
        xas->xa_offset--;
        return xa_entry(xas->xa, node, xas->xa_offset);
}

/**
 * xas_next() - Move state to next index.
 * @xas: XArray operation state.
 *
 * If the @xas was in an error state, it will remain in an error state
 * and this function will return %NULL.  If the @xas has never been walked,
 * it will have the effect of calling xas_load().  Otherwise one will be
 * added to the index and the state will be walked to the correct
 * location in the array for the next operation.
 *
 * If the iterator was referencing index %ULONG_MAX, this function wraps
 * around to 0.
 *
 * Return: The entry at the new index.  This may be %NULL or an internal
 * entry.
 */
static inline void *xas_next(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;

        if (unlikely(xas_not_node(node) || node->shift ||
                                xas->xa_offset == XA_CHUNK_MASK))
                return __xas_next(xas);

        xas->xa_index++;
        xas->xa_offset++;
        return xa_entry(xas->xa, node, xas->xa_offset);
}

#endif /* _LINUX_XARRAY_H */































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * include/linux/buffer_head.h
 *
 * Everything to do with buffer_heads.
 */

#ifndef _LINUX_BUFFER_HEAD_H
#define _LINUX_BUFFER_HEAD_H

#include <linux/types.h>
#include <linux/fs.h>
#include <linux/linkage.h>
#include <linux/pagemap.h>
#include <linux/wait.h>
#include <linux/atomic.h>

#ifdef CONFIG_BLOCK

enum bh_state_bits {
        BH_Uptodate,        /* Contains valid data */
        BH_Dirty,        /* Is dirty */
        BH_Lock,        /* Is locked */
        BH_Req,                /* Has been submitted for I/O */

        BH_Mapped,        /* Has a disk mapping */
        BH_New,                /* Disk mapping was newly created by get_block */
        BH_Async_Read,        /* Is under end_buffer_async_read I/O */
        BH_Async_Write,        /* Is under end_buffer_async_write I/O */
        BH_Delay,        /* Buffer is not yet allocated on disk */
        BH_Boundary,        /* Block is followed by a discontiguity */
        BH_Write_EIO,        /* I/O error on write */
        BH_Unwritten,        /* Buffer is allocated on disk but not written */
        BH_Quiet,        /* Buffer Error Prinks to be quiet */
        BH_Meta,        /* Buffer contains metadata */
        BH_Prio,        /* Buffer should be submitted with REQ_PRIO */
        BH_Defer_Completion, /* Defer AIO completion to workqueue */

        BH_PrivateStart,/* not a state bit, but the first bit available
                         * for private allocation by other entities
                         */
};

#define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)

struct page;
struct buffer_head;
struct address_space;
typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate);

/*
 * Historically, a buffer_head was used to map a single block
 * within a page, and of course as the unit of I/O through the
 * filesystem and block layers.  Nowadays the basic I/O unit
 * is the bio, and buffer_heads are used for extracting block
 * mappings (via a get_block_t call), for tracking state within
 * a page (via a page_mapping) and for wrapping bio submission
 * for backward compatibility reasons (e.g. submit_bh).
 */
struct buffer_head {
        unsigned long b_state;                /* buffer state bitmap (see above) */
        struct buffer_head *b_this_page;/* circular list of page's buffers */
        struct page *b_page;                /* the page this bh is mapped to */

        sector_t b_blocknr;                /* start block number */
        size_t b_size;                        /* size of mapping */
        char *b_data;                        /* pointer to data within the page */

        struct block_device *b_bdev;
        bh_end_io_t *b_end_io;                /* I/O completion */
         void *b_private;                /* reserved for b_end_io */
        struct list_head b_assoc_buffers; /* associated with another mapping */
        struct address_space *b_assoc_map;        /* mapping this buffer is
                                                   associated with */
        atomic_t b_count;                /* users using this buffer_head */
        spinlock_t b_uptodate_lock;        /* Used by the first bh in a page, to
                                         * serialise IO completion of other
                                         * buffers in the page */
};

/*
 * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
 * and buffer_foo() functions.
 * To avoid reset buffer flags that are already set, because that causes
 * a costly cache line transition, check the flag first.
 */
#define BUFFER_FNS(bit, name)                                                \
static __always_inline void set_buffer_##name(struct buffer_head *bh)        \
{                                                                        \
        if (!test_bit(BH_##bit, &(bh)->b_state))                        \
                set_bit(BH_##bit, &(bh)->b_state);                        \
}                                                                        \
static __always_inline void clear_buffer_##name(struct buffer_head *bh)        \
{                                                                        \
        clear_bit(BH_##bit, &(bh)->b_state);                                \
}                                                                        \
static __always_inline int buffer_##name(const struct buffer_head *bh)        \
{                                                                        \
        return test_bit(BH_##bit, &(bh)->b_state);                        \
}

/*
 * test_set_buffer_foo() and test_clear_buffer_foo()
 */
#define TAS_BUFFER_FNS(bit, name)                                        \
static __always_inline int test_set_buffer_##name(struct buffer_head *bh) \
{                                                                        \
        return test_and_set_bit(BH_##bit, &(bh)->b_state);                \
}                                                                        \
static __always_inline int test_clear_buffer_##name(struct buffer_head *bh) \
{                                                                        \
        return test_and_clear_bit(BH_##bit, &(bh)->b_state);                \
}                                                                        \

/*
 * Emit the buffer bitops functions.   Note that there are also functions
 * of the form "mark_buffer_foo()".  These are higher-level functions which
 * do something in addition to setting a b_state bit.
 */
BUFFER_FNS(Dirty, dirty)
TAS_BUFFER_FNS(Dirty, dirty)
BUFFER_FNS(Lock, locked)
BUFFER_FNS(Req, req)
TAS_BUFFER_FNS(Req, req)
BUFFER_FNS(Mapped, mapped)
BUFFER_FNS(New, new)
BUFFER_FNS(Async_Read, async_read)
BUFFER_FNS(Async_Write, async_write)
BUFFER_FNS(Delay, delay)
BUFFER_FNS(Boundary, boundary)
BUFFER_FNS(Write_EIO, write_io_error)
BUFFER_FNS(Unwritten, unwritten)
BUFFER_FNS(Meta, meta)
BUFFER_FNS(Prio, prio)
BUFFER_FNS(Defer_Completion, defer_completion)

static __always_inline void set_buffer_uptodate(struct buffer_head *bh)
{
        /*
         * If somebody else already set this uptodate, they will
         * have done the memory barrier, and a reader will thus
         * see *some* valid buffer state.
         *
         * Any other serialization (with IO errors or whatever that
         * might clear the bit) has to come from other state (eg BH_Lock).
         */
        if (test_bit(BH_Uptodate, &bh->b_state))
                return;

        /*
         * make it consistent with folio_mark_uptodate
         * pairs with smp_load_acquire in buffer_uptodate
         */
        smp_mb__before_atomic();
        set_bit(BH_Uptodate, &bh->b_state);
}

static __always_inline void clear_buffer_uptodate(struct buffer_head *bh)
{
        clear_bit(BH_Uptodate, &bh->b_state);
}

static __always_inline int buffer_uptodate(const struct buffer_head *bh)
{
        /*
         * make it consistent with folio_test_uptodate
         * pairs with smp_mb__before_atomic in set_buffer_uptodate
         */
        return (smp_load_acquire(&bh->b_state) & (1UL << BH_Uptodate)) != 0;
}

#define bh_offset(bh)                ((unsigned long)(bh)->b_data & ~PAGE_MASK)

/* If we *know* page->private refers to buffer_heads */
#define page_buffers(page)                                        \
        ({                                                        \
                BUG_ON(!PagePrivate(page));                        \
                ((struct buffer_head *)page_private(page));        \
        })
#define page_has_buffers(page)        PagePrivate(page)

void buffer_check_dirty_writeback(struct page *page,
                                     bool *dirty, bool *writeback);

/*
 * Declarations
 */

void mark_buffer_dirty(struct buffer_head *bh);
void mark_buffer_write_io_error(struct buffer_head *bh);
void touch_buffer(struct buffer_head *bh);
void set_bh_page(struct buffer_head *bh,
                struct page *page, unsigned long offset);
int try_to_free_buffers(struct page *);
struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
                bool retry);
void create_empty_buffers(struct page *, unsigned long,
                        unsigned long b_state);
void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
void end_buffer_write_sync(struct buffer_head *bh, int uptodate);
void end_buffer_async_write(struct buffer_head *bh, int uptodate);

/* Things to do with buffers at mapping->private_list */
void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode);
int inode_has_buffers(struct inode *);
void invalidate_inode_buffers(struct inode *);
int remove_inode_buffers(struct inode *inode);
int sync_mapping_buffers(struct address_space *mapping);
void clean_bdev_aliases(struct block_device *bdev, sector_t block,
                        sector_t len);
static inline void clean_bdev_bh_alias(struct buffer_head *bh)
{
        clean_bdev_aliases(bh->b_bdev, bh->b_blocknr, 1);
}

void mark_buffer_async_write(struct buffer_head *bh);
void __wait_on_buffer(struct buffer_head *);
wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
                        unsigned size);
struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block,
                                  unsigned size, gfp_t gfp);
void __brelse(struct buffer_head *);
void __bforget(struct buffer_head *);
void __breadahead(struct block_device *, sector_t block, unsigned int size);
void __breadahead_gfp(struct block_device *, sector_t block, unsigned int size,
                  gfp_t gfp);
struct buffer_head *__bread_gfp(struct block_device *,
                                sector_t block, unsigned size, gfp_t gfp);
void invalidate_bh_lrus(void);
struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
void free_buffer_head(struct buffer_head * bh);
void unlock_buffer(struct buffer_head *bh);
void __lock_buffer(struct buffer_head *bh);
void ll_rw_block(int, int, int, struct buffer_head * bh[]);
int sync_dirty_buffer(struct buffer_head *bh);
int __sync_dirty_buffer(struct buffer_head *bh, int op_flags);
void write_dirty_buffer(struct buffer_head *bh, int op_flags);
int submit_bh(int, int, struct buffer_head *);
void write_boundary_block(struct block_device *bdev,
                        sector_t bblock, unsigned blocksize);
int bh_uptodate_or_lock(struct buffer_head *bh);
int bh_submit_read(struct buffer_head *bh);

extern int buffer_heads_over_limit;

/*
 * Generic address_space_operations implementations for buffer_head-backed
 * address_spaces.
 */
void block_invalidatepage(struct page *page, unsigned int offset,
                          unsigned int length);
int block_write_full_page(struct page *page, get_block_t *get_block,
                                struct writeback_control *wbc);
int __block_write_full_page(struct inode *inode, struct page *page,
                        get_block_t *get_block, struct writeback_control *wbc,
                        bh_end_io_t *handler);
int block_read_full_page(struct page*, get_block_t*);
int block_is_partially_uptodate(struct page *page, unsigned long from,
                                unsigned long count);
int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
                unsigned flags, struct page **pagep, get_block_t *get_block);
int __block_write_begin(struct page *page, loff_t pos, unsigned len,
                get_block_t *get_block);
int block_write_end(struct file *, struct address_space *,
                                loff_t, unsigned, unsigned,
                                struct page *, void *);
int generic_write_end(struct file *, struct address_space *,
                                loff_t, unsigned, unsigned,
                                struct page *, void *);
void page_zero_new_buffers(struct page *page, unsigned from, unsigned to);
void clean_page_buffers(struct page *page);
int cont_write_begin(struct file *, struct address_space *, loff_t,
                        unsigned, unsigned, struct page **, void **,
                        get_block_t *, loff_t *);
int generic_cont_expand_simple(struct inode *inode, loff_t size);
int block_commit_write(struct page *page, unsigned from, unsigned to);
int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
                                get_block_t get_block);
/* Convert errno to return value from ->page_mkwrite() call */
static inline vm_fault_t block_page_mkwrite_return(int err)
{
        if (err == 0)
                return VM_FAULT_LOCKED;
        if (err == -EFAULT || err == -EAGAIN)
                return VM_FAULT_NOPAGE;
        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        /* -ENOSPC, -EDQUOT, -EIO ... */
        return VM_FAULT_SIGBUS;
}
sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
int block_truncate_page(struct address_space *, loff_t, get_block_t *);
int nobh_write_begin(struct address_space *, loff_t, unsigned, unsigned,
                                struct page **, void **, get_block_t*);
int nobh_write_end(struct file *, struct address_space *,
                                loff_t, unsigned, unsigned,
                                struct page *, void *);
int nobh_truncate_page(struct address_space *, loff_t, get_block_t *);
int nobh_writepage(struct page *page, get_block_t *get_block,
                        struct writeback_control *wbc);

void buffer_init(void);

/*
 * inline definitions
 */

static inline void get_bh(struct buffer_head *bh)
{
        atomic_inc(&bh->b_count);
}

static inline void put_bh(struct buffer_head *bh)
{
        smp_mb__before_atomic();
        atomic_dec(&bh->b_count);
}

static inline void brelse(struct buffer_head *bh)
{
        if (bh)
                __brelse(bh);
}

static inline void bforget(struct buffer_head *bh)
{
        if (bh)
                __bforget(bh);
}

static inline struct buffer_head *
sb_bread(struct super_block *sb, sector_t block)
{
        return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE);
}

static inline struct buffer_head *
sb_bread_unmovable(struct super_block *sb, sector_t block)
{
        return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, 0);
}

static inline void
sb_breadahead(struct super_block *sb, sector_t block)
{
        __breadahead(sb->s_bdev, block, sb->s_blocksize);
}

static inline void
sb_breadahead_unmovable(struct super_block *sb, sector_t block)
{
        __breadahead_gfp(sb->s_bdev, block, sb->s_blocksize, 0);
}

static inline struct buffer_head *
sb_getblk(struct super_block *sb, sector_t block)
{
        return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE);
}


static inline struct buffer_head *
sb_getblk_gfp(struct super_block *sb, sector_t block, gfp_t gfp)
{
        return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, gfp);
}

static inline struct buffer_head *
sb_find_get_block(struct super_block *sb, sector_t block)
{
        return __find_get_block(sb->s_bdev, block, sb->s_blocksize);
}

static inline void
map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block)
{
        set_buffer_mapped(bh);
        bh->b_bdev = sb->s_bdev;
        bh->b_blocknr = block;
        bh->b_size = sb->s_blocksize;
}

static inline void wait_on_buffer(struct buffer_head *bh)
{
        might_sleep();
        if (buffer_locked(bh))
                __wait_on_buffer(bh);
}

static inline int trylock_buffer(struct buffer_head *bh)
{
        return likely(!test_and_set_bit_lock(BH_Lock, &bh->b_state));
}

static inline void lock_buffer(struct buffer_head *bh)
{
        might_sleep();
        if (!trylock_buffer(bh))
                __lock_buffer(bh);
}

static inline struct buffer_head *getblk_unmovable(struct block_device *bdev,
                                                   sector_t block,
                                                   unsigned size)
{
        return __getblk_gfp(bdev, block, size, 0);
}

static inline struct buffer_head *__getblk(struct block_device *bdev,
                                           sector_t block,
                                           unsigned size)
{
        return __getblk_gfp(bdev, block, size, __GFP_MOVABLE);
}

/**
 *  __bread() - reads a specified block and returns the bh
 *  @bdev: the block_device to read from
 *  @block: number of block
 *  @size: size (in bytes) to read
 *
 *  Reads a specified block, and returns buffer head that contains it.
 *  The page cache is allocated from movable area so that it can be migrated.
 *  It returns NULL if the block was unreadable.
 */
static inline struct buffer_head *
__bread(struct block_device *bdev, sector_t block, unsigned size)
{
        return __bread_gfp(bdev, block, size, __GFP_MOVABLE);
}

extern int __set_page_dirty_buffers(struct page *page);

#else /* CONFIG_BLOCK */

static inline void buffer_init(void) {}
static inline int try_to_free_buffers(struct page *page) { return 1; }
static inline int inode_has_buffers(struct inode *inode) { return 0; }
static inline void invalidate_inode_buffers(struct inode *inode) {}
static inline int remove_inode_buffers(struct inode *inode) { return 1; }
static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
#define buffer_heads_over_limit 0

#endif /* CONFIG_BLOCK */
#endif /* _LINUX_BUFFER_HEAD_H */




































































    1 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * include/linux/pagevec.h
 *
 * In many places it is efficient to batch an operation up against multiple
 * pages.  A pagevec is a multipage container which is used for that.
 */

#ifndef _LINUX_PAGEVEC_H
#define _LINUX_PAGEVEC_H

#include <linux/xarray.h>

/* 15 pointers + header align the pagevec structure to a power of two */
#define PAGEVEC_SIZE        15

struct page;
struct address_space;

struct pagevec {
        unsigned char nr;
        bool percpu_pvec_drained;
        struct page *pages[PAGEVEC_SIZE];
};

void __pagevec_release(struct pagevec *pvec);
void __pagevec_lru_add(struct pagevec *pvec);
unsigned pagevec_lookup_entries(struct pagevec *pvec,
                                struct address_space *mapping,
                                pgoff_t start, unsigned nr_entries,
                                pgoff_t *indices);
void pagevec_remove_exceptionals(struct pagevec *pvec);
unsigned pagevec_lookup_range(struct pagevec *pvec,
                              struct address_space *mapping,
                              pgoff_t *start, pgoff_t end);
static inline unsigned pagevec_lookup(struct pagevec *pvec,
                                      struct address_space *mapping,
                                      pgoff_t *start)
{
        return pagevec_lookup_range(pvec, mapping, start, (pgoff_t)-1);
}

unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
                struct address_space *mapping, pgoff_t *index, pgoff_t end,
                xa_mark_t tag);
unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
                struct address_space *mapping, pgoff_t *index, pgoff_t end,
                xa_mark_t tag, unsigned max_pages);
static inline unsigned pagevec_lookup_tag(struct pagevec *pvec,
                struct address_space *mapping, pgoff_t *index, xa_mark_t tag)
{
        return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag);
}

static inline void pagevec_init(struct pagevec *pvec)
{
        pvec->nr = 0;
        pvec->percpu_pvec_drained = false;
}

static inline void pagevec_reinit(struct pagevec *pvec)
{
        pvec->nr = 0;
}

static inline unsigned pagevec_count(struct pagevec *pvec)
{
        return pvec->nr;
}

static inline unsigned pagevec_space(struct pagevec *pvec)
{
        return PAGEVEC_SIZE - pvec->nr;
}

/*
 * Add a page to a pagevec.  Returns the number of slots still available.
 */
static inline unsigned pagevec_add(struct pagevec *pvec, struct page *page)
{
        pvec->pages[pvec->nr++] = page;
        return pagevec_space(pvec);
}

static inline void pagevec_release(struct pagevec *pvec)
{
        if (pagevec_count(pvec))
                __pagevec_release(pvec);
}

#endif /* _LINUX_PAGEVEC_H */



















































































































































































































    1 













1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Tracing hooks
 *
 * Copyright (C) 2008-2009 Red Hat, Inc.  All rights reserved.
 *
 * This file defines hook entry points called by core code where
 * user tracing/debugging support might need to do something.  These
 * entry points are called tracehook_*().  Each hook declared below
 * has a detailed kerneldoc comment giving the context (locking et
 * al) from which it is called, and the meaning of its return value.
 *
 * Each function here typically has only one call site, so it is ok
 * to have some nontrivial tracehook_*() inlines.  In all cases, the
 * fast path when no tracing is enabled should be very short.
 *
 * The purpose of this file and the tracehook_* layer is to consolidate
 * the interface that the kernel core and arch code uses to enable any
 * user debugging or tracing facility (such as ptrace).  The interfaces
 * here are carefully documented so that maintainers of core and arch
 * code do not need to think about the implementation details of the
 * tracing facilities.  Likewise, maintainers of the tracing code do not
 * need to understand all the calling core or arch code in detail, just
 * documented circumstances of each call, such as locking conditions.
 *
 * If the calling core code changes so that locking is different, then
 * it is ok to change the interface documented here.  The maintainer of
 * core code changing should notify the maintainers of the tracing code
 * that they need to work out the change.
 *
 * Some tracehook_*() inlines take arguments that the current tracing
 * implementations might not necessarily use.  These function signatures
 * are chosen to pass in all the information that is on hand in the
 * caller and might conceivably be relevant to a tracer, so that the
 * core code won't have to be updated when tracing adds more features.
 * If a call site changes so that some of those parameters are no longer
 * already on hand without extra work, then the tracehook_* interface
 * can change so there is no make-work burden on the core code.  The
 * maintainer of core code changing should notify the maintainers of the
 * tracing code that they need to work out the change.
 */

#ifndef _LINUX_TRACEHOOK_H
#define _LINUX_TRACEHOOK_H        1

#include <linux/sched.h>
#include <linux/ptrace.h>
#include <linux/security.h>
#include <linux/task_work.h>
#include <linux/memcontrol.h>
#include <linux/blk-cgroup.h>
struct linux_binprm;

/*
 * ptrace report for syscall entry and exit looks identical.
 */
static inline int ptrace_report_syscall(struct pt_regs *regs,
                                        unsigned long message)
{
        int ptrace = current->ptrace;

        if (!(ptrace & PT_PTRACED))
                return 0;

        current->ptrace_message = message;
        ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0));

        /*
         * this isn't the same as continuing with a signal, but it will do
         * for normal use.  strace only continues with a signal if the
         * stopping signal is not SIGTRAP.  -brl
         */
        if (current->exit_code) {
                send_sig(current->exit_code, current, 1);
                current->exit_code = 0;
        }

        current->ptrace_message = 0;
        return fatal_signal_pending(current);
}

/**
 * tracehook_report_syscall_entry - task is about to attempt a system call
 * @regs:                user register state of current task
 *
 * This will be called if %TIF_SYSCALL_TRACE or %TIF_SYSCALL_EMU have been set,
 * when the current task has just entered the kernel for a system call.
 * Full user register state is available here.  Changing the values
 * in @regs can affect the system call number and arguments to be tried.
 * It is safe to block here, preventing the system call from beginning.
 *
 * Returns zero normally, or nonzero if the calling arch code should abort
 * the system call.  That must prevent normal entry so no system call is
 * made.  If @task ever returns to user mode after this, its register state
 * is unspecified, but should be something harmless like an %ENOSYS error
 * return.  It should preserve enough information so that syscall_rollback()
 * can work (see asm-generic/syscall.h).
 *
 * Called without locks, just after entering kernel mode.
 */
static inline __must_check int tracehook_report_syscall_entry(
        struct pt_regs *regs)
{
        return ptrace_report_syscall(regs, PTRACE_EVENTMSG_SYSCALL_ENTRY);
}

/**
 * tracehook_report_syscall_exit - task has just finished a system call
 * @regs:                user register state of current task
 * @step:                nonzero if simulating single-step or block-step
 *
 * This will be called if %TIF_SYSCALL_TRACE has been set, when the
 * current task has just finished an attempted system call.  Full
 * user register state is available here.  It is safe to block here,
 * preventing signals from being processed.
 *
 * If @step is nonzero, this report is also in lieu of the normal
 * trap that would follow the system call instruction because
 * user_enable_block_step() or user_enable_single_step() was used.
 * In this case, %TIF_SYSCALL_TRACE might not be set.
 *
 * Called without locks, just before checking for pending signals.
 */
static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step)
{
        if (step)
                user_single_step_report(regs);
        else
                ptrace_report_syscall(regs, PTRACE_EVENTMSG_SYSCALL_EXIT);
}

/**
 * tracehook_signal_handler - signal handler setup is complete
 * @stepping:                nonzero if debugger single-step or block-step in use
 *
 * Called by the arch code after a signal handler has been set up.
 * Register and stack state reflects the user handler about to run.
 * Signal mask changes have already been made.
 *
 * Called without locks, shortly before returning to user mode
 * (or handling more signals).
 */
static inline void tracehook_signal_handler(int stepping)
{
        if (stepping)
                ptrace_notify(SIGTRAP);
}

/**
 * set_notify_resume - cause tracehook_notify_resume() to be called
 * @task:                task that will call tracehook_notify_resume()
 *
 * Calling this arranges that @task will call tracehook_notify_resume()
 * before returning to user mode.  If it's already running in user mode,
 * it will enter the kernel and call tracehook_notify_resume() soon.
 * If it's blocked, it will not be woken.
 */
static inline void set_notify_resume(struct task_struct *task)
{
#ifdef TIF_NOTIFY_RESUME
        if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_RESUME))
                kick_process(task);
#endif
}

/**
 * tracehook_notify_resume - report when about to return to user mode
 * @regs:                user-mode registers of @current task
 *
 * This is called when %TIF_NOTIFY_RESUME has been set.  Now we are
 * about to return to user mode, and the user state in @regs can be
 * inspected or adjusted.  The caller in arch code has cleared
 * %TIF_NOTIFY_RESUME before the call.  If the flag gets set again
 * asynchronously, this will be called again before we return to
 * user mode.
 *
 * Called without locks.
 */
static inline void tracehook_notify_resume(struct pt_regs *regs)
{
        clear_thread_flag(TIF_NOTIFY_RESUME);
        /*
         * This barrier pairs with task_work_add()->set_notify_resume() after
         * hlist_add_head(task->task_works);
         */
        smp_mb__after_atomic();
        if (unlikely(current->task_works))
                task_work_run();

#ifdef CONFIG_KEYS_REQUEST_CACHE
        if (unlikely(current->cached_requested_key)) {
                key_put(current->cached_requested_key);
                current->cached_requested_key = NULL;
        }
#endif

        mem_cgroup_handle_over_high();
        blkcg_maybe_throttle_current();
}

/*
 * called by exit_to_user_mode_loop() if ti_work & _TIF_NOTIFY_SIGNAL. This
 * is currently used by TWA_SIGNAL based task_work, which requires breaking
 * wait loops to ensure that task_work is noticed and run.
 */
static inline void tracehook_notify_signal(void)
{
        clear_thread_flag(TIF_NOTIFY_SIGNAL);
        smp_mb__after_atomic();
        if (current->task_works)
                task_work_run();
}

/*
 * Called when we have work to process from exit_to_user_mode_loop()
 */
static inline void set_notify_signal(struct task_struct *task)
{
        if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) &&
            !wake_up_state(task, TASK_INTERRUPTIBLE))
                kick_process(task);
}

#endif        /* <linux/tracehook.h> */

















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * Sleepable Read-Copy Update mechanism for mutual exclusion
 *
 * Copyright (C) IBM Corporation, 2006
 * Copyright (C) Fujitsu, 2012
 *
 * Author: Paul McKenney <paulmck@linux.ibm.com>
 *           Lai Jiangshan <laijs@cn.fujitsu.com>
 *
 * For detailed explanation of Read-Copy Update mechanism see -
 *                Documentation/RCU/ *.txt
 *
 */

#ifndef _LINUX_SRCU_H
#define _LINUX_SRCU_H

#include <linux/mutex.h>
#include <linux/rcupdate.h>
#include <linux/workqueue.h>
#include <linux/rcu_segcblist.h>

struct srcu_struct;

#ifdef CONFIG_DEBUG_LOCK_ALLOC

int __init_srcu_struct(struct srcu_struct *ssp, const char *name,
                       struct lock_class_key *key);

#define init_srcu_struct(ssp) \
({ \
        static struct lock_class_key __srcu_key; \
        \
        __init_srcu_struct((ssp), #ssp, &__srcu_key); \
})

#define __SRCU_DEP_MAP_INIT(srcu_name)        .dep_map = { .name = #srcu_name },
#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */

int init_srcu_struct(struct srcu_struct *ssp);

#define __SRCU_DEP_MAP_INIT(srcu_name)
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */

#ifdef CONFIG_TINY_SRCU
#include <linux/srcutiny.h>
#elif defined(CONFIG_TREE_SRCU)
#include <linux/srcutree.h>
#elif defined(CONFIG_SRCU)
#error "Unknown SRCU implementation specified to kernel configuration"
#else
/* Dummy definition for things like notifiers.  Actual use gets link error. */
struct srcu_struct { };
#endif

void call_srcu(struct srcu_struct *ssp, struct rcu_head *head,
                void (*func)(struct rcu_head *head));
void cleanup_srcu_struct(struct srcu_struct *ssp);
int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp);
void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp);
void synchronize_srcu(struct srcu_struct *ssp);
unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp);
unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp);
bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie);

#ifdef CONFIG_DEBUG_LOCK_ALLOC

/**
 * srcu_read_lock_held - might we be in SRCU read-side critical section?
 * @ssp: The srcu_struct structure to check
 *
 * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an SRCU
 * read-side critical section.  In absence of CONFIG_DEBUG_LOCK_ALLOC,
 * this assumes we are in an SRCU read-side critical section unless it can
 * prove otherwise.
 *
 * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
 * and while lockdep is disabled.
 *
 * Note that SRCU is based on its own statemachine and it doesn't
 * relies on normal RCU, it can be called from the CPU which
 * is in the idle loop from an RCU point of view or offline.
 */
static inline int srcu_read_lock_held(const struct srcu_struct *ssp)
{
        if (!debug_lockdep_rcu_enabled())
                return 1;
        return lock_is_held(&ssp->dep_map);
}

#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */

static inline int srcu_read_lock_held(const struct srcu_struct *ssp)
{
        return 1;
}

#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */

/**
 * srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing
 * @p: the pointer to fetch and protect for later dereferencing
 * @ssp: pointer to the srcu_struct, which is used to check that we
 *        really are in an SRCU read-side critical section.
 * @c: condition to check for update-side use
 *
 * If PROVE_RCU is enabled, invoking this outside of an RCU read-side
 * critical section will result in an RCU-lockdep splat, unless @c evaluates
 * to 1.  The @c argument will normally be a logical expression containing
 * lockdep_is_held() calls.
 */
#define srcu_dereference_check(p, ssp, c) \
        __rcu_dereference_check((p), (c) || srcu_read_lock_held(ssp), __rcu)

/**
 * srcu_dereference - fetch SRCU-protected pointer for later dereferencing
 * @p: the pointer to fetch and protect for later dereferencing
 * @ssp: pointer to the srcu_struct, which is used to check that we
 *        really are in an SRCU read-side critical section.
 *
 * Makes rcu_dereference_check() do the dirty work.  If PROVE_RCU
 * is enabled, invoking this outside of an RCU read-side critical
 * section will result in an RCU-lockdep splat.
 */
#define srcu_dereference(p, ssp) srcu_dereference_check((p), (ssp), 0)

/**
 * srcu_dereference_notrace - no tracing and no lockdep calls from here
 * @p: the pointer to fetch and protect for later dereferencing
 * @ssp: pointer to the srcu_struct, which is used to check that we
 *        really are in an SRCU read-side critical section.
 */
#define srcu_dereference_notrace(p, ssp) srcu_dereference_check((p), (ssp), 1)

/**
 * srcu_read_lock - register a new reader for an SRCU-protected structure.
 * @ssp: srcu_struct in which to register the new reader.
 *
 * Enter an SRCU read-side critical section.  Note that SRCU read-side
 * critical sections may be nested.  However, it is illegal to
 * call anything that waits on an SRCU grace period for the same
 * srcu_struct, whether directly or indirectly.  Please note that
 * one way to indirectly wait on an SRCU grace period is to acquire
 * a mutex that is held elsewhere while calling synchronize_srcu() or
 * synchronize_srcu_expedited().
 *
 * Note that srcu_read_lock() and the matching srcu_read_unlock() must
 * occur in the same context, for example, it is illegal to invoke
 * srcu_read_unlock() in an irq handler if the matching srcu_read_lock()
 * was invoked in process context.
 */
static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp)
{
        int retval;

        retval = __srcu_read_lock(ssp);
        rcu_lock_acquire(&(ssp)->dep_map);
        return retval;
}

/* Used by tracing, cannot be traced and cannot invoke lockdep. */
static inline notrace int
srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp)
{
        int retval;

        retval = __srcu_read_lock(ssp);
        return retval;
}

/**
 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
 * @ssp: srcu_struct in which to unregister the old reader.
 * @idx: return value from corresponding srcu_read_lock().
 *
 * Exit an SRCU read-side critical section.
 */
static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx)
        __releases(ssp)
{
        WARN_ON_ONCE(idx & ~0x1);
        rcu_lock_release(&(ssp)->dep_map);
        __srcu_read_unlock(ssp, idx);
}

/* Used by tracing, cannot be traced and cannot call lockdep. */
static inline notrace void
srcu_read_unlock_notrace(struct srcu_struct *ssp, int idx) __releases(ssp)
{
        __srcu_read_unlock(ssp, idx);
}

/**
 * smp_mb__after_srcu_read_unlock - ensure full ordering after srcu_read_unlock
 *
 * Converts the preceding srcu_read_unlock into a two-way memory barrier.
 *
 * Call this after srcu_read_unlock, to guarantee that all memory operations
 * that occur after smp_mb__after_srcu_read_unlock will appear to happen after
 * the preceding srcu_read_unlock.
 */
static inline void smp_mb__after_srcu_read_unlock(void)
{
        /* __srcu_read_unlock has smp_mb() internally so nothing to do here. */
}

#endif










































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_GFP_H
#define __LINUX_GFP_H

#include <linux/mmdebug.h>
#include <linux/mmzone.h>
#include <linux/stddef.h>
#include <linux/linkage.h>
#include <linux/topology.h>

struct vm_area_struct;

/*
 * In case of changes, please don't forget to update
 * include/trace/events/mmflags.h and tools/perf/builtin-kmem.c
 */

/* Plain integer GFP bitmasks. Do not use this directly. */
#define ___GFP_DMA                0x01u
#define ___GFP_HIGHMEM                0x02u
#define ___GFP_DMA32                0x04u
#define ___GFP_MOVABLE                0x08u
#define ___GFP_RECLAIMABLE        0x10u
#define ___GFP_HIGH                0x20u
#define ___GFP_IO                0x40u
#define ___GFP_FS                0x80u
#define ___GFP_ZERO                0x100u
#define ___GFP_ATOMIC                0x200u
#define ___GFP_DIRECT_RECLAIM        0x400u
#define ___GFP_KSWAPD_RECLAIM        0x800u
#define ___GFP_WRITE                0x1000u
#define ___GFP_NOWARN                0x2000u
#define ___GFP_RETRY_MAYFAIL        0x4000u
#define ___GFP_NOFAIL                0x8000u
#define ___GFP_NORETRY                0x10000u
#define ___GFP_MEMALLOC                0x20000u
#define ___GFP_COMP                0x40000u
#define ___GFP_NOMEMALLOC        0x80000u
#define ___GFP_HARDWALL                0x100000u
#define ___GFP_THISNODE                0x200000u
#define ___GFP_ACCOUNT                0x400000u
#ifdef CONFIG_LOCKDEP
#define ___GFP_NOLOCKDEP        0x800000u
#else
#define ___GFP_NOLOCKDEP        0
#endif
/* If the above are modified, __GFP_BITS_SHIFT may need updating */

/*
 * Physical address zone modifiers (see linux/mmzone.h - low four bits)
 *
 * Do not put any conditional on these. If necessary modify the definitions
 * without the underscores and use them consistently. The definitions here may
 * be used in bit comparisons.
 */
#define __GFP_DMA        ((__force gfp_t)___GFP_DMA)
#define __GFP_HIGHMEM        ((__force gfp_t)___GFP_HIGHMEM)
#define __GFP_DMA32        ((__force gfp_t)___GFP_DMA32)
#define __GFP_MOVABLE        ((__force gfp_t)___GFP_MOVABLE)  /* ZONE_MOVABLE allowed */
#define GFP_ZONEMASK        (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)

/**
 * DOC: Page mobility and placement hints
 *
 * Page mobility and placement hints
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 * These flags provide hints about how mobile the page is. Pages with similar
 * mobility are placed within the same pageblocks to minimise problems due
 * to external fragmentation.
 *
 * %__GFP_MOVABLE (also a zone modifier) indicates that the page can be
 * moved by page migration during memory compaction or can be reclaimed.
 *
 * %__GFP_RECLAIMABLE is used for slab allocations that specify
 * SLAB_RECLAIM_ACCOUNT and whose pages can be freed via shrinkers.
 *
 * %__GFP_WRITE indicates the caller intends to dirty the page. Where possible,
 * these pages will be spread between local zones to avoid all the dirty
 * pages being in one zone (fair zone allocation policy).
 *
 * %__GFP_HARDWALL enforces the cpuset memory allocation policy.
 *
 * %__GFP_THISNODE forces the allocation to be satisfied from the requested
 * node with no fallbacks or placement policy enforcements.
 *
 * %__GFP_ACCOUNT causes the allocation to be accounted to kmemcg.
 */
#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE)
#define __GFP_WRITE        ((__force gfp_t)___GFP_WRITE)
#define __GFP_HARDWALL   ((__force gfp_t)___GFP_HARDWALL)
#define __GFP_THISNODE        ((__force gfp_t)___GFP_THISNODE)
#define __GFP_ACCOUNT        ((__force gfp_t)___GFP_ACCOUNT)

/**
 * DOC: Watermark modifiers
 *
 * Watermark modifiers -- controls access to emergency reserves
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 * %__GFP_HIGH indicates that the caller is high-priority and that granting
 * the request is necessary before the system can make forward progress.
 * For example, creating an IO context to clean pages.
 *
 * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is
 * high priority. Users are typically interrupt handlers. This may be
 * used in conjunction with %__GFP_HIGH
 *
 * %__GFP_MEMALLOC allows access to all memory. This should only be used when
 * the caller guarantees the allocation will allow more memory to be freed
 * very shortly e.g. process exiting or swapping. Users either should
 * be the MM or co-ordinating closely with the VM (e.g. swap over NFS).
 * Users of this flag have to be extremely careful to not deplete the reserve
 * completely and implement a throttling mechanism which controls the
 * consumption of the reserve based on the amount of freed memory.
 * Usage of a pre-allocated pool (e.g. mempool) should be always considered
 * before using this flag.
 *
 * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves.
 * This takes precedence over the %__GFP_MEMALLOC flag if both are set.
 */
#define __GFP_ATOMIC        ((__force gfp_t)___GFP_ATOMIC)
#define __GFP_HIGH        ((__force gfp_t)___GFP_HIGH)
#define __GFP_MEMALLOC        ((__force gfp_t)___GFP_MEMALLOC)
#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC)

/**
 * DOC: Reclaim modifiers
 *
 * Reclaim modifiers
 * ~~~~~~~~~~~~~~~~~
 * Please note that all the following flags are only applicable to sleepable
 * allocations (e.g. %GFP_NOWAIT and %GFP_ATOMIC will ignore them).
 *
 * %__GFP_IO can start physical IO.
 *
 * %__GFP_FS can call down to the low-level FS. Clearing the flag avoids the
 * allocator recursing into the filesystem which might already be holding
 * locks.
 *
 * %__GFP_DIRECT_RECLAIM indicates that the caller may enter direct reclaim.
 * This flag can be cleared to avoid unnecessary delays when a fallback
 * option is available.
 *
 * %__GFP_KSWAPD_RECLAIM indicates that the caller wants to wake kswapd when
 * the low watermark is reached and have it reclaim pages until the high
 * watermark is reached. A caller may wish to clear this flag when fallback
 * options are available and the reclaim is likely to disrupt the system. The
 * canonical example is THP allocation where a fallback is cheap but
 * reclaim/compaction may cause indirect stalls.
 *
 * %__GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim.
 *
 * The default allocator behavior depends on the request size. We have a concept
 * of so called costly allocations (with order > %PAGE_ALLOC_COSTLY_ORDER).
 * !costly allocations are too essential to fail so they are implicitly
 * non-failing by default (with some exceptions like OOM victims might fail so
 * the caller still has to check for failures) while costly requests try to be
 * not disruptive and back off even without invoking the OOM killer.
 * The following three modifiers might be used to override some of these
 * implicit rules
 *
 * %__GFP_NORETRY: The VM implementation will try only very lightweight
 * memory direct reclaim to get some memory under memory pressure (thus
 * it can sleep). It will avoid disruptive actions like OOM killer. The
 * caller must handle the failure which is quite likely to happen under
 * heavy memory pressure. The flag is suitable when failure can easily be
 * handled at small cost, such as reduced throughput
 *
 * %__GFP_RETRY_MAYFAIL: The VM implementation will retry memory reclaim
 * procedures that have previously failed if there is some indication
 * that progress has been made else where.  It can wait for other
 * tasks to attempt high level approaches to freeing memory such as
 * compaction (which removes fragmentation) and page-out.
 * There is still a definite limit to the number of retries, but it is
 * a larger limit than with %__GFP_NORETRY.
 * Allocations with this flag may fail, but only when there is
 * genuinely little unused memory. While these allocations do not
 * directly trigger the OOM killer, their failure indicates that
 * the system is likely to need to use the OOM killer soon.  The
 * caller must handle failure, but can reasonably do so by failing
 * a higher-level request, or completing it only in a much less
 * efficient manner.
 * If the allocation does fail, and the caller is in a position to
 * free some non-essential memory, doing so could benefit the system
 * as a whole.
 *
 * %__GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller
 * cannot handle allocation failures. The allocation could block
 * indefinitely but will never return with failure. Testing for
 * failure is pointless.
 * New users should be evaluated carefully (and the flag should be
 * used only when there is no reasonable failure policy) but it is
 * definitely preferable to use the flag rather than opencode endless
 * loop around allocator.
 * Using this flag for costly allocations is _highly_ discouraged.
 */
#define __GFP_IO        ((__force gfp_t)___GFP_IO)
#define __GFP_FS        ((__force gfp_t)___GFP_FS)
#define __GFP_DIRECT_RECLAIM        ((__force gfp_t)___GFP_DIRECT_RECLAIM) /* Caller can reclaim */
#define __GFP_KSWAPD_RECLAIM        ((__force gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can wake */
#define __GFP_RECLAIM ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM))
#define __GFP_RETRY_MAYFAIL        ((__force gfp_t)___GFP_RETRY_MAYFAIL)
#define __GFP_NOFAIL        ((__force gfp_t)___GFP_NOFAIL)
#define __GFP_NORETRY        ((__force gfp_t)___GFP_NORETRY)

/**
 * DOC: Action modifiers
 *
 * Action modifiers
 * ~~~~~~~~~~~~~~~~
 *
 * %__GFP_NOWARN suppresses allocation failure reports.
 *
 * %__GFP_COMP address compound page metadata.
 *
 * %__GFP_ZERO returns a zeroed page on success.
 */
#define __GFP_NOWARN        ((__force gfp_t)___GFP_NOWARN)
#define __GFP_COMP        ((__force gfp_t)___GFP_COMP)
#define __GFP_ZERO        ((__force gfp_t)___GFP_ZERO)

/* Disable lockdep for GFP context tracking */
#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)

/* Room for N __GFP_FOO bits */
#define __GFP_BITS_SHIFT (23 + IS_ENABLED(CONFIG_LOCKDEP))
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))

/**
 * DOC: Useful GFP flag combinations
 *
 * Useful GFP flag combinations
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 * Useful GFP flag combinations that are commonly used. It is recommended
 * that subsystems start with one of these combinations and then set/clear
 * %__GFP_FOO flags as necessary.
 *
 * %GFP_ATOMIC users can not sleep and need the allocation to succeed. A lower
 * watermark is applied to allow access to "atomic reserves".
 * The current implementation doesn't support NMI and few other strict
 * non-preemptive contexts (e.g. raw_spin_lock). The same applies to %GFP_NOWAIT.
 *
 * %GFP_KERNEL is typical for kernel-internal allocations. The caller requires
 * %ZONE_NORMAL or a lower zone for direct access but can direct reclaim.
 *
 * %GFP_KERNEL_ACCOUNT is the same as GFP_KERNEL, except the allocation is
 * accounted to kmemcg.
 *
 * %GFP_NOWAIT is for kernel allocations that should not stall for direct
 * reclaim, start physical IO or use any filesystem callback.
 *
 * %GFP_NOIO will use direct reclaim to discard clean pages or slab pages
 * that do not require the starting of any physical IO.
 * Please try to avoid using this flag directly and instead use
 * memalloc_noio_{save,restore} to mark the whole scope which cannot
 * perform any IO with a short explanation why. All allocation requests
 * will inherit GFP_NOIO implicitly.
 *
 * %GFP_NOFS will use direct reclaim but will not use any filesystem interfaces.
 * Please try to avoid using this flag directly and instead use
 * memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't
 * recurse into the FS layer with a short explanation why. All allocation
 * requests will inherit GFP_NOFS implicitly.
 *
 * %GFP_USER is for userspace allocations that also need to be directly
 * accessibly by the kernel or hardware. It is typically used by hardware
 * for buffers that are mapped to userspace (e.g. graphics) that hardware
 * still must DMA to. cpuset limits are enforced for these allocations.
 *
 * %GFP_DMA exists for historical reasons and should be avoided where possible.
 * The flags indicates that the caller requires that the lowest zone be
 * used (%ZONE_DMA or 16M on x86-64). Ideally, this would be removed but
 * it would require careful auditing as some users really require it and
 * others use the flag to avoid lowmem reserves in %ZONE_DMA and treat the
 * lowest zone as a type of emergency reserve.
 *
 * %GFP_DMA32 is similar to %GFP_DMA except that the caller requires a 32-bit
 * address.
 *
 * %GFP_HIGHUSER is for userspace allocations that may be mapped to userspace,
 * do not need to be directly accessible by the kernel but that cannot
 * move once in use. An example may be a hardware allocation that maps
 * data directly into userspace but has no addressing limitations.
 *
 * %GFP_HIGHUSER_MOVABLE is for userspace allocations that the kernel does not
 * need direct access to but can use kmap() when access is required. They
 * are expected to be movable via page reclaim or page migration. Typically,
 * pages on the LRU would also be allocated with %GFP_HIGHUSER_MOVABLE.
 *
 * %GFP_TRANSHUGE and %GFP_TRANSHUGE_LIGHT are used for THP allocations. They
 * are compound allocations that will generally fail quickly if memory is not
 * available and will not wake kswapd/kcompactd on failure. The _LIGHT
 * version does not attempt reclaim/compaction at all and is by default used
 * in page fault path, while the non-light is used by khugepaged.
 */
#define GFP_ATOMIC        (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
#define GFP_KERNEL        (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
#define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT)
#define GFP_NOWAIT        (__GFP_KSWAPD_RECLAIM)
#define GFP_NOIO        (__GFP_RECLAIM)
#define GFP_NOFS        (__GFP_RECLAIM | __GFP_IO)
#define GFP_USER        (__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
#define GFP_DMA                __GFP_DMA
#define GFP_DMA32        __GFP_DMA32
#define GFP_HIGHUSER        (GFP_USER | __GFP_HIGHMEM)
#define GFP_HIGHUSER_MOVABLE        (GFP_HIGHUSER | __GFP_MOVABLE)
#define GFP_TRANSHUGE_LIGHT        ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
                         __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM)
#define GFP_TRANSHUGE        (GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM)

/* Convert GFP flags to their corresponding migrate type */
#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
#define GFP_MOVABLE_SHIFT 3

static inline int gfp_migratetype(const gfp_t gfp_flags)
{
        VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
        BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);
        BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE);

        if (unlikely(page_group_by_mobility_disabled))
                return MIGRATE_UNMOVABLE;

        /* Group based on mobility */
        return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT;
}
#undef GFP_MOVABLE_MASK
#undef GFP_MOVABLE_SHIFT

static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
{
        return !!(gfp_flags & __GFP_DIRECT_RECLAIM);
}

/**
 * gfpflags_normal_context - is gfp_flags a normal sleepable context?
 * @gfp_flags: gfp_flags to test
 *
 * Test whether @gfp_flags indicates that the allocation is from the
 * %current context and allowed to sleep.
 *
 * An allocation being allowed to block doesn't mean it owns the %current
 * context.  When direct reclaim path tries to allocate memory, the
 * allocation context is nested inside whatever %current was doing at the
 * time of the original allocation.  The nested allocation may be allowed
 * to block but modifying anything %current owns can corrupt the outer
 * context's expectations.
 *
 * %true result from this function indicates that the allocation context
 * can sleep and use anything that's associated with %current.
 */
static inline bool gfpflags_normal_context(const gfp_t gfp_flags)
{
        return (gfp_flags & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC)) ==
                __GFP_DIRECT_RECLAIM;
}

#ifdef CONFIG_HIGHMEM
#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
#else
#define OPT_ZONE_HIGHMEM ZONE_NORMAL
#endif

#ifdef CONFIG_ZONE_DMA
#define OPT_ZONE_DMA ZONE_DMA
#else
#define OPT_ZONE_DMA ZONE_NORMAL
#endif

#ifdef CONFIG_ZONE_DMA32
#define OPT_ZONE_DMA32 ZONE_DMA32
#else
#define OPT_ZONE_DMA32 ZONE_NORMAL
#endif

/*
 * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the
 * zone to use given the lowest 4 bits of gfp_t. Entries are GFP_ZONES_SHIFT
 * bits long and there are 16 of them to cover all possible combinations of
 * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM.
 *
 * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA.
 * But GFP_MOVABLE is not only a zone specifier but also an allocation
 * policy. Therefore __GFP_MOVABLE plus another zone selector is valid.
 * Only 1 bit of the lowest 3 bits (DMA,DMA32,HIGHMEM) can be set to "1".
 *
 *       bit       result
 *       =================
 *       0x0    => NORMAL
 *       0x1    => DMA or NORMAL
 *       0x2    => HIGHMEM or NORMAL
 *       0x3    => BAD (DMA+HIGHMEM)
 *       0x4    => DMA32 or NORMAL
 *       0x5    => BAD (DMA+DMA32)
 *       0x6    => BAD (HIGHMEM+DMA32)
 *       0x7    => BAD (HIGHMEM+DMA32+DMA)
 *       0x8    => NORMAL (MOVABLE+0)
 *       0x9    => DMA or NORMAL (MOVABLE+DMA)
 *       0xa    => MOVABLE (Movable is valid only if HIGHMEM is set too)
 *       0xb    => BAD (MOVABLE+HIGHMEM+DMA)
 *       0xc    => DMA32 or NORMAL (MOVABLE+DMA32)
 *       0xd    => BAD (MOVABLE+DMA32+DMA)
 *       0xe    => BAD (MOVABLE+DMA32+HIGHMEM)
 *       0xf    => BAD (MOVABLE+DMA32+HIGHMEM+DMA)
 *
 * GFP_ZONES_SHIFT must be <= 2 on 32 bit platforms.
 */

#if defined(CONFIG_ZONE_DEVICE) && (MAX_NR_ZONES-1) <= 4
/* ZONE_DEVICE is not a valid GFP zone specifier */
#define GFP_ZONES_SHIFT 2
#else
#define GFP_ZONES_SHIFT ZONES_SHIFT
#endif

#if 16 * GFP_ZONES_SHIFT > BITS_PER_LONG
#error GFP_ZONES_SHIFT too large to create GFP_ZONE_TABLE integer
#endif

#define GFP_ZONE_TABLE ( \
        (ZONE_NORMAL << 0 * GFP_ZONES_SHIFT)                                       \
        | (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT)                       \
        | (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * GFP_ZONES_SHIFT)               \
        | (OPT_ZONE_DMA32 << ___GFP_DMA32 * GFP_ZONES_SHIFT)                       \
        | (ZONE_NORMAL << ___GFP_MOVABLE * GFP_ZONES_SHIFT)                       \
        | (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT)    \
        | (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)\
        | (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)\
)

/*
 * GFP_ZONE_BAD is a bitmap for all combinations of __GFP_DMA, __GFP_DMA32
 * __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per
 * entry starting with bit 0. Bit is set if the combination is not
 * allowed.
 */
#define GFP_ZONE_BAD ( \
        1 << (___GFP_DMA | ___GFP_HIGHMEM)                                      \
        | 1 << (___GFP_DMA | ___GFP_DMA32)                                      \
        | 1 << (___GFP_DMA32 | ___GFP_HIGHMEM)                                      \
        | 1 << (___GFP_DMA | ___GFP_DMA32 | ___GFP_HIGHMEM)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_HIGHMEM | ___GFP_DMA)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_HIGHMEM)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA | ___GFP_HIGHMEM)  \
)

static inline enum zone_type gfp_zone(gfp_t flags)
{
        enum zone_type z;
        int bit = (__force int) (flags & GFP_ZONEMASK);

        z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) &
                                         ((1 << GFP_ZONES_SHIFT) - 1);
        VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
        return z;
}

/*
 * There is only one page-allocator function, and two main namespaces to
 * it. The alloc_page*() variants return 'struct page *' and as such
 * can allocate highmem pages, the *get*page*() variants return
 * virtual kernel addresses to the allocated page(s).
 */

static inline int gfp_zonelist(gfp_t flags)
{
#ifdef CONFIG_NUMA
        if (unlikely(flags & __GFP_THISNODE))
                return ZONELIST_NOFALLBACK;
#endif
        return ZONELIST_FALLBACK;
}

/*
 * We get the zone list from the current node and the gfp_mask.
 * This zone list contains a maximum of MAXNODES*MAX_NR_ZONES zones.
 * There are two zonelists per node, one for all zones with memory and
 * one containing just zones from the node the zonelist belongs to.
 *
 * For the normal case of non-DISCONTIGMEM systems the NODE_DATA() gets
 * optimized to &contig_page_data at compile-time.
 */
static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
{
        return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
}

#ifndef HAVE_ARCH_FREE_PAGE
static inline void arch_free_page(struct page *page, int order) { }
#endif
#ifndef HAVE_ARCH_ALLOC_PAGE
static inline void arch_alloc_page(struct page *page, int order) { }
#endif
#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
static inline int arch_make_page_accessible(struct page *page)
{
        return 0;
}
#endif

struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
                                                        nodemask_t *nodemask);

static inline struct page *
__alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid)
{
        return __alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL);
}

/*
 * Allocate pages, preferring the node given as nid. The node must be valid and
 * online. For more general interface, see alloc_pages_node().
 */
static inline struct page *
__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
{
        VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
        VM_WARN_ON((gfp_mask & __GFP_THISNODE) && !node_online(nid));

        return __alloc_pages(gfp_mask, order, nid);
}

/*
 * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,
 * prefer the current CPU's closest node. Otherwise node must be valid and
 * online.
 */
static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
                                                unsigned int order)
{
        if (nid == NUMA_NO_NODE)
                nid = numa_mem_id();

        return __alloc_pages_node(nid, gfp_mask, order);
}

#ifdef CONFIG_NUMA
extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order);

static inline struct page *
alloc_pages(gfp_t gfp_mask, unsigned int order)
{
        return alloc_pages_current(gfp_mask, order);
}
extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
                        struct vm_area_struct *vma, unsigned long addr,
                        int node, bool hugepage);
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
        alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
#else
static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
{
        return alloc_pages_node(numa_node_id(), gfp_mask, order);
}
#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\
        alloc_pages(gfp_mask, order)
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
        alloc_pages(gfp_mask, order)
#endif
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
#define alloc_page_vma(gfp_mask, vma, addr)                        \
        alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false)

extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
extern unsigned long get_zeroed_page(gfp_t gfp_mask);

void *alloc_pages_exact(size_t size, gfp_t gfp_mask);
void free_pages_exact(void *virt, size_t size);
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);

#define __get_free_page(gfp_mask) \
                __get_free_pages((gfp_mask), 0)

#define __get_dma_pages(gfp_mask, order) \
                __get_free_pages((gfp_mask) | GFP_DMA, (order))

extern void __free_pages(struct page *page, unsigned int order);
extern void free_pages(unsigned long addr, unsigned int order);
extern void free_unref_page(struct page *page);
extern void free_unref_page_list(struct list_head *list);

struct page_frag_cache;
extern void __page_frag_cache_drain(struct page *page, unsigned int count);
extern void *page_frag_alloc(struct page_frag_cache *nc,
                             unsigned int fragsz, gfp_t gfp_mask);
extern void page_frag_free(void *addr);

#define __free_page(page) __free_pages((page), 0)
#define free_page(addr) free_pages((addr), 0)

void page_alloc_init(void);
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
void drain_all_pages(struct zone *zone);
void drain_local_pages(struct zone *zone);

void page_alloc_init_late(void);

/*
 * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
 * GFP flags are used before interrupts are enabled. Once interrupts are
 * enabled, it is set to __GFP_BITS_MASK while the system is running. During
 * hibernation, it is used by PM to avoid I/O during memory allocation while
 * devices are suspended.
 */
extern gfp_t gfp_allowed_mask;

/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */
bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);

extern void pm_restrict_gfp_mask(void);
extern void pm_restore_gfp_mask(void);

#ifdef CONFIG_PM_SLEEP
extern bool pm_suspended_storage(void);
#else
static inline bool pm_suspended_storage(void)
{
        return false;
}
#endif /* CONFIG_PM_SLEEP */

/*
 * Check if the gfp flags allow compaction - GFP_NOIO is a really
 * tricky context because the migration might require IO.
 */
static inline bool gfp_compaction_allowed(gfp_t gfp_mask)
{
        return IS_ENABLED(CONFIG_COMPACTION) && (gfp_mask & __GFP_IO);
}

#ifdef CONFIG_CONTIG_ALLOC
/* The below functions must be run on a range from a single zone. */
extern int alloc_contig_range(unsigned long start, unsigned long end,
                              unsigned migratetype, gfp_t gfp_mask);
extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
                                       int nid, nodemask_t *nodemask);
#endif
void free_contig_range(unsigned long pfn, unsigned int nr_pages);

#ifdef CONFIG_CMA
/* CMA stuff */
extern void init_cma_reserved_pageblock(struct page *page);
#endif

#endif /* __LINUX_GFP_H */












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the Interfaces handler.
 *
 * Version:        @(#)dev.h        1.0.10        08/12/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Corey Minyard <wf-rch!minyard@relay.EU.net>
 *                Donald J. Becker, <becker@cesdis.gsfc.nasa.gov>
 *                Alan Cox, <alan@lxorguk.ukuu.org.uk>
 *                Bjorn Ekwall. <bj0rn@blox.se>
 *              Pekka Riikonen <priikone@poseidon.pspt.fi>
 *
 *                Moved to /usr/include/linux for NET3
 */
#ifndef _LINUX_NETDEVICE_H
#define _LINUX_NETDEVICE_H

#include <linux/timer.h>
#include <linux/bug.h>
#include <linux/delay.h>
#include <linux/atomic.h>
#include <linux/prefetch.h>
#include <asm/cache.h>
#include <asm/byteorder.h>

#include <linux/percpu.h>
#include <linux/rculist.h>
#include <linux/workqueue.h>
#include <linux/dynamic_queue_limits.h>

#include <linux/ethtool.h>
#include <net/net_namespace.h>
#ifdef CONFIG_DCB
#include <net/dcbnl.h>
#endif
#include <net/netprio_cgroup.h>
#include <net/xdp.h>

#include <linux/netdev_features.h>
#include <linux/neighbour.h>
#include <uapi/linux/netdevice.h>
#include <uapi/linux/if_bonding.h>
#include <uapi/linux/pkt_cls.h>
#include <linux/hashtable.h>

struct netpoll_info;
struct device;
struct phy_device;
struct dsa_port;
struct ip_tunnel_parm;
struct macsec_context;
struct macsec_ops;

struct sfp_bus;
/* 802.11 specific */
struct wireless_dev;
/* 802.15.4 specific */
struct wpan_dev;
struct mpls_dev;
/* UDP Tunnel offloads */
struct udp_tunnel_info;
struct udp_tunnel_nic_info;
struct udp_tunnel_nic;
struct bpf_prog;
struct xdp_buff;

void synchronize_net(void);
void netdev_set_default_ethtool_ops(struct net_device *dev,
                                    const struct ethtool_ops *ops);

/* Backlog congestion levels */
#define NET_RX_SUCCESS                0        /* keep 'em coming, baby */
#define NET_RX_DROP                1        /* packet dropped */

#define MAX_NEST_DEV 8

/*
 * Transmit return codes: transmit return codes originate from three different
 * namespaces:
 *
 * - qdisc return codes
 * - driver transmit return codes
 * - errno values
 *
 * Drivers are allowed to return any one of those in their hard_start_xmit()
 * function. Real network devices commonly used with qdiscs should only return
 * the driver transmit return codes though - when qdiscs are used, the actual
 * transmission happens asynchronously, so the value is not propagated to
 * higher layers. Virtual network devices transmit synchronously; in this case
 * the driver transmit return codes are consumed by dev_queue_xmit(), and all
 * others are propagated to higher layers.
 */

/* qdisc ->enqueue() return codes. */
#define NET_XMIT_SUCCESS        0x00
#define NET_XMIT_DROP                0x01        /* skb dropped                        */
#define NET_XMIT_CN                0x02        /* congestion notification        */
#define NET_XMIT_MASK                0x0f        /* qdisc flags in net/sch_generic.h */

/* NET_XMIT_CN is special. It does not guarantee that this packet is lost. It
 * indicates that the device will soon be dropping packets, or already drops
 * some packets of the same priority; prompting us to send less aggressively. */
#define net_xmit_eval(e)        ((e) == NET_XMIT_CN ? 0 : (e))
#define net_xmit_errno(e)        ((e) != NET_XMIT_CN ? -ENOBUFS : 0)

/* Driver transmit return codes */
#define NETDEV_TX_MASK                0xf0

enum netdev_tx {
        __NETDEV_TX_MIN         = INT_MIN,        /* make sure enum is signed */
        NETDEV_TX_OK         = 0x00,        /* driver took care of packet */
        NETDEV_TX_BUSY         = 0x10,        /* driver tx path was busy*/
};
typedef enum netdev_tx netdev_tx_t;

/*
 * Current order: NETDEV_TX_MASK > NET_XMIT_MASK >= 0 is significant;
 * hard_start_xmit() return < NET_XMIT_MASK means skb was consumed.
 */
static inline bool dev_xmit_complete(int rc)
{
        /*
         * Positive cases with an skb consumed by a driver:
         * - successful transmission (rc == NETDEV_TX_OK)
         * - error while transmitting (rc < 0)
         * - error while queueing to a different device (rc & NET_XMIT_MASK)
         */
        if (likely(rc < NET_XMIT_MASK))
                return true;

        return false;
}

/*
 *        Compute the worst-case header length according to the protocols
 *        used.
 */

#if defined(CONFIG_HYPERV_NET)
# define LL_MAX_HEADER 128
#elif defined(CONFIG_WLAN) || IS_ENABLED(CONFIG_AX25)
# if defined(CONFIG_MAC80211_MESH)
#  define LL_MAX_HEADER 128
# else
#  define LL_MAX_HEADER 96
# endif
#else
# define LL_MAX_HEADER 32
#endif

#if !IS_ENABLED(CONFIG_NET_IPIP) && !IS_ENABLED(CONFIG_NET_IPGRE) && \
    !IS_ENABLED(CONFIG_IPV6_SIT) && !IS_ENABLED(CONFIG_IPV6_TUNNEL)
#define MAX_HEADER LL_MAX_HEADER
#else
#define MAX_HEADER (LL_MAX_HEADER + 48)
#endif

/*
 *        Old network device statistics. Fields are native words
 *        (unsigned long) so they can be read and written atomically.
 */

#define NET_DEV_STAT(FIELD)                        \
        union {                                        \
                unsigned long FIELD;                \
                atomic_long_t __##FIELD;        \
        }

struct net_device_stats {
        NET_DEV_STAT(rx_packets);
        NET_DEV_STAT(tx_packets);
        NET_DEV_STAT(rx_bytes);
        NET_DEV_STAT(tx_bytes);
        NET_DEV_STAT(rx_errors);
        NET_DEV_STAT(tx_errors);
        NET_DEV_STAT(rx_dropped);
        NET_DEV_STAT(tx_dropped);
        NET_DEV_STAT(multicast);
        NET_DEV_STAT(collisions);
        NET_DEV_STAT(rx_length_errors);
        NET_DEV_STAT(rx_over_errors);
        NET_DEV_STAT(rx_crc_errors);
        NET_DEV_STAT(rx_frame_errors);
        NET_DEV_STAT(rx_fifo_errors);
        NET_DEV_STAT(rx_missed_errors);
        NET_DEV_STAT(tx_aborted_errors);
        NET_DEV_STAT(tx_carrier_errors);
        NET_DEV_STAT(tx_fifo_errors);
        NET_DEV_STAT(tx_heartbeat_errors);
        NET_DEV_STAT(tx_window_errors);
        NET_DEV_STAT(rx_compressed);
        NET_DEV_STAT(tx_compressed);
};
#undef NET_DEV_STAT


#include <linux/cache.h>
#include <linux/skbuff.h>

#ifdef CONFIG_RPS
#include <linux/static_key.h>
extern struct static_key_false rps_needed;
extern struct static_key_false rfs_needed;
#endif

struct neighbour;
struct neigh_parms;
struct sk_buff;

struct netdev_hw_addr {
        struct list_head        list;
        unsigned char                addr[MAX_ADDR_LEN];
        unsigned char                type;
#define NETDEV_HW_ADDR_T_LAN                1
#define NETDEV_HW_ADDR_T_SAN                2
#define NETDEV_HW_ADDR_T_UNICAST        3
#define NETDEV_HW_ADDR_T_MULTICAST        4
        bool                        global_use;
        int                        sync_cnt;
        int                        refcount;
        int                        synced;
        struct rcu_head                rcu_head;
};

struct netdev_hw_addr_list {
        struct list_head        list;
        int                        count;
};

#define netdev_hw_addr_list_count(l) ((l)->count)
#define netdev_hw_addr_list_empty(l) (netdev_hw_addr_list_count(l) == 0)
#define netdev_hw_addr_list_for_each(ha, l) \
        list_for_each_entry(ha, &(l)->list, list)

#define netdev_uc_count(dev) netdev_hw_addr_list_count(&(dev)->uc)
#define netdev_uc_empty(dev) netdev_hw_addr_list_empty(&(dev)->uc)
#define netdev_for_each_uc_addr(ha, dev) \
        netdev_hw_addr_list_for_each(ha, &(dev)->uc)

#define netdev_mc_count(dev) netdev_hw_addr_list_count(&(dev)->mc)
#define netdev_mc_empty(dev) netdev_hw_addr_list_empty(&(dev)->mc)
#define netdev_for_each_mc_addr(ha, dev) \
        netdev_hw_addr_list_for_each(ha, &(dev)->mc)

struct hh_cache {
        unsigned int        hh_len;
        seqlock_t        hh_lock;

        /* cached hardware header; allow for machine alignment needs.        */
#define HH_DATA_MOD        16
#define HH_DATA_OFF(__len) \
        (HH_DATA_MOD - (((__len - 1) & (HH_DATA_MOD - 1)) + 1))
#define HH_DATA_ALIGN(__len) \
        (((__len)+(HH_DATA_MOD-1))&~(HH_DATA_MOD - 1))
        unsigned long        hh_data[HH_DATA_ALIGN(LL_MAX_HEADER) / sizeof(long)];
};

/* Reserve HH_DATA_MOD byte-aligned hard_header_len, but at least that much.
 * Alternative is:
 *   dev->hard_header_len ? (dev->hard_header_len +
 *                           (HH_DATA_MOD - 1)) & ~(HH_DATA_MOD - 1) : 0
 *
 * We could use other alignment values, but we must maintain the
 * relationship HH alignment <= LL alignment.
 */
#define LL_RESERVED_SPACE(dev) \
        ((((dev)->hard_header_len + READ_ONCE((dev)->needed_headroom)) \
          & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD)
#define LL_RESERVED_SPACE_EXTRA(dev,extra) \
        ((((dev)->hard_header_len + READ_ONCE((dev)->needed_headroom) + (extra)) \
          & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD)

struct header_ops {
        int        (*create) (struct sk_buff *skb, struct net_device *dev,
                           unsigned short type, const void *daddr,
                           const void *saddr, unsigned int len);
        int        (*parse)(const struct sk_buff *skb, unsigned char *haddr);
        int        (*cache)(const struct neighbour *neigh, struct hh_cache *hh, __be16 type);
        void        (*cache_update)(struct hh_cache *hh,
                                const struct net_device *dev,
                                const unsigned char *haddr);
        bool        (*validate)(const char *ll_header, unsigned int len);
        __be16        (*parse_protocol)(const struct sk_buff *skb);
};

/* These flag bits are private to the generic network queueing
 * layer; they may not be explicitly referenced by any other
 * code.
 */

enum netdev_state_t {
        __LINK_STATE_START,
        __LINK_STATE_PRESENT,
        __LINK_STATE_NOCARRIER,
        __LINK_STATE_LINKWATCH_PENDING,
        __LINK_STATE_DORMANT,
        __LINK_STATE_TESTING,
};


/*
 * This structure holds boot-time configured netdevice settings. They
 * are then used in the device probing.
 */
struct netdev_boot_setup {
        char name[IFNAMSIZ];
        struct ifmap map;
};
#define NETDEV_BOOT_SETUP_MAX 8

int __init netdev_boot_setup(char *str);

struct gro_list {
        struct list_head        list;
        int                        count;
};

/*
 * size of gro hash buckets, must less than bit number of
 * napi_struct::gro_bitmask
 */
#define GRO_HASH_BUCKETS        8

/*
 * Structure for NAPI scheduling similar to tasklet but with weighting
 */
struct napi_struct {
        /* The poll_list must only be managed by the entity which
         * changes the state of the NAPI_STATE_SCHED bit.  This means
         * whoever atomically sets that bit can add this napi_struct
         * to the per-CPU poll_list, and whoever clears that bit
         * can remove from the list right before clearing the bit.
         */
        struct list_head        poll_list;

        unsigned long                state;
        int                        weight;
        int                        defer_hard_irqs_count;
        unsigned long                gro_bitmask;
        int                        (*poll)(struct napi_struct *, int);
#ifdef CONFIG_NETPOLL
        int                        poll_owner;
#endif
        struct net_device        *dev;
        struct gro_list                gro_hash[GRO_HASH_BUCKETS];
        struct sk_buff                *skb;
        struct list_head        rx_list; /* Pending GRO_NORMAL skbs */
        int                        rx_count; /* length of rx_list */
        struct hrtimer                timer;
        struct list_head        dev_list;
        struct hlist_node        napi_hash_node;
        unsigned int                napi_id;
};

enum {
        NAPI_STATE_SCHED,        /* Poll is scheduled */
        NAPI_STATE_MISSED,        /* reschedule a napi */
        NAPI_STATE_DISABLE,        /* Disable pending */
        NAPI_STATE_NPSVC,        /* Netpoll - don't dequeue from poll_list */
        NAPI_STATE_LISTED,        /* NAPI added to system lists */
        NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
        NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
};

enum {
        NAPIF_STATE_SCHED         = BIT(NAPI_STATE_SCHED),
        NAPIF_STATE_MISSED         = BIT(NAPI_STATE_MISSED),
        NAPIF_STATE_DISABLE         = BIT(NAPI_STATE_DISABLE),
        NAPIF_STATE_NPSVC         = BIT(NAPI_STATE_NPSVC),
        NAPIF_STATE_LISTED         = BIT(NAPI_STATE_LISTED),
        NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
        NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
};

enum gro_result {
        GRO_MERGED,
        GRO_MERGED_FREE,
        GRO_HELD,
        GRO_NORMAL,
        GRO_DROP,
        GRO_CONSUMED,
};
typedef enum gro_result gro_result_t;

/*
 * enum rx_handler_result - Possible return values for rx_handlers.
 * @RX_HANDLER_CONSUMED: skb was consumed by rx_handler, do not process it
 * further.
 * @RX_HANDLER_ANOTHER: Do another round in receive path. This is indicated in
 * case skb->dev was changed by rx_handler.
 * @RX_HANDLER_EXACT: Force exact delivery, no wildcard.
 * @RX_HANDLER_PASS: Do nothing, pass the skb as if no rx_handler was called.
 *
 * rx_handlers are functions called from inside __netif_receive_skb(), to do
 * special processing of the skb, prior to delivery to protocol handlers.
 *
 * Currently, a net_device can only have a single rx_handler registered. Trying
 * to register a second rx_handler will return -EBUSY.
 *
 * To register a rx_handler on a net_device, use netdev_rx_handler_register().
 * To unregister a rx_handler on a net_device, use
 * netdev_rx_handler_unregister().
 *
 * Upon return, rx_handler is expected to tell __netif_receive_skb() what to
 * do with the skb.
 *
 * If the rx_handler consumed the skb in some way, it should return
 * RX_HANDLER_CONSUMED. This is appropriate when the rx_handler arranged for
 * the skb to be delivered in some other way.
 *
 * If the rx_handler changed skb->dev, to divert the skb to another
 * net_device, it should return RX_HANDLER_ANOTHER. The rx_handler for the
 * new device will be called if it exists.
 *
 * If the rx_handler decides the skb should be ignored, it should return
 * RX_HANDLER_EXACT. The skb will only be delivered to protocol handlers that
 * are registered on exact device (ptype->dev == skb->dev).
 *
 * If the rx_handler didn't change skb->dev, but wants the skb to be normally
 * delivered, it should return RX_HANDLER_PASS.
 *
 * A device without a registered rx_handler will behave as if rx_handler
 * returned RX_HANDLER_PASS.
 */

enum rx_handler_result {
        RX_HANDLER_CONSUMED,
        RX_HANDLER_ANOTHER,
        RX_HANDLER_EXACT,
        RX_HANDLER_PASS,
};
typedef enum rx_handler_result rx_handler_result_t;
typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb);

void __napi_schedule(struct napi_struct *n);
void __napi_schedule_irqoff(struct napi_struct *n);

static inline bool napi_disable_pending(struct napi_struct *n)
{
        return test_bit(NAPI_STATE_DISABLE, &n->state);
}

bool napi_schedule_prep(struct napi_struct *n);

/**
 *        napi_schedule - schedule NAPI poll
 *        @n: NAPI context
 *
 * Schedule NAPI poll routine to be called if it is not already
 * running.
 */
static inline void napi_schedule(struct napi_struct *n)
{
        if (napi_schedule_prep(n))
                __napi_schedule(n);
}

/**
 *        napi_schedule_irqoff - schedule NAPI poll
 *        @n: NAPI context
 *
 * Variant of napi_schedule(), assuming hard irqs are masked.
 */
static inline void napi_schedule_irqoff(struct napi_struct *n)
{
        if (napi_schedule_prep(n))
                __napi_schedule_irqoff(n);
}

/* Try to reschedule poll. Called by dev->poll() after napi_complete().  */
static inline bool napi_reschedule(struct napi_struct *napi)
{
        if (napi_schedule_prep(napi)) {
                __napi_schedule(napi);
                return true;
        }
        return false;
}

bool napi_complete_done(struct napi_struct *n, int work_done);
/**
 *        napi_complete - NAPI processing complete
 *        @n: NAPI context
 *
 * Mark NAPI processing as complete.
 * Consider using napi_complete_done() instead.
 * Return false if device should avoid rearming interrupts.
 */
static inline bool napi_complete(struct napi_struct *n)
{
        return napi_complete_done(n, 0);
}

/**
 *        napi_disable - prevent NAPI from scheduling
 *        @n: NAPI context
 *
 * Stop NAPI from being scheduled on this context.
 * Waits till any outstanding processing completes.
 */
void napi_disable(struct napi_struct *n);

/**
 *        napi_enable - enable NAPI scheduling
 *        @n: NAPI context
 *
 * Resume NAPI from being scheduled on this context.
 * Must be paired with napi_disable.
 */
static inline void napi_enable(struct napi_struct *n)
{
        BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
        smp_mb__before_atomic();
        clear_bit(NAPI_STATE_SCHED, &n->state);
        clear_bit(NAPI_STATE_NPSVC, &n->state);
}

/**
 *        napi_synchronize - wait until NAPI is not running
 *        @n: NAPI context
 *
 * Wait until NAPI is done being scheduled on this context.
 * Waits till any outstanding processing completes but
 * does not disable future activations.
 */
static inline void napi_synchronize(const struct napi_struct *n)
{
        if (IS_ENABLED(CONFIG_SMP))
                while (test_bit(NAPI_STATE_SCHED, &n->state))
                        msleep(1);
        else
                barrier();
}

/**
 *        napi_if_scheduled_mark_missed - if napi is running, set the
 *        NAPIF_STATE_MISSED
 *        @n: NAPI context
 *
 * If napi is running, set the NAPIF_STATE_MISSED, and return true if
 * NAPI is scheduled.
 **/
static inline bool napi_if_scheduled_mark_missed(struct napi_struct *n)
{
        unsigned long val, new;

        do {
                val = READ_ONCE(n->state);
                if (val & NAPIF_STATE_DISABLE)
                        return true;

                if (!(val & NAPIF_STATE_SCHED))
                        return false;

                new = val | NAPIF_STATE_MISSED;
        } while (cmpxchg(&n->state, val, new) != val);

        return true;
}

enum netdev_queue_state_t {
        __QUEUE_STATE_DRV_XOFF,
        __QUEUE_STATE_STACK_XOFF,
        __QUEUE_STATE_FROZEN,
};

#define QUEUE_STATE_DRV_XOFF        (1 << __QUEUE_STATE_DRV_XOFF)
#define QUEUE_STATE_STACK_XOFF        (1 << __QUEUE_STATE_STACK_XOFF)
#define QUEUE_STATE_FROZEN        (1 << __QUEUE_STATE_FROZEN)

#define QUEUE_STATE_ANY_XOFF        (QUEUE_STATE_DRV_XOFF | QUEUE_STATE_STACK_XOFF)
#define QUEUE_STATE_ANY_XOFF_OR_FROZEN (QUEUE_STATE_ANY_XOFF | \
                                        QUEUE_STATE_FROZEN)
#define QUEUE_STATE_DRV_XOFF_OR_FROZEN (QUEUE_STATE_DRV_XOFF | \
                                        QUEUE_STATE_FROZEN)

/*
 * __QUEUE_STATE_DRV_XOFF is used by drivers to stop the transmit queue.  The
 * netif_tx_* functions below are used to manipulate this flag.  The
 * __QUEUE_STATE_STACK_XOFF flag is used by the stack to stop the transmit
 * queue independently.  The netif_xmit_*stopped functions below are called
 * to check if the queue has been stopped by the driver or stack (either
 * of the XOFF bits are set in the state).  Drivers should not need to call
 * netif_xmit*stopped functions, they should only be using netif_tx_*.
 */

struct netdev_queue {
/*
 * read-mostly part
 */
        struct net_device        *dev;
        struct Qdisc __rcu        *qdisc;
        struct Qdisc                *qdisc_sleeping;
#ifdef CONFIG_SYSFS
        struct kobject                kobj;
#endif
#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
        int                        numa_node;
#endif
        unsigned long                tx_maxrate;
        /*
         * Number of TX timeouts for this queue
         * (/sys/class/net/DEV/Q/trans_timeout)
         */
        unsigned long                trans_timeout;

        /* Subordinate device that the queue has been assigned to */
        struct net_device        *sb_dev;
#ifdef CONFIG_XDP_SOCKETS
        struct xsk_buff_pool    *pool;
#endif
/*
 * write-mostly part
 */
        spinlock_t                _xmit_lock ____cacheline_aligned_in_smp;
        int                        xmit_lock_owner;
        /*
         * Time (in jiffies) of last Tx
         */
        unsigned long                trans_start;

        unsigned long                state;

#ifdef CONFIG_BQL
        struct dql                dql;
#endif
} ____cacheline_aligned_in_smp;

extern int sysctl_fb_tunnels_only_for_init_net;
extern int sysctl_devconf_inherit_init_net;

/*
 * sysctl_fb_tunnels_only_for_init_net == 0 : For all netns
 *                                     == 1 : For initns only
 *                                     == 2 : For none.
 */
static inline bool net_has_fallback_tunnels(const struct net *net)
{
#if IS_ENABLED(CONFIG_SYSCTL)
        int fb_tunnels_only_for_init_net = READ_ONCE(sysctl_fb_tunnels_only_for_init_net);

        return !fb_tunnels_only_for_init_net ||
                (net_eq(net, &init_net) && fb_tunnels_only_for_init_net == 1);
#else
        return true;
#endif
}

static inline int net_inherit_devconf(void)
{
#if IS_ENABLED(CONFIG_SYSCTL)
        return READ_ONCE(sysctl_devconf_inherit_init_net);
#else
        return 0;
#endif
}

static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
{
#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
        return q->numa_node;
#else
        return NUMA_NO_NODE;
#endif
}

static inline void netdev_queue_numa_node_write(struct netdev_queue *q, int node)
{
#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
        q->numa_node = node;
#endif
}

#ifdef CONFIG_RPS
/*
 * This structure holds an RPS map which can be of variable length.  The
 * map is an array of CPUs.
 */
struct rps_map {
        unsigned int len;
        struct rcu_head rcu;
        u16 cpus[];
};
#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + ((_num) * sizeof(u16)))

/*
 * The rps_dev_flow structure contains the mapping of a flow to a CPU, the
 * tail pointer for that CPU's input queue at the time of last enqueue, and
 * a hardware filter index.
 */
struct rps_dev_flow {
        u16 cpu;
        u16 filter;
        unsigned int last_qtail;
};
#define RPS_NO_FILTER 0xffff

/*
 * The rps_dev_flow_table structure contains a table of flow mappings.
 */
struct rps_dev_flow_table {
        unsigned int mask;
        struct rcu_head rcu;
        struct rps_dev_flow flows[];
};
#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \
    ((_num) * sizeof(struct rps_dev_flow)))

/*
 * The rps_sock_flow_table contains mappings of flows to the last CPU
 * on which they were processed by the application (set in recvmsg).
 * Each entry is a 32bit value. Upper part is the high-order bits
 * of flow hash, lower part is CPU number.
 * rps_cpu_mask is used to partition the space, depending on number of
 * possible CPUs : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1
 * For example, if 64 CPUs are possible, rps_cpu_mask = 0x3f,
 * meaning we use 32-6=26 bits for the hash.
 */
struct rps_sock_flow_table {
        u32        mask;

        u32        ents[] ____cacheline_aligned_in_smp;
};
#define        RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))

#define RPS_NO_CPU 0xffff

extern u32 rps_cpu_mask;
extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;

static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
                                        u32 hash)
{
        if (table && hash) {
                unsigned int index = hash & table->mask;
                u32 val = hash & ~rps_cpu_mask;

                /* We only give a hint, preemption can change CPU under us */
                val |= raw_smp_processor_id();

                /* The following WRITE_ONCE() is paired with the READ_ONCE()
                 * here, and another one in get_rps_cpu().
                 */
                if (READ_ONCE(table->ents[index]) != val)
                        WRITE_ONCE(table->ents[index], val);
        }
}

#ifdef CONFIG_RFS_ACCEL
bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id,
                         u16 filter_id);
#endif
#endif /* CONFIG_RPS */

/* This structure contains an instance of an RX queue. */
struct netdev_rx_queue {
#ifdef CONFIG_RPS
        struct rps_map __rcu                *rps_map;
        struct rps_dev_flow_table __rcu        *rps_flow_table;
#endif
        struct kobject                        kobj;
        struct net_device                *dev;
        struct xdp_rxq_info                xdp_rxq;
#ifdef CONFIG_XDP_SOCKETS
        struct xsk_buff_pool            *pool;
#endif
} ____cacheline_aligned_in_smp;

/*
 * RX queue sysfs structures and functions.
 */
struct rx_queue_attribute {
        struct attribute attr;
        ssize_t (*show)(struct netdev_rx_queue *queue, char *buf);
        ssize_t (*store)(struct netdev_rx_queue *queue,
                         const char *buf, size_t len);
};

#ifdef CONFIG_XPS
/*
 * This structure holds an XPS map which can be of variable length.  The
 * map is an array of queues.
 */
struct xps_map {
        unsigned int len;
        unsigned int alloc_len;
        struct rcu_head rcu;
        u16 queues[];
};
#define XPS_MAP_SIZE(_num) (sizeof(struct xps_map) + ((_num) * sizeof(u16)))
#define XPS_MIN_MAP_ALLOC ((L1_CACHE_ALIGN(offsetof(struct xps_map, queues[1])) \
       - sizeof(struct xps_map)) / sizeof(u16))

/*
 * This structure holds all XPS maps for device.  Maps are indexed by CPU.
 */
struct xps_dev_maps {
        struct rcu_head rcu;
        struct xps_map __rcu *attr_map[]; /* Either CPUs map or RXQs map */
};

#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) +        \
        (nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))

#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\
        (_rxqs * (_tcs) * sizeof(struct xps_map *)))

#endif /* CONFIG_XPS */

#define TC_MAX_QUEUE        16
#define TC_BITMASK        15
/* HW offloaded queuing disciplines txq count and offset maps */
struct netdev_tc_txq {
        u16 count;
        u16 offset;
};

#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
/*
 * This structure is to hold information about the device
 * configured to run FCoE protocol stack.
 */
struct netdev_fcoe_hbainfo {
        char        manufacturer[64];
        char        serial_number[64];
        char        hardware_version[64];
        char        driver_version[64];
        char        optionrom_version[64];
        char        firmware_version[64];
        char        model[256];
        char        model_description[256];
};
#endif

#define MAX_PHYS_ITEM_ID_LEN 32

/* This structure holds a unique identifier to identify some
 * physical item (port for example) used by a netdevice.
 */
struct netdev_phys_item_id {
        unsigned char id[MAX_PHYS_ITEM_ID_LEN];
        unsigned char id_len;
};

static inline bool netdev_phys_item_id_same(struct netdev_phys_item_id *a,
                                            struct netdev_phys_item_id *b)
{
        return a->id_len == b->id_len &&
               memcmp(a->id, b->id, a->id_len) == 0;
}

typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
                                       struct sk_buff *skb,
                                       struct net_device *sb_dev);

enum tc_setup_type {
        TC_SETUP_QDISC_MQPRIO,
        TC_SETUP_CLSU32,
        TC_SETUP_CLSFLOWER,
        TC_SETUP_CLSMATCHALL,
        TC_SETUP_CLSBPF,
        TC_SETUP_BLOCK,
        TC_SETUP_QDISC_CBS,
        TC_SETUP_QDISC_RED,
        TC_SETUP_QDISC_PRIO,
        TC_SETUP_QDISC_MQ,
        TC_SETUP_QDISC_ETF,
        TC_SETUP_ROOT_QDISC,
        TC_SETUP_QDISC_GRED,
        TC_SETUP_QDISC_TAPRIO,
        TC_SETUP_FT,
        TC_SETUP_QDISC_ETS,
        TC_SETUP_QDISC_TBF,
        TC_SETUP_QDISC_FIFO,
};

/* These structures hold the attributes of bpf state that are being passed
 * to the netdevice through the bpf op.
 */
enum bpf_netdev_command {
        /* Set or clear a bpf program used in the earliest stages of packet
         * rx. The prog will have been loaded as BPF_PROG_TYPE_XDP. The callee
         * is responsible for calling bpf_prog_put on any old progs that are
         * stored. In case of error, the callee need not release the new prog
         * reference, but on success it takes ownership and must bpf_prog_put
         * when it is no longer used.
         */
        XDP_SETUP_PROG,
        XDP_SETUP_PROG_HW,
        /* BPF program for offload callbacks, invoked at program load time. */
        BPF_OFFLOAD_MAP_ALLOC,
        BPF_OFFLOAD_MAP_FREE,
        XDP_SETUP_XSK_POOL,
};

struct bpf_prog_offload_ops;
struct netlink_ext_ack;
struct xdp_umem;
struct xdp_dev_bulk_queue;
struct bpf_xdp_link;

enum bpf_xdp_mode {
        XDP_MODE_SKB = 0,
        XDP_MODE_DRV = 1,
        XDP_MODE_HW = 2,
        __MAX_XDP_MODE
};

struct bpf_xdp_entity {
        struct bpf_prog *prog;
        struct bpf_xdp_link *link;
};

struct netdev_bpf {
        enum bpf_netdev_command command;
        union {
                /* XDP_SETUP_PROG */
                struct {
                        u32 flags;
                        struct bpf_prog *prog;
                        struct netlink_ext_ack *extack;
                };
                /* BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE */
                struct {
                        struct bpf_offloaded_map *offmap;
                };
                /* XDP_SETUP_XSK_POOL */
                struct {
                        struct xsk_buff_pool *pool;
                        u16 queue_id;
                } xsk;
        };
};

/* Flags for ndo_xsk_wakeup. */
#define XDP_WAKEUP_RX (1 << 0)
#define XDP_WAKEUP_TX (1 << 1)

#ifdef CONFIG_XFRM_OFFLOAD
struct xfrmdev_ops {
        int        (*xdo_dev_state_add) (struct xfrm_state *x);
        void        (*xdo_dev_state_delete) (struct xfrm_state *x);
        void        (*xdo_dev_state_free) (struct xfrm_state *x);
        bool        (*xdo_dev_offload_ok) (struct sk_buff *skb,
                                       struct xfrm_state *x);
        void        (*xdo_dev_state_advance_esn) (struct xfrm_state *x);
};
#endif

struct dev_ifalias {
        struct rcu_head rcuhead;
        char ifalias[];
};

struct devlink;
struct tlsdev_ops;

struct netdev_name_node {
        struct hlist_node hlist;
        struct list_head list;
        struct net_device *dev;
        const char *name;
};

int netdev_name_node_alt_create(struct net_device *dev, const char *name);
int netdev_name_node_alt_destroy(struct net_device *dev, const char *name);

struct netdev_net_notifier {
        struct list_head list;
        struct notifier_block *nb;
};

/*
 * This structure defines the management hooks for network devices.
 * The following hooks can be defined; unless noted otherwise, they are
 * optional and can be filled with a null pointer.
 *
 * int (*ndo_init)(struct net_device *dev);
 *     This function is called once when a network device is registered.
 *     The network device can use this for any late stage initialization
 *     or semantic validation. It can fail with an error code which will
 *     be propagated back to register_netdev.
 *
 * void (*ndo_uninit)(struct net_device *dev);
 *     This function is called when device is unregistered or when registration
 *     fails. It is not called if init fails.
 *
 * int (*ndo_open)(struct net_device *dev);
 *     This function is called when a network device transitions to the up
 *     state.
 *
 * int (*ndo_stop)(struct net_device *dev);
 *     This function is called when a network device transitions to the down
 *     state.
 *
 * netdev_tx_t (*ndo_start_xmit)(struct sk_buff *skb,
 *                               struct net_device *dev);
 *        Called when a packet needs to be transmitted.
 *        Returns NETDEV_TX_OK.  Can return NETDEV_TX_BUSY, but you should stop
 *        the queue before that can happen; it's for obsolete devices and weird
 *        corner cases, but the stack really does a non-trivial amount
 *        of useless work if you return NETDEV_TX_BUSY.
 *        Required; cannot be NULL.
 *
 * netdev_features_t (*ndo_features_check)(struct sk_buff *skb,
 *                                           struct net_device *dev
 *                                           netdev_features_t features);
 *        Called by core transmit path to determine if device is capable of
 *        performing offload operations on a given packet. This is to give
 *        the device an opportunity to implement any restrictions that cannot
 *        be otherwise expressed by feature flags. The check is called with
 *        the set of features that the stack has calculated and it returns
 *        those the driver believes to be appropriate.
 *
 * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb,
 *                         struct net_device *sb_dev);
 *        Called to decide which queue to use when device supports multiple
 *        transmit queues.
 *
 * void (*ndo_change_rx_flags)(struct net_device *dev, int flags);
 *        This function is called to allow device receiver to make
 *        changes to configuration when multicast or promiscuous is enabled.
 *
 * void (*ndo_set_rx_mode)(struct net_device *dev);
 *        This function is called device changes address list filtering.
 *        If driver handles unicast address filtering, it should set
 *        IFF_UNICAST_FLT in its priv_flags.
 *
 * int (*ndo_set_mac_address)(struct net_device *dev, void *addr);
 *        This function  is called when the Media Access Control address
 *        needs to be changed. If this interface is not defined, the
 *        MAC address can not be changed.
 *
 * int (*ndo_validate_addr)(struct net_device *dev);
 *        Test if Media Access Control address is valid for the device.
 *
 * int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd);
 *        Called when a user requests an ioctl which can't be handled by
 *        the generic interface code. If not defined ioctls return
 *        not supported error code.
 *
 * int (*ndo_set_config)(struct net_device *dev, struct ifmap *map);
 *        Used to set network devices bus interface parameters. This interface
 *        is retained for legacy reasons; new devices should use the bus
 *        interface (PCI) for low level management.
 *
 * int (*ndo_change_mtu)(struct net_device *dev, int new_mtu);
 *        Called when a user wants to change the Maximum Transfer Unit
 *        of a device.
 *
 * void (*ndo_tx_timeout)(struct net_device *dev, unsigned int txqueue);
 *        Callback used when the transmitter has not made any progress
 *        for dev->watchdog ticks.
 *
 * void (*ndo_get_stats64)(struct net_device *dev,
 *                         struct rtnl_link_stats64 *storage);
 * struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
 *        Called when a user wants to get the network device usage
 *        statistics. Drivers must do one of the following:
 *        1. Define @ndo_get_stats64 to fill in a zero-initialised
 *           rtnl_link_stats64 structure passed by the caller.
 *        2. Define @ndo_get_stats to update a net_device_stats structure
 *           (which should normally be dev->stats) and return a pointer to
 *           it. The structure may be changed asynchronously only if each
 *           field is written atomically.
 *        3. Update dev->stats asynchronously and atomically, and define
 *           neither operation.
 *
 * bool (*ndo_has_offload_stats)(const struct net_device *dev, int attr_id)
 *        Return true if this device supports offload stats of this attr_id.
 *
 * int (*ndo_get_offload_stats)(int attr_id, const struct net_device *dev,
 *        void *attr_data)
 *        Get statistics for offload operations by attr_id. Write it into the
 *        attr_data pointer.
 *
 * int (*ndo_vlan_rx_add_vid)(struct net_device *dev, __be16 proto, u16 vid);
 *        If device supports VLAN filtering this function is called when a
 *        VLAN id is registered.
 *
 * int (*ndo_vlan_rx_kill_vid)(struct net_device *dev, __be16 proto, u16 vid);
 *        If device supports VLAN filtering this function is called when a
 *        VLAN id is unregistered.
 *
 * void (*ndo_poll_controller)(struct net_device *dev);
 *
 *        SR-IOV management functions.
 * int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac);
 * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan,
 *                          u8 qos, __be16 proto);
 * int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int min_tx_rate,
 *                          int max_tx_rate);
 * int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool setting);
 * int (*ndo_set_vf_trust)(struct net_device *dev, int vf, bool setting);
 * int (*ndo_get_vf_config)(struct net_device *dev,
 *                            int vf, struct ifla_vf_info *ivf);
 * int (*ndo_set_vf_link_state)(struct net_device *dev, int vf, int link_state);
 * int (*ndo_set_vf_port)(struct net_device *dev, int vf,
 *                          struct nlattr *port[]);
 *
 *      Enable or disable the VF ability to query its RSS Redirection Table and
 *      Hash Key. This is needed since on some devices VF share this information
 *      with PF and querying it may introduce a theoretical security risk.
 * int (*ndo_set_vf_rss_query_en)(struct net_device *dev, int vf, bool setting);
 * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
 * int (*ndo_setup_tc)(struct net_device *dev, enum tc_setup_type type,
 *                       void *type_data);
 *        Called to setup any 'tc' scheduler, classifier or action on @dev.
 *        This is always called from the stack with the rtnl lock held and netif
 *        tx queues stopped. This allows the netdevice to perform queue
 *        management safely.
 *
 *        Fiber Channel over Ethernet (FCoE) offload functions.
 * int (*ndo_fcoe_enable)(struct net_device *dev);
 *        Called when the FCoE protocol stack wants to start using LLD for FCoE
 *        so the underlying device can perform whatever needed configuration or
 *        initialization to support acceleration of FCoE traffic.
 *
 * int (*ndo_fcoe_disable)(struct net_device *dev);
 *        Called when the FCoE protocol stack wants to stop using LLD for FCoE
 *        so the underlying device can perform whatever needed clean-ups to
 *        stop supporting acceleration of FCoE traffic.
 *
 * int (*ndo_fcoe_ddp_setup)(struct net_device *dev, u16 xid,
 *                             struct scatterlist *sgl, unsigned int sgc);
 *        Called when the FCoE Initiator wants to initialize an I/O that
 *        is a possible candidate for Direct Data Placement (DDP). The LLD can
 *        perform necessary setup and returns 1 to indicate the device is set up
 *        successfully to perform DDP on this I/O, otherwise this returns 0.
 *
 * int (*ndo_fcoe_ddp_done)(struct net_device *dev,  u16 xid);
 *        Called when the FCoE Initiator/Target is done with the DDPed I/O as
 *        indicated by the FC exchange id 'xid', so the underlying device can
 *        clean up and reuse resources for later DDP requests.
 *
 * int (*ndo_fcoe_ddp_target)(struct net_device *dev, u16 xid,
 *                              struct scatterlist *sgl, unsigned int sgc);
 *        Called when the FCoE Target wants to initialize an I/O that
 *        is a possible candidate for Direct Data Placement (DDP). The LLD can
 *        perform necessary setup and returns 1 to indicate the device is set up
 *        successfully to perform DDP on this I/O, otherwise this returns 0.
 *
 * int (*ndo_fcoe_get_hbainfo)(struct net_device *dev,
 *                               struct netdev_fcoe_hbainfo *hbainfo);
 *        Called when the FCoE Protocol stack wants information on the underlying
 *        device. This information is utilized by the FCoE protocol stack to
 *        register attributes with Fiber Channel management service as per the
 *        FC-GS Fabric Device Management Information(FDMI) specification.
 *
 * int (*ndo_fcoe_get_wwn)(struct net_device *dev, u64 *wwn, int type);
 *        Called when the underlying device wants to override default World Wide
 *        Name (WWN) generation mechanism in FCoE protocol stack to pass its own
 *        World Wide Port Name (WWPN) or World Wide Node Name (WWNN) to the FCoE
 *        protocol stack to use.
 *
 *        RFS acceleration.
 * int (*ndo_rx_flow_steer)(struct net_device *dev, const struct sk_buff *skb,
 *                            u16 rxq_index, u32 flow_id);
 *        Set hardware filter for RFS.  rxq_index is the target queue index;
 *        flow_id is a flow ID to be passed to rps_may_expire_flow() later.
 *        Return the filter ID on success, or a negative error code.
 *
 *        Slave management functions (for bridge, bonding, etc).
 * int (*ndo_add_slave)(struct net_device *dev, struct net_device *slave_dev);
 *        Called to make another netdev an underling.
 *
 * int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev);
 *        Called to release previously enslaved netdev.
 *
 * struct net_device *(*ndo_get_xmit_slave)(struct net_device *dev,
 *                                            struct sk_buff *skb,
 *                                            bool all_slaves);
 *        Get the xmit slave of master device. If all_slaves is true, function
 *        assume all the slaves can transmit.
 *
 *      Feature/offload setting functions.
 * netdev_features_t (*ndo_fix_features)(struct net_device *dev,
 *                netdev_features_t features);
 *        Adjusts the requested feature flags according to device-specific
 *        constraints, and returns the resulting flags. Must not modify
 *        the device state.
 *
 * int (*ndo_set_features)(struct net_device *dev, netdev_features_t features);
 *        Called to update device configuration to new features. Passed
 *        feature set might be less than what was returned by ndo_fix_features()).
 *        Must return >0 or -errno if it changed dev->features itself.
 *
 * int (*ndo_fdb_add)(struct ndmsg *ndm, struct nlattr *tb[],
 *                      struct net_device *dev,
 *                      const unsigned char *addr, u16 vid, u16 flags,
 *                      struct netlink_ext_ack *extack);
 *        Adds an FDB entry to dev for addr.
 * int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[],
 *                      struct net_device *dev,
 *                      const unsigned char *addr, u16 vid)
 *        Deletes the FDB entry from dev coresponding to addr.
 * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb,
 *                       struct net_device *dev, struct net_device *filter_dev,
 *                       int *idx)
 *        Used to add FDB entries to dump requests. Implementers should add
 *        entries to skb and update idx with the number of entries.
 *
 * int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh,
 *                             u16 flags, struct netlink_ext_ack *extack)
 * int (*ndo_bridge_getlink)(struct sk_buff *skb, u32 pid, u32 seq,
 *                             struct net_device *dev, u32 filter_mask,
 *                             int nlflags)
 * int (*ndo_bridge_dellink)(struct net_device *dev, struct nlmsghdr *nlh,
 *                             u16 flags);
 *
 * int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier);
 *        Called to change device carrier. Soft-devices (like dummy, team, etc)
 *        which do not represent real hardware may define this to allow their
 *        userspace components to manage their virtual carrier state. Devices
 *        that determine carrier state from physical hardware properties (eg
 *        network cables) or protocol-dependent mechanisms (eg
 *        USB_CDC_NOTIFY_NETWORK_CONNECTION) should NOT implement this function.
 *
 * int (*ndo_get_phys_port_id)(struct net_device *dev,
 *                               struct netdev_phys_item_id *ppid);
 *        Called to get ID of physical port of this device. If driver does
 *        not implement this, it is assumed that the hw is not able to have
 *        multiple net devices on single physical port.
 *
 * int (*ndo_get_port_parent_id)(struct net_device *dev,
 *                                 struct netdev_phys_item_id *ppid)
 *        Called to get the parent ID of the physical port of this device.
 *
 * void (*ndo_udp_tunnel_add)(struct net_device *dev,
 *                              struct udp_tunnel_info *ti);
 *        Called by UDP tunnel to notify a driver about the UDP port and socket
 *        address family that a UDP tunnel is listnening to. It is called only
 *        when a new port starts listening. The operation is protected by the
 *        RTNL.
 *
 * void (*ndo_udp_tunnel_del)(struct net_device *dev,
 *                              struct udp_tunnel_info *ti);
 *        Called by UDP tunnel to notify the driver about a UDP port and socket
 *        address family that the UDP tunnel is not listening to anymore. The
 *        operation is protected by the RTNL.
 *
 * void* (*ndo_dfwd_add_station)(struct net_device *pdev,
 *                                 struct net_device *dev)
 *        Called by upper layer devices to accelerate switching or other
 *        station functionality into hardware. 'pdev is the lowerdev
 *        to use for the offload and 'dev' is the net device that will
 *        back the offload. Returns a pointer to the private structure
 *        the upper layer will maintain.
 * void (*ndo_dfwd_del_station)(struct net_device *pdev, void *priv)
 *        Called by upper layer device to delete the station created
 *        by 'ndo_dfwd_add_station'. 'pdev' is the net device backing
 *        the station and priv is the structure returned by the add
 *        operation.
 * int (*ndo_set_tx_maxrate)(struct net_device *dev,
 *                             int queue_index, u32 maxrate);
 *        Called when a user wants to set a max-rate limitation of specific
 *        TX queue.
 * int (*ndo_get_iflink)(const struct net_device *dev);
 *        Called to get the iflink value of this device.
 * void (*ndo_change_proto_down)(struct net_device *dev,
 *                                 bool proto_down);
 *        This function is used to pass protocol port error state information
 *        to the switch driver. The switch driver can react to the proto_down
 *      by doing a phys down on the associated switch port.
 * int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb);
 *        This function is used to get egress tunnel information for given skb.
 *        This is useful for retrieving outer tunnel header parameters while
 *        sampling packet.
 * void (*ndo_set_rx_headroom)(struct net_device *dev, int needed_headroom);
 *        This function is used to specify the headroom that the skb must
 *        consider when allocation skb during packet reception. Setting
 *        appropriate rx headroom value allows avoiding skb head copy on
 *        forward. Setting a negative value resets the rx headroom to the
 *        default value.
 * int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf);
 *        This function is used to set or query state related to XDP on the
 *        netdevice and manage BPF offload. See definition of
 *        enum bpf_netdev_command for details.
 * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp,
 *                        u32 flags);
 *        This function is used to submit @n XDP packets for transmit on a
 *        netdevice. Returns number of frames successfully transmitted, frames
 *        that got dropped are freed/returned via xdp_return_frame().
 *        Returns negative number, means general error invoking ndo, meaning
 *        no frames were xmit'ed and core-caller will free all frames.
 * int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags);
 *      This function is used to wake up the softirq, ksoftirqd or kthread
 *        responsible for sending and/or receiving packets on a specific
 *        queue id bound to an AF_XDP socket. The flags field specifies if
 *        only RX, only Tx, or both should be woken up using the flags
 *        XDP_WAKEUP_RX and XDP_WAKEUP_TX.
 * struct devlink_port *(*ndo_get_devlink_port)(struct net_device *dev);
 *        Get devlink port instance associated with a given netdev.
 *        Called with a reference on the netdevice and devlink locks only,
 *        rtnl_lock is not held.
 * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p,
 *                         int cmd);
 *        Add, change, delete or get information on an IPv4 tunnel.
 * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev);
 *        If a device is paired with a peer device, return the peer instance.
 *        The caller must be under RCU read context.
 */
struct net_device_ops {
        int                        (*ndo_init)(struct net_device *dev);
        void                        (*ndo_uninit)(struct net_device *dev);
        int                        (*ndo_open)(struct net_device *dev);
        int                        (*ndo_stop)(struct net_device *dev);
        netdev_tx_t                (*ndo_start_xmit)(struct sk_buff *skb,
                                                  struct net_device *dev);
        netdev_features_t        (*ndo_features_check)(struct sk_buff *skb,
                                                      struct net_device *dev,
                                                      netdev_features_t features);
        u16                        (*ndo_select_queue)(struct net_device *dev,
                                                    struct sk_buff *skb,
                                                    struct net_device *sb_dev);
        void                        (*ndo_change_rx_flags)(struct net_device *dev,
                                                       int flags);
        void                        (*ndo_set_rx_mode)(struct net_device *dev);
        int                        (*ndo_set_mac_address)(struct net_device *dev,
                                                       void *addr);
        int                        (*ndo_validate_addr)(struct net_device *dev);
        int                        (*ndo_do_ioctl)(struct net_device *dev,
                                                struct ifreq *ifr, int cmd);
        int                        (*ndo_set_config)(struct net_device *dev,
                                                  struct ifmap *map);
        int                        (*ndo_change_mtu)(struct net_device *dev,
                                                  int new_mtu);
        int                        (*ndo_neigh_setup)(struct net_device *dev,
                                                   struct neigh_parms *);
        void                        (*ndo_tx_timeout) (struct net_device *dev,
                                                   unsigned int txqueue);

        void                        (*ndo_get_stats64)(struct net_device *dev,
                                                   struct rtnl_link_stats64 *storage);
        bool                        (*ndo_has_offload_stats)(const struct net_device *dev, int attr_id);
        int                        (*ndo_get_offload_stats)(int attr_id,
                                                         const struct net_device *dev,
                                                         void *attr_data);
        struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);

        int                        (*ndo_vlan_rx_add_vid)(struct net_device *dev,
                                                       __be16 proto, u16 vid);
        int                        (*ndo_vlan_rx_kill_vid)(struct net_device *dev,
                                                        __be16 proto, u16 vid);
#ifdef CONFIG_NET_POLL_CONTROLLER
        void                    (*ndo_poll_controller)(struct net_device *dev);
        int                        (*ndo_netpoll_setup)(struct net_device *dev,
                                                     struct netpoll_info *info);
        void                        (*ndo_netpoll_cleanup)(struct net_device *dev);
#endif
        int                        (*ndo_set_vf_mac)(struct net_device *dev,
                                                  int queue, u8 *mac);
        int                        (*ndo_set_vf_vlan)(struct net_device *dev,
                                                   int queue, u16 vlan,
                                                   u8 qos, __be16 proto);
        int                        (*ndo_set_vf_rate)(struct net_device *dev,
                                                   int vf, int min_tx_rate,
                                                   int max_tx_rate);
        int                        (*ndo_set_vf_spoofchk)(struct net_device *dev,
                                                       int vf, bool setting);
        int                        (*ndo_set_vf_trust)(struct net_device *dev,
                                                    int vf, bool setting);
        int                        (*ndo_get_vf_config)(struct net_device *dev,
                                                     int vf,
                                                     struct ifla_vf_info *ivf);
        int                        (*ndo_set_vf_link_state)(struct net_device *dev,
                                                         int vf, int link_state);
        int                        (*ndo_get_vf_stats)(struct net_device *dev,
                                                    int vf,
                                                    struct ifla_vf_stats
                                                    *vf_stats);
        int                        (*ndo_set_vf_port)(struct net_device *dev,
                                                   int vf,
                                                   struct nlattr *port[]);
        int                        (*ndo_get_vf_port)(struct net_device *dev,
                                                   int vf, struct sk_buff *skb);
        int                        (*ndo_get_vf_guid)(struct net_device *dev,
                                                   int vf,
                                                   struct ifla_vf_guid *node_guid,
                                                   struct ifla_vf_guid *port_guid);
        int                        (*ndo_set_vf_guid)(struct net_device *dev,
                                                   int vf, u64 guid,
                                                   int guid_type);
        int                        (*ndo_set_vf_rss_query_en)(
                                                   struct net_device *dev,
                                                   int vf, bool setting);
        int                        (*ndo_setup_tc)(struct net_device *dev,
                                                enum tc_setup_type type,
                                                void *type_data);
#if IS_ENABLED(CONFIG_FCOE)
        int                        (*ndo_fcoe_enable)(struct net_device *dev);
        int                        (*ndo_fcoe_disable)(struct net_device *dev);
        int                        (*ndo_fcoe_ddp_setup)(struct net_device *dev,
                                                      u16 xid,
                                                      struct scatterlist *sgl,
                                                      unsigned int sgc);
        int                        (*ndo_fcoe_ddp_done)(struct net_device *dev,
                                                     u16 xid);
        int                        (*ndo_fcoe_ddp_target)(struct net_device *dev,
                                                       u16 xid,
                                                       struct scatterlist *sgl,
                                                       unsigned int sgc);
        int                        (*ndo_fcoe_get_hbainfo)(struct net_device *dev,
                                                        struct netdev_fcoe_hbainfo *hbainfo);
#endif

#if IS_ENABLED(CONFIG_LIBFCOE)
#define NETDEV_FCOE_WWNN 0
#define NETDEV_FCOE_WWPN 1
        int                        (*ndo_fcoe_get_wwn)(struct net_device *dev,
                                                    u64 *wwn, int type);
#endif

#ifdef CONFIG_RFS_ACCEL
        int                        (*ndo_rx_flow_steer)(struct net_device *dev,
                                                     const struct sk_buff *skb,
                                                     u16 rxq_index,
                                                     u32 flow_id);
#endif
        int                        (*ndo_add_slave)(struct net_device *dev,
                                                 struct net_device *slave_dev,
                                                 struct netlink_ext_ack *extack);
        int                        (*ndo_del_slave)(struct net_device *dev,
                                                 struct net_device *slave_dev);
        struct net_device*        (*ndo_get_xmit_slave)(struct net_device *dev,
                                                      struct sk_buff *skb,
                                                      bool all_slaves);
        netdev_features_t        (*ndo_fix_features)(struct net_device *dev,
                                                    netdev_features_t features);
        int                        (*ndo_set_features)(struct net_device *dev,
                                                    netdev_features_t features);
        int                        (*ndo_neigh_construct)(struct net_device *dev,
                                                       struct neighbour *n);
        void                        (*ndo_neigh_destroy)(struct net_device *dev,
                                                     struct neighbour *n);

        int                        (*ndo_fdb_add)(struct ndmsg *ndm,
                                               struct nlattr *tb[],
                                               struct net_device *dev,
                                               const unsigned char *addr,
                                               u16 vid,
                                               u16 flags,
                                               struct netlink_ext_ack *extack);
        int                        (*ndo_fdb_del)(struct ndmsg *ndm,
                                               struct nlattr *tb[],
                                               struct net_device *dev,
                                               const unsigned char *addr,
                                               u16 vid);
        int                        (*ndo_fdb_dump)(struct sk_buff *skb,
                                                struct netlink_callback *cb,
                                                struct net_device *dev,
                                                struct net_device *filter_dev,
                                                int *idx);
        int                        (*ndo_fdb_get)(struct sk_buff *skb,
                                               struct nlattr *tb[],
                                               struct net_device *dev,
                                               const unsigned char *addr,
                                               u16 vid, u32 portid, u32 seq,
                                               struct netlink_ext_ack *extack);
        int                        (*ndo_bridge_setlink)(struct net_device *dev,
                                                      struct nlmsghdr *nlh,
                                                      u16 flags,
                                                      struct netlink_ext_ack *extack);
        int                        (*ndo_bridge_getlink)(struct sk_buff *skb,
                                                      u32 pid, u32 seq,
                                                      struct net_device *dev,
                                                      u32 filter_mask,
                                                      int nlflags);
        int                        (*ndo_bridge_dellink)(struct net_device *dev,
                                                      struct nlmsghdr *nlh,
                                                      u16 flags);
        int                        (*ndo_change_carrier)(struct net_device *dev,
                                                      bool new_carrier);
        int                        (*ndo_get_phys_port_id)(struct net_device *dev,
                                                        struct netdev_phys_item_id *ppid);
        int                        (*ndo_get_port_parent_id)(struct net_device *dev,
                                                          struct netdev_phys_item_id *ppid);
        int                        (*ndo_get_phys_port_name)(struct net_device *dev,
                                                          char *name, size_t len);
        void                        (*ndo_udp_tunnel_add)(struct net_device *dev,
                                                      struct udp_tunnel_info *ti);
        void                        (*ndo_udp_tunnel_del)(struct net_device *dev,
                                                      struct udp_tunnel_info *ti);
        void*                        (*ndo_dfwd_add_station)(struct net_device *pdev,
                                                        struct net_device *dev);
        void                        (*ndo_dfwd_del_station)(struct net_device *pdev,
                                                        void *priv);

        int                        (*ndo_set_tx_maxrate)(struct net_device *dev,
                                                      int queue_index,
                                                      u32 maxrate);
        int                        (*ndo_get_iflink)(const struct net_device *dev);
        int                        (*ndo_change_proto_down)(struct net_device *dev,
                                                         bool proto_down);
        int                        (*ndo_fill_metadata_dst)(struct net_device *dev,
                                                       struct sk_buff *skb);
        void                        (*ndo_set_rx_headroom)(struct net_device *dev,
                                                       int needed_headroom);
        int                        (*ndo_bpf)(struct net_device *dev,
                                           struct netdev_bpf *bpf);
        int                        (*ndo_xdp_xmit)(struct net_device *dev, int n,
                                                struct xdp_frame **xdp,
                                                u32 flags);
        int                        (*ndo_xsk_wakeup)(struct net_device *dev,
                                                  u32 queue_id, u32 flags);
        struct devlink_port *        (*ndo_get_devlink_port)(struct net_device *dev);
        int                        (*ndo_tunnel_ctl)(struct net_device *dev,
                                                  struct ip_tunnel_parm *p, int cmd);
        struct net_device *        (*ndo_get_peer_dev)(struct net_device *dev);
};

/**
 * enum net_device_priv_flags - &struct net_device priv_flags
 *
 * These are the &struct net_device, they are only set internally
 * by drivers and used in the kernel. These flags are invisible to
 * userspace; this means that the order of these flags can change
 * during any kernel release.
 *
 * You should have a pretty good reason to be extending these flags.
 *
 * @IFF_802_1Q_VLAN: 802.1Q VLAN device
 * @IFF_EBRIDGE: Ethernet bridging device
 * @IFF_BONDING: bonding master or slave
 * @IFF_ISATAP: ISATAP interface (RFC4214)
 * @IFF_WAN_HDLC: WAN HDLC device
 * @IFF_XMIT_DST_RELEASE: dev_hard_start_xmit() is allowed to
 *        release skb->dst
 * @IFF_DONT_BRIDGE: disallow bridging this ether dev
 * @IFF_DISABLE_NETPOLL: disable netpoll at run-time
 * @IFF_MACVLAN_PORT: device used as macvlan port
 * @IFF_BRIDGE_PORT: device used as bridge port
 * @IFF_OVS_DATAPATH: device used as Open vSwitch datapath port
 * @IFF_TX_SKB_SHARING: The interface supports sharing skbs on transmit
 * @IFF_UNICAST_FLT: Supports unicast filtering
 * @IFF_TEAM_PORT: device used as team port
 * @IFF_SUPP_NOFCS: device supports sending custom FCS
 * @IFF_LIVE_ADDR_CHANGE: device supports hardware address
 *        change when it's running
 * @IFF_MACVLAN: Macvlan device
 * @IFF_XMIT_DST_RELEASE_PERM: IFF_XMIT_DST_RELEASE not taking into account
 *        underlying stacked devices
 * @IFF_L3MDEV_MASTER: device is an L3 master device
 * @IFF_NO_QUEUE: device can run without qdisc attached
 * @IFF_OPENVSWITCH: device is a Open vSwitch master
 * @IFF_L3MDEV_SLAVE: device is enslaved to an L3 master device
 * @IFF_TEAM: device is a team device
 * @IFF_RXFH_CONFIGURED: device has had Rx Flow indirection table configured
 * @IFF_PHONY_HEADROOM: the headroom value is controlled by an external
 *        entity (i.e. the master device for bridged veth)
 * @IFF_MACSEC: device is a MACsec device
 * @IFF_NO_RX_HANDLER: device doesn't support the rx_handler hook
 * @IFF_FAILOVER: device is a failover master device
 * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
 * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device
 * @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running
 */
enum netdev_priv_flags {
        IFF_802_1Q_VLAN                        = 1<<0,
        IFF_EBRIDGE                        = 1<<1,
        IFF_BONDING                        = 1<<2,
        IFF_ISATAP                        = 1<<3,
        IFF_WAN_HDLC                        = 1<<4,
        IFF_XMIT_DST_RELEASE                = 1<<5,
        IFF_DONT_BRIDGE                        = 1<<6,
        IFF_DISABLE_NETPOLL                = 1<<7,
        IFF_MACVLAN_PORT                = 1<<8,
        IFF_BRIDGE_PORT                        = 1<<9,
        IFF_OVS_DATAPATH                = 1<<10,
        IFF_TX_SKB_SHARING                = 1<<11,
        IFF_UNICAST_FLT                        = 1<<12,
        IFF_TEAM_PORT                        = 1<<13,
        IFF_SUPP_NOFCS                        = 1<<14,
        IFF_LIVE_ADDR_CHANGE                = 1<<15,
        IFF_MACVLAN                        = 1<<16,
        IFF_XMIT_DST_RELEASE_PERM        = 1<<17,
        IFF_L3MDEV_MASTER                = 1<<18,
        IFF_NO_QUEUE                        = 1<<19,
        IFF_OPENVSWITCH                        = 1<<20,
        IFF_L3MDEV_SLAVE                = 1<<21,
        IFF_TEAM                        = 1<<22,
        IFF_RXFH_CONFIGURED                = 1<<23,
        IFF_PHONY_HEADROOM                = 1<<24,
        IFF_MACSEC                        = 1<<25,
        IFF_NO_RX_HANDLER                = 1<<26,
        IFF_FAILOVER                        = 1<<27,
        IFF_FAILOVER_SLAVE                = 1<<28,
        IFF_L3MDEV_RX_HANDLER                = 1<<29,
        IFF_LIVE_RENAME_OK                = 1<<30,
};

#define IFF_802_1Q_VLAN                        IFF_802_1Q_VLAN
#define IFF_EBRIDGE                        IFF_EBRIDGE
#define IFF_BONDING                        IFF_BONDING
#define IFF_ISATAP                        IFF_ISATAP
#define IFF_WAN_HDLC                        IFF_WAN_HDLC
#define IFF_XMIT_DST_RELEASE                IFF_XMIT_DST_RELEASE
#define IFF_DONT_BRIDGE                        IFF_DONT_BRIDGE
#define IFF_DISABLE_NETPOLL                IFF_DISABLE_NETPOLL
#define IFF_MACVLAN_PORT                IFF_MACVLAN_PORT
#define IFF_BRIDGE_PORT                        IFF_BRIDGE_PORT
#define IFF_OVS_DATAPATH                IFF_OVS_DATAPATH
#define IFF_TX_SKB_SHARING                IFF_TX_SKB_SHARING
#define IFF_UNICAST_FLT                        IFF_UNICAST_FLT
#define IFF_TEAM_PORT                        IFF_TEAM_PORT
#define IFF_SUPP_NOFCS                        IFF_SUPP_NOFCS
#define IFF_LIVE_ADDR_CHANGE                IFF_LIVE_ADDR_CHANGE
#define IFF_MACVLAN                        IFF_MACVLAN
#define IFF_XMIT_DST_RELEASE_PERM        IFF_XMIT_DST_RELEASE_PERM
#define IFF_L3MDEV_MASTER                IFF_L3MDEV_MASTER
#define IFF_NO_QUEUE                        IFF_NO_QUEUE
#define IFF_OPENVSWITCH                        IFF_OPENVSWITCH
#define IFF_L3MDEV_SLAVE                IFF_L3MDEV_SLAVE
#define IFF_TEAM                        IFF_TEAM
#define IFF_RXFH_CONFIGURED                IFF_RXFH_CONFIGURED
#define IFF_MACSEC                        IFF_MACSEC
#define IFF_NO_RX_HANDLER                IFF_NO_RX_HANDLER
#define IFF_FAILOVER                        IFF_FAILOVER
#define IFF_FAILOVER_SLAVE                IFF_FAILOVER_SLAVE
#define IFF_L3MDEV_RX_HANDLER                IFF_L3MDEV_RX_HANDLER
#define IFF_LIVE_RENAME_OK                IFF_LIVE_RENAME_OK

/* Specifies the type of the struct net_device::ml_priv pointer */
enum netdev_ml_priv_type {
        ML_PRIV_NONE,
        ML_PRIV_CAN,
};

/**
 *        struct net_device - The DEVICE structure.
 *
 *        Actually, this whole structure is a big mistake.  It mixes I/O
 *        data with strictly "high-level" data, and it has to know about
 *        almost every data structure used in the INET module.
 *
 *        @name:        This is the first field of the "visible" part of this structure
 *                (i.e. as seen by users in the "Space.c" file).  It is the name
 *                of the interface.
 *
 *        @name_node:        Name hashlist node
 *        @ifalias:        SNMP alias
 *        @mem_end:        Shared memory end
 *        @mem_start:        Shared memory start
 *        @base_addr:        Device I/O address
 *        @irq:                Device IRQ number
 *
 *        @state:                Generic network queuing layer state, see netdev_state_t
 *        @dev_list:        The global list of network devices
 *        @napi_list:        List entry used for polling NAPI devices
 *        @unreg_list:        List entry  when we are unregistering the
 *                        device; see the function unregister_netdev
 *        @close_list:        List entry used when we are closing the device
 *        @ptype_all:     Device-specific packet handlers for all protocols
 *        @ptype_specific: Device-specific, protocol-specific packet handlers
 *
 *        @adj_list:        Directly linked devices, like slaves for bonding
 *        @features:        Currently active device features
 *        @hw_features:        User-changeable features
 *
 *        @wanted_features:        User-requested features
 *        @vlan_features:                Mask of features inheritable by VLAN devices
 *
 *        @hw_enc_features:        Mask of features inherited by encapsulating devices
 *                                This field indicates what encapsulation
 *                                offloads the hardware is capable of doing,
 *                                and drivers will need to set them appropriately.
 *
 *        @mpls_features:        Mask of features inheritable by MPLS
 *        @gso_partial_features: value(s) from NETIF_F_GSO\*
 *
 *        @ifindex:        interface index
 *        @group:                The group the device belongs to
 *
 *        @stats:                Statistics struct, which was left as a legacy, use
 *                        rtnl_link_stats64 instead
 *
 *        @rx_dropped:        Dropped packets by core network,
 *                        do not use this in drivers
 *        @tx_dropped:        Dropped packets by core network,
 *                        do not use this in drivers
 *        @rx_nohandler:        nohandler dropped packets by core network on
 *                        inactive devices, do not use this in drivers
 *        @carrier_up_count:        Number of times the carrier has been up
 *        @carrier_down_count:        Number of times the carrier has been down
 *
 *        @wireless_handlers:        List of functions to handle Wireless Extensions,
 *                                instead of ioctl,
 *                                see <net/iw_handler.h> for details.
 *        @wireless_data:        Instance data managed by the core of wireless extensions
 *
 *        @netdev_ops:        Includes several pointers to callbacks,
 *                        if one wants to override the ndo_*() functions
 *        @ethtool_ops:        Management operations
 *        @l3mdev_ops:        Layer 3 master device operations
 *        @ndisc_ops:        Includes callbacks for different IPv6 neighbour
 *                        discovery handling. Necessary for e.g. 6LoWPAN.
 *        @xfrmdev_ops:        Transformation offload operations
 *        @tlsdev_ops:        Transport Layer Security offload operations
 *        @header_ops:        Includes callbacks for creating,parsing,caching,etc
 *                        of Layer 2 headers.
 *
 *        @flags:                Interface flags (a la BSD)
 *        @priv_flags:        Like 'flags' but invisible to userspace,
 *                        see if.h for the definitions
 *        @gflags:        Global flags ( kept as legacy )
 *        @padded:        How much padding added by alloc_netdev()
 *        @operstate:        RFC2863 operstate
 *        @link_mode:        Mapping policy to operstate
 *        @if_port:        Selectable AUI, TP, ...
 *        @dma:                DMA channel
 *        @mtu:                Interface MTU value
 *        @min_mtu:        Interface Minimum MTU value
 *        @max_mtu:        Interface Maximum MTU value
 *        @type:                Interface hardware type
 *        @hard_header_len: Maximum hardware header length.
 *        @min_header_len:  Minimum hardware header length
 *
 *        @needed_headroom: Extra headroom the hardware may need, but not in all
 *                          cases can this be guaranteed
 *        @needed_tailroom: Extra tailroom the hardware may need, but not in all
 *                          cases can this be guaranteed. Some cases also use
 *                          LL_MAX_HEADER instead to allocate the skb
 *
 *        interface address info:
 *
 *         @perm_addr:                Permanent hw address
 *         @addr_assign_type:        Hw address assignment type
 *         @addr_len:                Hardware address length
 *        @upper_level:                Maximum depth level of upper devices.
 *        @lower_level:                Maximum depth level of lower devices.
 *        @neigh_priv_len:        Used in neigh_alloc()
 *         @dev_id:                Used to differentiate devices that share
 *                                 the same link layer address
 *         @dev_port:                Used to differentiate devices that share
 *                                 the same function
 *        @addr_list_lock:        XXX: need comments on this one
 *        @name_assign_type:        network interface name assignment type
 *        @uc_promisc:                Counter that indicates promiscuous mode
 *                                has been enabled due to the need to listen to
 *                                additional unicast addresses in a device that
 *                                does not implement ndo_set_rx_mode()
 *        @uc:                        unicast mac addresses
 *        @mc:                        multicast mac addresses
 *        @dev_addrs:                list of device hw addresses
 *        @queues_kset:                Group of all Kobjects in the Tx and RX queues
 *        @promiscuity:                Number of times the NIC is told to work in
 *                                promiscuous mode; if it becomes 0 the NIC will
 *                                exit promiscuous mode
 *        @allmulti:                Counter, enables or disables allmulticast mode
 *
 *        @vlan_info:        VLAN info
 *        @dsa_ptr:        dsa specific data
 *        @tipc_ptr:        TIPC specific data
 *        @atalk_ptr:        AppleTalk link
 *        @ip_ptr:        IPv4 specific data
 *        @ip6_ptr:        IPv6 specific data
 *        @ax25_ptr:        AX.25 specific data
 *        @ieee80211_ptr:        IEEE 802.11 specific data, assign before registering
 *        @ieee802154_ptr: IEEE 802.15.4 low-rate Wireless Personal Area Network
 *                         device struct
 *        @mpls_ptr:        mpls_dev struct pointer
 *
 *        @dev_addr:        Hw address (before bcast,
 *                        because most packets are unicast)
 *
 *        @_rx:                        Array of RX queues
 *        @num_rx_queues:                Number of RX queues
 *                                allocated at register_netdev() time
 *        @real_num_rx_queues:         Number of RX queues currently active in device
 *        @xdp_prog:                XDP sockets filter program pointer
 *        @gro_flush_timeout:        timeout for GRO layer in NAPI
 *        @napi_defer_hard_irqs:        If not zero, provides a counter that would
 *                                allow to avoid NIC hard IRQ, on busy queues.
 *
 *        @rx_handler:                handler for received packets
 *        @rx_handler_data:         XXX: need comments on this one
 *        @miniq_ingress:                ingress/clsact qdisc specific data for
 *                                ingress processing
 *        @ingress_queue:                XXX: need comments on this one
 *        @nf_hooks_ingress:        netfilter hooks executed for ingress packets
 *        @broadcast:                hw bcast address
 *
 *        @rx_cpu_rmap:        CPU reverse-mapping for RX completion interrupts,
 *                        indexed by RX queue number. Assigned by driver.
 *                        This must only be set if the ndo_rx_flow_steer
 *                        operation is defined
 *        @index_hlist:                Device index hash chain
 *
 *        @_tx:                        Array of TX queues
 *        @num_tx_queues:                Number of TX queues allocated at alloc_netdev_mq() time
 *        @real_num_tx_queues:         Number of TX queues currently active in device
 *        @qdisc:                        Root qdisc from userspace point of view
 *        @tx_queue_len:                Max frames per queue allowed
 *        @tx_global_lock:         XXX: need comments on this one
 *        @xdp_bulkq:                XDP device bulk queue
 *        @xps_cpus_map:                all CPUs map for XPS device
 *        @xps_rxqs_map:                all RXQs map for XPS device
 *
 *        @xps_maps:        XXX: need comments on this one
 *        @miniq_egress:                clsact qdisc specific data for
 *                                egress processing
 *        @qdisc_hash:                qdisc hash table
 *        @watchdog_timeo:        Represents the timeout that is used by
 *                                the watchdog (see dev_watchdog())
 *        @watchdog_timer:        List of timers
 *
 *        @proto_down_reason:        reason a netdev interface is held down
 *        @pcpu_refcnt:                Number of references to this device
 *        @todo_list:                Delayed register/unregister
 *        @link_watch_list:        XXX: need comments on this one
 *
 *        @reg_state:                Register/unregister state machine
 *        @dismantle:                Device is going to be freed
 *        @rtnl_link_state:        This enum represents the phases of creating
 *                                a new link
 *
 *        @needs_free_netdev:        Should unregister perform free_netdev?
 *        @priv_destructor:        Called from unregister
 *        @npinfo:                XXX: need comments on this one
 *         @nd_net:                Network namespace this network device is inside
 *
 *         @ml_priv:        Mid-layer private
 *        @ml_priv_type:  Mid-layer private type
 *         @lstats:        Loopback statistics
 *         @tstats:        Tunnel statistics
 *         @dstats:        Dummy statistics
 *         @vstats:        Virtual ethernet statistics
 *
 *        @garp_port:        GARP
 *        @mrp_port:        MRP
 *
 *        @dev:                Class/net/name entry
 *        @sysfs_groups:        Space for optional device, statistics and wireless
 *                        sysfs groups
 *
 *        @sysfs_rx_queue_group:        Space for optional per-rx queue attributes
 *        @rtnl_link_ops:        Rtnl_link_ops
 *
 *        @gso_max_size:        Maximum size of generic segmentation offload
 *        @gso_max_segs:        Maximum number of segments that can be passed to the
 *                        NIC for GSO
 *
 *        @dcbnl_ops:        Data Center Bridging netlink ops
 *        @num_tc:        Number of traffic classes in the net device
 *        @tc_to_txq:        XXX: need comments on this one
 *        @prio_tc_map:        XXX: need comments on this one
 *
 *        @fcoe_ddp_xid:        Max exchange id for FCoE LRO by ddp
 *
 *        @priomap:        XXX: need comments on this one
 *        @phydev:        Physical device may attach itself
 *                        for hardware timestamping
 *        @sfp_bus:        attached &struct sfp_bus structure.
 *
 *        @qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock
 *        @qdisc_running_key: lockdep class annotating Qdisc->running seqcount
 *
 *        @proto_down:        protocol port state information can be sent to the
 *                        switch driver and used to set the phys state of the
 *                        switch port.
 *
 *        @wol_enabled:        Wake-on-LAN is enabled
 *
 *        @net_notifier_list:        List of per-net netdev notifier block
 *                                that follow this device when it is moved
 *                                to another network namespace.
 *
 *        @macsec_ops:    MACsec offloading ops
 *
 *        @udp_tunnel_nic_info:        static structure describing the UDP tunnel
 *                                offload capabilities of the device
 *        @udp_tunnel_nic:        UDP tunnel offload state
 *        @xdp_state:                stores info on attached XDP BPF programs
 *
 *        @nested_level:        Used as as a parameter of spin_lock_nested() of
 *                        dev->addr_list_lock.
 *        @unlink_list:        As netif_addr_lock() can be called recursively,
 *                        keep a list of interfaces to be deleted.
 *
 *        FIXME: cleanup struct net_device such that network protocol info
 *        moves out.
 */

struct net_device {
        char                        name[IFNAMSIZ];
        struct netdev_name_node        *name_node;
        struct dev_ifalias        __rcu *ifalias;
        /*
         *        I/O specific fields
         *        FIXME: Merge these and struct ifmap into one
         */
        unsigned long                mem_end;
        unsigned long                mem_start;
        unsigned long                base_addr;
        int                        irq;

        /*
         *        Some hardware also needs these fields (state,dev_list,
         *        napi_list,unreg_list,close_list) but they are not
         *        part of the usual set specified in Space.c.
         */

        unsigned long                state;

        struct list_head        dev_list;
        struct list_head        napi_list;
        struct list_head        unreg_list;
        struct list_head        close_list;
        struct list_head        ptype_all;
        struct list_head        ptype_specific;

        struct {
                struct list_head upper;
                struct list_head lower;
        } adj_list;

        netdev_features_t        features;
        netdev_features_t        hw_features;
        netdev_features_t        wanted_features;
        netdev_features_t        vlan_features;
        netdev_features_t        hw_enc_features;
        netdev_features_t        mpls_features;
        netdev_features_t        gso_partial_features;

        int                        ifindex;
        int                        group;

        struct net_device_stats        stats;

        atomic_long_t                rx_dropped;
        atomic_long_t                tx_dropped;
        atomic_long_t                rx_nohandler;

        /* Stats to monitor link on/off, flapping */
        atomic_t                carrier_up_count;
        atomic_t                carrier_down_count;

#ifdef CONFIG_WIRELESS_EXT
        const struct iw_handler_def *wireless_handlers;
        struct iw_public_data        *wireless_data;
#endif
        const struct net_device_ops *netdev_ops;
        const struct ethtool_ops *ethtool_ops;
#ifdef CONFIG_NET_L3_MASTER_DEV
        const struct l3mdev_ops        *l3mdev_ops;
#endif
#if IS_ENABLED(CONFIG_IPV6)
        const struct ndisc_ops *ndisc_ops;
#endif

#ifdef CONFIG_XFRM_OFFLOAD
        const struct xfrmdev_ops *xfrmdev_ops;
#endif

#if IS_ENABLED(CONFIG_TLS_DEVICE)
        const struct tlsdev_ops *tlsdev_ops;
#endif

        const struct header_ops *header_ops;

        unsigned int                flags;
        unsigned int                priv_flags;

        unsigned short                gflags;
        unsigned short                padded;

        unsigned char                operstate;
        unsigned char                link_mode;

        unsigned char                if_port;
        unsigned char                dma;

        /* Note : dev->mtu is often read without holding a lock.
         * Writers usually hold RTNL.
         * It is recommended to use READ_ONCE() to annotate the reads,
         * and to use WRITE_ONCE() to annotate the writes.
         */
        unsigned int                mtu;
        unsigned int                min_mtu;
        unsigned int                max_mtu;
        unsigned short                type;
        unsigned short                hard_header_len;
        unsigned char                min_header_len;
        unsigned char                name_assign_type;

        unsigned short                needed_headroom;
        unsigned short                needed_tailroom;

        /* Interface address info. */
        unsigned char                perm_addr[MAX_ADDR_LEN];
        unsigned char                addr_assign_type;
        unsigned char                addr_len;
        unsigned char                upper_level;
        unsigned char                lower_level;

        unsigned short                neigh_priv_len;
        unsigned short          dev_id;
        unsigned short          dev_port;
        spinlock_t                addr_list_lock;

        struct netdev_hw_addr_list        uc;
        struct netdev_hw_addr_list        mc;
        struct netdev_hw_addr_list        dev_addrs;

#ifdef CONFIG_SYSFS
        struct kset                *queues_kset;
#endif
#ifdef CONFIG_LOCKDEP
        struct list_head        unlink_list;
#endif
        unsigned int                promiscuity;
        unsigned int                allmulti;
        bool                        uc_promisc;
#ifdef CONFIG_LOCKDEP
        unsigned char                nested_level;
#endif


        /* Protocol-specific pointers */

#if IS_ENABLED(CONFIG_VLAN_8021Q)
        struct vlan_info __rcu        *vlan_info;
#endif
#if IS_ENABLED(CONFIG_NET_DSA)
        struct dsa_port                *dsa_ptr;
#endif
#if IS_ENABLED(CONFIG_TIPC)
        struct tipc_bearer __rcu *tipc_ptr;
#endif
#if IS_ENABLED(CONFIG_IRDA) || IS_ENABLED(CONFIG_ATALK)
        void                         *atalk_ptr;
#endif
        struct in_device __rcu        *ip_ptr;
        struct inet6_dev __rcu        *ip6_ptr;
#if IS_ENABLED(CONFIG_AX25)
        void                        *ax25_ptr;
#endif
        struct wireless_dev        *ieee80211_ptr;
        struct wpan_dev                *ieee802154_ptr;
#if IS_ENABLED(CONFIG_MPLS_ROUTING)
        struct mpls_dev __rcu        *mpls_ptr;
#endif

/*
 * Cache lines mostly used on receive path (including eth_type_trans())
 */
        /* Interface address info used in eth_type_trans() */
        unsigned char                *dev_addr;

        struct netdev_rx_queue        *_rx;
        unsigned int                num_rx_queues;
        unsigned int                real_num_rx_queues;

        struct bpf_prog __rcu        *xdp_prog;
        unsigned long                gro_flush_timeout;
        int                        napi_defer_hard_irqs;
        rx_handler_func_t __rcu        *rx_handler;
        void __rcu                *rx_handler_data;

#ifdef CONFIG_NET_CLS_ACT
        struct mini_Qdisc __rcu        *miniq_ingress;
#endif
        struct netdev_queue __rcu *ingress_queue;
#ifdef CONFIG_NETFILTER_INGRESS
        struct nf_hook_entries __rcu *nf_hooks_ingress;
#endif

        unsigned char                broadcast[MAX_ADDR_LEN];
#ifdef CONFIG_RFS_ACCEL
        struct cpu_rmap                *rx_cpu_rmap;
#endif
        struct hlist_node        index_hlist;

/*
 * Cache lines mostly used on transmit path
 */
        struct netdev_queue        *_tx ____cacheline_aligned_in_smp;
        unsigned int                num_tx_queues;
        unsigned int                real_num_tx_queues;
        struct Qdisc __rcu        *qdisc;
        unsigned int                tx_queue_len;
        spinlock_t                tx_global_lock;

        struct xdp_dev_bulk_queue __percpu *xdp_bulkq;

#ifdef CONFIG_XPS
        struct xps_dev_maps __rcu *xps_cpus_map;
        struct xps_dev_maps __rcu *xps_rxqs_map;
#endif
#ifdef CONFIG_NET_CLS_ACT
        struct mini_Qdisc __rcu        *miniq_egress;
#endif

#ifdef CONFIG_NET_SCHED
        DECLARE_HASHTABLE        (qdisc_hash, 4);
#endif
        /* These may be needed for future network-power-down code. */
        struct timer_list        watchdog_timer;
        int                        watchdog_timeo;

        u32                     proto_down_reason;

        struct list_head        todo_list;
        int __percpu                *pcpu_refcnt;

        struct list_head        link_watch_list;

        enum { NETREG_UNINITIALIZED=0,
               NETREG_REGISTERED,        /* completed register_netdevice */
               NETREG_UNREGISTERING,        /* called unregister_netdevice */
               NETREG_UNREGISTERED,        /* completed unregister todo */
               NETREG_RELEASED,                /* called free_netdev */
               NETREG_DUMMY,                /* dummy device for NAPI poll */
        } reg_state:8;

        bool dismantle;

        enum {
                RTNL_LINK_INITIALIZED,
                RTNL_LINK_INITIALIZING,
        } rtnl_link_state:16;

        bool needs_free_netdev;
        void (*priv_destructor)(struct net_device *dev);

#ifdef CONFIG_NETPOLL
        struct netpoll_info __rcu        *npinfo;
#endif

        possible_net_t                        nd_net;

        /* mid-layer private */
        void                                *ml_priv;
        enum netdev_ml_priv_type        ml_priv_type;

        union {
                struct pcpu_lstats __percpu                *lstats;
                struct pcpu_sw_netstats __percpu        *tstats;
                struct pcpu_dstats __percpu                *dstats;
        };

#if IS_ENABLED(CONFIG_GARP)
        struct garp_port __rcu        *garp_port;
#endif
#if IS_ENABLED(CONFIG_MRP)
        struct mrp_port __rcu        *mrp_port;
#endif

        struct device                dev;
        const struct attribute_group *sysfs_groups[4];
        const struct attribute_group *sysfs_rx_queue_group;

        const struct rtnl_link_ops *rtnl_link_ops;

        /* for setting kernel sock attribute on TCP connection setup */
#define GSO_MAX_SIZE                65536
        unsigned int                gso_max_size;
#define GSO_MAX_SEGS                65535
        u16                        gso_max_segs;

#ifdef CONFIG_DCB
        const struct dcbnl_rtnl_ops *dcbnl_ops;
#endif
        s16                        num_tc;
        struct netdev_tc_txq        tc_to_txq[TC_MAX_QUEUE];
        u8                        prio_tc_map[TC_BITMASK + 1];

#if IS_ENABLED(CONFIG_FCOE)
        unsigned int                fcoe_ddp_xid;
#endif
#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
        struct netprio_map __rcu *priomap;
#endif
        struct phy_device        *phydev;
        struct sfp_bus                *sfp_bus;
        struct lock_class_key        *qdisc_tx_busylock;
        struct lock_class_key        *qdisc_running_key;
        bool                        proto_down;
        unsigned                wol_enabled:1;

        struct list_head        net_notifier_list;

#if IS_ENABLED(CONFIG_MACSEC)
        /* MACsec management functions */
        const struct macsec_ops *macsec_ops;
#endif
        const struct udp_tunnel_nic_info        *udp_tunnel_nic_info;
        struct udp_tunnel_nic        *udp_tunnel_nic;

        /* protected by rtnl_lock */
        struct bpf_xdp_entity        xdp_state[__MAX_XDP_MODE];
};
#define to_net_dev(d) container_of(d, struct net_device, dev)

static inline bool netif_elide_gro(const struct net_device *dev)
{
        if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
                return true;
        return false;
}

#define        NETDEV_ALIGN                32

static inline
int netdev_get_prio_tc_map(const struct net_device *dev, u32 prio)
{
        return dev->prio_tc_map[prio & TC_BITMASK];
}

static inline
int netdev_set_prio_tc_map(struct net_device *dev, u8 prio, u8 tc)
{
        if (tc >= dev->num_tc)
                return -EINVAL;

        dev->prio_tc_map[prio & TC_BITMASK] = tc & TC_BITMASK;
        return 0;
}

int netdev_txq_to_tc(struct net_device *dev, unsigned int txq);
void netdev_reset_tc(struct net_device *dev);
int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset);
int netdev_set_num_tc(struct net_device *dev, u8 num_tc);

static inline
int netdev_get_num_tc(struct net_device *dev)
{
        return dev->num_tc;
}

static inline void net_prefetch(void *p)
{
        prefetch(p);
#if L1_CACHE_BYTES < 128
        prefetch((u8 *)p + L1_CACHE_BYTES);
#endif
}

static inline void net_prefetchw(void *p)
{
        prefetchw(p);
#if L1_CACHE_BYTES < 128
        prefetchw((u8 *)p + L1_CACHE_BYTES);
#endif
}

void netdev_unbind_sb_channel(struct net_device *dev,
                              struct net_device *sb_dev);
int netdev_bind_sb_channel_queue(struct net_device *dev,
                                 struct net_device *sb_dev,
                                 u8 tc, u16 count, u16 offset);
int netdev_set_sb_channel(struct net_device *dev, u16 channel);
static inline int netdev_get_sb_channel(struct net_device *dev)
{
        return max_t(int, -dev->num_tc, 0);
}

static inline
struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
                                         unsigned int index)
{
        return &dev->_tx[index];
}

static inline struct netdev_queue *skb_get_tx_queue(const struct net_device *dev,
                                                    const struct sk_buff *skb)
{
        return netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
}

static inline void netdev_for_each_tx_queue(struct net_device *dev,
                                            void (*f)(struct net_device *,
                                                      struct netdev_queue *,
                                                      void *),
                                            void *arg)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++)
                f(dev, &dev->_tx[i], arg);
}

#define netdev_lockdep_set_classes(dev)                                \
{                                                                \
        static struct lock_class_key qdisc_tx_busylock_key;        \
        static struct lock_class_key qdisc_running_key;                \
        static struct lock_class_key qdisc_xmit_lock_key;        \
        static struct lock_class_key dev_addr_list_lock_key;        \
        unsigned int i;                                                \
                                                                \
        (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key;        \
        (dev)->qdisc_running_key = &qdisc_running_key;                \
        lockdep_set_class(&(dev)->addr_list_lock,                \
                          &dev_addr_list_lock_key);                \
        for (i = 0; i < (dev)->num_tx_queues; i++)                \
                lockdep_set_class(&(dev)->_tx[i]._xmit_lock,        \
                                  &qdisc_xmit_lock_key);        \
}

u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
                     struct net_device *sb_dev);
struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
                                         struct sk_buff *skb,
                                         struct net_device *sb_dev);

/* returns the headroom that the master device needs to take in account
 * when forwarding to this dev
 */
static inline unsigned netdev_get_fwd_headroom(struct net_device *dev)
{
        return dev->priv_flags & IFF_PHONY_HEADROOM ? 0 : dev->needed_headroom;
}

static inline void netdev_set_rx_headroom(struct net_device *dev, int new_hr)
{
        if (dev->netdev_ops->ndo_set_rx_headroom)
                dev->netdev_ops->ndo_set_rx_headroom(dev, new_hr);
}

/* set the device rx headroom to the dev's default */
static inline void netdev_reset_rx_headroom(struct net_device *dev)
{
        netdev_set_rx_headroom(dev, -1);
}

static inline void *netdev_get_ml_priv(struct net_device *dev,
                                       enum netdev_ml_priv_type type)
{
        if (dev->ml_priv_type != type)
                return NULL;

        return dev->ml_priv;
}

static inline void netdev_set_ml_priv(struct net_device *dev,
                                      void *ml_priv,
                                      enum netdev_ml_priv_type type)
{
        WARN(dev->ml_priv_type && dev->ml_priv_type != type,
             "Overwriting already set ml_priv_type (%u) with different ml_priv_type (%u)!\n",
             dev->ml_priv_type, type);
        WARN(!dev->ml_priv_type && dev->ml_priv,
             "Overwriting already set ml_priv and ml_priv_type is ML_PRIV_NONE!\n");

        dev->ml_priv = ml_priv;
        dev->ml_priv_type = type;
}

/*
 * Net namespace inlines
 */
static inline
struct net *dev_net(const struct net_device *dev)
{
        return read_pnet(&dev->nd_net);
}

static inline
void dev_net_set(struct net_device *dev, struct net *net)
{
        write_pnet(&dev->nd_net, net);
}

/**
 *        netdev_priv - access network device private data
 *        @dev: network device
 *
 * Get network device private data
 */
static inline void *netdev_priv(const struct net_device *dev)
{
        return (char *)dev + ALIGN(sizeof(struct net_device), NETDEV_ALIGN);
}

/* Set the sysfs physical device reference for the network logical device
 * if set prior to registration will cause a symlink during initialization.
 */
#define SET_NETDEV_DEV(net, pdev)        ((net)->dev.parent = (pdev))

/* Set the sysfs device type for the network logical device to allow
 * fine-grained identification of different network device types. For
 * example Ethernet, Wireless LAN, Bluetooth, WiMAX etc.
 */
#define SET_NETDEV_DEVTYPE(net, devtype)        ((net)->dev.type = (devtype))

/* Default NAPI poll() weight
 * Device drivers are strongly advised to not use bigger value
 */
#define NAPI_POLL_WEIGHT 64

/**
 *        netif_napi_add - initialize a NAPI context
 *        @dev:  network device
 *        @napi: NAPI context
 *        @poll: polling function
 *        @weight: default weight
 *
 * netif_napi_add() must be used to initialize a NAPI context prior to calling
 * *any* of the other NAPI-related functions.
 */
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
                    int (*poll)(struct napi_struct *, int), int weight);

/**
 *        netif_tx_napi_add - initialize a NAPI context
 *        @dev:  network device
 *        @napi: NAPI context
 *        @poll: polling function
 *        @weight: default weight
 *
 * This variant of netif_napi_add() should be used from drivers using NAPI
 * to exclusively poll a TX queue.
 * This will avoid we add it into napi_hash[], thus polluting this hash table.
 */
static inline void netif_tx_napi_add(struct net_device *dev,
                                     struct napi_struct *napi,
                                     int (*poll)(struct napi_struct *, int),
                                     int weight)
{
        set_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state);
        netif_napi_add(dev, napi, poll, weight);
}

/**
 *  __netif_napi_del - remove a NAPI context
 *  @napi: NAPI context
 *
 * Warning: caller must observe RCU grace period before freeing memory
 * containing @napi. Drivers might want to call this helper to combine
 * all the needed RCU grace periods into a single one.
 */
void __netif_napi_del(struct napi_struct *napi);

/**
 *  netif_napi_del - remove a NAPI context
 *  @napi: NAPI context
 *
 *  netif_napi_del() removes a NAPI context from the network device NAPI list
 */
static inline void netif_napi_del(struct napi_struct *napi)
{
        __netif_napi_del(napi);
        synchronize_net();
}

struct napi_gro_cb {
        /* Virtual address of skb_shinfo(skb)->frags[0].page + offset. */
        void        *frag0;

        /* Length of frag0. */
        unsigned int frag0_len;

        /* This indicates where we are processing relative to skb->data. */
        int        data_offset;

        /* This is non-zero if the packet cannot be merged with the new skb. */
        u16        flush;

        /* Save the IP ID here and check when we get to the transport layer */
        u16        flush_id;

        /* Number of segments aggregated. */
        u16        count;

        /* Start offset for remote checksum offload */
        u16        gro_remcsum_start;

        /* jiffies when first packet was created/queued */
        unsigned long age;

        /* Used in ipv6_gro_receive() and foo-over-udp */
        u16        proto;

        /* This is non-zero if the packet may be of the same flow. */
        u8        same_flow:1;

        /* Used in tunnel GRO receive */
        u8        encap_mark:1;

        /* GRO checksum is valid */
        u8        csum_valid:1;

        /* Number of checksums via CHECKSUM_UNNECESSARY */
        u8        csum_cnt:3;

        /* Free the skb? */
        u8        free:2;
#define NAPI_GRO_FREE                  1
#define NAPI_GRO_FREE_STOLEN_HEAD 2

        /* Used in foo-over-udp, set in udp[46]_gro_receive */
        u8        is_ipv6:1;

        /* Used in GRE, set in fou/gue_gro_receive */
        u8        is_fou:1;

        /* Used to determine if flush_id can be ignored */
        u8        is_atomic:1;

        /* Number of gro_receive callbacks this packet already went through */
        u8 recursion_counter:4;

        /* GRO is done by frag_list pointer chaining. */
        u8        is_flist:1;

        /* used to support CHECKSUM_COMPLETE for tunneling protocols */
        __wsum        csum;

        /* used in skb_gro_receive() slow path */
        struct sk_buff *last;
};

#define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb)

#define GRO_RECURSION_LIMIT 15
static inline int gro_recursion_inc_test(struct sk_buff *skb)
{
        return ++NAPI_GRO_CB(skb)->recursion_counter == GRO_RECURSION_LIMIT;
}

typedef struct sk_buff *(*gro_receive_t)(struct list_head *, struct sk_buff *);
static inline struct sk_buff *call_gro_receive(gro_receive_t cb,
                                               struct list_head *head,
                                               struct sk_buff *skb)
{
        if (unlikely(gro_recursion_inc_test(skb))) {
                NAPI_GRO_CB(skb)->flush |= 1;
                return NULL;
        }

        return cb(head, skb);
}

typedef struct sk_buff *(*gro_receive_sk_t)(struct sock *, struct list_head *,
                                            struct sk_buff *);
static inline struct sk_buff *call_gro_receive_sk(gro_receive_sk_t cb,
                                                  struct sock *sk,
                                                  struct list_head *head,
                                                  struct sk_buff *skb)
{
        if (unlikely(gro_recursion_inc_test(skb))) {
                NAPI_GRO_CB(skb)->flush |= 1;
                return NULL;
        }

        return cb(sk, head, skb);
}

struct packet_type {
        __be16                        type;        /* This is really htons(ether_type). */
        bool                        ignore_outgoing;
        struct net_device        *dev;        /* NULL is wildcarded here             */
        int                        (*func) (struct sk_buff *,
                                         struct net_device *,
                                         struct packet_type *,
                                         struct net_device *);
        void                        (*list_func) (struct list_head *,
                                              struct packet_type *,
                                              struct net_device *);
        bool                        (*id_match)(struct packet_type *ptype,
                                            struct sock *sk);
        struct net                *af_packet_net;
        void                        *af_packet_priv;
        struct list_head        list;
};

struct offload_callbacks {
        struct sk_buff                *(*gso_segment)(struct sk_buff *skb,
                                                netdev_features_t features);
        struct sk_buff                *(*gro_receive)(struct list_head *head,
                                                struct sk_buff *skb);
        int                        (*gro_complete)(struct sk_buff *skb, int nhoff);
};

struct packet_offload {
        __be16                         type;        /* This is really htons(ether_type). */
        u16                         priority;
        struct offload_callbacks callbacks;
        struct list_head         list;
};

/* often modified stats are per-CPU, other are shared (netdev->stats) */
struct pcpu_sw_netstats {
        u64     rx_packets;
        u64     rx_bytes;
        u64     tx_packets;
        u64     tx_bytes;
        struct u64_stats_sync   syncp;
} __aligned(4 * sizeof(u64));

struct pcpu_lstats {
        u64_stats_t packets;
        u64_stats_t bytes;
        struct u64_stats_sync syncp;
} __aligned(2 * sizeof(u64));

void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes);

static inline void dev_sw_netstats_rx_add(struct net_device *dev, unsigned int len)
{
        struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);

        u64_stats_update_begin(&tstats->syncp);
        tstats->rx_bytes += len;
        tstats->rx_packets++;
        u64_stats_update_end(&tstats->syncp);
}

static inline void dev_lstats_add(struct net_device *dev, unsigned int len)
{
        struct pcpu_lstats *lstats = this_cpu_ptr(dev->lstats);

        u64_stats_update_begin(&lstats->syncp);
        u64_stats_add(&lstats->bytes, len);
        u64_stats_inc(&lstats->packets);
        u64_stats_update_end(&lstats->syncp);
}

#define __netdev_alloc_pcpu_stats(type, gfp)                                \
({                                                                        \
        typeof(type) __percpu *pcpu_stats = alloc_percpu_gfp(type, gfp);\
        if (pcpu_stats)        {                                                \
                int __cpu;                                                \
                for_each_possible_cpu(__cpu) {                                \
                        typeof(type) *stat;                                \
                        stat = per_cpu_ptr(pcpu_stats, __cpu);                \
                        u64_stats_init(&stat->syncp);                        \
                }                                                        \
        }                                                                \
        pcpu_stats;                                                        \
})

#define netdev_alloc_pcpu_stats(type)                                        \
        __netdev_alloc_pcpu_stats(type, GFP_KERNEL)

enum netdev_lag_tx_type {
        NETDEV_LAG_TX_TYPE_UNKNOWN,
        NETDEV_LAG_TX_TYPE_RANDOM,
        NETDEV_LAG_TX_TYPE_BROADCAST,
        NETDEV_LAG_TX_TYPE_ROUNDROBIN,
        NETDEV_LAG_TX_TYPE_ACTIVEBACKUP,
        NETDEV_LAG_TX_TYPE_HASH,
};

enum netdev_lag_hash {
        NETDEV_LAG_HASH_NONE,
        NETDEV_LAG_HASH_L2,
        NETDEV_LAG_HASH_L34,
        NETDEV_LAG_HASH_L23,
        NETDEV_LAG_HASH_E23,
        NETDEV_LAG_HASH_E34,
        NETDEV_LAG_HASH_UNKNOWN,
};

struct netdev_lag_upper_info {
        enum netdev_lag_tx_type tx_type;
        enum netdev_lag_hash hash_type;
};

struct netdev_lag_lower_state_info {
        u8 link_up : 1,
           tx_enabled : 1;
};

#include <linux/notifier.h>

/* netdevice notifier chain. Please remember to update netdev_cmd_to_name()
 * and the rtnetlink notification exclusion list in rtnetlink_event() when
 * adding new types.
 */
enum netdev_cmd {
        NETDEV_UP        = 1,        /* For now you can't veto a device up/down */
        NETDEV_DOWN,
        NETDEV_REBOOT,                /* Tell a protocol stack a network interface
                                   detected a hardware crash and restarted
                                   - we can use this eg to kick tcp sessions
                                   once done */
        NETDEV_CHANGE,                /* Notify device state change */
        NETDEV_REGISTER,
        NETDEV_UNREGISTER,
        NETDEV_CHANGEMTU,        /* notify after mtu change happened */
        NETDEV_CHANGEADDR,        /* notify after the address change */
        NETDEV_PRE_CHANGEADDR,        /* notify before the address change */
        NETDEV_GOING_DOWN,
        NETDEV_CHANGENAME,
        NETDEV_FEAT_CHANGE,
        NETDEV_BONDING_FAILOVER,
        NETDEV_PRE_UP,
        NETDEV_PRE_TYPE_CHANGE,
        NETDEV_POST_TYPE_CHANGE,
        NETDEV_POST_INIT,
        NETDEV_RELEASE,
        NETDEV_NOTIFY_PEERS,
        NETDEV_JOIN,
        NETDEV_CHANGEUPPER,
        NETDEV_RESEND_IGMP,
        NETDEV_PRECHANGEMTU,        /* notify before mtu change happened */
        NETDEV_CHANGEINFODATA,
        NETDEV_BONDING_INFO,
        NETDEV_PRECHANGEUPPER,
        NETDEV_CHANGELOWERSTATE,
        NETDEV_UDP_TUNNEL_PUSH_INFO,
        NETDEV_UDP_TUNNEL_DROP_INFO,
        NETDEV_CHANGE_TX_QUEUE_LEN,
        NETDEV_CVLAN_FILTER_PUSH_INFO,
        NETDEV_CVLAN_FILTER_DROP_INFO,
        NETDEV_SVLAN_FILTER_PUSH_INFO,
        NETDEV_SVLAN_FILTER_DROP_INFO,
};
const char *netdev_cmd_to_name(enum netdev_cmd cmd);

int register_netdevice_notifier(struct notifier_block *nb);
int unregister_netdevice_notifier(struct notifier_block *nb);
int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb);
int unregister_netdevice_notifier_net(struct net *net,
                                      struct notifier_block *nb);
int register_netdevice_notifier_dev_net(struct net_device *dev,
                                        struct notifier_block *nb,
                                        struct netdev_net_notifier *nn);
int unregister_netdevice_notifier_dev_net(struct net_device *dev,
                                          struct notifier_block *nb,
                                          struct netdev_net_notifier *nn);

struct netdev_notifier_info {
        struct net_device        *dev;
        struct netlink_ext_ack        *extack;
};

struct netdev_notifier_info_ext {
        struct netdev_notifier_info info; /* must be first */
        union {
                u32 mtu;
        } ext;
};

struct netdev_notifier_change_info {
        struct netdev_notifier_info info; /* must be first */
        unsigned int flags_changed;
};

struct netdev_notifier_changeupper_info {
        struct netdev_notifier_info info; /* must be first */
        struct net_device *upper_dev; /* new upper dev */
        bool master; /* is upper dev master */
        bool linking; /* is the notification for link or unlink */
        void *upper_info; /* upper dev info */
};

struct netdev_notifier_changelowerstate_info {
        struct netdev_notifier_info info; /* must be first */
        void *lower_state_info; /* is lower dev state */
};

struct netdev_notifier_pre_changeaddr_info {
        struct netdev_notifier_info info; /* must be first */
        const unsigned char *dev_addr;
};

static inline void netdev_notifier_info_init(struct netdev_notifier_info *info,
                                             struct net_device *dev)
{
        info->dev = dev;
        info->extack = NULL;
}

static inline struct net_device *
netdev_notifier_info_to_dev(const struct netdev_notifier_info *info)
{
        return info->dev;
}

static inline struct netlink_ext_ack *
netdev_notifier_info_to_extack(const struct netdev_notifier_info *info)
{
        return info->extack;
}

int call_netdevice_notifiers(unsigned long val, struct net_device *dev);


extern rwlock_t                                dev_base_lock;                /* Device list lock */

#define for_each_netdev(net, d)                \
                list_for_each_entry(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_reverse(net, d)        \
                list_for_each_entry_reverse(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_rcu(net, d)                \
                list_for_each_entry_rcu(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_safe(net, d, n)        \
                list_for_each_entry_safe(d, n, &(net)->dev_base_head, dev_list)
#define for_each_netdev_continue(net, d)                \
                list_for_each_entry_continue(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_continue_reverse(net, d)                \
                list_for_each_entry_continue_reverse(d, &(net)->dev_base_head, \
                                                     dev_list)
#define for_each_netdev_continue_rcu(net, d)                \
        list_for_each_entry_continue_rcu(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_in_bond_rcu(bond, slave)        \
                for_each_netdev_rcu(&init_net, slave)        \
                        if (netdev_master_upper_dev_get_rcu(slave) == (bond))
#define net_device_entry(lh)        list_entry(lh, struct net_device, dev_list)

static inline struct net_device *next_net_device(struct net_device *dev)
{
        struct list_head *lh;
        struct net *net;

        net = dev_net(dev);
        lh = dev->dev_list.next;
        return lh == &net->dev_base_head ? NULL : net_device_entry(lh);
}

static inline struct net_device *next_net_device_rcu(struct net_device *dev)
{
        struct list_head *lh;
        struct net *net;

        net = dev_net(dev);
        lh = rcu_dereference(list_next_rcu(&dev->dev_list));
        return lh == &net->dev_base_head ? NULL : net_device_entry(lh);
}

static inline struct net_device *first_net_device(struct net *net)
{
        return list_empty(&net->dev_base_head) ? NULL :
                net_device_entry(net->dev_base_head.next);
}

static inline struct net_device *first_net_device_rcu(struct net *net)
{
        struct list_head *lh = rcu_dereference(list_next_rcu(&net->dev_base_head));

        return lh == &net->dev_base_head ? NULL : net_device_entry(lh);
}

int netdev_boot_setup_check(struct net_device *dev);
unsigned long netdev_boot_base(const char *prefix, int unit);
struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
                                       const char *hwaddr);
struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type);
struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type);
void dev_add_pack(struct packet_type *pt);
void dev_remove_pack(struct packet_type *pt);
void __dev_remove_pack(struct packet_type *pt);
void dev_add_offload(struct packet_offload *po);
void dev_remove_offload(struct packet_offload *po);

int dev_get_iflink(const struct net_device *dev);
int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb);
struct net_device *__dev_get_by_flags(struct net *net, unsigned short flags,
                                      unsigned short mask);
struct net_device *dev_get_by_name(struct net *net, const char *name);
struct net_device *dev_get_by_name_rcu(struct net *net, const char *name);
struct net_device *__dev_get_by_name(struct net *net, const char *name);
int dev_alloc_name(struct net_device *dev, const char *name);
int dev_open(struct net_device *dev, struct netlink_ext_ack *extack);
void dev_close(struct net_device *dev);
void dev_close_many(struct list_head *head, bool unlink);
void dev_disable_lro(struct net_device *dev);
int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb);
u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
                     struct net_device *sb_dev);
u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
                       struct net_device *sb_dev);

int dev_queue_xmit(struct sk_buff *skb);
int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev);
int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id);

static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
{
        int ret;

        ret = __dev_direct_xmit(skb, queue_id);
        if (!dev_xmit_complete(ret))
                kfree_skb(skb);
        return ret;
}

int register_netdevice(struct net_device *dev);
void unregister_netdevice_queue(struct net_device *dev, struct list_head *head);
void unregister_netdevice_many(struct list_head *head);
static inline void unregister_netdevice(struct net_device *dev)
{
        unregister_netdevice_queue(dev, NULL);
}

int netdev_refcnt_read(const struct net_device *dev);
void free_netdev(struct net_device *dev);
void netdev_freemem(struct net_device *dev);
int init_dummy_netdev(struct net_device *dev);

struct net_device *netdev_get_xmit_slave(struct net_device *dev,
                                         struct sk_buff *skb,
                                         bool all_slaves);
struct net_device *dev_get_by_index(struct net *net, int ifindex);
struct net_device *__dev_get_by_index(struct net *net, int ifindex);
struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
struct net_device *dev_get_by_napi_id(unsigned int napi_id);
int netdev_get_name(struct net *net, char *name, int ifindex);
int dev_restart(struct net_device *dev);
int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb);
int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb);

static inline unsigned int skb_gro_offset(const struct sk_buff *skb)
{
        return NAPI_GRO_CB(skb)->data_offset;
}

static inline unsigned int skb_gro_len(const struct sk_buff *skb)
{
        return skb->len - NAPI_GRO_CB(skb)->data_offset;
}

static inline void skb_gro_pull(struct sk_buff *skb, unsigned int len)
{
        NAPI_GRO_CB(skb)->data_offset += len;
}

static inline void *skb_gro_header_fast(struct sk_buff *skb,
                                        unsigned int offset)
{
        return NAPI_GRO_CB(skb)->frag0 + offset;
}

static inline int skb_gro_header_hard(struct sk_buff *skb, unsigned int hlen)
{
        return NAPI_GRO_CB(skb)->frag0_len < hlen;
}

static inline void skb_gro_frag0_invalidate(struct sk_buff *skb)
{
        NAPI_GRO_CB(skb)->frag0 = NULL;
        NAPI_GRO_CB(skb)->frag0_len = 0;
}

static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen,
                                        unsigned int offset)
{
        if (!pskb_may_pull(skb, hlen))
                return NULL;

        skb_gro_frag0_invalidate(skb);
        return skb->data + offset;
}

static inline void *skb_gro_network_header(struct sk_buff *skb)
{
        return (NAPI_GRO_CB(skb)->frag0 ?: skb->data) +
               skb_network_offset(skb);
}

static inline void skb_gro_postpull_rcsum(struct sk_buff *skb,
                                        const void *start, unsigned int len)
{
        if (NAPI_GRO_CB(skb)->csum_valid)
                NAPI_GRO_CB(skb)->csum = csum_sub(NAPI_GRO_CB(skb)->csum,
                                                  csum_partial(start, len, 0));
}

/* GRO checksum functions. These are logical equivalents of the normal
 * checksum functions (in skbuff.h) except that they operate on the GRO
 * offsets and fields in sk_buff.
 */

__sum16 __skb_gro_checksum_complete(struct sk_buff *skb);

static inline bool skb_at_gro_remcsum_start(struct sk_buff *skb)
{
        return (NAPI_GRO_CB(skb)->gro_remcsum_start == skb_gro_offset(skb));
}

static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb,
                                                      bool zero_okay,
                                                      __sum16 check)
{
        return ((skb->ip_summed != CHECKSUM_PARTIAL ||
                skb_checksum_start_offset(skb) <
                 skb_gro_offset(skb)) &&
                !skb_at_gro_remcsum_start(skb) &&
                NAPI_GRO_CB(skb)->csum_cnt == 0 &&
                (!zero_okay || check));
}

static inline __sum16 __skb_gro_checksum_validate_complete(struct sk_buff *skb,
                                                           __wsum psum)
{
        if (NAPI_GRO_CB(skb)->csum_valid &&
            !csum_fold(csum_add(psum, NAPI_GRO_CB(skb)->csum)))
                return 0;

        NAPI_GRO_CB(skb)->csum = psum;

        return __skb_gro_checksum_complete(skb);
}

static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb)
{
        if (NAPI_GRO_CB(skb)->csum_cnt > 0) {
                /* Consume a checksum from CHECKSUM_UNNECESSARY */
                NAPI_GRO_CB(skb)->csum_cnt--;
        } else {
                /* Update skb for CHECKSUM_UNNECESSARY and csum_level when we
                 * verified a new top level checksum or an encapsulated one
                 * during GRO. This saves work if we fallback to normal path.
                 */
                __skb_incr_checksum_unnecessary(skb);
        }
}

#define __skb_gro_checksum_validate(skb, proto, zero_okay, check,        \
                                    compute_pseudo)                        \
({                                                                        \
        __sum16 __ret = 0;                                                \
        if (__skb_gro_checksum_validate_needed(skb, zero_okay, check))        \
                __ret = __skb_gro_checksum_validate_complete(skb,        \
                                compute_pseudo(skb, proto));                \
        if (!__ret)                                                        \
                skb_gro_incr_csum_unnecessary(skb);                        \
        __ret;                                                                \
})

#define skb_gro_checksum_validate(skb, proto, compute_pseudo)                \
        __skb_gro_checksum_validate(skb, proto, false, 0, compute_pseudo)

#define skb_gro_checksum_validate_zero_check(skb, proto, check,                \
                                             compute_pseudo)                \
        __skb_gro_checksum_validate(skb, proto, true, check, compute_pseudo)

#define skb_gro_checksum_simple_validate(skb)                                \
        __skb_gro_checksum_validate(skb, 0, false, 0, null_compute_pseudo)

static inline bool __skb_gro_checksum_convert_check(struct sk_buff *skb)
{
        return (NAPI_GRO_CB(skb)->csum_cnt == 0 &&
                !NAPI_GRO_CB(skb)->csum_valid);
}

static inline void __skb_gro_checksum_convert(struct sk_buff *skb,
                                              __wsum pseudo)
{
        NAPI_GRO_CB(skb)->csum = ~pseudo;
        NAPI_GRO_CB(skb)->csum_valid = 1;
}

#define skb_gro_checksum_try_convert(skb, proto, compute_pseudo)        \
do {                                                                        \
        if (__skb_gro_checksum_convert_check(skb))                        \
                __skb_gro_checksum_convert(skb,                         \
                                           compute_pseudo(skb, proto));        \
} while (0)

struct gro_remcsum {
        int offset;
        __wsum delta;
};

static inline void skb_gro_remcsum_init(struct gro_remcsum *grc)
{
        grc->offset = 0;
        grc->delta = 0;
}

static inline void *skb_gro_remcsum_process(struct sk_buff *skb, void *ptr,
                                            unsigned int off, size_t hdrlen,
                                            int start, int offset,
                                            struct gro_remcsum *grc,
                                            bool nopartial)
{
        __wsum delta;
        size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);

        BUG_ON(!NAPI_GRO_CB(skb)->csum_valid);

        if (!nopartial) {
                NAPI_GRO_CB(skb)->gro_remcsum_start = off + hdrlen + start;
                return ptr;
        }

        ptr = skb_gro_header_fast(skb, off);
        if (skb_gro_header_hard(skb, off + plen)) {
                ptr = skb_gro_header_slow(skb, off + plen, off);
                if (!ptr)
                        return NULL;
        }

        delta = remcsum_adjust(ptr + hdrlen, NAPI_GRO_CB(skb)->csum,
                               start, offset);

        /* Adjust skb->csum since we changed the packet */
        NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta);

        grc->offset = off + hdrlen + offset;
        grc->delta = delta;

        return ptr;
}

static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb,
                                           struct gro_remcsum *grc)
{
        void *ptr;
        size_t plen = grc->offset + sizeof(u16);

        if (!grc->delta)
                return;

        ptr = skb_gro_header_fast(skb, grc->offset);
        if (skb_gro_header_hard(skb, grc->offset + sizeof(u16))) {
                ptr = skb_gro_header_slow(skb, plen, grc->offset);
                if (!ptr)
                        return;
        }

        remcsum_unadjust((__sum16 *)ptr, grc->delta);
}

#ifdef CONFIG_XFRM_OFFLOAD
static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush)
{
        if (PTR_ERR(pp) != -EINPROGRESS)
                NAPI_GRO_CB(skb)->flush |= flush;
}
static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb,
                                               struct sk_buff *pp,
                                               int flush,
                                               struct gro_remcsum *grc)
{
        if (PTR_ERR(pp) != -EINPROGRESS) {
                NAPI_GRO_CB(skb)->flush |= flush;
                skb_gro_remcsum_cleanup(skb, grc);
                skb->remcsum_offload = 0;
        }
}
#else
static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush)
{
        NAPI_GRO_CB(skb)->flush |= flush;
}
static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb,
                                               struct sk_buff *pp,
                                               int flush,
                                               struct gro_remcsum *grc)
{
        NAPI_GRO_CB(skb)->flush |= flush;
        skb_gro_remcsum_cleanup(skb, grc);
        skb->remcsum_offload = 0;
}
#endif

static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
                                  unsigned short type,
                                  const void *daddr, const void *saddr,
                                  unsigned int len)
{
        if (!dev->header_ops || !dev->header_ops->create)
                return 0;

        return dev->header_ops->create(skb, dev, type, daddr, saddr, len);
}

static inline int dev_parse_header(const struct sk_buff *skb,
                                   unsigned char *haddr)
{
        const struct net_device *dev = skb->dev;

        if (!dev->header_ops || !dev->header_ops->parse)
                return 0;
        return dev->header_ops->parse(skb, haddr);
}

static inline __be16 dev_parse_header_protocol(const struct sk_buff *skb)
{
        const struct net_device *dev = skb->dev;

        if (!dev->header_ops || !dev->header_ops->parse_protocol)
                return 0;
        return dev->header_ops->parse_protocol(skb);
}

/* ll_header must have at least hard_header_len allocated */
static inline bool dev_validate_header(const struct net_device *dev,
                                       char *ll_header, int len)
{
        if (likely(len >= dev->hard_header_len))
                return true;
        if (len < dev->min_header_len)
                return false;

        if (capable(CAP_SYS_RAWIO)) {
                memset(ll_header + len, 0, dev->hard_header_len - len);
                return true;
        }

        if (dev->header_ops && dev->header_ops->validate)
                return dev->header_ops->validate(ll_header, len);

        return false;
}

static inline bool dev_has_header(const struct net_device *dev)
{
        return dev->header_ops && dev->header_ops->create;
}

#ifdef CONFIG_NET_FLOW_LIMIT
#define FLOW_LIMIT_HISTORY        (1 << 7)  /* must be ^2 and !overflow buckets */
struct sd_flow_limit {
        u64                        count;
        unsigned int                num_buckets;
        unsigned int                history_head;
        u16                        history[FLOW_LIMIT_HISTORY];
        u8                        buckets[];
};

extern int netdev_flow_limit_table_len;
#endif /* CONFIG_NET_FLOW_LIMIT */

/*
 * Incoming packets are placed on per-CPU queues
 */
struct softnet_data {
        struct list_head        poll_list;
        struct sk_buff_head        process_queue;

        /* stats */
        unsigned int                processed;
        unsigned int                time_squeeze;
        unsigned int                received_rps;
#ifdef CONFIG_RPS
        struct softnet_data        *rps_ipi_list;
#endif
#ifdef CONFIG_NET_FLOW_LIMIT
        struct sd_flow_limit __rcu *flow_limit;
#endif
        struct Qdisc                *output_queue;
        struct Qdisc                **output_queue_tailp;
        struct sk_buff                *completion_queue;
#ifdef CONFIG_XFRM_OFFLOAD
        struct sk_buff_head        xfrm_backlog;
#endif
        /* written and read only by owning cpu: */
        struct {
                u16 recursion;
                u8  more;
        } xmit;
#ifdef CONFIG_RPS
        /* input_queue_head should be written by cpu owning this struct,
         * and only read by other cpus. Worth using a cache line.
         */
        unsigned int                input_queue_head ____cacheline_aligned_in_smp;

        /* Elements below can be accessed between CPUs for RPS/RFS */
        call_single_data_t        csd ____cacheline_aligned_in_smp;
        struct softnet_data        *rps_ipi_next;
        unsigned int                cpu;
        unsigned int                input_queue_tail;
#endif
        unsigned int                dropped;
        struct sk_buff_head        input_pkt_queue;
        struct napi_struct        backlog;

};

static inline void input_queue_head_incr(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
        sd->input_queue_head++;
#endif
}

static inline void input_queue_tail_incr_save(struct softnet_data *sd,
                                              unsigned int *qtail)
{
#ifdef CONFIG_RPS
        *qtail = ++sd->input_queue_tail;
#endif
}

DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);

static inline int dev_recursion_level(void)
{
        return this_cpu_read(softnet_data.xmit.recursion);
}

#define XMIT_RECURSION_LIMIT        8
static inline bool dev_xmit_recursion(void)
{
        return unlikely(__this_cpu_read(softnet_data.xmit.recursion) >
                        XMIT_RECURSION_LIMIT);
}

static inline void dev_xmit_recursion_inc(void)
{
        __this_cpu_inc(softnet_data.xmit.recursion);
}

static inline void dev_xmit_recursion_dec(void)
{
        __this_cpu_dec(softnet_data.xmit.recursion);
}

void __netif_schedule(struct Qdisc *q);
void netif_schedule_queue(struct netdev_queue *txq);

static inline void netif_tx_schedule_all(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++)
                netif_schedule_queue(netdev_get_tx_queue(dev, i));
}

static __always_inline void netif_tx_start_queue(struct netdev_queue *dev_queue)
{
        clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
}

/**
 *        netif_start_queue - allow transmit
 *        @dev: network device
 *
 *        Allow upper layers to call the device hard_start_xmit routine.
 */
static inline void netif_start_queue(struct net_device *dev)
{
        netif_tx_start_queue(netdev_get_tx_queue(dev, 0));
}

static inline void netif_tx_start_all_queues(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
                netif_tx_start_queue(txq);
        }
}

void netif_tx_wake_queue(struct netdev_queue *dev_queue);

/**
 *        netif_wake_queue - restart transmit
 *        @dev: network device
 *
 *        Allow upper layers to call the device hard_start_xmit routine.
 *        Used for flow control when transmit resources are available.
 */
static inline void netif_wake_queue(struct net_device *dev)
{
        netif_tx_wake_queue(netdev_get_tx_queue(dev, 0));
}

static inline void netif_tx_wake_all_queues(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
                netif_tx_wake_queue(txq);
        }
}

static __always_inline void netif_tx_stop_queue(struct netdev_queue *dev_queue)
{
        set_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
}

/**
 *        netif_stop_queue - stop transmitted packets
 *        @dev: network device
 *
 *        Stop upper layers calling the device hard_start_xmit routine.
 *        Used for flow control when transmit resources are unavailable.
 */
static inline void netif_stop_queue(struct net_device *dev)
{
        netif_tx_stop_queue(netdev_get_tx_queue(dev, 0));
}

void netif_tx_stop_all_queues(struct net_device *dev);

static inline bool netif_tx_queue_stopped(const struct netdev_queue *dev_queue)
{
        return test_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
}

/**
 *        netif_queue_stopped - test if transmit queue is flowblocked
 *        @dev: network device
 *
 *        Test if transmit queue on device is currently unable to send.
 */
static inline bool netif_queue_stopped(const struct net_device *dev)
{
        return netif_tx_queue_stopped(netdev_get_tx_queue(dev, 0));
}

static inline bool netif_xmit_stopped(const struct netdev_queue *dev_queue)
{
        return dev_queue->state & QUEUE_STATE_ANY_XOFF;
}

static inline bool
netif_xmit_frozen_or_stopped(const struct netdev_queue *dev_queue)
{
        return dev_queue->state & QUEUE_STATE_ANY_XOFF_OR_FROZEN;
}

static inline bool
netif_xmit_frozen_or_drv_stopped(const struct netdev_queue *dev_queue)
{
        return dev_queue->state & QUEUE_STATE_DRV_XOFF_OR_FROZEN;
}

/**
 *        netdev_txq_bql_enqueue_prefetchw - prefetch bql data for write
 *        @dev_queue: pointer to transmit queue
 *
 * BQL enabled drivers might use this helper in their ndo_start_xmit(),
 * to give appropriate hint to the CPU.
 */
static inline void netdev_txq_bql_enqueue_prefetchw(struct netdev_queue *dev_queue)
{
#ifdef CONFIG_BQL
        prefetchw(&dev_queue->dql.num_queued);
#endif
}

/**
 *        netdev_txq_bql_complete_prefetchw - prefetch bql data for write
 *        @dev_queue: pointer to transmit queue
 *
 * BQL enabled drivers might use this helper in their TX completion path,
 * to give appropriate hint to the CPU.
 */
static inline void netdev_txq_bql_complete_prefetchw(struct netdev_queue *dev_queue)
{
#ifdef CONFIG_BQL
        prefetchw(&dev_queue->dql.limit);
#endif
}

static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue,
                                        unsigned int bytes)
{
#ifdef CONFIG_BQL
        dql_queued(&dev_queue->dql, bytes);

        if (likely(dql_avail(&dev_queue->dql) >= 0))
                return;

        set_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state);

        /*
         * The XOFF flag must be set before checking the dql_avail below,
         * because in netdev_tx_completed_queue we update the dql_completed
         * before checking the XOFF flag.
         */
        smp_mb();

        /* check again in case another CPU has just made room avail */
        if (unlikely(dql_avail(&dev_queue->dql) >= 0))
                clear_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state);
#endif
}

/* Variant of netdev_tx_sent_queue() for drivers that are aware
 * that they should not test BQL status themselves.
 * We do want to change __QUEUE_STATE_STACK_XOFF only for the last
 * skb of a batch.
 * Returns true if the doorbell must be used to kick the NIC.
 */
static inline bool __netdev_tx_sent_queue(struct netdev_queue *dev_queue,
                                          unsigned int bytes,
                                          bool xmit_more)
{
        if (xmit_more) {
#ifdef CONFIG_BQL
                dql_queued(&dev_queue->dql, bytes);
#endif
                return netif_tx_queue_stopped(dev_queue);
        }
        netdev_tx_sent_queue(dev_queue, bytes);
        return true;
}

/**
 *         netdev_sent_queue - report the number of bytes queued to hardware
 *         @dev: network device
 *         @bytes: number of bytes queued to the hardware device queue
 *
 *         Report the number of bytes queued for sending/completion to the network
 *         device hardware queue. @bytes should be a good approximation and should
 *         exactly match netdev_completed_queue() @bytes
 */
static inline void netdev_sent_queue(struct net_device *dev, unsigned int bytes)
{
        netdev_tx_sent_queue(netdev_get_tx_queue(dev, 0), bytes);
}

static inline bool __netdev_sent_queue(struct net_device *dev,
                                       unsigned int bytes,
                                       bool xmit_more)
{
        return __netdev_tx_sent_queue(netdev_get_tx_queue(dev, 0), bytes,
                                      xmit_more);
}

static inline void netdev_tx_completed_queue(struct netdev_queue *dev_queue,
                                             unsigned int pkts, unsigned int bytes)
{
#ifdef CONFIG_BQL
        if (unlikely(!bytes))
                return;

        dql_completed(&dev_queue->dql, bytes);

        /*
         * Without the memory barrier there is a small possiblity that
         * netdev_tx_sent_queue will miss the update and cause the queue to
         * be stopped forever
         */
        smp_mb();

        if (unlikely(dql_avail(&dev_queue->dql) < 0))
                return;

        if (test_and_clear_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state))
                netif_schedule_queue(dev_queue);
#endif
}

/**
 *         netdev_completed_queue - report bytes and packets completed by device
 *         @dev: network device
 *         @pkts: actual number of packets sent over the medium
 *         @bytes: actual number of bytes sent over the medium
 *
 *         Report the number of bytes and packets transmitted by the network device
 *         hardware queue over the physical medium, @bytes must exactly match the
 *         @bytes amount passed to netdev_sent_queue()
 */
static inline void netdev_completed_queue(struct net_device *dev,
                                          unsigned int pkts, unsigned int bytes)
{
        netdev_tx_completed_queue(netdev_get_tx_queue(dev, 0), pkts, bytes);
}

static inline void netdev_tx_reset_queue(struct netdev_queue *q)
{
#ifdef CONFIG_BQL
        clear_bit(__QUEUE_STATE_STACK_XOFF, &q->state);
        dql_reset(&q->dql);
#endif
}

/**
 *         netdev_reset_queue - reset the packets and bytes count of a network device
 *         @dev_queue: network device
 *
 *         Reset the bytes and packet count of a network device and clear the
 *         software flow control OFF bit for this network device
 */
static inline void netdev_reset_queue(struct net_device *dev_queue)
{
        netdev_tx_reset_queue(netdev_get_tx_queue(dev_queue, 0));
}

/**
 *         netdev_cap_txqueue - check if selected tx queue exceeds device queues
 *         @dev: network device
 *         @queue_index: given tx queue index
 *
 *         Returns 0 if given tx queue index >= number of device tx queues,
 *         otherwise returns the originally passed tx queue index.
 */
static inline u16 netdev_cap_txqueue(struct net_device *dev, u16 queue_index)
{
        if (unlikely(queue_index >= dev->real_num_tx_queues)) {
                net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
                                     dev->name, queue_index,
                                     dev->real_num_tx_queues);
                return 0;
        }

        return queue_index;
}

/**
 *        netif_running - test if up
 *        @dev: network device
 *
 *        Test if the device has been brought up.
 */
static inline bool netif_running(const struct net_device *dev)
{
        return test_bit(__LINK_STATE_START, &dev->state);
}

/*
 * Routines to manage the subqueues on a device.  We only need start,
 * stop, and a check if it's stopped.  All other device management is
 * done at the overall netdevice level.
 * Also test the device if we're multiqueue.
 */

/**
 *        netif_start_subqueue - allow sending packets on subqueue
 *        @dev: network device
 *        @queue_index: sub queue index
 *
 * Start individual transmit queue of a device with multiple transmit queues.
 */
static inline void netif_start_subqueue(struct net_device *dev, u16 queue_index)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);

        netif_tx_start_queue(txq);
}

/**
 *        netif_stop_subqueue - stop sending packets on subqueue
 *        @dev: network device
 *        @queue_index: sub queue index
 *
 * Stop individual transmit queue of a device with multiple transmit queues.
 */
static inline void netif_stop_subqueue(struct net_device *dev, u16 queue_index)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
        netif_tx_stop_queue(txq);
}

/**
 *        netif_subqueue_stopped - test status of subqueue
 *        @dev: network device
 *        @queue_index: sub queue index
 *
 * Check individual transmit queue of a device with multiple transmit queues.
 */
static inline bool __netif_subqueue_stopped(const struct net_device *dev,
                                            u16 queue_index)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);

        return netif_tx_queue_stopped(txq);
}

static inline bool netif_subqueue_stopped(const struct net_device *dev,
                                          struct sk_buff *skb)
{
        return __netif_subqueue_stopped(dev, skb_get_queue_mapping(skb));
}

/**
 *        netif_wake_subqueue - allow sending packets on subqueue
 *        @dev: network device
 *        @queue_index: sub queue index
 *
 * Resume individual transmit queue of a device with multiple transmit queues.
 */
static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);

        netif_tx_wake_queue(txq);
}

#ifdef CONFIG_XPS
int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
                        u16 index);
int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
                          u16 index, bool is_rxqs_map);

/**
 *        netif_attr_test_mask - Test a CPU or Rx queue set in a mask
 *        @j: CPU/Rx queue index
 *        @mask: bitmask of all cpus/rx queues
 *        @nr_bits: number of bits in the bitmask
 *
 * Test if a CPU or Rx queue index is set in a mask of all CPU/Rx queues.
 */
static inline bool netif_attr_test_mask(unsigned long j,
                                        const unsigned long *mask,
                                        unsigned int nr_bits)
{
        cpu_max_bits_warn(j, nr_bits);
        return test_bit(j, mask);
}

/**
 *        netif_attr_test_online - Test for online CPU/Rx queue
 *        @j: CPU/Rx queue index
 *        @online_mask: bitmask for CPUs/Rx queues that are online
 *        @nr_bits: number of bits in the bitmask
 *
 * Returns true if a CPU/Rx queue is online.
 */
static inline bool netif_attr_test_online(unsigned long j,
                                          const unsigned long *online_mask,
                                          unsigned int nr_bits)
{
        cpu_max_bits_warn(j, nr_bits);

        if (online_mask)
                return test_bit(j, online_mask);

        return (j < nr_bits);
}

/**
 *        netif_attrmask_next - get the next CPU/Rx queue in a cpu/Rx queues mask
 *        @n: CPU/Rx queue index
 *        @srcp: the cpumask/Rx queue mask pointer
 *        @nr_bits: number of bits in the bitmask
 *
 * Returns >= nr_bits if no further CPUs/Rx queues set.
 */
static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp,
                                               unsigned int nr_bits)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpu_max_bits_warn(n, nr_bits);

        if (srcp)
                return find_next_bit(srcp, nr_bits, n + 1);

        return n + 1;
}

/**
 *        netif_attrmask_next_and - get the next CPU/Rx queue in \*src1p & \*src2p
 *        @n: CPU/Rx queue index
 *        @src1p: the first CPUs/Rx queues mask pointer
 *        @src2p: the second CPUs/Rx queues mask pointer
 *        @nr_bits: number of bits in the bitmask
 *
 * Returns >= nr_bits if no further CPUs/Rx queues set in both.
 */
static inline int netif_attrmask_next_and(int n, const unsigned long *src1p,
                                          const unsigned long *src2p,
                                          unsigned int nr_bits)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpu_max_bits_warn(n, nr_bits);

        if (src1p && src2p)
                return find_next_and_bit(src1p, src2p, nr_bits, n + 1);
        else if (src1p)
                return find_next_bit(src1p, nr_bits, n + 1);
        else if (src2p)
                return find_next_bit(src2p, nr_bits, n + 1);

        return n + 1;
}
#else
static inline int netif_set_xps_queue(struct net_device *dev,
                                      const struct cpumask *mask,
                                      u16 index)
{
        return 0;
}

static inline int __netif_set_xps_queue(struct net_device *dev,
                                        const unsigned long *mask,
                                        u16 index, bool is_rxqs_map)
{
        return 0;
}
#endif

/**
 *        netif_is_multiqueue - test if device has multiple transmit queues
 *        @dev: network device
 *
 * Check if device has multiple transmit queues
 */
static inline bool netif_is_multiqueue(const struct net_device *dev)
{
        return dev->num_tx_queues > 1;
}

int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq);

#ifdef CONFIG_SYSFS
int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq);
#else
static inline int netif_set_real_num_rx_queues(struct net_device *dev,
                                                unsigned int rxqs)
{
        dev->real_num_rx_queues = rxqs;
        return 0;
}
#endif

static inline struct netdev_rx_queue *
__netif_get_rx_queue(struct net_device *dev, unsigned int rxq)
{
        return dev->_rx + rxq;
}

#ifdef CONFIG_SYSFS
static inline unsigned int get_netdev_rx_queue_index(
                struct netdev_rx_queue *queue)
{
        struct net_device *dev = queue->dev;
        int index = queue - dev->_rx;

        BUG_ON(index >= dev->num_rx_queues);
        return index;
}
#endif

#define DEFAULT_MAX_NUM_RSS_QUEUES        (8)
int netif_get_num_default_rss_queues(void);

enum skb_free_reason {
        SKB_REASON_CONSUMED,
        SKB_REASON_DROPPED,
};

void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason);
void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason);

/*
 * It is not allowed to call kfree_skb() or consume_skb() from hardware
 * interrupt context or with hardware interrupts being disabled.
 * (in_irq() || irqs_disabled())
 *
 * We provide four helpers that can be used in following contexts :
 *
 * dev_kfree_skb_irq(skb) when caller drops a packet from irq context,
 *  replacing kfree_skb(skb)
 *
 * dev_consume_skb_irq(skb) when caller consumes a packet from irq context.
 *  Typically used in place of consume_skb(skb) in TX completion path
 *
 * dev_kfree_skb_any(skb) when caller doesn't know its current irq context,
 *  replacing kfree_skb(skb)
 *
 * dev_consume_skb_any(skb) when caller doesn't know its current irq context,
 *  and consumed a packet. Used in place of consume_skb(skb)
 */
static inline void dev_kfree_skb_irq(struct sk_buff *skb)
{
        __dev_kfree_skb_irq(skb, SKB_REASON_DROPPED);
}

static inline void dev_consume_skb_irq(struct sk_buff *skb)
{
        __dev_kfree_skb_irq(skb, SKB_REASON_CONSUMED);
}

static inline void dev_kfree_skb_any(struct sk_buff *skb)
{
        __dev_kfree_skb_any(skb, SKB_REASON_DROPPED);
}

static inline void dev_consume_skb_any(struct sk_buff *skb)
{
        __dev_kfree_skb_any(skb, SKB_REASON_CONSUMED);
}

void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog);
int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
int netif_rx(struct sk_buff *skb);
int netif_rx_ni(struct sk_buff *skb);
int netif_rx_any_context(struct sk_buff *skb);
int netif_receive_skb(struct sk_buff *skb);
int netif_receive_skb_core(struct sk_buff *skb);
void netif_receive_skb_list(struct list_head *head);
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
void napi_gro_flush(struct napi_struct *napi, bool flush_old);
struct sk_buff *napi_get_frags(struct napi_struct *napi);
gro_result_t napi_gro_frags(struct napi_struct *napi);
struct packet_offload *gro_find_receive_by_type(__be16 type);
struct packet_offload *gro_find_complete_by_type(__be16 type);

static inline void napi_free_frags(struct napi_struct *napi)
{
        kfree_skb(napi->skb);
        napi->skb = NULL;
}

bool netdev_is_rx_handler_busy(struct net_device *dev);
int netdev_rx_handler_register(struct net_device *dev,
                               rx_handler_func_t *rx_handler,
                               void *rx_handler_data);
void netdev_rx_handler_unregister(struct net_device *dev);

bool dev_valid_name(const char *name);
static inline bool is_socket_ioctl_cmd(unsigned int cmd)
{
        return _IOC_TYPE(cmd) == SOCK_IOC_TYPE;
}
int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
                bool *need_copyout);
int dev_ifconf(struct net *net, struct ifconf *, int);
int dev_ethtool(struct net *net, struct ifreq *);
unsigned int dev_get_flags(const struct net_device *);
int __dev_change_flags(struct net_device *dev, unsigned int flags,
                       struct netlink_ext_ack *extack);
int dev_change_flags(struct net_device *dev, unsigned int flags,
                     struct netlink_ext_ack *extack);
void __dev_notify_flags(struct net_device *, unsigned int old_flags,
                        unsigned int gchanges);
int dev_change_name(struct net_device *, const char *);
int dev_set_alias(struct net_device *, const char *, size_t);
int dev_get_alias(const struct net_device *, char *, size_t);
int dev_change_net_namespace(struct net_device *, struct net *, const char *);
int __dev_set_mtu(struct net_device *, int);
int dev_validate_mtu(struct net_device *dev, int mtu,
                     struct netlink_ext_ack *extack);
int dev_set_mtu_ext(struct net_device *dev, int mtu,
                    struct netlink_ext_ack *extack);
int dev_set_mtu(struct net_device *, int);
int dev_change_tx_queue_len(struct net_device *, unsigned long);
void dev_set_group(struct net_device *, int);
int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
                              struct netlink_ext_ack *extack);
int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
                        struct netlink_ext_ack *extack);
int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
                             struct netlink_ext_ack *extack);
int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name);
int dev_change_carrier(struct net_device *, bool new_carrier);
int dev_get_phys_port_id(struct net_device *dev,
                         struct netdev_phys_item_id *ppid);
int dev_get_phys_port_name(struct net_device *dev,
                           char *name, size_t len);
int dev_get_port_parent_id(struct net_device *dev,
                           struct netdev_phys_item_id *ppid, bool recurse);
bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b);
int dev_change_proto_down(struct net_device *dev, bool proto_down);
int dev_change_proto_down_generic(struct net_device *dev, bool proto_down);
void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
                                  u32 value);
struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again);
struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
                                    struct netdev_queue *txq, int *ret);

typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
                      int fd, int expected_fd, u32 flags);
int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode);

int xdp_umem_query(struct net_device *dev, u16 queue_id);

int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
bool is_skb_forwardable(const struct net_device *dev,
                        const struct sk_buff *skb);

static __always_inline int ____dev_forward_skb(struct net_device *dev,
                                               struct sk_buff *skb)
{
        if (skb_orphan_frags(skb, GFP_ATOMIC) ||
            unlikely(!is_skb_forwardable(dev, skb))) {
                atomic_long_inc(&dev->rx_dropped);
                kfree_skb(skb);
                return NET_RX_DROP;
        }

        skb_scrub_packet(skb, !net_eq(dev_net(dev), dev_net(skb->dev)));
        skb->priority = 0;
        return 0;
}

bool dev_nit_active(struct net_device *dev);
void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);

extern int                netdev_budget;
extern unsigned int        netdev_budget_usecs;

/* Called by rtnetlink.c:rtnl_unlock() */
void netdev_run_todo(void);

/**
 *        dev_put - release reference to device
 *        @dev: network device
 *
 * Release reference to device to allow it to be freed.
 */
static inline void dev_put(struct net_device *dev)
{
        if (dev)
                this_cpu_dec(*dev->pcpu_refcnt);
}

/**
 *        dev_hold - get reference to device
 *        @dev: network device
 *
 * Hold reference to device to keep it from being freed.
 */
static inline void dev_hold(struct net_device *dev)
{
        if (dev)
                this_cpu_inc(*dev->pcpu_refcnt);
}

/* Carrier loss detection, dial on demand. The functions netif_carrier_on
 * and _off may be called from IRQ context, but it is caller
 * who is responsible for serialization of these calls.
 *
 * The name carrier is inappropriate, these functions should really be
 * called netif_lowerlayer_*() because they represent the state of any
 * kind of lower layer not just hardware media.
 */

void linkwatch_init_dev(struct net_device *dev);
void linkwatch_fire_event(struct net_device *dev);
void linkwatch_forget_dev(struct net_device *dev);

/**
 *        netif_carrier_ok - test if carrier present
 *        @dev: network device
 *
 * Check if carrier is present on device
 */
static inline bool netif_carrier_ok(const struct net_device *dev)
{
        return !test_bit(__LINK_STATE_NOCARRIER, &dev->state);
}

unsigned long dev_trans_start(struct net_device *dev);

void __netdev_watchdog_up(struct net_device *dev);

void netif_carrier_on(struct net_device *dev);

void netif_carrier_off(struct net_device *dev);

/**
 *        netif_dormant_on - mark device as dormant.
 *        @dev: network device
 *
 * Mark device as dormant (as per RFC2863).
 *
 * The dormant state indicates that the relevant interface is not
 * actually in a condition to pass packets (i.e., it is not 'up') but is
 * in a "pending" state, waiting for some external event.  For "on-
 * demand" interfaces, this new state identifies the situation where the
 * interface is waiting for events to place it in the up state.
 */
static inline void netif_dormant_on(struct net_device *dev)
{
        if (!test_and_set_bit(__LINK_STATE_DORMANT, &dev->state))
                linkwatch_fire_event(dev);
}

/**
 *        netif_dormant_off - set device as not dormant.
 *        @dev: network device
 *
 * Device is not in dormant state.
 */
static inline void netif_dormant_off(struct net_device *dev)
{
        if (test_and_clear_bit(__LINK_STATE_DORMANT, &dev->state))
                linkwatch_fire_event(dev);
}

/**
 *        netif_dormant - test if device is dormant
 *        @dev: network device
 *
 * Check if device is dormant.
 */
static inline bool netif_dormant(const struct net_device *dev)
{
        return test_bit(__LINK_STATE_DORMANT, &dev->state);
}


/**
 *        netif_testing_on - mark device as under test.
 *        @dev: network device
 *
 * Mark device as under test (as per RFC2863).
 *
 * The testing state indicates that some test(s) must be performed on
 * the interface. After completion, of the test, the interface state
 * will change to up, dormant, or down, as appropriate.
 */
static inline void netif_testing_on(struct net_device *dev)
{
        if (!test_and_set_bit(__LINK_STATE_TESTING, &dev->state))
                linkwatch_fire_event(dev);
}

/**
 *        netif_testing_off - set device as not under test.
 *        @dev: network device
 *
 * Device is not in testing state.
 */
static inline void netif_testing_off(struct net_device *dev)
{
        if (test_and_clear_bit(__LINK_STATE_TESTING, &dev->state))
                linkwatch_fire_event(dev);
}

/**
 *        netif_testing - test if device is under test
 *        @dev: network device
 *
 * Check if device is under test
 */
static inline bool netif_testing(const struct net_device *dev)
{
        return test_bit(__LINK_STATE_TESTING, &dev->state);
}


/**
 *        netif_oper_up - test if device is operational
 *        @dev: network device
 *
 * Check if carrier is operational
 */
static inline bool netif_oper_up(const struct net_device *dev)
{
        return (dev->operstate == IF_OPER_UP ||
                dev->operstate == IF_OPER_UNKNOWN /* backward compat */);
}

/**
 *        netif_device_present - is device available or removed
 *        @dev: network device
 *
 * Check if device has not been removed from system.
 */
static inline bool netif_device_present(struct net_device *dev)
{
        return test_bit(__LINK_STATE_PRESENT, &dev->state);
}

void netif_device_detach(struct net_device *dev);

void netif_device_attach(struct net_device *dev);

/*
 * Network interface message level settings
 */

enum {
        NETIF_MSG_DRV_BIT,
        NETIF_MSG_PROBE_BIT,
        NETIF_MSG_LINK_BIT,
        NETIF_MSG_TIMER_BIT,
        NETIF_MSG_IFDOWN_BIT,
        NETIF_MSG_IFUP_BIT,
        NETIF_MSG_RX_ERR_BIT,
        NETIF_MSG_TX_ERR_BIT,
        NETIF_MSG_TX_QUEUED_BIT,
        NETIF_MSG_INTR_BIT,
        NETIF_MSG_TX_DONE_BIT,
        NETIF_MSG_RX_STATUS_BIT,
        NETIF_MSG_PKTDATA_BIT,
        NETIF_MSG_HW_BIT,
        NETIF_MSG_WOL_BIT,

        /* When you add a new bit above, update netif_msg_class_names array
         * in net/ethtool/common.c
         */
        NETIF_MSG_CLASS_COUNT,
};
/* Both ethtool_ops interface and internal driver implementation use u32 */
static_assert(NETIF_MSG_CLASS_COUNT <= 32);

#define __NETIF_MSG_BIT(bit)        ((u32)1 << (bit))
#define __NETIF_MSG(name)        __NETIF_MSG_BIT(NETIF_MSG_ ## name ## _BIT)

#define NETIF_MSG_DRV                __NETIF_MSG(DRV)
#define NETIF_MSG_PROBE                __NETIF_MSG(PROBE)
#define NETIF_MSG_LINK                __NETIF_MSG(LINK)
#define NETIF_MSG_TIMER                __NETIF_MSG(TIMER)
#define NETIF_MSG_IFDOWN        __NETIF_MSG(IFDOWN)
#define NETIF_MSG_IFUP                __NETIF_MSG(IFUP)
#define NETIF_MSG_RX_ERR        __NETIF_MSG(RX_ERR)
#define NETIF_MSG_TX_ERR        __NETIF_MSG(TX_ERR)
#define NETIF_MSG_TX_QUEUED        __NETIF_MSG(TX_QUEUED)
#define NETIF_MSG_INTR                __NETIF_MSG(INTR)
#define NETIF_MSG_TX_DONE        __NETIF_MSG(TX_DONE)
#define NETIF_MSG_RX_STATUS        __NETIF_MSG(RX_STATUS)
#define NETIF_MSG_PKTDATA        __NETIF_MSG(PKTDATA)
#define NETIF_MSG_HW                __NETIF_MSG(HW)
#define NETIF_MSG_WOL                __NETIF_MSG(WOL)

#define netif_msg_drv(p)        ((p)->msg_enable & NETIF_MSG_DRV)
#define netif_msg_probe(p)        ((p)->msg_enable & NETIF_MSG_PROBE)
#define netif_msg_link(p)        ((p)->msg_enable & NETIF_MSG_LINK)
#define netif_msg_timer(p)        ((p)->msg_enable & NETIF_MSG_TIMER)
#define netif_msg_ifdown(p)        ((p)->msg_enable & NETIF_MSG_IFDOWN)
#define netif_msg_ifup(p)        ((p)->msg_enable & NETIF_MSG_IFUP)
#define netif_msg_rx_err(p)        ((p)->msg_enable & NETIF_MSG_RX_ERR)
#define netif_msg_tx_err(p)        ((p)->msg_enable & NETIF_MSG_TX_ERR)
#define netif_msg_tx_queued(p)        ((p)->msg_enable & NETIF_MSG_TX_QUEUED)
#define netif_msg_intr(p)        ((p)->msg_enable & NETIF_MSG_INTR)
#define netif_msg_tx_done(p)        ((p)->msg_enable & NETIF_MSG_TX_DONE)
#define netif_msg_rx_status(p)        ((p)->msg_enable & NETIF_MSG_RX_STATUS)
#define netif_msg_pktdata(p)        ((p)->msg_enable & NETIF_MSG_PKTDATA)
#define netif_msg_hw(p)                ((p)->msg_enable & NETIF_MSG_HW)
#define netif_msg_wol(p)        ((p)->msg_enable & NETIF_MSG_WOL)

static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits)
{
        /* use default */
        if (debug_value < 0 || debug_value >= (sizeof(u32) * 8))
                return default_msg_enable_bits;
        if (debug_value == 0)        /* no output */
                return 0;
        /* set low N bits */
        return (1U << debug_value) - 1;
}

static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu)
{
        spin_lock(&txq->_xmit_lock);
        /* Pairs with READ_ONCE() in __dev_queue_xmit() */
        WRITE_ONCE(txq->xmit_lock_owner, cpu);
}

static inline bool __netif_tx_acquire(struct netdev_queue *txq)
{
        __acquire(&txq->_xmit_lock);
        return true;
}

static inline void __netif_tx_release(struct netdev_queue *txq)
{
        __release(&txq->_xmit_lock);
}

static inline void __netif_tx_lock_bh(struct netdev_queue *txq)
{
        spin_lock_bh(&txq->_xmit_lock);
        /* Pairs with READ_ONCE() in __dev_queue_xmit() */
        WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id());
}

static inline bool __netif_tx_trylock(struct netdev_queue *txq)
{
        bool ok = spin_trylock(&txq->_xmit_lock);

        if (likely(ok)) {
                /* Pairs with READ_ONCE() in __dev_queue_xmit() */
                WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id());
        }
        return ok;
}

static inline void __netif_tx_unlock(struct netdev_queue *txq)
{
        /* Pairs with READ_ONCE() in __dev_queue_xmit() */
        WRITE_ONCE(txq->xmit_lock_owner, -1);
        spin_unlock(&txq->_xmit_lock);
}

static inline void __netif_tx_unlock_bh(struct netdev_queue *txq)
{
        /* Pairs with READ_ONCE() in __dev_queue_xmit() */
        WRITE_ONCE(txq->xmit_lock_owner, -1);
        spin_unlock_bh(&txq->_xmit_lock);
}

static inline void txq_trans_update(struct netdev_queue *txq)
{
        if (txq->xmit_lock_owner != -1)
                txq->trans_start = jiffies;
}

/* legacy drivers only, netdev_start_xmit() sets txq->trans_start */
static inline void netif_trans_update(struct net_device *dev)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);

        if (txq->trans_start != jiffies)
                txq->trans_start = jiffies;
}

/**
 *        netif_tx_lock - grab network device transmit lock
 *        @dev: network device
 *
 * Get network device transmit lock
 */
static inline void netif_tx_lock(struct net_device *dev)
{
        unsigned int i;
        int cpu;

        spin_lock(&dev->tx_global_lock);
        cpu = smp_processor_id();
        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);

                /* We are the only thread of execution doing a
                 * freeze, but we have to grab the _xmit_lock in
                 * order to synchronize with threads which are in
                 * the ->hard_start_xmit() handler and already
                 * checked the frozen bit.
                 */
                __netif_tx_lock(txq, cpu);
                set_bit(__QUEUE_STATE_FROZEN, &txq->state);
                __netif_tx_unlock(txq);
        }
}

static inline void netif_tx_lock_bh(struct net_device *dev)
{
        local_bh_disable();
        netif_tx_lock(dev);
}

static inline void netif_tx_unlock(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);

                /* No need to grab the _xmit_lock here.  If the
                 * queue is not stopped for another reason, we
                 * force a schedule.
                 */
                clear_bit(__QUEUE_STATE_FROZEN, &txq->state);
                netif_schedule_queue(txq);
        }
        spin_unlock(&dev->tx_global_lock);
}

static inline void netif_tx_unlock_bh(struct net_device *dev)
{
        netif_tx_unlock(dev);
        local_bh_enable();
}

#define HARD_TX_LOCK(dev, txq, cpu) {                        \
        if ((dev->features & NETIF_F_LLTX) == 0) {        \
                __netif_tx_lock(txq, cpu);                \
        } else {                                        \
                __netif_tx_acquire(txq);                \
        }                                                \
}

#define HARD_TX_TRYLOCK(dev, txq)                        \
        (((dev->features & NETIF_F_LLTX) == 0) ?        \
                __netif_tx_trylock(txq) :                \
                __netif_tx_acquire(txq))

#define HARD_TX_UNLOCK(dev, txq) {                        \
        if ((dev->features & NETIF_F_LLTX) == 0) {        \
                __netif_tx_unlock(txq);                        \
        } else {                                        \
                __netif_tx_release(txq);                \
        }                                                \
}

static inline void netif_tx_disable(struct net_device *dev)
{
        unsigned int i;
        int cpu;

        local_bh_disable();
        cpu = smp_processor_id();
        spin_lock(&dev->tx_global_lock);
        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);

                __netif_tx_lock(txq, cpu);
                netif_tx_stop_queue(txq);
                __netif_tx_unlock(txq);
        }
        spin_unlock(&dev->tx_global_lock);
        local_bh_enable();
}

static inline void netif_addr_lock(struct net_device *dev)
{
        unsigned char nest_level = 0;

#ifdef CONFIG_LOCKDEP
        nest_level = dev->nested_level;
#endif
        spin_lock_nested(&dev->addr_list_lock, nest_level);
}

static inline void netif_addr_lock_bh(struct net_device *dev)
{
        unsigned char nest_level = 0;

#ifdef CONFIG_LOCKDEP
        nest_level = dev->nested_level;
#endif
        local_bh_disable();
        spin_lock_nested(&dev->addr_list_lock, nest_level);
}

static inline void netif_addr_unlock(struct net_device *dev)
{
        spin_unlock(&dev->addr_list_lock);
}

static inline void netif_addr_unlock_bh(struct net_device *dev)
{
        spin_unlock_bh(&dev->addr_list_lock);
}

/*
 * dev_addrs walker. Should be used only for read access. Call with
 * rcu_read_lock held.
 */
#define for_each_dev_addr(dev, ha) \
                list_for_each_entry_rcu(ha, &dev->dev_addrs.list, list)

/* These functions live elsewhere (drivers/net/net_init.c, but related) */

void ether_setup(struct net_device *dev);

/* Support for loadable net-drivers */
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
                                    unsigned char name_assign_type,
                                    void (*setup)(struct net_device *),
                                    unsigned int txqs, unsigned int rxqs);
#define alloc_netdev(sizeof_priv, name, name_assign_type, setup) \
        alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, 1, 1)

#define alloc_netdev_mq(sizeof_priv, name, name_assign_type, setup, count) \
        alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, count, \
                         count)

int register_netdev(struct net_device *dev);
void unregister_netdev(struct net_device *dev);

int devm_register_netdev(struct device *dev, struct net_device *ndev);

/* General hardware address lists handling functions */
int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
                   struct netdev_hw_addr_list *from_list, int addr_len);
void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
                      struct netdev_hw_addr_list *from_list, int addr_len);
int __hw_addr_sync_dev(struct netdev_hw_addr_list *list,
                       struct net_device *dev,
                       int (*sync)(struct net_device *, const unsigned char *),
                       int (*unsync)(struct net_device *,
                                     const unsigned char *));
int __hw_addr_ref_sync_dev(struct netdev_hw_addr_list *list,
                           struct net_device *dev,
                           int (*sync)(struct net_device *,
                                       const unsigned char *, int),
                           int (*unsync)(struct net_device *,
                                         const unsigned char *, int));
void __hw_addr_ref_unsync_dev(struct netdev_hw_addr_list *list,
                              struct net_device *dev,
                              int (*unsync)(struct net_device *,
                                            const unsigned char *, int));
void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list,
                          struct net_device *dev,
                          int (*unsync)(struct net_device *,
                                        const unsigned char *));
void __hw_addr_init(struct netdev_hw_addr_list *list);

/* Functions used for device addresses handling */
static inline void
__dev_addr_set(struct net_device *dev, const u8 *addr, size_t len)
{
        memcpy(dev->dev_addr, addr, len);
}

static inline void dev_addr_set(struct net_device *dev, const u8 *addr)
{
        __dev_addr_set(dev, addr, dev->addr_len);
}

static inline void
dev_addr_mod(struct net_device *dev, unsigned int offset,
             const u8 *addr, size_t len)
{
        memcpy(&dev->dev_addr[offset], addr, len);
}

int dev_addr_add(struct net_device *dev, const unsigned char *addr,
                 unsigned char addr_type);
int dev_addr_del(struct net_device *dev, const unsigned char *addr,
                 unsigned char addr_type);
void dev_addr_flush(struct net_device *dev);
int dev_addr_init(struct net_device *dev);

/* Functions used for unicast addresses handling */
int dev_uc_add(struct net_device *dev, const unsigned char *addr);
int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr);
int dev_uc_del(struct net_device *dev, const unsigned char *addr);
int dev_uc_sync(struct net_device *to, struct net_device *from);
int dev_uc_sync_multiple(struct net_device *to, struct net_device *from);
void dev_uc_unsync(struct net_device *to, struct net_device *from);
void dev_uc_flush(struct net_device *dev);
void dev_uc_init(struct net_device *dev);

/**
 *  __dev_uc_sync - Synchonize device's unicast list
 *  @dev:  device to sync
 *  @sync: function to call if address should be added
 *  @unsync: function to call if address should be removed
 *
 *  Add newly added addresses to the interface, and release
 *  addresses that have been deleted.
 */
static inline int __dev_uc_sync(struct net_device *dev,
                                int (*sync)(struct net_device *,
                                            const unsigned char *),
                                int (*unsync)(struct net_device *,
                                              const unsigned char *))
{
        return __hw_addr_sync_dev(&dev->uc, dev, sync, unsync);
}

/**
 *  __dev_uc_unsync - Remove synchronized addresses from device
 *  @dev:  device to sync
 *  @unsync: function to call if address should be removed
 *
 *  Remove all addresses that were added to the device by dev_uc_sync().
 */
static inline void __dev_uc_unsync(struct net_device *dev,
                                   int (*unsync)(struct net_device *,
                                                 const unsigned char *))
{
        __hw_addr_unsync_dev(&dev->uc, dev, unsync);
}

/* Functions used for multicast addresses handling */
int dev_mc_add(struct net_device *dev, const unsigned char *addr);
int dev_mc_add_global(struct net_device *dev, const unsigned char *addr);
int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr);
int dev_mc_del(struct net_device *dev, const unsigned char *addr);
int dev_mc_del_global(struct net_device *dev, const unsigned char *addr);
int dev_mc_sync(struct net_device *to, struct net_device *from);
int dev_mc_sync_multiple(struct net_device *to, struct net_device *from);
void dev_mc_unsync(struct net_device *to, struct net_device *from);
void dev_mc_flush(struct net_device *dev);
void dev_mc_init(struct net_device *dev);

/**
 *  __dev_mc_sync - Synchonize device's multicast list
 *  @dev:  device to sync
 *  @sync: function to call if address should be added
 *  @unsync: function to call if address should be removed
 *
 *  Add newly added addresses to the interface, and release
 *  addresses that have been deleted.
 */
static inline int __dev_mc_sync(struct net_device *dev,
                                int (*sync)(struct net_device *,
                                            const unsigned char *),
                                int (*unsync)(struct net_device *,
                                              const unsigned char *))
{
        return __hw_addr_sync_dev(&dev->mc, dev, sync, unsync);
}

/**
 *  __dev_mc_unsync - Remove synchronized addresses from device
 *  @dev:  device to sync
 *  @unsync: function to call if address should be removed
 *
 *  Remove all addresses that were added to the device by dev_mc_sync().
 */
static inline void __dev_mc_unsync(struct net_device *dev,
                                   int (*unsync)(struct net_device *,
                                                 const unsigned char *))
{
        __hw_addr_unsync_dev(&dev->mc, dev, unsync);
}

/* Functions used for secondary unicast and multicast support */
void dev_set_rx_mode(struct net_device *dev);
void __dev_set_rx_mode(struct net_device *dev);
int dev_set_promiscuity(struct net_device *dev, int inc);
int dev_set_allmulti(struct net_device *dev, int inc);
void netdev_state_change(struct net_device *dev);
void netdev_notify_peers(struct net_device *dev);
void netdev_features_change(struct net_device *dev);
/* Load a device via the kmod */
void dev_load(struct net *net, const char *name);
struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
                                        struct rtnl_link_stats64 *storage);
void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
                             const struct net_device_stats *netdev_stats);
void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
                           const struct pcpu_sw_netstats __percpu *netstats);

extern int                netdev_max_backlog;
extern int                netdev_tstamp_prequeue;
extern int                weight_p;
extern int                dev_weight_rx_bias;
extern int                dev_weight_tx_bias;
extern int                dev_rx_weight;
extern int                dev_tx_weight;
extern int                gro_normal_batch;

enum {
        NESTED_SYNC_IMM_BIT,
        NESTED_SYNC_TODO_BIT,
};

#define __NESTED_SYNC_BIT(bit)        ((u32)1 << (bit))
#define __NESTED_SYNC(name)        __NESTED_SYNC_BIT(NESTED_SYNC_ ## name ## _BIT)

#define NESTED_SYNC_IMM                __NESTED_SYNC(IMM)
#define NESTED_SYNC_TODO        __NESTED_SYNC(TODO)

struct netdev_nested_priv {
        unsigned char flags;
        void *data;
};

bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev);
struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
                                                     struct list_head **iter);
struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
                                                     struct list_head **iter);

#ifdef CONFIG_LOCKDEP
static LIST_HEAD(net_unlink_list);

static inline void net_unlink_todo(struct net_device *dev)
{
        if (list_empty(&dev->unlink_list))
                list_add_tail(&dev->unlink_list, &net_unlink_list);
}
#endif

/* iterate through upper list, must be called under RCU read lock */
#define netdev_for_each_upper_dev_rcu(dev, updev, iter) \
        for (iter = &(dev)->adj_list.upper, \
             updev = netdev_upper_get_next_dev_rcu(dev, &(iter)); \
             updev; \
             updev = netdev_upper_get_next_dev_rcu(dev, &(iter)))

int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
                                  int (*fn)(struct net_device *upper_dev,
                                            struct netdev_nested_priv *priv),
                                  struct netdev_nested_priv *priv);

bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
                                  struct net_device *upper_dev);

bool netdev_has_any_upper_dev(struct net_device *dev);

void *netdev_lower_get_next_private(struct net_device *dev,
                                    struct list_head **iter);
void *netdev_lower_get_next_private_rcu(struct net_device *dev,
                                        struct list_head **iter);

#define netdev_for_each_lower_private(dev, priv, iter) \
        for (iter = (dev)->adj_list.lower.next, \
             priv = netdev_lower_get_next_private(dev, &(iter)); \
             priv; \
             priv = netdev_lower_get_next_private(dev, &(iter)))

#define netdev_for_each_lower_private_rcu(dev, priv, iter) \
        for (iter = &(dev)->adj_list.lower, \
             priv = netdev_lower_get_next_private_rcu(dev, &(iter)); \
             priv; \
             priv = netdev_lower_get_next_private_rcu(dev, &(iter)))

void *netdev_lower_get_next(struct net_device *dev,
                                struct list_head **iter);

#define netdev_for_each_lower_dev(dev, ldev, iter) \
        for (iter = (dev)->adj_list.lower.next, \
             ldev = netdev_lower_get_next(dev, &(iter)); \
             ldev; \
             ldev = netdev_lower_get_next(dev, &(iter)))

struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
                                             struct list_head **iter);
int netdev_walk_all_lower_dev(struct net_device *dev,
                              int (*fn)(struct net_device *lower_dev,
                                        struct netdev_nested_priv *priv),
                              struct netdev_nested_priv *priv);
int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
                                  int (*fn)(struct net_device *lower_dev,
                                            struct netdev_nested_priv *priv),
                                  struct netdev_nested_priv *priv);

void *netdev_adjacent_get_private(struct list_head *adj_list);
void *netdev_lower_get_first_private_rcu(struct net_device *dev);
struct net_device *netdev_master_upper_dev_get(struct net_device *dev);
struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev);
int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev,
                          struct netlink_ext_ack *extack);
int netdev_master_upper_dev_link(struct net_device *dev,
                                 struct net_device *upper_dev,
                                 void *upper_priv, void *upper_info,
                                 struct netlink_ext_ack *extack);
void netdev_upper_dev_unlink(struct net_device *dev,
                             struct net_device *upper_dev);
int netdev_adjacent_change_prepare(struct net_device *old_dev,
                                   struct net_device *new_dev,
                                   struct net_device *dev,
                                   struct netlink_ext_ack *extack);
void netdev_adjacent_change_commit(struct net_device *old_dev,
                                   struct net_device *new_dev,
                                   struct net_device *dev);
void netdev_adjacent_change_abort(struct net_device *old_dev,
                                  struct net_device *new_dev,
                                  struct net_device *dev);
void netdev_adjacent_rename_links(struct net_device *dev, char *oldname);
void *netdev_lower_dev_get_private(struct net_device *dev,
                                   struct net_device *lower_dev);
void netdev_lower_state_changed(struct net_device *lower_dev,
                                void *lower_state_info);

/* RSS keys are 40 or 52 bytes long */
#define NETDEV_RSS_KEY_LEN 52
extern u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly;
void netdev_rss_key_fill(void *buffer, size_t len);

int skb_checksum_help(struct sk_buff *skb);
int skb_crc32c_csum_help(struct sk_buff *skb);
int skb_csum_hwoffload_help(struct sk_buff *skb,
                            const netdev_features_t features);

struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
                                  netdev_features_t features, bool tx_path);
struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
                                    netdev_features_t features);

struct netdev_bonding_info {
        ifslave        slave;
        ifbond        master;
};

struct netdev_notifier_bonding_info {
        struct netdev_notifier_info info; /* must be first */
        struct netdev_bonding_info  bonding_info;
};

void netdev_bonding_info_change(struct net_device *dev,
                                struct netdev_bonding_info *bonding_info);

#if IS_ENABLED(CONFIG_ETHTOOL_NETLINK)
void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data);
#else
static inline void ethtool_notify(struct net_device *dev, unsigned int cmd,
                                  const void *data)
{
}
#endif

static inline
struct sk_buff *skb_gso_segment(struct sk_buff *skb, netdev_features_t features)
{
        return __skb_gso_segment(skb, features, true);
}
__be16 skb_network_protocol(struct sk_buff *skb, int *depth);

static inline bool can_checksum_protocol(netdev_features_t features,
                                         __be16 protocol)
{
        if (protocol == htons(ETH_P_FCOE))
                return !!(features & NETIF_F_FCOE_CRC);

        /* Assume this is an IP checksum (not SCTP CRC) */

        if (features & NETIF_F_HW_CSUM) {
                /* Can checksum everything */
                return true;
        }

        switch (protocol) {
        case htons(ETH_P_IP):
                return !!(features & NETIF_F_IP_CSUM);
        case htons(ETH_P_IPV6):
                return !!(features & NETIF_F_IPV6_CSUM);
        default:
                return false;
        }
}

#ifdef CONFIG_BUG
void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb);
#else
static inline void netdev_rx_csum_fault(struct net_device *dev,
                                        struct sk_buff *skb)
{
}
#endif
/* rx skb timestamps */
void net_enable_timestamp(void);
void net_disable_timestamp(void);

#ifdef CONFIG_PROC_FS
int __init dev_proc_init(void);
#else
#define dev_proc_init() 0
#endif

static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
                                              struct sk_buff *skb, struct net_device *dev,
                                              bool more)
{
        __this_cpu_write(softnet_data.xmit.more, more);
        return ops->ndo_start_xmit(skb, dev);
}

static inline bool netdev_xmit_more(void)
{
        return __this_cpu_read(softnet_data.xmit.more);
}

static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev,
                                            struct netdev_queue *txq, bool more)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        netdev_tx_t rc;

        rc = __netdev_start_xmit(ops, skb, dev, more);
        if (rc == NETDEV_TX_OK)
                txq_trans_update(txq);

        return rc;
}

int netdev_class_create_file_ns(const struct class_attribute *class_attr,
                                const void *ns);
void netdev_class_remove_file_ns(const struct class_attribute *class_attr,
                                 const void *ns);

extern const struct kobj_ns_type_operations net_ns_type_operations;

const char *netdev_drivername(const struct net_device *dev);

void linkwatch_run_queue(void);

static inline netdev_features_t netdev_intersect_features(netdev_features_t f1,
                                                          netdev_features_t f2)
{
        if ((f1 ^ f2) & NETIF_F_HW_CSUM) {
                if (f1 & NETIF_F_HW_CSUM)
                        f1 |= (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
                else
                        f2 |= (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
        }

        return f1 & f2;
}

static inline netdev_features_t netdev_get_wanted_features(
        struct net_device *dev)
{
        return (dev->features & ~dev->hw_features) | dev->wanted_features;
}
netdev_features_t netdev_increment_features(netdev_features_t all,
        netdev_features_t one, netdev_features_t mask);

/* Allow TSO being used on stacked device :
 * Performing the GSO segmentation before last device
 * is a performance improvement.
 */
static inline netdev_features_t netdev_add_tso_features(netdev_features_t features,
                                                        netdev_features_t mask)
{
        return netdev_increment_features(features, NETIF_F_ALL_TSO, mask);
}

int __netdev_update_features(struct net_device *dev);
void netdev_update_features(struct net_device *dev);
void netdev_change_features(struct net_device *dev);

void netif_stacked_transfer_operstate(const struct net_device *rootdev,
                                        struct net_device *dev);

netdev_features_t passthru_features_check(struct sk_buff *skb,
                                          struct net_device *dev,
                                          netdev_features_t features);
netdev_features_t netif_skb_features(struct sk_buff *skb);

static inline bool net_gso_ok(netdev_features_t features, int gso_type)
{
        netdev_features_t feature = (netdev_features_t)gso_type << NETIF_F_GSO_SHIFT;

        /* check flags correspondence */
        BUILD_BUG_ON(SKB_GSO_TCPV4   != (NETIF_F_TSO >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_DODGY   != (NETIF_F_GSO_ROBUST >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TCPV6   != (NETIF_F_TSO6 >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_FCOE    != (NETIF_F_FSO >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_GRE     != (NETIF_F_GSO_GRE >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_GRE_CSUM != (NETIF_F_GSO_GRE_CSUM >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_IPXIP4  != (NETIF_F_GSO_IPXIP4 >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_IPXIP6  != (NETIF_F_GSO_IPXIP6 >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_SCTP    != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_GSO_UDP >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_UDP_L4 != (NETIF_F_GSO_UDP_L4 >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_FRAGLIST != (NETIF_F_GSO_FRAGLIST >> NETIF_F_GSO_SHIFT));

        return (features & feature) == feature;
}

static inline bool skb_gso_ok(struct sk_buff *skb, netdev_features_t features)
{
        return net_gso_ok(features, skb_shinfo(skb)->gso_type) &&
               (!skb_has_frag_list(skb) || (features & NETIF_F_FRAGLIST));
}

static inline bool netif_needs_gso(struct sk_buff *skb,
                                   netdev_features_t features)
{
        return skb_is_gso(skb) && (!skb_gso_ok(skb, features) ||
                unlikely((skb->ip_summed != CHECKSUM_PARTIAL) &&
                         (skb->ip_summed != CHECKSUM_UNNECESSARY)));
}

static inline void netif_set_gso_max_size(struct net_device *dev,
                                          unsigned int size)
{
        dev->gso_max_size = size;
}

static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol,
                                        int pulled_hlen, u16 mac_offset,
                                        int mac_len)
{
        skb->protocol = protocol;
        skb->encapsulation = 1;
        skb_push(skb, pulled_hlen);
        skb_reset_transport_header(skb);
        skb->mac_header = mac_offset;
        skb->network_header = skb->mac_header + mac_len;
        skb->mac_len = mac_len;
}

static inline bool netif_is_macsec(const struct net_device *dev)
{
        return dev->priv_flags & IFF_MACSEC;
}

static inline bool netif_is_macvlan(const struct net_device *dev)
{
        return dev->priv_flags & IFF_MACVLAN;
}

static inline bool netif_is_macvlan_port(const struct net_device *dev)
{
        return dev->priv_flags & IFF_MACVLAN_PORT;
}

static inline bool netif_is_bond_master(const struct net_device *dev)
{
        return dev->flags & IFF_MASTER && dev->priv_flags & IFF_BONDING;
}

static inline bool netif_is_bond_slave(const struct net_device *dev)
{
        return dev->flags & IFF_SLAVE && dev->priv_flags & IFF_BONDING;
}

static inline bool netif_supports_nofcs(struct net_device *dev)
{
        return dev->priv_flags & IFF_SUPP_NOFCS;
}

static inline bool netif_has_l3_rx_handler(const struct net_device *dev)
{
        return dev->priv_flags & IFF_L3MDEV_RX_HANDLER;
}

static inline bool netif_is_l3_master(const struct net_device *dev)
{
        return dev->priv_flags & IFF_L3MDEV_MASTER;
}

static inline bool netif_is_l3_slave(const struct net_device *dev)
{
        return dev->priv_flags & IFF_L3MDEV_SLAVE;
}

static inline bool netif_is_bridge_master(const struct net_device *dev)
{
        return dev->priv_flags & IFF_EBRIDGE;
}

static inline bool netif_is_bridge_port(const struct net_device *dev)
{
        return dev->priv_flags & IFF_BRIDGE_PORT;
}

static inline bool netif_is_ovs_master(const struct net_device *dev)
{
        return dev->priv_flags & IFF_OPENVSWITCH;
}

static inline bool netif_is_ovs_port(const struct net_device *dev)
{
        return dev->priv_flags & IFF_OVS_DATAPATH;
}

static inline bool netif_is_any_bridge_port(const struct net_device *dev)
{
        return netif_is_bridge_port(dev) || netif_is_ovs_port(dev);
}

static inline bool netif_is_team_master(const struct net_device *dev)
{
        return dev->priv_flags & IFF_TEAM;
}

static inline bool netif_is_team_port(const struct net_device *dev)
{
        return dev->priv_flags & IFF_TEAM_PORT;
}

static inline bool netif_is_lag_master(const struct net_device *dev)
{
        return netif_is_bond_master(dev) || netif_is_team_master(dev);
}

static inline bool netif_is_lag_port(const struct net_device *dev)
{
        return netif_is_bond_slave(dev) || netif_is_team_port(dev);
}

static inline bool netif_is_rxfh_configured(const struct net_device *dev)
{
        return dev->priv_flags & IFF_RXFH_CONFIGURED;
}

static inline bool netif_is_failover(const struct net_device *dev)
{
        return dev->priv_flags & IFF_FAILOVER;
}

static inline bool netif_is_failover_slave(const struct net_device *dev)
{
        return dev->priv_flags & IFF_FAILOVER_SLAVE;
}

/* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
static inline void netif_keep_dst(struct net_device *dev)
{
        dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM);
}

/* return true if dev can't cope with mtu frames that need vlan tag insertion */
static inline bool netif_reduces_vlan_mtu(struct net_device *dev)
{
        /* TODO: reserve and use an additional IFF bit, if we get more users */
        return dev->priv_flags & IFF_MACSEC;
}

extern struct pernet_operations __net_initdata loopback_net_ops;

/* Logging, debugging and troubleshooting/diagnostic helpers. */

/* netdev_printk helpers, similar to dev_printk */

static inline const char *netdev_name(const struct net_device *dev)
{
        if (!dev->name[0] || strchr(dev->name, '%'))
                return "(unnamed net_device)";
        return dev->name;
}

static inline bool netdev_unregistering(const struct net_device *dev)
{
        return dev->reg_state == NETREG_UNREGISTERING;
}

static inline const char *netdev_reg_state(const struct net_device *dev)
{
        switch (dev->reg_state) {
        case NETREG_UNINITIALIZED: return " (uninitialized)";
        case NETREG_REGISTERED: return "";
        case NETREG_UNREGISTERING: return " (unregistering)";
        case NETREG_UNREGISTERED: return " (unregistered)";
        case NETREG_RELEASED: return " (released)";
        case NETREG_DUMMY: return " (dummy)";
        }

        WARN_ONCE(1, "%s: unknown reg_state %d\n", dev->name, dev->reg_state);
        return " (unknown)";
}

__printf(3, 4) __cold
void netdev_printk(const char *level, const struct net_device *dev,
                   const char *format, ...);
__printf(2, 3) __cold
void netdev_emerg(const struct net_device *dev, const char *format, ...);
__printf(2, 3) __cold
void netdev_alert(const struct net_device *dev, const char *format, ...);
__printf(2, 3) __cold
void netdev_crit(const struct net_device *dev, const char *format, ...);
__printf(2, 3) __cold
void netdev_err(const struct net_device *dev, const char *format, ...);
__printf(2, 3) __cold
void netdev_warn(const struct net_device *dev, const char *format, ...);
__printf(2, 3) __cold
void netdev_notice(const struct net_device *dev, const char *format, ...);
__printf(2, 3) __cold
void netdev_info(const struct net_device *dev, const char *format, ...);

#define netdev_level_once(level, dev, fmt, ...)                        \
do {                                                                \
        static bool __print_once __read_mostly;                        \
                                                                \
        if (!__print_once) {                                        \
                __print_once = true;                                \
                netdev_printk(level, dev, fmt, ##__VA_ARGS__);        \
        }                                                        \
} while (0)

#define netdev_emerg_once(dev, fmt, ...) \
        netdev_level_once(KERN_EMERG, dev, fmt, ##__VA_ARGS__)
#define netdev_alert_once(dev, fmt, ...) \
        netdev_level_once(KERN_ALERT, dev, fmt, ##__VA_ARGS__)
#define netdev_crit_once(dev, fmt, ...) \
        netdev_level_once(KERN_CRIT, dev, fmt, ##__VA_ARGS__)
#define netdev_err_once(dev, fmt, ...) \
        netdev_level_once(KERN_ERR, dev, fmt, ##__VA_ARGS__)
#define netdev_warn_once(dev, fmt, ...) \
        netdev_level_once(KERN_WARNING, dev, fmt, ##__VA_ARGS__)
#define netdev_notice_once(dev, fmt, ...) \
        netdev_level_once(KERN_NOTICE, dev, fmt, ##__VA_ARGS__)
#define netdev_info_once(dev, fmt, ...) \
        netdev_level_once(KERN_INFO, dev, fmt, ##__VA_ARGS__)

#define MODULE_ALIAS_NETDEV(device) \
        MODULE_ALIAS("netdev-" device)

#if defined(CONFIG_DYNAMIC_DEBUG) || \
        (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
#define netdev_dbg(__dev, format, args...)                        \
do {                                                                \
        dynamic_netdev_dbg(__dev, format, ##args);                \
} while (0)
#elif defined(DEBUG)
#define netdev_dbg(__dev, format, args...)                        \
        netdev_printk(KERN_DEBUG, __dev, format, ##args)
#else
#define netdev_dbg(__dev, format, args...)                        \
({                                                                \
        if (0)                                                        \
                netdev_printk(KERN_DEBUG, __dev, format, ##args); \
})
#endif

#if defined(VERBOSE_DEBUG)
#define netdev_vdbg        netdev_dbg
#else

#define netdev_vdbg(dev, format, args...)                        \
({                                                                \
        if (0)                                                        \
                netdev_printk(KERN_DEBUG, dev, format, ##args);        \
        0;                                                        \
})
#endif

/*
 * netdev_WARN() acts like dev_printk(), but with the key difference
 * of using a WARN/WARN_ON to get the message out, including the
 * file/line information and a backtrace.
 */
#define netdev_WARN(dev, format, args...)                        \
        WARN(1, "netdevice: %s%s: " format, netdev_name(dev),        \
             netdev_reg_state(dev), ##args)

#define netdev_WARN_ONCE(dev, format, args...)                                \
        WARN_ONCE(1, "netdevice: %s%s: " format, netdev_name(dev),        \
                  netdev_reg_state(dev), ##args)

/* netif printk helpers, similar to netdev_printk */

#define netif_printk(priv, type, level, dev, fmt, args...)        \
do {                                                                  \
        if (netif_msg_##type(priv))                                \
                netdev_printk(level, (dev), fmt, ##args);        \
} while (0)

#define netif_level(level, priv, type, dev, fmt, args...)        \
do {                                                                \
        if (netif_msg_##type(priv))                                \
                netdev_##level(dev, fmt, ##args);                \
} while (0)

#define netif_emerg(priv, type, dev, fmt, args...)                \
        netif_level(emerg, priv, type, dev, fmt, ##args)
#define netif_alert(priv, type, dev, fmt, args...)                \
        netif_level(alert, priv, type, dev, fmt, ##args)
#define netif_crit(priv, type, dev, fmt, args...)                \
        netif_level(crit, priv, type, dev, fmt, ##args)
#define netif_err(priv, type, dev, fmt, args...)                \
        netif_level(err, priv, type, dev, fmt, ##args)
#define netif_warn(priv, type, dev, fmt, args...)                \
        netif_level(warn, priv, type, dev, fmt, ##args)
#define netif_notice(priv, type, dev, fmt, args...)                \
        netif_level(notice, priv, type, dev, fmt, ##args)
#define netif_info(priv, type, dev, fmt, args...)                \
        netif_level(info, priv, type, dev, fmt, ##args)

#if defined(CONFIG_DYNAMIC_DEBUG) || \
        (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
#define netif_dbg(priv, type, netdev, format, args...)                \
do {                                                                \
        if (netif_msg_##type(priv))                                \
                dynamic_netdev_dbg(netdev, format, ##args);        \
} while (0)
#elif defined(DEBUG)
#define netif_dbg(priv, type, dev, format, args...)                \
        netif_printk(priv, type, KERN_DEBUG, dev, format, ##args)
#else
#define netif_dbg(priv, type, dev, format, args...)                        \
({                                                                        \
        if (0)                                                                \
                netif_printk(priv, type, KERN_DEBUG, dev, format, ##args); \
        0;                                                                \
})
#endif

/* if @cond then downgrade to debug, else print at @level */
#define netif_cond_dbg(priv, type, netdev, cond, level, fmt, args...)     \
        do {                                                              \
                if (cond)                                                 \
                        netif_dbg(priv, type, netdev, fmt, ##args);       \
                else                                                      \
                        netif_ ## level(priv, type, netdev, fmt, ##args); \
        } while (0)

#if defined(VERBOSE_DEBUG)
#define netif_vdbg        netif_dbg
#else
#define netif_vdbg(priv, type, dev, format, args...)                \
({                                                                \
        if (0)                                                        \
                netif_printk(priv, type, KERN_DEBUG, dev, format, ##args); \
        0;                                                        \
})
#endif

/*
 *        The list of packet types we will receive (as opposed to discard)
 *        and the routines to invoke.
 *
 *        Why 16. Because with 16 the only overlap we get on a hash of the
 *        low nibble of the protocol value is RARP/SNAP/X.25.
 *
 *                0800        IP
 *                0001        802.3
 *                0002        AX.25
 *                0004        802.2
 *                8035        RARP
 *                0005        SNAP
 *                0805        X.25
 *                0806        ARP
 *                8137        IPX
 *                0009        Localtalk
 *                86DD        IPv6
 */
#define PTYPE_HASH_SIZE        (16)
#define PTYPE_HASH_MASK        (PTYPE_HASH_SIZE - 1)

extern struct net_device *blackhole_netdev;

/* Note: Avoid these macros in fast path, prefer per-cpu or per-queue counters. */
#define DEV_STATS_INC(DEV, FIELD) atomic_long_inc(&(DEV)->stats.__##FIELD)
#define DEV_STATS_ADD(DEV, FIELD, VAL)         \
                atomic_long_add((VAL), &(DEV)->stats.__##FIELD)
#define DEV_STATS_READ(DEV, FIELD) atomic_long_read(&(DEV)->stats.__##FIELD)

#endif        /* _LINUX_NETDEVICE_H */


























































































































































    1 




    1 





    1 

    1 

























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 *  NSA Security-Enhanced Linux (SELinux) security module
 *
 *  This file contains the SELinux security data structures for kernel objects.
 *
 *  Author(s):  Stephen Smalley, <sds@tycho.nsa.gov>
 *                Chris Vance, <cvance@nai.com>
 *                Wayne Salamon, <wsalamon@nai.com>
 *                James Morris <jmorris@redhat.com>
 *
 *  Copyright (C) 2001,2002 Networks Associates Technology, Inc.
 *  Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
 *  Copyright (C) 2016 Mellanox Technologies
 */
#ifndef _SELINUX_OBJSEC_H_
#define _SELINUX_OBJSEC_H_

#include <linux/list.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/binfmts.h>
#include <linux/in.h>
#include <linux/spinlock.h>
#include <linux/lsm_hooks.h>
#include <linux/msg.h>
#include <net/net_namespace.h>
#include "flask.h"
#include "avc.h"

struct task_security_struct {
        u32 osid;                /* SID prior to last execve */
        u32 sid;                /* current SID */
        u32 exec_sid;                /* exec SID */
        u32 create_sid;                /* fscreate SID */
        u32 keycreate_sid;        /* keycreate SID */
        u32 sockcreate_sid;        /* fscreate SID */
} __randomize_layout;

enum label_initialized {
        LABEL_INVALID,                /* invalid or not initialized */
        LABEL_INITIALIZED,        /* initialized */
        LABEL_PENDING
};

struct inode_security_struct {
        struct inode *inode;        /* back pointer to inode object */
        struct list_head list;        /* list of inode_security_struct */
        u32 task_sid;                /* SID of creating task */
        u32 sid;                /* SID of this object */
        u16 sclass;                /* security class of this object */
        unsigned char initialized;        /* initialization flag */
        spinlock_t lock;
};

struct file_security_struct {
        u32 sid;                /* SID of open file description */
        u32 fown_sid;                /* SID of file owner (for SIGIO) */
        u32 isid;                /* SID of inode at the time of file open */
        u32 pseqno;                /* Policy seqno at the time of file open */
};

struct superblock_security_struct {
        struct super_block *sb;                /* back pointer to sb object */
        u32 sid;                        /* SID of file system superblock */
        u32 def_sid;                        /* default SID for labeling */
        u32 mntpoint_sid;                /* SECURITY_FS_USE_MNTPOINT context for files */
        unsigned short behavior;        /* labeling behavior */
        unsigned short flags;                /* which mount options were specified */
        struct mutex lock;
        struct list_head isec_head;
        spinlock_t isec_lock;
};

struct msg_security_struct {
        u32 sid;        /* SID of message */
};

struct ipc_security_struct {
        u16 sclass;        /* security class of this object */
        u32 sid;        /* SID of IPC resource */
};

struct netif_security_struct {
        struct net *ns;                        /* network namespace */
        int ifindex;                        /* device index */
        u32 sid;                        /* SID for this interface */
};

struct netnode_security_struct {
        union {
                __be32 ipv4;                /* IPv4 node address */
                struct in6_addr ipv6;        /* IPv6 node address */
        } addr;
        u32 sid;                        /* SID for this node */
        u16 family;                        /* address family */
};

struct netport_security_struct {
        u32 sid;                        /* SID for this node */
        u16 port;                        /* port number */
        u8 protocol;                        /* transport protocol */
};

struct sk_security_struct {
#ifdef CONFIG_NETLABEL
        enum {                                /* NetLabel state */
                NLBL_UNSET = 0,
                NLBL_REQUIRE,
                NLBL_LABELED,
                NLBL_REQSKB,
                NLBL_CONNLABELED,
        } nlbl_state;
        struct netlbl_lsm_secattr *nlbl_secattr; /* NetLabel sec attributes */
#endif
        u32 sid;                        /* SID of this object */
        u32 peer_sid;                        /* SID of peer */
        u16 sclass;                        /* sock security class */
        enum {                                /* SCTP association state */
                SCTP_ASSOC_UNSET = 0,
                SCTP_ASSOC_SET,
        } sctp_assoc_state;
};

struct tun_security_struct {
        u32 sid;                        /* SID for the tun device sockets */
};

struct key_security_struct {
        u32 sid;        /* SID of key */
};

struct ib_security_struct {
        u32 sid;        /* SID of the queue pair or MAD agent */
};

struct pkey_security_struct {
        u64        subnet_prefix; /* Port subnet prefix */
        u16        pkey;        /* PKey number */
        u32        sid;        /* SID of pkey */
};

struct bpf_security_struct {
        u32 sid;  /* SID of bpf obj creator */
};

struct perf_event_security_struct {
        u32 sid;  /* SID of perf_event obj creator */
};

extern struct lsm_blob_sizes selinux_blob_sizes;
static inline struct task_security_struct *selinux_cred(const struct cred *cred)
{
        return cred->security + selinux_blob_sizes.lbs_cred;
}

static inline struct file_security_struct *selinux_file(const struct file *file)
{
        return file->f_security + selinux_blob_sizes.lbs_file;
}

static inline struct inode_security_struct *selinux_inode(
                                                const struct inode *inode)
{
        if (unlikely(!inode->i_security))
                return NULL;
        return inode->i_security + selinux_blob_sizes.lbs_inode;
}

static inline struct msg_security_struct *selinux_msg_msg(
                                                const struct msg_msg *msg_msg)
{
        return msg_msg->security + selinux_blob_sizes.lbs_msg_msg;
}

static inline struct ipc_security_struct *selinux_ipc(
                                                const struct kern_ipc_perm *ipc)
{
        return ipc->security + selinux_blob_sizes.lbs_ipc;
}

/*
 * get the subjective security ID of the current task
 */
static inline u32 current_sid(void)
{
        const struct task_security_struct *tsec = selinux_cred(current_cred());

        return tsec->sid;
}

#endif /* _SELINUX_OBJSEC_H_ */










































































































































































    1 
































































    1 



    1 

    1 



























    1 



    1 

    1 



























    1 



    1 

    1 



















































    1 

    1 



    1 
    1 



    1 

    1 






    1 

    1 



    1 





    1 
    1 










    1 





    1 


    1 



    1 









    1 
















    1 








    1 

































    1 
    1 
    1 










































































































    1 




    1 

    1 









































    1 











































































































































































































































































































































































































































































































































































































    1 















    1 









    1 











    1 
































































    1 





    1 














    1 




    1 












    1 





























    1 















    1 












    1 
    1 














    1 



    1 












    1 

    1 

    1 



    1 

    1 







    1 




    1 



    1 
    1 

    1 
    1 

    1 


















    1 



























    1 


    1 
    1 















































































































































































































































































































































































































































































































































































































































































    1 




    1 
    1 

























    1 

    1 



















    1 




    1 













    1 


    1 




    1 




















    1 

























    1 





    1 



    1 


    1 



    1 
    1 



    1 


    1 










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


































    1 


    1 








    1 






    1 







































































    1 


    1 

    1 





    1 


    1 
    1 



    1 

    1 






















































































































































































    1 


    1 






































































































































































































    1 


    1 
    1 




    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/mm/memory.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 */

/*
 * demand-loading started 01.12.91 - seems it is high on the list of
 * things wanted, and it should be easy to implement. - Linus
 */

/*
 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
 * pages started 02.12.91, seems to work. - Linus.
 *
 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
 * would have taken more than the 6M I have free, but it worked well as
 * far as I could see.
 *
 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
 */

/*
 * Real VM (paging to/from disk) started 18.12.91. Much more work and
 * thought has to go into this. Oh, well..
 * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
 *                Found it. Everything seems to work now.
 * 20.12.91  -  Ok, making the swap-device changeable like the root.
 */

/*
 * 05.04.94  -  Multi-page memory management added for v1.1.
 *              Idea by Alex Bligh (alex@cconcepts.co.uk)
 *
 * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
 *                (Gerhard.Wichert@pdb.siemens.de)
 *
 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
 */

#include <linux/kernel_stat.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/memremap.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/export.h>
#include <linux/delayacct.h>
#include <linux/init.h>
#include <linux/pfn_t.h>
#include <linux/writeback.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/swapops.h>
#include <linux/elf.h>
#include <linux/gfp.h>
#include <linux/migrate.h>
#include <linux/string.h>
#include <linux/debugfs.h>
#include <linux/userfaultfd_k.h>
#include <linux/dax.h>
#include <linux/oom.h>
#include <linux/numa.h>
#include <linux/perf_event.h>
#include <linux/ptrace.h>
#include <linux/vmalloc.h>

#include <trace/events/kmem.h>

#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>

#include "pgalloc-track.h"
#include "internal.h"

#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
#endif

#ifndef CONFIG_NEED_MULTIPLE_NODES
/* use the per-pgdat data instead for discontigmem - mbligh */
unsigned long max_mapnr;
EXPORT_SYMBOL(max_mapnr);

struct page *mem_map;
EXPORT_SYMBOL(mem_map);
#endif

/*
 * A number of key systems in x86 including ioremap() rely on the assumption
 * that high_memory defines the upper bound on direct map memory, then end
 * of ZONE_NORMAL.  Under CONFIG_DISCONTIG this means that max_low_pfn and
 * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
 * and ZONE_HIGHMEM.
 */
void *high_memory;
EXPORT_SYMBOL(high_memory);

/*
 * Randomize the address space (stacks, mmaps, brk, etc.).
 *
 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
 *   as ancient (libc5 based) binaries can segfault. )
 */
int randomize_va_space __read_mostly =
#ifdef CONFIG_COMPAT_BRK
                                        1;
#else
                                        2;
#endif

#ifndef arch_faults_on_old_pte
static inline bool arch_faults_on_old_pte(void)
{
        /*
         * Those arches which don't have hw access flag feature need to
         * implement their own helper. By default, "true" means pagefault
         * will be hit on old pte.
         */
        return true;
}
#endif

static int __init disable_randmaps(char *s)
{
        randomize_va_space = 0;
        return 1;
}
__setup("norandmaps", disable_randmaps);

unsigned long zero_pfn __read_mostly;
EXPORT_SYMBOL(zero_pfn);

unsigned long highest_memmap_pfn __read_mostly;

/*
 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
 */
static int __init init_zero_pfn(void)
{
        zero_pfn = page_to_pfn(ZERO_PAGE(0));
        return 0;
}
early_initcall(init_zero_pfn);

void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
{
        trace_rss_stat(mm, member, count);
}

#if defined(SPLIT_RSS_COUNTING)

void sync_mm_rss(struct mm_struct *mm)
{
        int i;

        for (i = 0; i < NR_MM_COUNTERS; i++) {
                if (current->rss_stat.count[i]) {
                        add_mm_counter(mm, i, current->rss_stat.count[i]);
                        current->rss_stat.count[i] = 0;
                }
        }
        current->rss_stat.events = 0;
}

static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
{
        struct task_struct *task = current;

        if (likely(task->mm == mm))
                task->rss_stat.count[member] += val;
        else
                add_mm_counter(mm, member, val);
}
#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)

/* sync counter once per 64 page faults */
#define TASK_RSS_EVENTS_THRESH        (64)
static void check_sync_rss_stat(struct task_struct *task)
{
        if (unlikely(task != current))
                return;
        if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
                sync_mm_rss(task->mm);
}
#else /* SPLIT_RSS_COUNTING */

#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)

static void check_sync_rss_stat(struct task_struct *task)
{
}

#endif /* SPLIT_RSS_COUNTING */

/*
 * Note: this doesn't free the actual pages themselves. That
 * has been handled earlier when unmapping all the memory regions.
 */
static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
                           unsigned long addr)
{
        pgtable_t token = pmd_pgtable(*pmd);
        pmd_clear(pmd);
        pte_free_tlb(tlb, token, addr);
        mm_dec_nr_ptes(tlb->mm);
}

static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
                                unsigned long addr, unsigned long end,
                                unsigned long floor, unsigned long ceiling)
{
        pmd_t *pmd;
        unsigned long next;
        unsigned long start;

        start = addr;
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
                if (pmd_none_or_clear_bad(pmd))
                        continue;
                free_pte_range(tlb, pmd, addr);
        } while (pmd++, addr = next, addr != end);

        start &= PUD_MASK;
        if (start < floor)
                return;
        if (ceiling) {
                ceiling &= PUD_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                return;

        pmd = pmd_offset(pud, start);
        pud_clear(pud);
        pmd_free_tlb(tlb, pmd, start);
        mm_dec_nr_pmds(tlb->mm);
}

static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
                                unsigned long addr, unsigned long end,
                                unsigned long floor, unsigned long ceiling)
{
        pud_t *pud;
        unsigned long next;
        unsigned long start;

        start = addr;
        pud = pud_offset(p4d, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
                free_pmd_range(tlb, pud, addr, next, floor, ceiling);
        } while (pud++, addr = next, addr != end);

        start &= P4D_MASK;
        if (start < floor)
                return;
        if (ceiling) {
                ceiling &= P4D_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                return;

        pud = pud_offset(p4d, start);
        p4d_clear(p4d);
        pud_free_tlb(tlb, pud, start);
        mm_dec_nr_puds(tlb->mm);
}

static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
                                unsigned long addr, unsigned long end,
                                unsigned long floor, unsigned long ceiling)
{
        p4d_t *p4d;
        unsigned long next;
        unsigned long start;

        start = addr;
        p4d = p4d_offset(pgd, addr);
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none_or_clear_bad(p4d))
                        continue;
                free_pud_range(tlb, p4d, addr, next, floor, ceiling);
        } while (p4d++, addr = next, addr != end);

        start &= PGDIR_MASK;
        if (start < floor)
                return;
        if (ceiling) {
                ceiling &= PGDIR_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                return;

        p4d = p4d_offset(pgd, start);
        pgd_clear(pgd);
        p4d_free_tlb(tlb, p4d, start);
}

/*
 * This function frees user-level page tables of a process.
 */
void free_pgd_range(struct mmu_gather *tlb,
                        unsigned long addr, unsigned long end,
                        unsigned long floor, unsigned long ceiling)
{
        pgd_t *pgd;
        unsigned long next;

        /*
         * The next few lines have given us lots of grief...
         *
         * Why are we testing PMD* at this top level?  Because often
         * there will be no work to do at all, and we'd prefer not to
         * go all the way down to the bottom just to discover that.
         *
         * Why all these "- 1"s?  Because 0 represents both the bottom
         * of the address space and the top of it (using -1 for the
         * top wouldn't help much: the masks would do the wrong thing).
         * The rule is that addr 0 and floor 0 refer to the bottom of
         * the address space, but end 0 and ceiling 0 refer to the top
         * Comparisons need to use "end - 1" and "ceiling - 1" (though
         * that end 0 case should be mythical).
         *
         * Wherever addr is brought up or ceiling brought down, we must
         * be careful to reject "the opposite 0" before it confuses the
         * subsequent tests.  But what about where end is brought down
         * by PMD_SIZE below? no, end can't go down to 0 there.
         *
         * Whereas we round start (addr) and ceiling down, by different
         * masks at different levels, in order to test whether a table
         * now has no other vmas using it, so can be freed, we don't
         * bother to round floor or end up - the tests don't need that.
         */

        addr &= PMD_MASK;
        if (addr < floor) {
                addr += PMD_SIZE;
                if (!addr)
                        return;
        }
        if (ceiling) {
                ceiling &= PMD_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                end -= PMD_SIZE;
        if (addr > end - 1)
                return;
        /*
         * We add page table cache pages with PAGE_SIZE,
         * (see pte_free_tlb()), flush the tlb if we need
         */
        tlb_change_page_size(tlb, PAGE_SIZE);
        pgd = pgd_offset(tlb->mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
                free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
        } while (pgd++, addr = next, addr != end);
}

void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
                unsigned long floor, unsigned long ceiling)
{
        while (vma) {
                struct vm_area_struct *next = vma->vm_next;
                unsigned long addr = vma->vm_start;

                /*
                 * Hide vma from rmap and truncate_pagecache before freeing
                 * pgtables
                 */
                unlink_anon_vmas(vma);
                unlink_file_vma(vma);

                if (is_vm_hugetlb_page(vma)) {
                        hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
                                floor, next ? next->vm_start : ceiling);
                } else {
                        /*
                         * Optimization: gather nearby vmas into one call down
                         */
                        while (next && next->vm_start <= vma->vm_end + PMD_SIZE
                               && !is_vm_hugetlb_page(next)) {
                                vma = next;
                                next = vma->vm_next;
                                unlink_anon_vmas(vma);
                                unlink_file_vma(vma);
                        }
                        free_pgd_range(tlb, addr, vma->vm_end,
                                floor, next ? next->vm_start : ceiling);
                }
                vma = next;
        }
}

int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
{
        spinlock_t *ptl;
        pgtable_t new = pte_alloc_one(mm);
        if (!new)
                return -ENOMEM;

        /*
         * Ensure all pte setup (eg. pte page lock and page clearing) are
         * visible before the pte is made visible to other CPUs by being
         * put into page tables.
         *
         * The other side of the story is the pointer chasing in the page
         * table walking code (when walking the page table without locking;
         * ie. most of the time). Fortunately, these data accesses consist
         * of a chain of data-dependent loads, meaning most CPUs (alpha
         * being the notable exception) will already guarantee loads are
         * seen in-order. See the alpha page table accessors for the
         * smp_rmb() barriers in page table walking code.
         */
        smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */

        ptl = pmd_lock(mm, pmd);
        if (likely(pmd_none(*pmd))) {        /* Has another populated it ? */
                mm_inc_nr_ptes(mm);
                pmd_populate(mm, pmd, new);
                new = NULL;
        }
        spin_unlock(ptl);
        if (new)
                pte_free(mm, new);
        return 0;
}

int __pte_alloc_kernel(pmd_t *pmd)
{
        pte_t *new = pte_alloc_one_kernel(&init_mm);
        if (!new)
                return -ENOMEM;

        smp_wmb(); /* See comment in __pte_alloc */

        spin_lock(&init_mm.page_table_lock);
        if (likely(pmd_none(*pmd))) {        /* Has another populated it ? */
                pmd_populate_kernel(&init_mm, pmd, new);
                new = NULL;
        }
        spin_unlock(&init_mm.page_table_lock);
        if (new)
                pte_free_kernel(&init_mm, new);
        return 0;
}

static inline void init_rss_vec(int *rss)
{
        memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
}

static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
{
        int i;

        if (current->mm == mm)
                sync_mm_rss(mm);
        for (i = 0; i < NR_MM_COUNTERS; i++)
                if (rss[i])
                        add_mm_counter(mm, i, rss[i]);
}

/*
 * This function is called to print an error when a bad pte
 * is found. For example, we might have a PFN-mapped pte in
 * a region that doesn't allow it.
 *
 * The calling function must still handle the error.
 */
static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
                          pte_t pte, struct page *page)
{
        pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
        p4d_t *p4d = p4d_offset(pgd, addr);
        pud_t *pud = pud_offset(p4d, addr);
        pmd_t *pmd = pmd_offset(pud, addr);
        struct address_space *mapping;
        pgoff_t index;
        static unsigned long resume;
        static unsigned long nr_shown;
        static unsigned long nr_unshown;

        /*
         * Allow a burst of 60 reports, then keep quiet for that minute;
         * or allow a steady drip of one report per second.
         */
        if (nr_shown == 60) {
                if (time_before(jiffies, resume)) {
                        nr_unshown++;
                        return;
                }
                if (nr_unshown) {
                        pr_alert("BUG: Bad page map: %lu messages suppressed\n",
                                 nr_unshown);
                        nr_unshown = 0;
                }
                nr_shown = 0;
        }
        if (nr_shown++ == 0)
                resume = jiffies + 60 * HZ;

        mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
        index = linear_page_index(vma, addr);

        pr_alert("BUG: Bad page map in process %s  pte:%08llx pmd:%08llx\n",
                 current->comm,
                 (long long)pte_val(pte), (long long)pmd_val(*pmd));
        if (page)
                dump_page(page, "bad pte");
        pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
                 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
        pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
                 vma->vm_file,
                 vma->vm_ops ? vma->vm_ops->fault : NULL,
                 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
                 mapping ? mapping->a_ops->readpage : NULL);
        dump_stack();
        add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}

/*
 * vm_normal_page -- This function gets the "struct page" associated with a pte.
 *
 * "Special" mappings do not wish to be associated with a "struct page" (either
 * it doesn't exist, or it exists but they don't want to touch it). In this
 * case, NULL is returned here. "Normal" mappings do have a struct page.
 *
 * There are 2 broad cases. Firstly, an architecture may define a pte_special()
 * pte bit, in which case this function is trivial. Secondly, an architecture
 * may not have a spare pte bit, which requires a more complicated scheme,
 * described below.
 *
 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
 * special mapping (even if there are underlying and valid "struct pages").
 * COWed pages of a VM_PFNMAP are always normal.
 *
 * The way we recognize COWed pages within VM_PFNMAP mappings is through the
 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
 * set, and the vm_pgoff will point to the first PFN mapped: thus every special
 * mapping will always honor the rule
 *
 *        pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
 *
 * And for normal mappings this is false.
 *
 * This restricts such mappings to be a linear translation from virtual address
 * to pfn. To get around this restriction, we allow arbitrary mappings so long
 * as the vma is not a COW mapping; in that case, we know that all ptes are
 * special (because none can have been COWed).
 *
 *
 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
 *
 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
 * page" backing, however the difference is that _all_ pages with a struct
 * page (that is, those where pfn_valid is true) are refcounted and considered
 * normal pages by the VM. The disadvantage is that pages are refcounted
 * (which can be slower and simply not an option for some PFNMAP users). The
 * advantage is that we don't have to follow the strict linearity rule of
 * PFNMAP mappings in order to support COWable mappings.
 *
 */
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                            pte_t pte)
{
        unsigned long pfn = pte_pfn(pte);

        if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
                if (likely(!pte_special(pte)))
                        goto check_pfn;
                if (vma->vm_ops && vma->vm_ops->find_special_page)
                        return vma->vm_ops->find_special_page(vma, addr);
                if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
                        return NULL;
                if (is_zero_pfn(pfn))
                        return NULL;
                if (pte_devmap(pte))
                        return NULL;

                print_bad_pte(vma, addr, pte, NULL);
                return NULL;
        }

        /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */

        if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
                if (vma->vm_flags & VM_MIXEDMAP) {
                        if (!pfn_valid(pfn))
                                return NULL;
                        goto out;
                } else {
                        unsigned long off;
                        off = (addr - vma->vm_start) >> PAGE_SHIFT;
                        if (pfn == vma->vm_pgoff + off)
                                return NULL;
                        if (!is_cow_mapping(vma->vm_flags))
                                return NULL;
                }
        }

        if (is_zero_pfn(pfn))
                return NULL;

check_pfn:
        if (unlikely(pfn > highest_memmap_pfn)) {
                print_bad_pte(vma, addr, pte, NULL);
                return NULL;
        }

        /*
         * NOTE! We still have PageReserved() pages in the page tables.
         * eg. VDSO mappings can cause them to exist.
         */
out:
        return pfn_to_page(pfn);
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
                                pmd_t pmd)
{
        unsigned long pfn = pmd_pfn(pmd);

        /*
         * There is no pmd_special() but there may be special pmds, e.g.
         * in a direct-access (dax) mapping, so let's just replicate the
         * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here.
         */
        if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
                if (vma->vm_flags & VM_MIXEDMAP) {
                        if (!pfn_valid(pfn))
                                return NULL;
                        goto out;
                } else {
                        unsigned long off;
                        off = (addr - vma->vm_start) >> PAGE_SHIFT;
                        if (pfn == vma->vm_pgoff + off)
                                return NULL;
                        if (!is_cow_mapping(vma->vm_flags))
                                return NULL;
                }
        }

        if (pmd_devmap(pmd))
                return NULL;
        if (is_huge_zero_pmd(pmd))
                return NULL;
        if (unlikely(pfn > highest_memmap_pfn))
                return NULL;

        /*
         * NOTE! We still have PageReserved() pages in the page tables.
         * eg. VDSO mappings can cause them to exist.
         */
out:
        return pfn_to_page(pfn);
}
#endif

/*
 * copy one vm_area from one task to the other. Assumes the page tables
 * already present in the new task to be cleared in the whole range
 * covered by this vma.
 */

static unsigned long
copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
                struct vm_area_struct *src_vma, unsigned long addr, int *rss)
{
        unsigned long vm_flags = dst_vma->vm_flags;
        pte_t pte = *src_pte;
        struct page *page;
        swp_entry_t entry = pte_to_swp_entry(pte);

        if (likely(!non_swap_entry(entry))) {
                if (swap_duplicate(entry) < 0)
                        return entry.val;

                /* make sure dst_mm is on swapoff's mmlist. */
                if (unlikely(list_empty(&dst_mm->mmlist))) {
                        spin_lock(&mmlist_lock);
                        if (list_empty(&dst_mm->mmlist))
                                list_add(&dst_mm->mmlist,
                                                &src_mm->mmlist);
                        spin_unlock(&mmlist_lock);
                }
                rss[MM_SWAPENTS]++;
        } else if (is_migration_entry(entry)) {
                page = migration_entry_to_page(entry);

                rss[mm_counter(page)]++;

                if (is_write_migration_entry(entry) &&
                                is_cow_mapping(vm_flags)) {
                        /*
                         * COW mappings require pages in both
                         * parent and child to be set to read.
                         */
                        make_migration_entry_read(&entry);
                        pte = swp_entry_to_pte(entry);
                        if (pte_swp_soft_dirty(*src_pte))
                                pte = pte_swp_mksoft_dirty(pte);
                        if (pte_swp_uffd_wp(*src_pte))
                                pte = pte_swp_mkuffd_wp(pte);
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
        } else if (is_device_private_entry(entry)) {
                page = device_private_entry_to_page(entry);

                /*
                 * Update rss count even for unaddressable pages, as
                 * they should treated just like normal pages in this
                 * respect.
                 *
                 * We will likely want to have some new rss counters
                 * for unaddressable pages, at some point. But for now
                 * keep things as they are.
                 */
                get_page(page);
                rss[mm_counter(page)]++;
                page_dup_rmap(page, false);

                /*
                 * We do not preserve soft-dirty information, because so
                 * far, checkpoint/restore is the only feature that
                 * requires that. And checkpoint/restore does not work
                 * when a device driver is involved (you cannot easily
                 * save and restore device driver state).
                 */
                if (is_write_device_private_entry(entry) &&
                    is_cow_mapping(vm_flags)) {
                        make_device_private_entry_read(&entry);
                        pte = swp_entry_to_pte(entry);
                        if (pte_swp_uffd_wp(*src_pte))
                                pte = pte_swp_mkuffd_wp(pte);
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
        }
        if (!userfaultfd_wp(dst_vma))
                pte = pte_swp_clear_uffd_wp(pte);
        set_pte_at(dst_mm, addr, dst_pte, pte);
        return 0;
}

/*
 * Copy a present and normal page if necessary.
 *
 * NOTE! The usual case is that this doesn't need to do
 * anything, and can just return a positive value. That
 * will let the caller know that it can just increase
 * the page refcount and re-use the pte the traditional
 * way.
 *
 * But _if_ we need to copy it because it needs to be
 * pinned in the parent (and the child should get its own
 * copy rather than just a reference to the same page),
 * we'll do that here and return zero to let the caller
 * know we're done.
 *
 * And if we need a pre-allocated page but don't yet have
 * one, return a negative error to let the preallocation
 * code know so that it can do so outside the page table
 * lock.
 */
static inline int
copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
                  pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
                  struct page **prealloc, pte_t pte, struct page *page)
{
        struct mm_struct *src_mm = src_vma->vm_mm;
        struct page *new_page;

        if (!is_cow_mapping(src_vma->vm_flags))
                return 1;

        /*
         * What we want to do is to check whether this page may
         * have been pinned by the parent process.  If so,
         * instead of wrprotect the pte on both sides, we copy
         * the page immediately so that we'll always guarantee
         * the pinned page won't be randomly replaced in the
         * future.
         *
         * The page pinning checks are just "has this mm ever
         * seen pinning", along with the (inexact) check of
         * the page count. That might give false positives for
         * for pinning, but it will work correctly.
         */
        if (likely(!atomic_read(&src_mm->has_pinned)))
                return 1;
        if (likely(!page_maybe_dma_pinned(page)))
                return 1;

        /*
         * The vma->anon_vma of the child process may be NULL
         * because the entire vma does not contain anonymous pages.
         * A BUG will occur when the copy_present_page() passes
         * a copy of a non-anonymous page of that vma to the
         * page_add_new_anon_rmap() to set up new anonymous rmap.
         * Return 1 if the page is not an anonymous page.
         */
        if (!PageAnon(page))
                return 1;

        new_page = *prealloc;
        if (!new_page)
                return -EAGAIN;

        /*
         * We have a prealloc page, all good!  Take it
         * over and copy the page & arm it.
         */
        *prealloc = NULL;
        copy_user_highpage(new_page, page, addr, src_vma);
        __SetPageUptodate(new_page);
        page_add_new_anon_rmap(new_page, dst_vma, addr, false);
        lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
        rss[mm_counter(new_page)]++;

        /* All done, just insert the new page copy in the child */
        pte = mk_pte(new_page, dst_vma->vm_page_prot);
        pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
        if (userfaultfd_pte_wp(dst_vma, *src_pte))
                /* Uffd-wp needs to be delivered to dest pte as well */
                pte = pte_wrprotect(pte_mkuffd_wp(pte));
        set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
        return 0;
}

/*
 * Copy one pte.  Returns 0 if succeeded, or -EAGAIN if one preallocated page
 * is required to copy this pte.
 */
static inline int
copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
                 pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
                 struct page **prealloc)
{
        struct mm_struct *src_mm = src_vma->vm_mm;
        unsigned long vm_flags = src_vma->vm_flags;
        pte_t pte = *src_pte;
        struct page *page;

        page = vm_normal_page(src_vma, addr, pte);
        if (page) {
                int retval;

                retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
                                           addr, rss, prealloc, pte, page);
                if (retval <= 0)
                        return retval;

                get_page(page);
                page_dup_rmap(page, false);
                rss[mm_counter(page)]++;
        }

        /*
         * If it's a COW mapping, write protect it both
         * in the parent and the child
         */
        if (is_cow_mapping(vm_flags) && pte_write(pte)) {
                ptep_set_wrprotect(src_mm, addr, src_pte);
                pte = pte_wrprotect(pte);
        }

        /*
         * If it's a shared mapping, mark it clean in
         * the child
         */
        if (vm_flags & VM_SHARED)
                pte = pte_mkclean(pte);
        pte = pte_mkold(pte);

        if (!userfaultfd_wp(dst_vma))
                pte = pte_clear_uffd_wp(pte);

        set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
        return 0;
}

static inline struct page *
page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
                   unsigned long addr)
{
        struct page *new_page;

        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr);
        if (!new_page)
                return NULL;

        if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) {
                put_page(new_page);
                return NULL;
        }
        cgroup_throttle_swaprate(new_page, GFP_KERNEL);

        return new_page;
}

static int
copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        pte_t *orig_src_pte, *orig_dst_pte;
        pte_t *src_pte, *dst_pte;
        spinlock_t *src_ptl, *dst_ptl;
        int progress, ret = 0;
        int rss[NR_MM_COUNTERS];
        swp_entry_t entry = (swp_entry_t){0};
        struct page *prealloc = NULL;

again:
        progress = 0;
        init_rss_vec(rss);

        dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
        if (!dst_pte) {
                ret = -ENOMEM;
                goto out;
        }
        src_pte = pte_offset_map(src_pmd, addr);
        src_ptl = pte_lockptr(src_mm, src_pmd);
        spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
        orig_src_pte = src_pte;
        orig_dst_pte = dst_pte;
        arch_enter_lazy_mmu_mode();

        do {
                /*
                 * We are holding two locks at this point - either of them
                 * could generate latencies in another task on another CPU.
                 */
                if (progress >= 32) {
                        progress = 0;
                        if (need_resched() ||
                            spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
                                break;
                }
                if (pte_none(*src_pte)) {
                        progress++;
                        continue;
                }
                if (unlikely(!pte_present(*src_pte))) {
                        entry.val = copy_nonpresent_pte(dst_mm, src_mm,
                                                        dst_pte, src_pte,
                                                        dst_vma, src_vma,
                                                        addr, rss);
                        if (entry.val)
                                break;
                        progress += 8;
                        continue;
                }
                /* copy_present_pte() will clear `*prealloc' if consumed */
                ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
                                       addr, rss, &prealloc);
                /*
                 * If we need a pre-allocated page for this pte, drop the
                 * locks, allocate, and try again.
                 */
                if (unlikely(ret == -EAGAIN))
                        break;
                if (unlikely(prealloc)) {
                        /*
                         * pre-alloc page cannot be reused by next time so as
                         * to strictly follow mempolicy (e.g., alloc_page_vma()
                         * will allocate page according to address).  This
                         * could only happen if one pinned pte changed.
                         */
                        put_page(prealloc);
                        prealloc = NULL;
                }
                progress += 8;
        } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);

        arch_leave_lazy_mmu_mode();
        spin_unlock(src_ptl);
        pte_unmap(orig_src_pte);
        add_mm_rss_vec(dst_mm, rss);
        pte_unmap_unlock(orig_dst_pte, dst_ptl);
        cond_resched();

        if (entry.val) {
                if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
                        ret = -ENOMEM;
                        goto out;
                }
                entry.val = 0;
        } else if (ret) {
                WARN_ON_ONCE(ret != -EAGAIN);
                prealloc = page_copy_prealloc(src_mm, src_vma, addr);
                if (!prealloc)
                        return -ENOMEM;
                /* We've captured and resolved the error. Reset, try again. */
                ret = 0;
        }
        if (addr != end)
                goto again;
out:
        if (unlikely(prealloc))
                put_page(prealloc);
        return ret;
}

static inline int
copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        pmd_t *src_pmd, *dst_pmd;
        unsigned long next;

        dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
        if (!dst_pmd)
                return -ENOMEM;
        src_pmd = pmd_offset(src_pud, addr);
        do {
                next = pmd_addr_end(addr, end);
                if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
                        || pmd_devmap(*src_pmd)) {
                        int err;
                        VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
                        err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
                                            addr, dst_vma, src_vma);
                        if (err == -ENOMEM)
                                return -ENOMEM;
                        if (!err)
                                continue;
                        /* fall through */
                }
                if (pmd_none_or_clear_bad(src_pmd))
                        continue;
                if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
                                   addr, next))
                        return -ENOMEM;
        } while (dst_pmd++, src_pmd++, addr = next, addr != end);
        return 0;
}

static inline int
copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        pud_t *src_pud, *dst_pud;
        unsigned long next;

        dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
        if (!dst_pud)
                return -ENOMEM;
        src_pud = pud_offset(src_p4d, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
                        int err;

                        VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
                        err = copy_huge_pud(dst_mm, src_mm,
                                            dst_pud, src_pud, addr, src_vma);
                        if (err == -ENOMEM)
                                return -ENOMEM;
                        if (!err)
                                continue;
                        /* fall through */
                }
                if (pud_none_or_clear_bad(src_pud))
                        continue;
                if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
                                   addr, next))
                        return -ENOMEM;
        } while (dst_pud++, src_pud++, addr = next, addr != end);
        return 0;
}

static inline int
copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        p4d_t *src_p4d, *dst_p4d;
        unsigned long next;

        dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
        if (!dst_p4d)
                return -ENOMEM;
        src_p4d = p4d_offset(src_pgd, addr);
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none_or_clear_bad(src_p4d))
                        continue;
                if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
                                   addr, next))
                        return -ENOMEM;
        } while (dst_p4d++, src_p4d++, addr = next, addr != end);
        return 0;
}

int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
        pgd_t *src_pgd, *dst_pgd;
        unsigned long next;
        unsigned long addr = src_vma->vm_start;
        unsigned long end = src_vma->vm_end;
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        struct mmu_notifier_range range;
        bool is_cow;
        int ret;

        /*
         * Don't copy ptes where a page fault will fill them correctly.
         * Fork becomes much lighter when there are big shared or private
         * readonly mappings. The tradeoff is that copy_page_range is more
         * efficient than faulting.
         */
        if (!(src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
            !src_vma->anon_vma)
                return 0;

        if (is_vm_hugetlb_page(src_vma))
                return copy_hugetlb_page_range(dst_mm, src_mm, src_vma);

        if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
                /*
                 * We do not free on error cases below as remove_vma
                 * gets called on error from higher level routine
                 */
                ret = track_pfn_copy(src_vma);
                if (ret)
                        return ret;
        }

        /*
         * We need to invalidate the secondary MMU mappings only when
         * there could be a permission downgrade on the ptes of the
         * parent mm. And a permission downgrade will only happen if
         * is_cow_mapping() returns true.
         */
        is_cow = is_cow_mapping(src_vma->vm_flags);

        if (is_cow) {
                mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
                                        0, src_vma, src_mm, addr, end);
                mmu_notifier_invalidate_range_start(&range);
                /*
                 * Disabling preemption is not needed for the write side, as
                 * the read side doesn't spin, but goes to the mmap_lock.
                 *
                 * Use the raw variant of the seqcount_t write API to avoid
                 * lockdep complaining about preemptibility.
                 */
                mmap_assert_write_locked(src_mm);
                raw_write_seqcount_begin(&src_mm->write_protect_seq);
        }

        ret = 0;
        dst_pgd = pgd_offset(dst_mm, addr);
        src_pgd = pgd_offset(src_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(src_pgd))
                        continue;
                if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
                                            addr, next))) {
                        ret = -ENOMEM;
                        break;
                }
        } while (dst_pgd++, src_pgd++, addr = next, addr != end);

        if (is_cow) {
                raw_write_seqcount_end(&src_mm->write_protect_seq);
                mmu_notifier_invalidate_range_end(&range);
        }
        return ret;
}

/* Whether we should zap all COWed (private) pages too */
static inline bool should_zap_cows(struct zap_details *details)
{
        /* By default, zap all pages */
        if (!details)
                return true;

        /* Or, we zap COWed pages only if the caller wants to */
        return !details->check_mapping;
}

static unsigned long zap_pte_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pmd_t *pmd,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        struct mm_struct *mm = tlb->mm;
        int force_flush = 0;
        int rss[NR_MM_COUNTERS];
        spinlock_t *ptl;
        pte_t *start_pte;
        pte_t *pte;
        swp_entry_t entry;

        tlb_change_page_size(tlb, PAGE_SIZE);
again:
        init_rss_vec(rss);
        start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
        pte = start_pte;
        flush_tlb_batched_pending(mm);
        arch_enter_lazy_mmu_mode();
        do {
                pte_t ptent = *pte;
                if (pte_none(ptent))
                        continue;

                if (need_resched())
                        break;

                if (pte_present(ptent)) {
                        struct page *page;

                        page = vm_normal_page(vma, addr, ptent);
                        if (unlikely(details) && page) {
                                /*
                                 * unmap_shared_mapping_pages() wants to
                                 * invalidate cache without truncating:
                                 * unmap shared but keep private pages.
                                 */
                                if (details->check_mapping &&
                                    details->check_mapping != page_rmapping(page))
                                        continue;
                        }
                        ptent = ptep_get_and_clear_full(mm, addr, pte,
                                                        tlb->fullmm);
                        tlb_remove_tlb_entry(tlb, pte, addr);
                        if (unlikely(!page))
                                continue;

                        if (!PageAnon(page)) {
                                if (pte_dirty(ptent)) {
                                        force_flush = 1;
                                        set_page_dirty(page);
                                }
                                if (pte_young(ptent) &&
                                    likely(!(vma->vm_flags & VM_SEQ_READ)))
                                        mark_page_accessed(page);
                        }
                        rss[mm_counter(page)]--;
                        page_remove_rmap(page, false);
                        if (unlikely(page_mapcount(page) < 0))
                                print_bad_pte(vma, addr, ptent, page);
                        if (unlikely(__tlb_remove_page(tlb, page))) {
                                force_flush = 1;
                                addr += PAGE_SIZE;
                                break;
                        }
                        continue;
                }

                entry = pte_to_swp_entry(ptent);
                if (is_device_private_entry(entry)) {
                        struct page *page = device_private_entry_to_page(entry);

                        if (unlikely(details && details->check_mapping)) {
                                /*
                                 * unmap_shared_mapping_pages() wants to
                                 * invalidate cache without truncating:
                                 * unmap shared but keep private pages.
                                 */
                                if (details->check_mapping !=
                                    page_rmapping(page))
                                        continue;
                        }

                        pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
                        rss[mm_counter(page)]--;
                        page_remove_rmap(page, false);
                        put_page(page);
                        continue;
                }

                if (!non_swap_entry(entry)) {
                        /* Genuine swap entry, hence a private anon page */
                        if (!should_zap_cows(details))
                                continue;
                        rss[MM_SWAPENTS]--;
                } else if (is_migration_entry(entry)) {
                        struct page *page;

                        page = migration_entry_to_page(entry);
                        if (details && details->check_mapping &&
                            details->check_mapping != page_rmapping(page))
                                continue;
                        rss[mm_counter(page)]--;
                }
                if (unlikely(!free_swap_and_cache(entry)))
                        print_bad_pte(vma, addr, ptent, NULL);
                pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
        } while (pte++, addr += PAGE_SIZE, addr != end);

        add_mm_rss_vec(mm, rss);
        arch_leave_lazy_mmu_mode();

        /* Do the actual TLB flush before dropping ptl */
        if (force_flush)
                tlb_flush_mmu_tlbonly(tlb);
        pte_unmap_unlock(start_pte, ptl);

        /*
         * If we forced a TLB flush (either due to running out of
         * batch buffers or because we needed to flush dirty TLB
         * entries before releasing the ptl), free the batched
         * memory too. Restart if we didn't do everything.
         */
        if (force_flush) {
                force_flush = 0;
                tlb_flush_mmu(tlb);
        }

        if (addr != end) {
                cond_resched();
                goto again;
        }

        return addr;
}

static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pud_t *pud,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        pmd_t *pmd;
        unsigned long next;

        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
                if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
                        if (next - addr != HPAGE_PMD_SIZE)
                                __split_huge_pmd(vma, pmd, addr, false, NULL);
                        else if (zap_huge_pmd(tlb, vma, pmd, addr))
                                goto next;
                        /* fall through */
                } else if (details && details->single_page &&
                           PageTransCompound(details->single_page) &&
                           next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
                        spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
                        /*
                         * Take and drop THP pmd lock so that we cannot return
                         * prematurely, while zap_huge_pmd() has cleared *pmd,
                         * but not yet decremented compound_mapcount().
                         */
                        spin_unlock(ptl);
                }

                /*
                 * Here there can be other concurrent MADV_DONTNEED or
                 * trans huge page faults running, and if the pmd is
                 * none or trans huge it can change under us. This is
                 * because MADV_DONTNEED holds the mmap_lock in read
                 * mode.
                 */
                if (pmd_none_or_trans_huge_or_clear_bad(pmd))
                        goto next;
                next = zap_pte_range(tlb, vma, pmd, addr, next, details);
next:
                cond_resched();
        } while (pmd++, addr = next, addr != end);

        return addr;
}

static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, p4d_t *p4d,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        pud_t *pud;
        unsigned long next;

        pud = pud_offset(p4d, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
                        if (next - addr != HPAGE_PUD_SIZE) {
                                mmap_assert_locked(tlb->mm);
                                split_huge_pud(vma, pud, addr);
                        } else if (zap_huge_pud(tlb, vma, pud, addr))
                                goto next;
                        /* fall through */
                }
                if (pud_none_or_clear_bad(pud))
                        continue;
                next = zap_pmd_range(tlb, vma, pud, addr, next, details);
next:
                cond_resched();
        } while (pud++, addr = next, addr != end);

        return addr;
}

static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pgd_t *pgd,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        p4d_t *p4d;
        unsigned long next;

        p4d = p4d_offset(pgd, addr);
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none_or_clear_bad(p4d))
                        continue;
                next = zap_pud_range(tlb, vma, p4d, addr, next, details);
        } while (p4d++, addr = next, addr != end);

        return addr;
}

void unmap_page_range(struct mmu_gather *tlb,
                             struct vm_area_struct *vma,
                             unsigned long addr, unsigned long end,
                             struct zap_details *details)
{
        pgd_t *pgd;
        unsigned long next;

        BUG_ON(addr >= end);
        tlb_start_vma(tlb, vma);
        pgd = pgd_offset(vma->vm_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
                next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
        } while (pgd++, addr = next, addr != end);
        tlb_end_vma(tlb, vma);
}


static void unmap_single_vma(struct mmu_gather *tlb,
                struct vm_area_struct *vma, unsigned long start_addr,
                unsigned long end_addr,
                struct zap_details *details)
{
        unsigned long start = max(vma->vm_start, start_addr);
        unsigned long end;

        if (start >= vma->vm_end)
                return;
        end = min(vma->vm_end, end_addr);
        if (end <= vma->vm_start)
                return;

        if (vma->vm_file)
                uprobe_munmap(vma, start, end);

        if (unlikely(vma->vm_flags & VM_PFNMAP))
                untrack_pfn(vma, 0, 0);

        if (start != end) {
                if (unlikely(is_vm_hugetlb_page(vma))) {
                        /*
                         * It is undesirable to test vma->vm_file as it
                         * should be non-null for valid hugetlb area.
                         * However, vm_file will be NULL in the error
                         * cleanup path of mmap_region. When
                         * hugetlbfs ->mmap method fails,
                         * mmap_region() nullifies vma->vm_file
                         * before calling this function to clean up.
                         * Since no pte has actually been setup, it is
                         * safe to do nothing in this case.
                         */
                        if (vma->vm_file) {
                                i_mmap_lock_write(vma->vm_file->f_mapping);
                                __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
                                i_mmap_unlock_write(vma->vm_file->f_mapping);
                        }
                } else
                        unmap_page_range(tlb, vma, start, end, details);
        }
}

/**
 * unmap_vmas - unmap a range of memory covered by a list of vma's
 * @tlb: address of the caller's struct mmu_gather
 * @vma: the starting vma
 * @start_addr: virtual address at which to start unmapping
 * @end_addr: virtual address at which to end unmapping
 *
 * Unmap all pages in the vma list.
 *
 * Only addresses between `start' and `end' will be unmapped.
 *
 * The VMA list must be sorted in ascending virtual address order.
 *
 * unmap_vmas() assumes that the caller will flush the whole unmapped address
 * range after unmap_vmas() returns.  So the only responsibility here is to
 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
 * drops the lock and schedules.
 */
void unmap_vmas(struct mmu_gather *tlb,
                struct vm_area_struct *vma, unsigned long start_addr,
                unsigned long end_addr)
{
        struct mmu_notifier_range range;

        mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
                                start_addr, end_addr);
        mmu_notifier_invalidate_range_start(&range);
        for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
                unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
        mmu_notifier_invalidate_range_end(&range);
}

/**
 * zap_page_range - remove user pages in a given range
 * @vma: vm_area_struct holding the applicable pages
 * @start: starting address of pages to zap
 * @size: number of bytes to zap
 *
 * Caller must protect the VMA list
 */
void zap_page_range(struct vm_area_struct *vma, unsigned long start,
                unsigned long size)
{
        struct mmu_notifier_range range;
        struct mmu_gather tlb;

        lru_add_drain();
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
                                start, start + size);
        tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
        update_hiwater_rss(vma->vm_mm);
        mmu_notifier_invalidate_range_start(&range);
        for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
                unmap_single_vma(&tlb, vma, start, range.end, NULL);
        mmu_notifier_invalidate_range_end(&range);
        tlb_finish_mmu(&tlb, start, range.end);
}

/**
 * zap_page_range_single - remove user pages in a given range
 * @vma: vm_area_struct holding the applicable pages
 * @address: starting address of pages to zap
 * @size: number of bytes to zap
 * @details: details of shared cache invalidation
 *
 * The range must fit into one VMA.
 */
static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
                unsigned long size, struct zap_details *details)
{
        struct mmu_notifier_range range;
        struct mmu_gather tlb;

        lru_add_drain();
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
                                address, address + size);
        tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
        update_hiwater_rss(vma->vm_mm);
        mmu_notifier_invalidate_range_start(&range);
        unmap_single_vma(&tlb, vma, address, range.end, details);
        mmu_notifier_invalidate_range_end(&range);
        tlb_finish_mmu(&tlb, address, range.end);
}

/**
 * zap_vma_ptes - remove ptes mapping the vma
 * @vma: vm_area_struct holding ptes to be zapped
 * @address: starting address of pages to zap
 * @size: number of bytes to zap
 *
 * This function only unmaps ptes assigned to VM_PFNMAP vmas.
 *
 * The entire address range must be fully contained within the vma.
 *
 */
void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
                unsigned long size)
{
        if (address < vma->vm_start || address + size > vma->vm_end ||
                            !(vma->vm_flags & VM_PFNMAP))
                return;

        zap_page_range_single(vma, address, size, NULL);
}
EXPORT_SYMBOL_GPL(zap_vma_ptes);

static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;

        pgd = pgd_offset(mm, addr);
        p4d = p4d_alloc(mm, pgd, addr);
        if (!p4d)
                return NULL;
        pud = pud_alloc(mm, p4d, addr);
        if (!pud)
                return NULL;
        pmd = pmd_alloc(mm, pud, addr);
        if (!pmd)
                return NULL;

        VM_BUG_ON(pmd_trans_huge(*pmd));
        return pmd;
}

pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
                        spinlock_t **ptl)
{
        pmd_t *pmd = walk_to_pmd(mm, addr);

        if (!pmd)
                return NULL;
        return pte_alloc_map_lock(mm, pmd, addr, ptl);
}

static int validate_page_before_insert(struct page *page)
{
        if (PageAnon(page) || PageSlab(page) || page_has_type(page))
                return -EINVAL;
        flush_dcache_page(page);
        return 0;
}

static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
                        unsigned long addr, struct page *page, pgprot_t prot)
{
        if (!pte_none(*pte))
                return -EBUSY;
        /* Ok, finally just insert the thing.. */
        get_page(page);
        inc_mm_counter_fast(mm, mm_counter_file(page));
        page_add_file_rmap(page, false);
        set_pte_at(mm, addr, pte, mk_pte(page, prot));
        return 0;
}

/*
 * This is the old fallback for page remapping.
 *
 * For historical reasons, it only allows reserved pages. Only
 * old drivers should use this, and they needed to mark their
 * pages reserved for the old functions anyway.
 */
static int insert_page(struct vm_area_struct *vma, unsigned long addr,
                        struct page *page, pgprot_t prot)
{
        struct mm_struct *mm = vma->vm_mm;
        int retval;
        pte_t *pte;
        spinlock_t *ptl;

        retval = validate_page_before_insert(page);
        if (retval)
                goto out;
        retval = -ENOMEM;
        pte = get_locked_pte(mm, addr, &ptl);
        if (!pte)
                goto out;
        retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
        pte_unmap_unlock(pte, ptl);
out:
        return retval;
}

#ifdef pte_index
static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
                        unsigned long addr, struct page *page, pgprot_t prot)
{
        int err;

        if (!page_count(page))
                return -EINVAL;
        err = validate_page_before_insert(page);
        if (err)
                return err;
        return insert_page_into_pte_locked(mm, pte, addr, page, prot);
}

/* insert_pages() amortizes the cost of spinlock operations
 * when inserting pages in a loop. Arch *must* define pte_index.
 */
static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
                        struct page **pages, unsigned long *num, pgprot_t prot)
{
        pmd_t *pmd = NULL;
        pte_t *start_pte, *pte;
        spinlock_t *pte_lock;
        struct mm_struct *const mm = vma->vm_mm;
        unsigned long curr_page_idx = 0;
        unsigned long remaining_pages_total = *num;
        unsigned long pages_to_write_in_pmd;
        int ret;
more:
        ret = -EFAULT;
        pmd = walk_to_pmd(mm, addr);
        if (!pmd)
                goto out;

        pages_to_write_in_pmd = min_t(unsigned long,
                remaining_pages_total, PTRS_PER_PTE - pte_index(addr));

        /* Allocate the PTE if necessary; takes PMD lock once only. */
        ret = -ENOMEM;
        if (pte_alloc(mm, pmd))
                goto out;

        while (pages_to_write_in_pmd) {
                int pte_idx = 0;
                const int batch_size = min_t(int, pages_to_write_in_pmd, 8);

                start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
                for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
                        int err = insert_page_in_batch_locked(mm, pte,
                                addr, pages[curr_page_idx], prot);
                        if (unlikely(err)) {
                                pte_unmap_unlock(start_pte, pte_lock);
                                ret = err;
                                remaining_pages_total -= pte_idx;
                                goto out;
                        }
                        addr += PAGE_SIZE;
                        ++curr_page_idx;
                }
                pte_unmap_unlock(start_pte, pte_lock);
                pages_to_write_in_pmd -= batch_size;
                remaining_pages_total -= batch_size;
        }
        if (remaining_pages_total)
                goto more;
        ret = 0;
out:
        *num = remaining_pages_total;
        return ret;
}
#endif  /* ifdef pte_index */

/**
 * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
 * @vma: user vma to map to
 * @addr: target start user address of these pages
 * @pages: source kernel pages
 * @num: in: number of pages to map. out: number of pages that were *not*
 * mapped. (0 means all pages were successfully mapped).
 *
 * Preferred over vm_insert_page() when inserting multiple pages.
 *
 * In case of error, we may have mapped a subset of the provided
 * pages. It is the caller's responsibility to account for this case.
 *
 * The same restrictions apply as in vm_insert_page().
 */
int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
                        struct page **pages, unsigned long *num)
{
#ifdef pte_index
        const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;

        if (addr < vma->vm_start || end_addr >= vma->vm_end)
                return -EFAULT;
        if (!(vma->vm_flags & VM_MIXEDMAP)) {
                BUG_ON(mmap_read_trylock(vma->vm_mm));
                BUG_ON(vma->vm_flags & VM_PFNMAP);
                vma->vm_flags |= VM_MIXEDMAP;
        }
        /* Defer page refcount checking till we're about to map that page. */
        return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
#else
        unsigned long idx = 0, pgcount = *num;
        int err = -EINVAL;

        for (; idx < pgcount; ++idx) {
                err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]);
                if (err)
                        break;
        }
        *num = pgcount - idx;
        return err;
#endif  /* ifdef pte_index */
}
EXPORT_SYMBOL(vm_insert_pages);

/**
 * vm_insert_page - insert single page into user vma
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @page: source kernel page
 *
 * This allows drivers to insert individual pages they've allocated
 * into a user vma.
 *
 * The page has to be a nice clean _individual_ kernel allocation.
 * If you allocate a compound page, you need to have marked it as
 * such (__GFP_COMP), or manually just split the page up yourself
 * (see split_page()).
 *
 * NOTE! Traditionally this was done with "remap_pfn_range()" which
 * took an arbitrary page protection parameter. This doesn't allow
 * that. Your vma protection will have to be set up correctly, which
 * means that if you want a shared writable mapping, you'd better
 * ask for a shared writable mapping!
 *
 * The page does not need to be reserved.
 *
 * Usually this function is called from f_op->mmap() handler
 * under mm->mmap_lock write-lock, so it can change vma->vm_flags.
 * Caller must set VM_MIXEDMAP on vma if it wants to call this
 * function from other places, for example from page-fault handler.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
                        struct page *page)
{
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return -EFAULT;
        if (!page_count(page))
                return -EINVAL;
        if (!(vma->vm_flags & VM_MIXEDMAP)) {
                BUG_ON(mmap_read_trylock(vma->vm_mm));
                BUG_ON(vma->vm_flags & VM_PFNMAP);
                vma->vm_flags |= VM_MIXEDMAP;
        }
        return insert_page(vma, addr, page, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_insert_page);

/*
 * __vm_map_pages - maps range of kernel pages into user vma
 * @vma: user vma to map to
 * @pages: pointer to array of source kernel pages
 * @num: number of pages in page array
 * @offset: user's requested vm_pgoff
 *
 * This allows drivers to map range of kernel pages into a user vma.
 *
 * Return: 0 on success and error code otherwise.
 */
static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num, unsigned long offset)
{
        unsigned long count = vma_pages(vma);
        unsigned long uaddr = vma->vm_start;
        int ret, i;

        /* Fail if the user requested offset is beyond the end of the object */
        if (offset >= num)
                return -ENXIO;

        /* Fail if the user requested size exceeds available object size */
        if (count > num - offset)
                return -ENXIO;

        for (i = 0; i < count; i++) {
                ret = vm_insert_page(vma, uaddr, pages[offset + i]);
                if (ret < 0)
                        return ret;
                uaddr += PAGE_SIZE;
        }

        return 0;
}

/**
 * vm_map_pages - maps range of kernel pages starts with non zero offset
 * @vma: user vma to map to
 * @pages: pointer to array of source kernel pages
 * @num: number of pages in page array
 *
 * Maps an object consisting of @num pages, catering for the user's
 * requested vm_pgoff
 *
 * If we fail to insert any page into the vma, the function will return
 * immediately leaving any previously inserted pages present.  Callers
 * from the mmap handler may immediately return the error as their caller
 * will destroy the vma, removing any successfully inserted pages. Other
 * callers should make their own arrangements for calling unmap_region().
 *
 * Context: Process context. Called by mmap handlers.
 * Return: 0 on success and error code otherwise.
 */
int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num)
{
        return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
}
EXPORT_SYMBOL(vm_map_pages);

/**
 * vm_map_pages_zero - map range of kernel pages starts with zero offset
 * @vma: user vma to map to
 * @pages: pointer to array of source kernel pages
 * @num: number of pages in page array
 *
 * Similar to vm_map_pages(), except that it explicitly sets the offset
 * to 0. This function is intended for the drivers that did not consider
 * vm_pgoff.
 *
 * Context: Process context. Called by mmap handlers.
 * Return: 0 on success and error code otherwise.
 */
int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num)
{
        return __vm_map_pages(vma, pages, num, 0);
}
EXPORT_SYMBOL(vm_map_pages_zero);

static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        pfn_t pfn, pgprot_t prot, bool mkwrite)
{
        struct mm_struct *mm = vma->vm_mm;
        pte_t *pte, entry;
        spinlock_t *ptl;

        pte = get_locked_pte(mm, addr, &ptl);
        if (!pte)
                return VM_FAULT_OOM;
        if (!pte_none(*pte)) {
                if (mkwrite) {
                        /*
                         * For read faults on private mappings the PFN passed
                         * in may not match the PFN we have mapped if the
                         * mapped PFN is a writeable COW page.  In the mkwrite
                         * case we are creating a writable PTE for a shared
                         * mapping and we expect the PFNs to match. If they
                         * don't match, we are likely racing with block
                         * allocation and mapping invalidation so just skip the
                         * update.
                         */
                        if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
                                WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
                                goto out_unlock;
                        }
                        entry = pte_mkyoung(*pte);
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                        if (ptep_set_access_flags(vma, addr, pte, entry, 1))
                                update_mmu_cache(vma, addr, pte);
                }
                goto out_unlock;
        }

        /* Ok, finally just insert the thing.. */
        if (pfn_t_devmap(pfn))
                entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
        else
                entry = pte_mkspecial(pfn_t_pte(pfn, prot));

        if (mkwrite) {
                entry = pte_mkyoung(entry);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        }

        set_pte_at(mm, addr, pte, entry);
        update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */

out_unlock:
        pte_unmap_unlock(pte, ptl);
        return VM_FAULT_NOPAGE;
}

/**
 * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @pfn: source kernel pfn
 * @pgprot: pgprot flags for the inserted page
 *
 * This is exactly like vmf_insert_pfn(), except that it allows drivers
 * to override pgprot on a per-page basis.
 *
 * This only makes sense for IO mappings, and it makes no sense for
 * COW mappings.  In general, using multiple vmas is preferable;
 * vmf_insert_pfn_prot should only be used if using multiple VMAs is
 * impractical.
 *
 * See vmf_insert_mixed_prot() for a discussion of the implication of using
 * a value of @pgprot different from that of @vma->vm_page_prot.
 *
 * Context: Process context.  May allocate using %GFP_KERNEL.
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn, pgprot_t pgprot)
{
        /*
         * Technically, architectures with pte_special can avoid all these
         * restrictions (same for remap_pfn_range).  However we would like
         * consistency in testing and feature parity among all, so we should
         * try to keep these invariants in place for everybody.
         */
        BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
        BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
                                                (VM_PFNMAP|VM_MIXEDMAP));
        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
        BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));

        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;

        if (!pfn_modify_allowed(pfn, pgprot))
                return VM_FAULT_SIGBUS;

        track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));

        return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
                        false);
}
EXPORT_SYMBOL(vmf_insert_pfn_prot);

/**
 * vmf_insert_pfn - insert single pfn into user vma
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @pfn: source kernel pfn
 *
 * Similar to vm_insert_page, this allows drivers to insert individual pages
 * they've allocated into a user vma. Same comments apply.
 *
 * This function should only be called from a vm_ops->fault handler, and
 * in that case the handler should return the result of this function.
 *
 * vma cannot be a COW mapping.
 *
 * As this is called only for pages that do not currently exist, we
 * do not need to flush old virtual caches or the TLB.
 *
 * Context: Process context.  May allocate using %GFP_KERNEL.
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn)
{
        return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
}
EXPORT_SYMBOL(vmf_insert_pfn);

static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
{
        /* these checks mirror the abort conditions in vm_normal_page */
        if (vma->vm_flags & VM_MIXEDMAP)
                return true;
        if (pfn_t_devmap(pfn))
                return true;
        if (pfn_t_special(pfn))
                return true;
        if (is_zero_pfn(pfn_t_to_pfn(pfn)))
                return true;
        return false;
}

static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
                unsigned long addr, pfn_t pfn, pgprot_t pgprot,
                bool mkwrite)
{
        int err;

        BUG_ON(!vm_mixed_ok(vma, pfn));

        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;

        track_pfn_insert(vma, &pgprot, pfn);

        if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
                return VM_FAULT_SIGBUS;

        /*
         * If we don't have pte special, then we have to use the pfn_valid()
         * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
         * refcount the page if pfn_valid is true (hence insert_page rather
         * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
         * without pte special, it would there be refcounted as a normal page.
         */
        if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
            !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
                struct page *page;

                /*
                 * At this point we are committed to insert_page()
                 * regardless of whether the caller specified flags that
                 * result in pfn_t_has_page() == false.
                 */
                page = pfn_to_page(pfn_t_to_pfn(pfn));
                err = insert_page(vma, addr, page, pgprot);
        } else {
                return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
        }

        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        if (err < 0 && err != -EBUSY)
                return VM_FAULT_SIGBUS;

        return VM_FAULT_NOPAGE;
}

/**
 * vmf_insert_mixed_prot - insert single pfn into user vma with specified pgprot
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @pfn: source kernel pfn
 * @pgprot: pgprot flags for the inserted page
 *
 * This is exactly like vmf_insert_mixed(), except that it allows drivers
 * to override pgprot on a per-page basis.
 *
 * Typically this function should be used by drivers to set caching- and
 * encryption bits different than those of @vma->vm_page_prot, because
 * the caching- or encryption mode may not be known at mmap() time.
 * This is ok as long as @vma->vm_page_prot is not used by the core vm
 * to set caching and encryption bits for those vmas (except for COW pages).
 * This is ensured by core vm only modifying these page table entries using
 * functions that don't touch caching- or encryption bits, using pte_modify()
 * if needed. (See for example mprotect()).
 * Also when new page-table entries are created, this is only done using the
 * fault() callback, and never using the value of vma->vm_page_prot,
 * except for page-table entries that point to anonymous pages as the result
 * of COW.
 *
 * Context: Process context.  May allocate using %GFP_KERNEL.
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
                                 pfn_t pfn, pgprot_t pgprot)
{
        return __vm_insert_mixed(vma, addr, pfn, pgprot, false);
}
EXPORT_SYMBOL(vmf_insert_mixed_prot);

vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                pfn_t pfn)
{
        return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, false);
}
EXPORT_SYMBOL(vmf_insert_mixed);

/*
 *  If the insertion of PTE failed because someone else already added a
 *  different entry in the mean time, we treat that as success as we assume
 *  the same entry was actually inserted.
 */
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
                unsigned long addr, pfn_t pfn)
{
        return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, true);
}
EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);

/*
 * maps a range of physical memory into the requested pages. the old
 * mappings are removed. any references to nonexistent pages results
 * in null mappings (currently treated as "copy-on-access")
 */
static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        pte_t *pte, *mapped_pte;
        spinlock_t *ptl;
        int err = 0;

        mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
        if (!pte)
                return -ENOMEM;
        arch_enter_lazy_mmu_mode();
        do {
                BUG_ON(!pte_none(*pte));
                if (!pfn_modify_allowed(pfn, prot)) {
                        err = -EACCES;
                        break;
                }
                set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
                pfn++;
        } while (pte++, addr += PAGE_SIZE, addr != end);
        arch_leave_lazy_mmu_mode();
        pte_unmap_unlock(mapped_pte, ptl);
        return err;
}

static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        pmd_t *pmd;
        unsigned long next;
        int err;

        pfn -= addr >> PAGE_SHIFT;
        pmd = pmd_alloc(mm, pud, addr);
        if (!pmd)
                return -ENOMEM;
        VM_BUG_ON(pmd_trans_huge(*pmd));
        do {
                next = pmd_addr_end(addr, end);
                err = remap_pte_range(mm, pmd, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        return err;
        } while (pmd++, addr = next, addr != end);
        return 0;
}

static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        pud_t *pud;
        unsigned long next;
        int err;

        pfn -= addr >> PAGE_SHIFT;
        pud = pud_alloc(mm, p4d, addr);
        if (!pud)
                return -ENOMEM;
        do {
                next = pud_addr_end(addr, end);
                err = remap_pmd_range(mm, pud, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        return err;
        } while (pud++, addr = next, addr != end);
        return 0;
}

static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        p4d_t *p4d;
        unsigned long next;
        int err;

        pfn -= addr >> PAGE_SHIFT;
        p4d = p4d_alloc(mm, pgd, addr);
        if (!p4d)
                return -ENOMEM;
        do {
                next = p4d_addr_end(addr, end);
                err = remap_pud_range(mm, p4d, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        return err;
        } while (p4d++, addr = next, addr != end);
        return 0;
}

/**
 * remap_pfn_range - remap kernel memory to userspace
 * @vma: user vma to map to
 * @addr: target page aligned user address to start at
 * @pfn: page frame number of kernel physical memory address
 * @size: size of mapping area
 * @prot: page protection flags for this mapping
 *
 * Note: this is only safe if the mm semaphore is held when called.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
                    unsigned long pfn, unsigned long size, pgprot_t prot)
{
        pgd_t *pgd;
        unsigned long next;
        unsigned long end = addr + PAGE_ALIGN(size);
        struct mm_struct *mm = vma->vm_mm;
        unsigned long remap_pfn = pfn;
        int err;

        if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
                return -EINVAL;

        /*
         * Physically remapped pages are special. Tell the
         * rest of the world about it:
         *   VM_IO tells people not to look at these pages
         *        (accesses can have side effects).
         *   VM_PFNMAP tells the core MM that the base pages are just
         *        raw PFN mappings, and do not have a "struct page" associated
         *        with them.
         *   VM_DONTEXPAND
         *      Disable vma merging and expanding with mremap().
         *   VM_DONTDUMP
         *      Omit vma from core dump, even when VM_IO turned off.
         *
         * There's a horrible special case to handle copy-on-write
         * behaviour that some programs depend on. We mark the "original"
         * un-COW'ed pages by matching them up with "vma->vm_pgoff".
         * See vm_normal_page() for details.
         */
        if (is_cow_mapping(vma->vm_flags)) {
                if (addr != vma->vm_start || end != vma->vm_end)
                        return -EINVAL;
                vma->vm_pgoff = pfn;
        }

        err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
        if (err)
                return -EINVAL;

        vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;

        BUG_ON(addr >= end);
        pfn -= addr >> PAGE_SHIFT;
        pgd = pgd_offset(mm, addr);
        flush_cache_range(vma, addr, end);
        do {
                next = pgd_addr_end(addr, end);
                err = remap_p4d_range(mm, pgd, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        break;
        } while (pgd++, addr = next, addr != end);

        if (err)
                untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));

        return err;
}
EXPORT_SYMBOL(remap_pfn_range);

/**
 * vm_iomap_memory - remap memory to userspace
 * @vma: user vma to map to
 * @start: start of the physical memory to be mapped
 * @len: size of area
 *
 * This is a simplified io_remap_pfn_range() for common driver use. The
 * driver just needs to give us the physical memory range to be mapped,
 * we'll figure out the rest from the vma information.
 *
 * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
 * whatever write-combining details or similar.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
{
        unsigned long vm_len, pfn, pages;

        /* Check that the physical memory area passed in looks valid */
        if (start + len < start)
                return -EINVAL;
        /*
         * You *really* shouldn't map things that aren't page-aligned,
         * but we've historically allowed it because IO memory might
         * just have smaller alignment.
         */
        len += start & ~PAGE_MASK;
        pfn = start >> PAGE_SHIFT;
        pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
        if (pfn + pages < pfn)
                return -EINVAL;

        /* We start the mapping 'vm_pgoff' pages into the area */
        if (vma->vm_pgoff > pages)
                return -EINVAL;
        pfn += vma->vm_pgoff;
        pages -= vma->vm_pgoff;

        /* Can we fit all of the mapping? */
        vm_len = vma->vm_end - vma->vm_start;
        if (vm_len >> PAGE_SHIFT > pages)
                return -EINVAL;

        /* Ok, let it rip */
        return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_iomap_memory);

static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        pte_t *pte;
        int err = 0;
        spinlock_t *ptl;

        if (create) {
                pte = (mm == &init_mm) ?
                        pte_alloc_kernel_track(pmd, addr, mask) :
                        pte_alloc_map_lock(mm, pmd, addr, &ptl);
                if (!pte)
                        return -ENOMEM;
        } else {
                pte = (mm == &init_mm) ?
                        pte_offset_kernel(pmd, addr) :
                        pte_offset_map_lock(mm, pmd, addr, &ptl);
        }

        BUG_ON(pmd_huge(*pmd));

        arch_enter_lazy_mmu_mode();

        if (fn) {
                do {
                        if (create || !pte_none(*pte)) {
                                err = fn(pte++, addr, data);
                                if (err)
                                        break;
                        }
                } while (addr += PAGE_SIZE, addr != end);
        }
        *mask |= PGTBL_PTE_MODIFIED;

        arch_leave_lazy_mmu_mode();

        if (mm != &init_mm)
                pte_unmap_unlock(pte-1, ptl);
        return err;
}

static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        pmd_t *pmd;
        unsigned long next;
        int err = 0;

        BUG_ON(pud_huge(*pud));

        if (create) {
                pmd = pmd_alloc_track(mm, pud, addr, mask);
                if (!pmd)
                        return -ENOMEM;
        } else {
                pmd = pmd_offset(pud, addr);
        }
        do {
                next = pmd_addr_end(addr, end);
                if (create || !pmd_none_or_clear_bad(pmd)) {
                        err = apply_to_pte_range(mm, pmd, addr, next, fn, data,
                                                 create, mask);
                        if (err)
                                break;
                }
        } while (pmd++, addr = next, addr != end);
        return err;
}

static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        pud_t *pud;
        unsigned long next;
        int err = 0;

        if (create) {
                pud = pud_alloc_track(mm, p4d, addr, mask);
                if (!pud)
                        return -ENOMEM;
        } else {
                pud = pud_offset(p4d, addr);
        }
        do {
                next = pud_addr_end(addr, end);
                if (create || !pud_none_or_clear_bad(pud)) {
                        err = apply_to_pmd_range(mm, pud, addr, next, fn, data,
                                                 create, mask);
                        if (err)
                                break;
                }
        } while (pud++, addr = next, addr != end);
        return err;
}

static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        p4d_t *p4d;
        unsigned long next;
        int err = 0;

        if (create) {
                p4d = p4d_alloc_track(mm, pgd, addr, mask);
                if (!p4d)
                        return -ENOMEM;
        } else {
                p4d = p4d_offset(pgd, addr);
        }
        do {
                next = p4d_addr_end(addr, end);
                if (create || !p4d_none_or_clear_bad(p4d)) {
                        err = apply_to_pud_range(mm, p4d, addr, next, fn, data,
                                                 create, mask);
                        if (err)
                                break;
                }
        } while (p4d++, addr = next, addr != end);
        return err;
}

static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
                                 unsigned long size, pte_fn_t fn,
                                 void *data, bool create)
{
        pgd_t *pgd;
        unsigned long start = addr, next;
        unsigned long end = addr + size;
        pgtbl_mod_mask mask = 0;
        int err = 0;

        if (WARN_ON(addr >= end))
                return -EINVAL;

        pgd = pgd_offset(mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (!create && pgd_none_or_clear_bad(pgd))
                        continue;
                err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask);
                if (err)
                        break;
        } while (pgd++, addr = next, addr != end);

        if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
                arch_sync_kernel_mappings(start, start + size);

        return err;
}

/*
 * Scan a region of virtual memory, filling in page tables as necessary
 * and calling a provided function on each leaf page table.
 */
int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
                        unsigned long size, pte_fn_t fn, void *data)
{
        return __apply_to_page_range(mm, addr, size, fn, data, true);
}
EXPORT_SYMBOL_GPL(apply_to_page_range);

/*
 * Scan a region of virtual memory, calling a provided function on
 * each leaf page table where it exists.
 *
 * Unlike apply_to_page_range, this does _not_ fill in page tables
 * where they are absent.
 */
int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
                                 unsigned long size, pte_fn_t fn, void *data)
{
        return __apply_to_page_range(mm, addr, size, fn, data, false);
}
EXPORT_SYMBOL_GPL(apply_to_existing_page_range);

/*
 * handle_pte_fault chooses page fault handler according to an entry which was
 * read non-atomically.  Before making any commitment, on those architectures
 * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
 * parts, do_swap_page must check under lock before unmapping the pte and
 * proceeding (but do_wp_page is only called after already making such a check;
 * and do_anonymous_page can safely check later on).
 */
static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
                                pte_t *page_table, pte_t orig_pte)
{
        int same = 1;
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
        if (sizeof(pte_t) > sizeof(unsigned long)) {
                spinlock_t *ptl = pte_lockptr(mm, pmd);
                spin_lock(ptl);
                same = pte_same(*page_table, orig_pte);
                spin_unlock(ptl);
        }
#endif
        pte_unmap(page_table);
        return same;
}

static inline bool cow_user_page(struct page *dst, struct page *src,
                                 struct vm_fault *vmf)
{
        bool ret;
        void *kaddr;
        void __user *uaddr;
        bool locked = false;
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long addr = vmf->address;

        if (likely(src)) {
                copy_user_highpage(dst, src, addr, vma);
                return true;
        }

        /*
         * If the source page was a PFN mapping, we don't have
         * a "struct page" for it. We do a best-effort copy by
         * just copying from the original user address. If that
         * fails, we just zero-fill it. Live with it.
         */
        kaddr = kmap_atomic(dst);
        uaddr = (void __user *)(addr & PAGE_MASK);

        /*
         * On architectures with software "accessed" bits, we would
         * take a double page fault, so mark it accessed here.
         */
        if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
                pte_t entry;

                vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
                locked = true;
                if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
                        /*
                         * Other thread has already handled the fault
                         * and update local tlb only
                         */
                        update_mmu_tlb(vma, addr, vmf->pte);
                        ret = false;
                        goto pte_unlock;
                }

                entry = pte_mkyoung(vmf->orig_pte);
                if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
                        update_mmu_cache(vma, addr, vmf->pte);
        }

        /*
         * This really shouldn't fail, because the page is there
         * in the page tables. But it might just be unreadable,
         * in which case we just give up and fill the result with
         * zeroes.
         */
        if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
                if (locked)
                        goto warn;

                /* Re-validate under PTL if the page is still mapped */
                vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
                locked = true;
                if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
                        /* The PTE changed under us, update local tlb */
                        update_mmu_tlb(vma, addr, vmf->pte);
                        ret = false;
                        goto pte_unlock;
                }

                /*
                 * The same page can be mapped back since last copy attempt.
                 * Try to copy again under PTL.
                 */
                if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
                        /*
                         * Give a warn in case there can be some obscure
                         * use-case
                         */
warn:
                        WARN_ON_ONCE(1);
                        clear_page(kaddr);
                }
        }

        ret = true;

pte_unlock:
        if (locked)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        kunmap_atomic(kaddr);
        flush_dcache_page(dst);

        return ret;
}

static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
{
        struct file *vm_file = vma->vm_file;

        if (vm_file)
                return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;

        /*
         * Special mappings (e.g. VDSO) do not have any file so fake
         * a default GFP_KERNEL for them.
         */
        return GFP_KERNEL;
}

/*
 * Notify the address space that the page is about to become writable so that
 * it can prohibit this or wait for the page to get into an appropriate state.
 *
 * We do this without the lock held, so that it can sleep if it needs to.
 */
static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
{
        vm_fault_t ret;
        struct page *page = vmf->page;
        unsigned int old_flags = vmf->flags;

        vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;

        if (vmf->vma->vm_file &&
            IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
                return VM_FAULT_SIGBUS;

        ret = vmf->vma->vm_ops->page_mkwrite(vmf);
        /* Restore original flags so that caller is not surprised */
        vmf->flags = old_flags;
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
                return ret;
        if (unlikely(!(ret & VM_FAULT_LOCKED))) {
                lock_page(page);
                if (!page->mapping) {
                        unlock_page(page);
                        return 0; /* retry */
                }
                ret |= VM_FAULT_LOCKED;
        } else
                VM_BUG_ON_PAGE(!PageLocked(page), page);
        return ret;
}

/*
 * Handle dirtying of a page in shared file mapping on a write fault.
 *
 * The function expects the page to be locked and unlocks it.
 */
static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct address_space *mapping;
        struct page *page = vmf->page;
        bool dirtied;
        bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;

        dirtied = set_page_dirty(page);
        VM_BUG_ON_PAGE(PageAnon(page), page);
        /*
         * Take a local copy of the address_space - page.mapping may be zeroed
         * by truncate after unlock_page().   The address_space itself remains
         * pinned by vma->vm_file's reference.  We rely on unlock_page()'s
         * release semantics to prevent the compiler from undoing this copying.
         */
        mapping = page_rmapping(page);
        unlock_page(page);

        if (!page_mkwrite)
                file_update_time(vma->vm_file);

        /*
         * Throttle page dirtying rate down to writeback speed.
         *
         * mapping may be NULL here because some device drivers do not
         * set page.mapping but still dirty their pages
         *
         * Drop the mmap_lock before waiting on IO, if we can. The file
         * is pinning the mapping, as per above.
         */
        if ((dirtied || page_mkwrite) && mapping) {
                struct file *fpin;

                fpin = maybe_unlock_mmap_for_io(vmf, NULL);
                balance_dirty_pages_ratelimited(mapping);
                if (fpin) {
                        fput(fpin);
                        return VM_FAULT_RETRY;
                }
        }

        return 0;
}

/*
 * Handle write page faults for pages that can be reused in the current vma
 *
 * This can happen either due to the mapping being with the VM_SHARED flag,
 * or due to us being the last reference standing to the page. In either
 * case, all we need to do here is to mark the page as writable and update
 * any related book-keeping.
 */
static inline void wp_page_reuse(struct vm_fault *vmf)
        __releases(vmf->ptl)
{
        struct vm_area_struct *vma = vmf->vma;
        struct page *page = vmf->page;
        pte_t entry;
        /*
         * Clear the pages cpupid information as the existing
         * information potentially belongs to a now completely
         * unrelated process.
         */
        if (page)
                page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);

        flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
        entry = pte_mkyoung(vmf->orig_pte);
        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
                update_mmu_cache(vma, vmf->address, vmf->pte);
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        count_vm_event(PGREUSE);
}

/*
 * Handle the case of a page which we actually need to copy to a new page.
 *
 * Called with mmap_lock locked and the old page referenced, but
 * without the ptl held.
 *
 * High level logic flow:
 *
 * - Allocate a page, copy the content of the old page to the new one.
 * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
 * - Take the PTL. If the pte changed, bail out and release the allocated page
 * - If the pte is still the way we remember it, update the page table and all
 *   relevant references. This includes dropping the reference the page-table
 *   held to the old page, as well as updating the rmap.
 * - In any case, unlock the PTL and drop the reference we took to the old page.
 */
static vm_fault_t wp_page_copy(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *mm = vma->vm_mm;
        struct page *old_page = vmf->page;
        struct page *new_page = NULL;
        pte_t entry;
        int page_copied = 0;
        struct mmu_notifier_range range;

        if (unlikely(anon_vma_prepare(vma)))
                goto oom;

        if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
                new_page = alloc_zeroed_user_highpage_movable(vma,
                                                              vmf->address);
                if (!new_page)
                        goto oom;
        } else {
                new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
                                vmf->address);
                if (!new_page)
                        goto oom;

                if (!cow_user_page(new_page, old_page, vmf)) {
                        /*
                         * COW failed, if the fault was solved by other,
                         * it's fine. If not, userspace would re-fault on
                         * the same address and we will handle the fault
                         * from the second attempt.
                         */
                        put_page(new_page);
                        if (old_page)
                                put_page(old_page);
                        return 0;
                }
        }

        if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
                goto oom_free_new;
        cgroup_throttle_swaprate(new_page, GFP_KERNEL);

        __SetPageUptodate(new_page);

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
                                vmf->address & PAGE_MASK,
                                (vmf->address & PAGE_MASK) + PAGE_SIZE);
        mmu_notifier_invalidate_range_start(&range);

        /*
         * Re-check the pte - we dropped the lock
         */
        vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
        if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
                if (old_page) {
                        if (!PageAnon(old_page)) {
                                dec_mm_counter_fast(mm,
                                                mm_counter_file(old_page));
                                inc_mm_counter_fast(mm, MM_ANONPAGES);
                        }
                } else {
                        inc_mm_counter_fast(mm, MM_ANONPAGES);
                }
                flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
                entry = mk_pte(new_page, vma->vm_page_prot);
                entry = pte_sw_mkyoung(entry);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                /*
                 * Clear the pte entry and flush it first, before updating the
                 * pte with the new entry. This will avoid a race condition
                 * seen in the presence of one thread doing SMC and another
                 * thread doing COW.
                 */
                ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
                page_add_new_anon_rmap(new_page, vma, vmf->address, false);
                lru_cache_add_inactive_or_unevictable(new_page, vma);
                /*
                 * We call the notify macro here because, when using secondary
                 * mmu page tables (such as kvm shadow page tables), we want the
                 * new page to be mapped directly into the secondary page table.
                 */
                set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
                update_mmu_cache(vma, vmf->address, vmf->pte);
                if (old_page) {
                        /*
                         * Only after switching the pte to the new page may
                         * we remove the mapcount here. Otherwise another
                         * process may come and find the rmap count decremented
                         * before the pte is switched to the new page, and
                         * "reuse" the old page writing into it while our pte
                         * here still points into it and can be read by other
                         * threads.
                         *
                         * The critical issue is to order this
                         * page_remove_rmap with the ptp_clear_flush above.
                         * Those stores are ordered by (if nothing else,)
                         * the barrier present in the atomic_add_negative
                         * in page_remove_rmap.
                         *
                         * Then the TLB flush in ptep_clear_flush ensures that
                         * no process can access the old page before the
                         * decremented mapcount is visible. And the old page
                         * cannot be reused until after the decremented
                         * mapcount is visible. So transitively, TLBs to
                         * old page will be flushed before it can be reused.
                         */
                        page_remove_rmap(old_page, false);
                }

                /* Free the old page.. */
                new_page = old_page;
                page_copied = 1;
        } else {
                update_mmu_tlb(vma, vmf->address, vmf->pte);
        }

        if (new_page)
                put_page(new_page);

        pte_unmap_unlock(vmf->pte, vmf->ptl);
        /*
         * No need to double call mmu_notifier->invalidate_range() callback as
         * the above ptep_clear_flush_notify() did already call it.
         */
        mmu_notifier_invalidate_range_only_end(&range);
        if (old_page) {
                /*
                 * Don't let another task, with possibly unlocked vma,
                 * keep the mlocked page.
                 */
                if (page_copied && (vma->vm_flags & VM_LOCKED)) {
                        lock_page(old_page);        /* LRU manipulation */
                        if (PageMlocked(old_page))
                                munlock_vma_page(old_page);
                        unlock_page(old_page);
                }
                put_page(old_page);
        }
        return page_copied ? VM_FAULT_WRITE : 0;
oom_free_new:
        put_page(new_page);
oom:
        if (old_page)
                put_page(old_page);
        return VM_FAULT_OOM;
}

/**
 * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
 *                          writeable once the page is prepared
 *
 * @vmf: structure describing the fault
 *
 * This function handles all that is needed to finish a write page fault in a
 * shared mapping due to PTE being read-only once the mapped page is prepared.
 * It handles locking of PTE and modifying it.
 *
 * The function expects the page to be locked or other protection against
 * concurrent faults / writeback (such as DAX radix tree locks).
 *
 * Return: %VM_FAULT_WRITE on success, %0 when PTE got changed before
 * we acquired PTE lock.
 */
vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
{
        WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
        vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
                                       &vmf->ptl);
        /*
         * We might have raced with another page fault while we released the
         * pte_offset_map_lock.
         */
        if (!pte_same(*vmf->pte, vmf->orig_pte)) {
                update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                return VM_FAULT_NOPAGE;
        }
        wp_page_reuse(vmf);
        return 0;
}

/*
 * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
 * mapping
 */
static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;

        if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
                vm_fault_t ret;

                pte_unmap_unlock(vmf->pte, vmf->ptl);
                vmf->flags |= FAULT_FLAG_MKWRITE;
                ret = vma->vm_ops->pfn_mkwrite(vmf);
                if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
                        return ret;
                return finish_mkwrite_fault(vmf);
        }
        wp_page_reuse(vmf);
        return VM_FAULT_WRITE;
}

static vm_fault_t wp_page_shared(struct vm_fault *vmf)
        __releases(vmf->ptl)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret = VM_FAULT_WRITE;

        get_page(vmf->page);

        if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
                vm_fault_t tmp;

                pte_unmap_unlock(vmf->pte, vmf->ptl);
                tmp = do_page_mkwrite(vmf);
                if (unlikely(!tmp || (tmp &
                                      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
                        put_page(vmf->page);
                        return tmp;
                }
                tmp = finish_mkwrite_fault(vmf);
                if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
                        unlock_page(vmf->page);
                        put_page(vmf->page);
                        return tmp;
                }
        } else {
                wp_page_reuse(vmf);
                lock_page(vmf->page);
        }
        ret |= fault_dirty_shared_page(vmf);
        put_page(vmf->page);

        return ret;
}

/*
 * This routine handles present pages, when users try to write
 * to a shared page. It is done by copying the page to a new address
 * and decrementing the shared-page counter for the old page.
 *
 * Note that this routine assumes that the protection checks have been
 * done by the caller (the low-level page fault routine in most cases).
 * Thus we can safely just mark it writable once we've done any necessary
 * COW.
 *
 * We also mark the page dirty at this point even though the page will
 * change only once the write actually happens. This avoids a few races,
 * and potentially makes it more efficient.
 *
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults), with pte both mapped and locked.
 * We return with mmap_lock still held, but pte unmapped and unlocked.
 */
static vm_fault_t do_wp_page(struct vm_fault *vmf)
        __releases(vmf->ptl)
{
        struct vm_area_struct *vma = vmf->vma;

        if (userfaultfd_pte_wp(vma, *vmf->pte)) {
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                return handle_userfault(vmf, VM_UFFD_WP);
        }

        /*
         * Userfaultfd write-protect can defer flushes. Ensure the TLB
         * is flushed in this case before copying.
         */
        if (unlikely(userfaultfd_wp(vmf->vma) &&
                     mm_tlb_flush_pending(vmf->vma->vm_mm)))
                flush_tlb_page(vmf->vma, vmf->address);

        vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
        if (!vmf->page) {
                /*
                 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
                 * VM_PFNMAP VMA.
                 *
                 * We should not cow pages in a shared writeable mapping.
                 * Just mark the pages writable and/or call ops->pfn_mkwrite.
                 */
                if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                                     (VM_WRITE|VM_SHARED))
                        return wp_pfn_shared(vmf);

                pte_unmap_unlock(vmf->pte, vmf->ptl);
                return wp_page_copy(vmf);
        }

        /*
         * Take out anonymous pages first, anonymous shared vmas are
         * not dirty accountable.
         */
        if (PageAnon(vmf->page)) {
                struct page *page = vmf->page;

                /* PageKsm() doesn't necessarily raise the page refcount */
                if (PageKsm(page) || page_count(page) != 1)
                        goto copy;
                if (!trylock_page(page))
                        goto copy;
                if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) {
                        unlock_page(page);
                        goto copy;
                }
                /*
                 * Ok, we've got the only map reference, and the only
                 * page count reference, and the page is locked,
                 * it's dark out, and we're wearing sunglasses. Hit it.
                 */
                unlock_page(page);
                wp_page_reuse(vmf);
                return VM_FAULT_WRITE;
        } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                                        (VM_WRITE|VM_SHARED))) {
                return wp_page_shared(vmf);
        }
copy:
        /*
         * Ok, we need to copy. Oh, well..
         */
        get_page(vmf->page);

        pte_unmap_unlock(vmf->pte, vmf->ptl);
        return wp_page_copy(vmf);
}

static void unmap_mapping_range_vma(struct vm_area_struct *vma,
                unsigned long start_addr, unsigned long end_addr,
                struct zap_details *details)
{
        zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
}

static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
                                            struct zap_details *details)
{
        struct vm_area_struct *vma;
        pgoff_t vba, vea, zba, zea;

        vma_interval_tree_foreach(vma, root,
                        details->first_index, details->last_index) {

                vba = vma->vm_pgoff;
                vea = vba + vma_pages(vma) - 1;
                zba = details->first_index;
                if (zba < vba)
                        zba = vba;
                zea = details->last_index;
                if (zea > vea)
                        zea = vea;

                unmap_mapping_range_vma(vma,
                        ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
                        ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
                                details);
        }
}

/**
 * unmap_mapping_page() - Unmap single page from processes.
 * @page: The locked page to be unmapped.
 *
 * Unmap this page from any userspace process which still has it mmaped.
 * Typically, for efficiency, the range of nearby pages has already been
 * unmapped by unmap_mapping_pages() or unmap_mapping_range().  But once
 * truncation or invalidation holds the lock on a page, it may find that
 * the page has been remapped again: and then uses unmap_mapping_page()
 * to unmap it finally.
 */
void unmap_mapping_page(struct page *page)
{
        struct address_space *mapping = page->mapping;
        struct zap_details details = { };

        VM_BUG_ON(!PageLocked(page));
        VM_BUG_ON(PageTail(page));

        details.check_mapping = mapping;
        details.first_index = page->index;
        details.last_index = page->index + thp_nr_pages(page) - 1;
        details.single_page = page;

        i_mmap_lock_write(mapping);
        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
                unmap_mapping_range_tree(&mapping->i_mmap, &details);
        i_mmap_unlock_write(mapping);
}

/**
 * unmap_mapping_pages() - Unmap pages from processes.
 * @mapping: The address space containing pages to be unmapped.
 * @start: Index of first page to be unmapped.
 * @nr: Number of pages to be unmapped.  0 to unmap to end of file.
 * @even_cows: Whether to unmap even private COWed pages.
 *
 * Unmap the pages in this address space from any userspace process which
 * has them mmaped.  Generally, you want to remove COWed pages as well when
 * a file is being truncated, but not when invalidating pages from the page
 * cache.
 */
void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
                pgoff_t nr, bool even_cows)
{
        struct zap_details details = { };

        details.check_mapping = even_cows ? NULL : mapping;
        details.first_index = start;
        details.last_index = start + nr - 1;
        if (details.last_index < details.first_index)
                details.last_index = ULONG_MAX;

        i_mmap_lock_write(mapping);
        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
                unmap_mapping_range_tree(&mapping->i_mmap, &details);
        i_mmap_unlock_write(mapping);
}

/**
 * unmap_mapping_range - unmap the portion of all mmaps in the specified
 * address_space corresponding to the specified byte range in the underlying
 * file.
 *
 * @mapping: the address space containing mmaps to be unmapped.
 * @holebegin: byte in first page to unmap, relative to the start of
 * the underlying file.  This will be rounded down to a PAGE_SIZE
 * boundary.  Note that this is different from truncate_pagecache(), which
 * must keep the partial page.  In contrast, we must get rid of
 * partial pages.
 * @holelen: size of prospective hole in bytes.  This will be rounded
 * up to a PAGE_SIZE boundary.  A holelen of zero truncates to the
 * end of the file.
 * @even_cows: 1 when truncating a file, unmap even private COWed pages;
 * but 0 when invalidating pagecache, don't throw away private data.
 */
void unmap_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen, int even_cows)
{
        pgoff_t hba = (pgoff_t)(holebegin) >> PAGE_SHIFT;
        pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - 1) >> PAGE_SHIFT;

        /* Check for overflow. */
        if (sizeof(holelen) > sizeof(hlen)) {
                long long holeend =
                        (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
                if (holeend & ~(long long)ULONG_MAX)
                        hlen = ULONG_MAX - hba + 1;
        }

        unmap_mapping_pages(mapping, hba, hlen, even_cows);
}
EXPORT_SYMBOL(unmap_mapping_range);

/*
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
 * We return with pte unmapped and unlocked.
 *
 * We return with the mmap_lock locked or unlocked in the same cases
 * as does filemap_fault().
 */
vm_fault_t do_swap_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct page *page = NULL, *swapcache;
        swp_entry_t entry;
        pte_t pte;
        int locked;
        int exclusive = 0;
        vm_fault_t ret = 0;
        void *shadow = NULL;

        if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
                goto out;

        entry = pte_to_swp_entry(vmf->orig_pte);
        if (unlikely(non_swap_entry(entry))) {
                if (is_migration_entry(entry)) {
                        migration_entry_wait(vma->vm_mm, vmf->pmd,
                                             vmf->address);
                } else if (is_device_private_entry(entry)) {
                        vmf->page = device_private_entry_to_page(entry);
                        ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
                } else if (is_hwpoison_entry(entry)) {
                        ret = VM_FAULT_HWPOISON;
                } else {
                        print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
                        ret = VM_FAULT_SIGBUS;
                }
                goto out;
        }


        delayacct_set_flag(DELAYACCT_PF_SWAPIN);
        page = lookup_swap_cache(entry, vma, vmf->address);
        swapcache = page;

        if (!page) {
                struct swap_info_struct *si = swp_swap_info(entry);

                if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
                    __swap_count(entry) == 1) {
                        /* skip swapcache */
                        page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
                                                        vmf->address);
                        if (page) {
                                int err;

                                __SetPageLocked(page);
                                __SetPageSwapBacked(page);
                                set_page_private(page, entry.val);

                                /* Tell memcg to use swap ownership records */
                                SetPageSwapCache(page);
                                err = mem_cgroup_charge(page, vma->vm_mm,
                                                        GFP_KERNEL);
                                ClearPageSwapCache(page);
                                if (err) {
                                        ret = VM_FAULT_OOM;
                                        goto out_page;
                                }

                                shadow = get_shadow_from_swap_cache(entry);
                                if (shadow)
                                        workingset_refault(page, shadow);

                                lru_cache_add(page);
                                swap_readpage(page, true);
                        }
                } else {
                        page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
                                                vmf);
                        swapcache = page;
                }

                if (!page) {
                        /*
                         * Back out if somebody else faulted in this pte
                         * while we released the pte lock.
                         */
                        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                        vmf->address, &vmf->ptl);
                        if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
                                ret = VM_FAULT_OOM;
                        delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
                        goto unlock;
                }

                /* Had to read the page from swap area: Major fault */
                ret = VM_FAULT_MAJOR;
                count_vm_event(PGMAJFAULT);
                count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
        } else if (PageHWPoison(page)) {
                /*
                 * hwpoisoned dirty swapcache pages are kept for killing
                 * owner processes (which may be unknown at hwpoison time)
                 */
                ret = VM_FAULT_HWPOISON;
                delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
                goto out_release;
        }

        locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);

        delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
        if (!locked) {
                ret |= VM_FAULT_RETRY;
                goto out_release;
        }

        /*
         * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
         * release the swapcache from under us.  The page pin, and pte_same
         * test below, are not enough to exclude that.  Even if it is still
         * swapcache, we need to check that the page's swap has not changed.
         */
        if (unlikely((!PageSwapCache(page) ||
                        page_private(page) != entry.val)) && swapcache)
                goto out_page;

        page = ksm_might_need_to_copy(page, vma, vmf->address);
        if (unlikely(!page)) {
                ret = VM_FAULT_OOM;
                page = swapcache;
                goto out_page;
        }

        cgroup_throttle_swaprate(page, GFP_KERNEL);

        /*
         * Back out if somebody else already faulted in this pte.
         */
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                        &vmf->ptl);
        if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
                goto out_nomap;

        if (unlikely(!PageUptodate(page))) {
                ret = VM_FAULT_SIGBUS;
                goto out_nomap;
        }

        /*
         * The page isn't present yet, go ahead with the fault.
         *
         * Be careful about the sequence of operations here.
         * To get its accounting right, reuse_swap_page() must be called
         * while the page is counted on swap but not yet in mapcount i.e.
         * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
         * must be called after the swap_free(), or it will never succeed.
         */

        inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
        dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
        pte = mk_pte(page, vma->vm_page_prot);
        if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
                pte = maybe_mkwrite(pte_mkdirty(pte), vma);
                vmf->flags &= ~FAULT_FLAG_WRITE;
                ret |= VM_FAULT_WRITE;
                exclusive = RMAP_EXCLUSIVE;
        }
        flush_icache_page(vma, page);
        if (pte_swp_soft_dirty(vmf->orig_pte))
                pte = pte_mksoft_dirty(pte);
        if (pte_swp_uffd_wp(vmf->orig_pte)) {
                pte = pte_mkuffd_wp(pte);
                pte = pte_wrprotect(pte);
        }
        set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
        arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
        vmf->orig_pte = pte;

        /* ksm created a completely new copy */
        if (unlikely(page != swapcache && swapcache)) {
                page_add_new_anon_rmap(page, vma, vmf->address, false);
                lru_cache_add_inactive_or_unevictable(page, vma);
        } else {
                do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
        }

        swap_free(entry);
        if (mem_cgroup_swap_full(page) ||
            (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
                try_to_free_swap(page);
        unlock_page(page);
        if (page != swapcache && swapcache) {
                /*
                 * Hold the lock to avoid the swap entry to be reused
                 * until we take the PT lock for the pte_same() check
                 * (to avoid false positives from pte_same). For
                 * further safety release the lock after the swap_free
                 * so that the swap count won't change under a
                 * parallel locked swapcache.
                 */
                unlock_page(swapcache);
                put_page(swapcache);
        }

        if (vmf->flags & FAULT_FLAG_WRITE) {
                ret |= do_wp_page(vmf);
                if (ret & VM_FAULT_ERROR)
                        ret &= VM_FAULT_ERROR;
                goto out;
        }

        /* No need to invalidate - it was non-present before */
        update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
        pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
        return ret;
out_nomap:
        pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
        unlock_page(page);
out_release:
        put_page(page);
        if (page != swapcache && swapcache) {
                unlock_page(swapcache);
                put_page(swapcache);
        }
        return ret;
}

/*
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
 * We return with mmap_lock still held, but pte unmapped and unlocked.
 */
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct page *page;
        vm_fault_t ret = 0;
        pte_t entry;

        /* File mapping without ->vm_ops ? */
        if (vma->vm_flags & VM_SHARED)
                return VM_FAULT_SIGBUS;

        /*
         * Use pte_alloc() instead of pte_alloc_map().  We can't run
         * pte_offset_map() on pmds where a huge pmd might be created
         * from a different thread.
         *
         * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
         * parallel threads are excluded by other means.
         *
         * Here we only have mmap_read_lock(mm).
         */
        if (pte_alloc(vma->vm_mm, vmf->pmd))
                return VM_FAULT_OOM;

        /* See the comment in pte_alloc_one_map() */
        if (unlikely(pmd_trans_unstable(vmf->pmd)))
                return 0;

        /* Use the zero-page for reads */
        if (!(vmf->flags & FAULT_FLAG_WRITE) &&
                        !mm_forbids_zeropage(vma->vm_mm)) {
                entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
                                                vma->vm_page_prot));
                vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                vmf->address, &vmf->ptl);
                if (!pte_none(*vmf->pte)) {
                        update_mmu_tlb(vma, vmf->address, vmf->pte);
                        goto unlock;
                }
                ret = check_stable_address_space(vma->vm_mm);
                if (ret)
                        goto unlock;
                /* Deliver the page fault to userland, check inside PT lock */
                if (userfaultfd_missing(vma)) {
                        pte_unmap_unlock(vmf->pte, vmf->ptl);
                        return handle_userfault(vmf, VM_UFFD_MISSING);
                }
                goto setpte;
        }

        /* Allocate our own private page. */
        if (unlikely(anon_vma_prepare(vma)))
                goto oom;
        page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
        if (!page)
                goto oom;

        if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
                goto oom_free_page;
        cgroup_throttle_swaprate(page, GFP_KERNEL);

        /*
         * The memory barrier inside __SetPageUptodate makes sure that
         * preceding stores to the page contents become visible before
         * the set_pte_at() write.
         */
        __SetPageUptodate(page);

        entry = mk_pte(page, vma->vm_page_prot);
        entry = pte_sw_mkyoung(entry);
        if (vma->vm_flags & VM_WRITE)
                entry = pte_mkwrite(pte_mkdirty(entry));

        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                        &vmf->ptl);
        if (!pte_none(*vmf->pte)) {
                update_mmu_cache(vma, vmf->address, vmf->pte);
                goto release;
        }

        ret = check_stable_address_space(vma->vm_mm);
        if (ret)
                goto release;

        /* Deliver the page fault to userland, check inside PT lock */
        if (userfaultfd_missing(vma)) {
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                put_page(page);
                return handle_userfault(vmf, VM_UFFD_MISSING);
        }

        inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
        page_add_new_anon_rmap(page, vma, vmf->address, false);
        lru_cache_add_inactive_or_unevictable(page, vma);
setpte:
        set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);

        /* No need to invalidate - it was non-present before */
        update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        return ret;
release:
        put_page(page);
        goto unlock;
oom_free_page:
        put_page(page);
oom:
        return VM_FAULT_OOM;
}

/*
 * The mmap_lock must have been held on entry, and may have been
 * released depending on flags and vma->vm_ops->fault() return value.
 * See filemap_fault() and __lock_page_retry().
 */
static vm_fault_t __do_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret;

        /*
         * Preallocate pte before we take page_lock because this might lead to
         * deadlocks for memcg reclaim which waits for pages under writeback:
         *                                lock_page(A)
         *                                SetPageWriteback(A)
         *                                unlock_page(A)
         * lock_page(B)
         *                                lock_page(B)
         * pte_alloc_one
         *   shrink_page_list
         *     wait_on_page_writeback(A)
         *                                SetPageWriteback(B)
         *                                unlock_page(B)
         *                                # flush A, B to clear the writeback
         */
        if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
                vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
                if (!vmf->prealloc_pte)
                        return VM_FAULT_OOM;
                smp_wmb(); /* See comment in __pte_alloc() */
        }

        ret = vma->vm_ops->fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
                            VM_FAULT_DONE_COW)))
                return ret;

        if (unlikely(PageHWPoison(vmf->page))) {
                struct page *page = vmf->page;
                vm_fault_t poisonret = VM_FAULT_HWPOISON;
                if (ret & VM_FAULT_LOCKED) {
                        if (page_mapped(page))
                                unmap_mapping_pages(page_mapping(page),
                                                    page->index, 1, false);
                        /* Retry if a clean page was removed from the cache. */
                        if (invalidate_inode_page(page))
                                poisonret = VM_FAULT_NOPAGE;
                        unlock_page(page);
                }
                put_page(page);
                vmf->page = NULL;
                return poisonret;
        }

        if (unlikely(!(ret & VM_FAULT_LOCKED)))
                lock_page(vmf->page);
        else
                VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);

        return ret;
}

/*
 * The ordering of these checks is important for pmds with _PAGE_DEVMAP set.
 * If we check pmd_trans_unstable() first we will trip the bad_pmd() check
 * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly
 * returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
 */
static int pmd_devmap_trans_unstable(pmd_t *pmd)
{
        return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
}

static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;

        if (!pmd_none(*vmf->pmd))
                goto map_pte;
        if (vmf->prealloc_pte) {
                vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
                if (unlikely(!pmd_none(*vmf->pmd))) {
                        spin_unlock(vmf->ptl);
                        goto map_pte;
                }

                mm_inc_nr_ptes(vma->vm_mm);
                pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
                spin_unlock(vmf->ptl);
                vmf->prealloc_pte = NULL;
        } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
                return VM_FAULT_OOM;
        }
map_pte:
        /*
         * If a huge pmd materialized under us just retry later.  Use
         * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of
         * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge
         * under us and then back to pmd_none, as a result of MADV_DONTNEED
         * running immediately after a huge pmd fault in a different thread of
         * this mm, in turn leading to a misleading pmd_trans_huge() retval.
         * All we have to ensure is that it is a regular pmd that we can walk
         * with pte_offset_map() and we can do that through an atomic read in
         * C, which is what pmd_trans_unstable() provides.
         */
        if (pmd_devmap_trans_unstable(vmf->pmd))
                return VM_FAULT_NOPAGE;

        /*
         * At this point we know that our vmf->pmd points to a page of ptes
         * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge()
         * for the duration of the fault.  If a racing MADV_DONTNEED runs and
         * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still
         * be valid and we will re-check to make sure the vmf->pte isn't
         * pte_none() under vmf->ptl protection when we return to
         * alloc_set_pte().
         */
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                        &vmf->ptl);
        return 0;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void deposit_prealloc_pte(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;

        pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
        /*
         * We are going to consume the prealloc table,
         * count that as nr_ptes.
         */
        mm_inc_nr_ptes(vma->vm_mm);
        vmf->prealloc_pte = NULL;
}

static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
{
        struct vm_area_struct *vma = vmf->vma;
        bool write = vmf->flags & FAULT_FLAG_WRITE;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        pmd_t entry;
        int i;
        vm_fault_t ret = VM_FAULT_FALLBACK;

        if (!transhuge_vma_suitable(vma, haddr))
                return ret;

        page = compound_head(page);
        if (compound_order(page) != HPAGE_PMD_ORDER)
                return ret;

        /*
         * Archs like ppc64 need additonal space to store information
         * related to pte entry. Use the preallocated table for that.
         */
        if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
                vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
                if (!vmf->prealloc_pte)
                        return VM_FAULT_OOM;
                smp_wmb(); /* See comment in __pte_alloc() */
        }

        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
        if (unlikely(!pmd_none(*vmf->pmd)))
                goto out;

        for (i = 0; i < HPAGE_PMD_NR; i++)
                flush_icache_page(vma, page + i);

        entry = mk_huge_pmd(page, vma->vm_page_prot);
        if (write)
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);

        add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
        page_add_file_rmap(page, true);
        /*
         * deposit and withdraw with pmd lock held
         */
        if (arch_needs_pgtable_deposit())
                deposit_prealloc_pte(vmf);

        set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);

        update_mmu_cache_pmd(vma, haddr, vmf->pmd);

        /* fault is handled */
        ret = 0;
        count_vm_event(THP_FILE_MAPPED);
out:
        spin_unlock(vmf->ptl);
        return ret;
}
#else
static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
{
        BUILD_BUG();
        return 0;
}
#endif

/**
 * alloc_set_pte - setup new PTE entry for given page and add reverse page
 * mapping. If needed, the function allocates page table or use pre-allocated.
 *
 * @vmf: fault environment
 * @page: page to map
 *
 * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
 * return.
 *
 * Target users are page handler itself and implementations of
 * vm_ops->map_pages.
 *
 * Return: %0 on success, %VM_FAULT_ code in case of error.
 */
vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
{
        struct vm_area_struct *vma = vmf->vma;
        bool write = vmf->flags & FAULT_FLAG_WRITE;
        pte_t entry;
        vm_fault_t ret;

        if (pmd_none(*vmf->pmd) && PageTransCompound(page)) {
                ret = do_set_pmd(vmf, page);
                if (ret != VM_FAULT_FALLBACK)
                        return ret;
        }

        if (!vmf->pte) {
                ret = pte_alloc_one_map(vmf);
                if (ret)
                        return ret;
        }

        /* Re-check under ptl */
        if (unlikely(!pte_none(*vmf->pte))) {
                update_mmu_tlb(vma, vmf->address, vmf->pte);
                return VM_FAULT_NOPAGE;
        }

        flush_icache_page(vma, page);
        entry = mk_pte(page, vma->vm_page_prot);
        entry = pte_sw_mkyoung(entry);
        if (write)
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        /* copy-on-write page */
        if (write && !(vma->vm_flags & VM_SHARED)) {
                inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
                page_add_new_anon_rmap(page, vma, vmf->address, false);
                lru_cache_add_inactive_or_unevictable(page, vma);
        } else {
                inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
                page_add_file_rmap(page, false);
        }
        set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);

        /* no need to invalidate: a not-present page won't be cached */
        update_mmu_cache(vma, vmf->address, vmf->pte);

        return 0;
}


/**
 * finish_fault - finish page fault once we have prepared the page to fault
 *
 * @vmf: structure describing the fault
 *
 * This function handles all that is needed to finish a page fault once the
 * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
 * given page, adds reverse page mapping, handles memcg charges and LRU
 * addition.
 *
 * The function expects the page to be locked and on success it consumes a
 * reference of a page being mapped (for the PTE which maps it).
 *
 * Return: %0 on success, %VM_FAULT_ code in case of error.
 */
vm_fault_t finish_fault(struct vm_fault *vmf)
{
        struct page *page;
        vm_fault_t ret = 0;

        /* Did we COW the page? */
        if ((vmf->flags & FAULT_FLAG_WRITE) &&
            !(vmf->vma->vm_flags & VM_SHARED))
                page = vmf->cow_page;
        else
                page = vmf->page;

        /*
         * check even for read faults because we might have lost our CoWed
         * page
         */
        if (!(vmf->vma->vm_flags & VM_SHARED))
                ret = check_stable_address_space(vmf->vma->vm_mm);
        if (!ret)
                ret = alloc_set_pte(vmf, page);
        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        return ret;
}

static unsigned long fault_around_bytes __read_mostly =
        rounddown_pow_of_two(65536);

#ifdef CONFIG_DEBUG_FS
static int fault_around_bytes_get(void *data, u64 *val)
{
        *val = fault_around_bytes;
        return 0;
}

/*
 * fault_around_bytes must be rounded down to the nearest page order as it's
 * what do_fault_around() expects to see.
 */
static int fault_around_bytes_set(void *data, u64 val)
{
        if (val / PAGE_SIZE > PTRS_PER_PTE)
                return -EINVAL;
        if (val > PAGE_SIZE)
                fault_around_bytes = rounddown_pow_of_two(val);
        else
                fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */
        return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
                fault_around_bytes_get, fault_around_bytes_set, "%llu\n");

static int __init fault_around_debugfs(void)
{
        debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
                                   &fault_around_bytes_fops);
        return 0;
}
late_initcall(fault_around_debugfs);
#endif

/*
 * do_fault_around() tries to map few pages around the fault address. The hope
 * is that the pages will be needed soon and this will lower the number of
 * faults to handle.
 *
 * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
 * not ready to be mapped: not up-to-date, locked, etc.
 *
 * This function is called with the page table lock taken. In the split ptlock
 * case the page table lock only protects only those entries which belong to
 * the page table corresponding to the fault address.
 *
 * This function doesn't cross the VMA boundaries, in order to call map_pages()
 * only once.
 *
 * fault_around_bytes defines how many bytes we'll try to map.
 * do_fault_around() expects it to be set to a power of two less than or equal
 * to PTRS_PER_PTE.
 *
 * The virtual address of the area that we map is naturally aligned to
 * fault_around_bytes rounded down to the machine page size
 * (and therefore to page order).  This way it's easier to guarantee
 * that we don't cross page table boundaries.
 */
static vm_fault_t do_fault_around(struct vm_fault *vmf)
{
        unsigned long address = vmf->address, nr_pages, mask;
        pgoff_t start_pgoff = vmf->pgoff;
        pgoff_t end_pgoff;
        int off;
        vm_fault_t ret = 0;

        nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
        mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;

        vmf->address = max(address & mask, vmf->vma->vm_start);
        off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
        start_pgoff -= off;

        /*
         *  end_pgoff is either the end of the page table, the end of
         *  the vma or nr_pages from start_pgoff, depending what is nearest.
         */
        end_pgoff = start_pgoff -
                ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
                PTRS_PER_PTE - 1;
        end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
                        start_pgoff + nr_pages - 1);

        if (pmd_none(*vmf->pmd)) {
                vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
                if (!vmf->prealloc_pte)
                        goto out;
                smp_wmb(); /* See comment in __pte_alloc() */
        }

        vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);

        /* Huge page is mapped? Page fault is solved */
        if (pmd_trans_huge(*vmf->pmd)) {
                ret = VM_FAULT_NOPAGE;
                goto out;
        }

        /* ->map_pages() haven't done anything useful. Cold page cache? */
        if (!vmf->pte)
                goto out;

        /* check if the page fault is solved */
        vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
        if (!pte_none(*vmf->pte))
                ret = VM_FAULT_NOPAGE;
        pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
        vmf->address = address;
        vmf->pte = NULL;
        return ret;
}

static vm_fault_t do_read_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret = 0;

        /*
         * Let's call ->map_pages() first and use ->fault() as fallback
         * if page by the offset is not ready to be mapped (cold cache or
         * something).
         */
        if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
                ret = do_fault_around(vmf);
                if (ret)
                        return ret;
        }

        ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;

        ret |= finish_fault(vmf);
        unlock_page(vmf->page);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                put_page(vmf->page);
        return ret;
}

static vm_fault_t do_cow_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret;

        if (unlikely(anon_vma_prepare(vma)))
                return VM_FAULT_OOM;

        vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
        if (!vmf->cow_page)
                return VM_FAULT_OOM;

        if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) {
                put_page(vmf->cow_page);
                return VM_FAULT_OOM;
        }
        cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL);

        ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                goto uncharge_out;
        if (ret & VM_FAULT_DONE_COW)
                return ret;

        copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
        __SetPageUptodate(vmf->cow_page);

        ret |= finish_fault(vmf);
        unlock_page(vmf->page);
        put_page(vmf->page);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                goto uncharge_out;
        return ret;
uncharge_out:
        put_page(vmf->cow_page);
        return ret;
}

static vm_fault_t do_shared_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret, tmp;

        ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;

        /*
         * Check if the backing address space wants to know that the page is
         * about to become writable
         */
        if (vma->vm_ops->page_mkwrite) {
                unlock_page(vmf->page);
                tmp = do_page_mkwrite(vmf);
                if (unlikely(!tmp ||
                                (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
                        put_page(vmf->page);
                        return tmp;
                }
        }

        ret |= finish_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
                                        VM_FAULT_RETRY))) {
                unlock_page(vmf->page);
                put_page(vmf->page);
                return ret;
        }

        ret |= fault_dirty_shared_page(vmf);
        return ret;
}

/*
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults).
 * The mmap_lock may have been released depending on flags and our
 * return value.  See filemap_fault() and __lock_page_or_retry().
 * If mmap_lock is released, vma may become invalid (for example
 * by other thread calling munmap()).
 */
static vm_fault_t do_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *vm_mm = vma->vm_mm;
        vm_fault_t ret;

        /*
         * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
         */
        if (!vma->vm_ops->fault) {
                /*
                 * If we find a migration pmd entry or a none pmd entry, which
                 * should never happen, return SIGBUS
                 */
                if (unlikely(!pmd_present(*vmf->pmd)))
                        ret = VM_FAULT_SIGBUS;
                else {
                        vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
                                                       vmf->pmd,
                                                       vmf->address,
                                                       &vmf->ptl);
                        /*
                         * Make sure this is not a temporary clearing of pte
                         * by holding ptl and checking again. A R/M/W update
                         * of pte involves: take ptl, clearing the pte so that
                         * we don't have concurrent modification by hardware
                         * followed by an update.
                         */
                        if (unlikely(pte_none(*vmf->pte)))
                                ret = VM_FAULT_SIGBUS;
                        else
                                ret = VM_FAULT_NOPAGE;

                        pte_unmap_unlock(vmf->pte, vmf->ptl);
                }
        } else if (!(vmf->flags & FAULT_FLAG_WRITE))
                ret = do_read_fault(vmf);
        else if (!(vma->vm_flags & VM_SHARED))
                ret = do_cow_fault(vmf);
        else
                ret = do_shared_fault(vmf);

        /* preallocated pagetable is unused: free it */
        if (vmf->prealloc_pte) {
                pte_free(vm_mm, vmf->prealloc_pte);
                vmf->prealloc_pte = NULL;
        }
        return ret;
}

static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
                                unsigned long addr, int page_nid,
                                int *flags)
{
        get_page(page);

        count_vm_numa_event(NUMA_HINT_FAULTS);
        if (page_nid == numa_node_id()) {
                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
                *flags |= TNF_FAULT_LOCAL;
        }

        return mpol_misplaced(page, vma, addr);
}

static vm_fault_t do_numa_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct page *page = NULL;
        int page_nid = NUMA_NO_NODE;
        int last_cpupid;
        int target_nid;
        bool migrated = false;
        pte_t pte, old_pte;
        bool was_writable = pte_savedwrite(vmf->orig_pte);
        int flags = 0;

        /*
         * The "pte" at this point cannot be used safely without
         * validation through pte_unmap_same(). It's of NUMA type but
         * the pfn may be screwed if the read is non atomic.
         */
        vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
        spin_lock(vmf->ptl);
        if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                goto out;
        }

        /*
         * Make it present again, Depending on how arch implementes non
         * accessible ptes, some can allow access by kernel mode.
         */
        old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
        pte = pte_modify(old_pte, vma->vm_page_prot);
        pte = pte_mkyoung(pte);
        if (was_writable)
                pte = pte_mkwrite(pte);
        ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
        update_mmu_cache(vma, vmf->address, vmf->pte);

        page = vm_normal_page(vma, vmf->address, pte);
        if (!page) {
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                return 0;
        }

        /* TODO: handle PTE-mapped THP */
        if (PageCompound(page)) {
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                return 0;
        }

        /*
         * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
         * much anyway since they can be in shared cache state. This misses
         * the case where a mapping is writable but the process never writes
         * to it but pte_write gets cleared during protection updates and
         * pte_dirty has unpredictable behaviour between PTE scan updates,
         * background writeback, dirty balancing and application behaviour.
         */
        if (!pte_write(pte))
                flags |= TNF_NO_GROUP;

        /*
         * Flag if the page is shared between multiple address spaces. This
         * is later used when determining whether to group tasks together
         */
        if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
                flags |= TNF_SHARED;

        last_cpupid = page_cpupid_last(page);
        page_nid = page_to_nid(page);
        target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
                        &flags);
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        if (target_nid == NUMA_NO_NODE) {
                put_page(page);
                goto out;
        }

        /* Migrate to the requested node */
        migrated = migrate_misplaced_page(page, vma, target_nid);
        if (migrated) {
                page_nid = target_nid;
                flags |= TNF_MIGRATED;
        } else
                flags |= TNF_MIGRATE_FAIL;

out:
        if (page_nid != NUMA_NO_NODE)
                task_numa_fault(last_cpupid, page_nid, 1, flags);
        return 0;
}

static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
{
        if (vma_is_anonymous(vmf->vma))
                return do_huge_pmd_anonymous_page(vmf);
        if (vmf->vma->vm_ops->huge_fault)
                return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
        return VM_FAULT_FALLBACK;
}

/* `inline' is required to avoid gcc 4.1.2 build error */
static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
{
        if (vma_is_anonymous(vmf->vma)) {
                if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd))
                        return handle_userfault(vmf, VM_UFFD_WP);
                return do_huge_pmd_wp_page(vmf, orig_pmd);
        }
        if (vmf->vma->vm_ops->huge_fault) {
                vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);

                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
        }

        /* COW or write-notify handled on pte level: split pmd. */
        __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);

        return VM_FAULT_FALLBACK;
}

static vm_fault_t create_huge_pud(struct vm_fault *vmf)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&                        \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
        /* No support for anonymous transparent PUD pages yet */
        if (vma_is_anonymous(vmf->vma))
                return VM_FAULT_FALLBACK;
        if (vmf->vma->vm_ops->huge_fault)
                return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
        return VM_FAULT_FALLBACK;
}

static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&                        \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
        /* No support for anonymous transparent PUD pages yet */
        if (vma_is_anonymous(vmf->vma))
                goto split;
        if (vmf->vma->vm_ops->huge_fault) {
                vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);

                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
        }
split:
        /* COW or write-notify not handled on PUD level: split pud.*/
        __split_huge_pud(vmf->vma, vmf->pud, vmf->address);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
        return VM_FAULT_FALLBACK;
}

/*
 * These routines also need to handle stuff like marking pages dirty
 * and/or accessed for architectures that don't do it in hardware (most
 * RISC architectures).  The early dirtying is also good on the i386.
 *
 * There is also a hook called "update_mmu_cache()" that architectures
 * with external mmu caches can use to update those (ie the Sparc or
 * PowerPC hashed page tables that act as extended TLBs).
 *
 * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
 * concurrent faults).
 *
 * The mmap_lock may have been released depending on flags and our return value.
 * See filemap_fault() and __lock_page_or_retry().
 */
static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
        pte_t entry;

        if (unlikely(pmd_none(*vmf->pmd))) {
                /*
                 * Leave __pte_alloc() until later: because vm_ops->fault may
                 * want to allocate huge page, and if we expose page table
                 * for an instant, it will be difficult to retract from
                 * concurrent faults and from rmap lookups.
                 */
                vmf->pte = NULL;
        } else {
                /* See comment in pte_alloc_one_map() */
                if (pmd_devmap_trans_unstable(vmf->pmd))
                        return 0;
                /*
                 * A regular pmd is established and it can't morph into a huge
                 * pmd from under us anymore at this point because we hold the
                 * mmap_lock read mode and khugepaged takes it in write mode.
                 * So now it's safe to run pte_offset_map().
                 */
                vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
                vmf->orig_pte = *vmf->pte;

                /*
                 * some architectures can have larger ptes than wordsize,
                 * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
                 * CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic
                 * accesses.  The code below just needs a consistent view
                 * for the ifs and we later double check anyway with the
                 * ptl lock held. So here a barrier will do.
                 */
                barrier();
                if (pte_none(vmf->orig_pte)) {
                        pte_unmap(vmf->pte);
                        vmf->pte = NULL;
                }
        }

        if (!vmf->pte) {
                if (vma_is_anonymous(vmf->vma))
                        return do_anonymous_page(vmf);
                else
                        return do_fault(vmf);
        }

        if (!pte_present(vmf->orig_pte))
                return do_swap_page(vmf);

        if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
                return do_numa_page(vmf);

        vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
        spin_lock(vmf->ptl);
        entry = vmf->orig_pte;
        if (unlikely(!pte_same(*vmf->pte, entry))) {
                update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
                goto unlock;
        }
        if (vmf->flags & FAULT_FLAG_WRITE) {
                if (!pte_write(entry))
                        return do_wp_page(vmf);
                entry = pte_mkdirty(entry);
        }
        entry = pte_mkyoung(entry);
        if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
                                vmf->flags & FAULT_FLAG_WRITE)) {
                update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
        } else {
                /* Skip spurious TLB flush for retried page fault */
                if (vmf->flags & FAULT_FLAG_TRIED)
                        goto unlock;
                /*
                 * This is needed only for protection faults but the arch code
                 * is not yet telling us if this is a protection fault or not.
                 * This still avoids useless tlb flushes for .text page faults
                 * with threads.
                 */
                if (vmf->flags & FAULT_FLAG_WRITE)
                        flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
        }
unlock:
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        return 0;
}

/*
 * By the time we get here, we already hold the mm semaphore
 *
 * The mmap_lock may have been released depending on flags and our
 * return value.  See filemap_fault() and __lock_page_or_retry().
 */
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
                unsigned long address, unsigned int flags)
{
        struct vm_fault vmf = {
                .vma = vma,
                .address = address & PAGE_MASK,
                .flags = flags,
                .pgoff = linear_page_index(vma, address),
                .gfp_mask = __get_fault_gfp_mask(vma),
        };
        unsigned int dirty = flags & FAULT_FLAG_WRITE;
        struct mm_struct *mm = vma->vm_mm;
        pgd_t *pgd;
        p4d_t *p4d;
        vm_fault_t ret;

        pgd = pgd_offset(mm, address);
        p4d = p4d_alloc(mm, pgd, address);
        if (!p4d)
                return VM_FAULT_OOM;

        vmf.pud = pud_alloc(mm, p4d, address);
        if (!vmf.pud)
                return VM_FAULT_OOM;
retry_pud:
        if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
                ret = create_huge_pud(&vmf);
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
        } else {
                pud_t orig_pud = *vmf.pud;

                barrier();
                if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {

                        /* NUMA case for anonymous PUDs would go here */

                        if (dirty && !pud_write(orig_pud)) {
                                ret = wp_huge_pud(&vmf, orig_pud);
                                if (!(ret & VM_FAULT_FALLBACK))
                                        return ret;
                        } else {
                                huge_pud_set_accessed(&vmf, orig_pud);
                                return 0;
                        }
                }
        }

        vmf.pmd = pmd_alloc(mm, vmf.pud, address);
        if (!vmf.pmd)
                return VM_FAULT_OOM;

        /* Huge pud page fault raced with pmd_alloc? */
        if (pud_trans_unstable(vmf.pud))
                goto retry_pud;

        if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
                ret = create_huge_pmd(&vmf);
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
        } else {
                pmd_t orig_pmd = *vmf.pmd;

                barrier();
                if (unlikely(is_swap_pmd(orig_pmd))) {
                        VM_BUG_ON(thp_migration_supported() &&
                                          !is_pmd_migration_entry(orig_pmd));
                        if (is_pmd_migration_entry(orig_pmd))
                                pmd_migration_entry_wait(mm, vmf.pmd);
                        return 0;
                }
                if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
                        if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
                                return do_huge_pmd_numa_page(&vmf, orig_pmd);

                        if (dirty && !pmd_write(orig_pmd)) {
                                ret = wp_huge_pmd(&vmf, orig_pmd);
                                if (!(ret & VM_FAULT_FALLBACK))
                                        return ret;
                        } else {
                                huge_pmd_set_accessed(&vmf, orig_pmd);
                                return 0;
                        }
                }
        }

        return handle_pte_fault(&vmf);
}

/**
 * mm_account_fault - Do page fault accountings
 *
 * @regs: the pt_regs struct pointer.  When set to NULL, will skip accounting
 *        of perf event counters, but we'll still do the per-task accounting to
 *        the task who triggered this page fault.
 * @address: the faulted address.
 * @flags: the fault flags.
 * @ret: the fault retcode.
 *
 * This will take care of most of the page fault accountings.  Meanwhile, it
 * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
 * updates.  However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
 * still be in per-arch page fault handlers at the entry of page fault.
 */
static inline void mm_account_fault(struct pt_regs *regs,
                                    unsigned long address, unsigned int flags,
                                    vm_fault_t ret)
{
        bool major;

        /*
         * We don't do accounting for some specific faults:
         *
         * - Unsuccessful faults (e.g. when the address wasn't valid).  That
         *   includes arch_vma_access_permitted() failing before reaching here.
         *   So this is not a "this many hardware page faults" counter.  We
         *   should use the hw profiling for that.
         *
         * - Incomplete faults (VM_FAULT_RETRY).  They will only be counted
         *   once they're completed.
         */
        if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY))
                return;

        /*
         * We define the fault as a major fault when the final successful fault
         * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
         * handle it immediately previously).
         */
        major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);

        if (major)
                current->maj_flt++;
        else
                current->min_flt++;

        /*
         * If the fault is done for GUP, regs will be NULL.  We only do the
         * accounting for the per thread fault counters who triggered the
         * fault, and we skip the perf event updates.
         */
        if (!regs)
                return;

        if (major)
                perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
        else
                perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
}

/*
 * By the time we get here, we already hold the mm semaphore
 *
 * The mmap_lock may have been released depending on flags and our
 * return value.  See filemap_fault() and __lock_page_or_retry().
 */
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                           unsigned int flags, struct pt_regs *regs)
{
        vm_fault_t ret;

        __set_current_state(TASK_RUNNING);

        count_vm_event(PGFAULT);
        count_memcg_event_mm(vma->vm_mm, PGFAULT);

        /* do counter updates before entering really critical section. */
        check_sync_rss_stat(current);

        if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
                                            flags & FAULT_FLAG_INSTRUCTION,
                                            flags & FAULT_FLAG_REMOTE))
                return VM_FAULT_SIGSEGV;

        /*
         * Enable the memcg OOM handling for faults triggered in user
         * space.  Kernel faults are handled more gracefully.
         */
        if (flags & FAULT_FLAG_USER)
                mem_cgroup_enter_user_fault();

        if (unlikely(is_vm_hugetlb_page(vma)))
                ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
        else
                ret = __handle_mm_fault(vma, address, flags);

        if (flags & FAULT_FLAG_USER) {
                mem_cgroup_exit_user_fault();
                /*
                 * The task may have entered a memcg OOM situation but
                 * if the allocation error was handled gracefully (no
                 * VM_FAULT_OOM), there is no need to kill anything.
                 * Just clean up the OOM state peacefully.
                 */
                if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
                        mem_cgroup_oom_synchronize(false);
        }

        mm_account_fault(regs, address, flags, ret);

        return ret;
}
EXPORT_SYMBOL_GPL(handle_mm_fault);

#ifndef __PAGETABLE_P4D_FOLDED
/*
 * Allocate p4d page table.
 * We've already handled the fast-path in-line.
 */
int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
{
        p4d_t *new = p4d_alloc_one(mm, address);
        if (!new)
                return -ENOMEM;

        smp_wmb(); /* See comment in __pte_alloc */

        spin_lock(&mm->page_table_lock);
        if (pgd_present(*pgd))                /* Another has populated it */
                p4d_free(mm, new);
        else
                pgd_populate(mm, pgd, new);
        spin_unlock(&mm->page_table_lock);
        return 0;
}
#endif /* __PAGETABLE_P4D_FOLDED */

#ifndef __PAGETABLE_PUD_FOLDED
/*
 * Allocate page upper directory.
 * We've already handled the fast-path in-line.
 */
int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
{
        pud_t *new = pud_alloc_one(mm, address);
        if (!new)
                return -ENOMEM;

        smp_wmb(); /* See comment in __pte_alloc */

        spin_lock(&mm->page_table_lock);
        if (!p4d_present(*p4d)) {
                mm_inc_nr_puds(mm);
                p4d_populate(mm, p4d, new);
        } else        /* Another has populated it */
                pud_free(mm, new);
        spin_unlock(&mm->page_table_lock);
        return 0;
}
#endif /* __PAGETABLE_PUD_FOLDED */

#ifndef __PAGETABLE_PMD_FOLDED
/*
 * Allocate page middle directory.
 * We've already handled the fast-path in-line.
 */
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
        spinlock_t *ptl;
        pmd_t *new = pmd_alloc_one(mm, address);
        if (!new)
                return -ENOMEM;

        smp_wmb(); /* See comment in __pte_alloc */

        ptl = pud_lock(mm, pud);
        if (!pud_present(*pud)) {
                mm_inc_nr_pmds(mm);
                pud_populate(mm, pud, new);
        } else        /* Another has populated it */
                pmd_free(mm, new);
        spin_unlock(ptl);
        return 0;
}
#endif /* __PAGETABLE_PMD_FOLDED */

int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
                          struct mmu_notifier_range *range, pte_t **ptepp,
                          pmd_t **pmdpp, spinlock_t **ptlp)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *ptep;

        pgd = pgd_offset(mm, address);
        if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
                goto out;

        p4d = p4d_offset(pgd, address);
        if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
                goto out;

        pud = pud_offset(p4d, address);
        if (pud_none(*pud) || unlikely(pud_bad(*pud)))
                goto out;

        pmd = pmd_offset(pud, address);
        VM_BUG_ON(pmd_trans_huge(*pmd));

        if (pmd_huge(*pmd)) {
                if (!pmdpp)
                        goto out;

                if (range) {
                        mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
                                                NULL, mm, address & PMD_MASK,
                                                (address & PMD_MASK) + PMD_SIZE);
                        mmu_notifier_invalidate_range_start(range);
                }
                *ptlp = pmd_lock(mm, pmd);
                if (pmd_huge(*pmd)) {
                        *pmdpp = pmd;
                        return 0;
                }
                spin_unlock(*ptlp);
                if (range)
                        mmu_notifier_invalidate_range_end(range);
        }

        if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
                goto out;

        if (range) {
                mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
                                        address & PAGE_MASK,
                                        (address & PAGE_MASK) + PAGE_SIZE);
                mmu_notifier_invalidate_range_start(range);
        }
        ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
        if (!pte_present(*ptep))
                goto unlock;
        *ptepp = ptep;
        return 0;
unlock:
        pte_unmap_unlock(ptep, *ptlp);
        if (range)
                mmu_notifier_invalidate_range_end(range);
out:
        return -EINVAL;
}

/**
 * follow_pte - look up PTE at a user virtual address
 * @mm: the mm_struct of the target address space
 * @address: user virtual address
 * @ptepp: location to store found PTE
 * @ptlp: location to store the lock for the PTE
 *
 * On a successful return, the pointer to the PTE is stored in @ptepp;
 * the corresponding lock is taken and its location is stored in @ptlp.
 * The contents of the PTE are only stable until @ptlp is released;
 * any further use, if any, must be protected against invalidation
 * with MMU notifiers.
 *
 * Only IO mappings and raw PFN mappings are allowed.  The mmap semaphore
 * should be taken for read.
 *
 * KVM uses this function.  While it is arguably less bad than ``follow_pfn``,
 * it is not a good general-purpose API.
 *
 * Return: zero on success, -ve otherwise.
 */
int follow_pte(struct mm_struct *mm, unsigned long address,
               pte_t **ptepp, spinlock_t **ptlp)
{
        return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp);
}
EXPORT_SYMBOL_GPL(follow_pte);

/**
 * follow_pfn - look up PFN at a user virtual address
 * @vma: memory mapping
 * @address: user virtual address
 * @pfn: location to store found PFN
 *
 * Only IO mappings and raw PFN mappings are allowed.
 *
 * This function does not allow the caller to read the permissions
 * of the PTE.  Do not use it.
 *
 * Return: zero and the pfn at @pfn on success, -ve otherwise.
 */
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
        unsigned long *pfn)
{
        int ret = -EINVAL;
        spinlock_t *ptl;
        pte_t *ptep;

        if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
                return ret;

        ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
        if (ret)
                return ret;
        *pfn = pte_pfn(*ptep);
        pte_unmap_unlock(ptep, ptl);
        return 0;
}
EXPORT_SYMBOL(follow_pfn);

#ifdef CONFIG_HAVE_IOREMAP_PROT
int follow_phys(struct vm_area_struct *vma,
                unsigned long address, unsigned int flags,
                unsigned long *prot, resource_size_t *phys)
{
        int ret = -EINVAL;
        pte_t *ptep, pte;
        spinlock_t *ptl;

        if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
                goto out;

        if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
                goto out;
        pte = *ptep;

        /* Never return PFNs of anon folios in COW mappings. */
        if (vm_normal_page(vma, address, pte))
                goto unlock;

        if ((flags & FOLL_WRITE) && !pte_write(pte))
                goto unlock;

        *prot = pgprot_val(pte_pgprot(pte));
        *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;

        ret = 0;
unlock:
        pte_unmap_unlock(ptep, ptl);
out:
        return ret;
}

int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
                        void *buf, int len, int write)
{
        resource_size_t phys_addr;
        unsigned long prot = 0;
        void __iomem *maddr;
        int offset = addr & (PAGE_SIZE-1);

        if (follow_phys(vma, addr, write, &prot, &phys_addr))
                return -EINVAL;

        maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
        if (!maddr)
                return -ENOMEM;

        if (write)
                memcpy_toio(maddr + offset, buf, len);
        else
                memcpy_fromio(buf, maddr + offset, len);
        iounmap(maddr);

        return len;
}
EXPORT_SYMBOL_GPL(generic_access_phys);
#endif

/*
 * Access another process' address space as given in mm.  If non-NULL, use the
 * given task for page fault accounting.
 */
int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
                unsigned long addr, void *buf, int len, unsigned int gup_flags)
{
        struct vm_area_struct *vma;
        void *old_buf = buf;
        int write = gup_flags & FOLL_WRITE;

        if (mmap_read_lock_killable(mm))
                return 0;

        /* ignore errors, just check how much was successfully transferred */
        while (len) {
                int bytes, ret, offset;
                void *maddr;
                struct page *page = NULL;

                ret = get_user_pages_remote(mm, addr, 1,
                                gup_flags, &page, &vma, NULL);
                if (ret <= 0) {
#ifndef CONFIG_HAVE_IOREMAP_PROT
                        break;
#else
                        /*
                         * Check if this is a VM_IO | VM_PFNMAP VMA, which
                         * we can access using slightly different code.
                         */
                        vma = find_vma(mm, addr);
                        if (!vma || vma->vm_start > addr)
                                break;
                        if (vma->vm_ops && vma->vm_ops->access)
                                ret = vma->vm_ops->access(vma, addr, buf,
                                                          len, write);
                        if (ret <= 0)
                                break;
                        bytes = ret;
#endif
                } else {
                        bytes = len;
                        offset = addr & (PAGE_SIZE-1);
                        if (bytes > PAGE_SIZE-offset)
                                bytes = PAGE_SIZE-offset;

                        maddr = kmap(page);
                        if (write) {
                                copy_to_user_page(vma, page, addr,
                                                  maddr + offset, buf, bytes);
                                set_page_dirty_lock(page);
                        } else {
                                copy_from_user_page(vma, page, addr,
                                                    buf, maddr + offset, bytes);
                        }
                        kunmap(page);
                        put_page(page);
                }
                len -= bytes;
                buf += bytes;
                addr += bytes;
        }
        mmap_read_unlock(mm);

        return buf - old_buf;
}

/**
 * access_remote_vm - access another process' address space
 * @mm:                the mm_struct of the target address space
 * @addr:        start address to access
 * @buf:        source or destination buffer
 * @len:        number of bytes to transfer
 * @gup_flags:        flags modifying lookup behaviour
 *
 * The caller must hold a reference on @mm.
 *
 * Return: number of bytes copied from source to destination.
 */
int access_remote_vm(struct mm_struct *mm, unsigned long addr,
                void *buf, int len, unsigned int gup_flags)
{
        return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
}

/*
 * Access another process' address space.
 * Source/target buffer must be kernel space,
 * Do not walk the page table directly, use get_user_pages
 */
int access_process_vm(struct task_struct *tsk, unsigned long addr,
                void *buf, int len, unsigned int gup_flags)
{
        struct mm_struct *mm;
        int ret;

        mm = get_task_mm(tsk);
        if (!mm)
                return 0;

        ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);

        mmput(mm);

        return ret;
}
EXPORT_SYMBOL_GPL(access_process_vm);

/*
 * Print the name of a VMA.
 */
void print_vma_addr(char *prefix, unsigned long ip)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;

        /*
         * we might be running from an atomic context so we cannot sleep
         */
        if (!mmap_read_trylock(mm))
                return;

        vma = find_vma(mm, ip);
        if (vma && vma->vm_file) {
                struct file *f = vma->vm_file;
                char *buf = (char *)__get_free_page(GFP_NOWAIT);
                if (buf) {
                        char *p;

                        p = file_path(f, buf, PAGE_SIZE);
                        if (IS_ERR(p))
                                p = "?";
                        printk("%s%s[%lx+%lx]", prefix, kbasename(p),
                                        vma->vm_start,
                                        vma->vm_end - vma->vm_start);
                        free_page((unsigned long)buf);
                }
        }
        mmap_read_unlock(mm);
}

#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
void __might_fault(const char *file, int line)
{
        /*
         * Some code (nfs/sunrpc) uses socket ops on kernel memory while
         * holding the mmap_lock, this is safe because kernel memory doesn't
         * get paged out, therefore we'll never actually fault, and the
         * below annotations will generate false positives.
         */
        if (uaccess_kernel())
                return;
        if (pagefault_disabled())
                return;
        __might_sleep(file, line, 0);
#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
        if (current->mm)
                might_lock_read(&current->mm->mmap_lock);
#endif
}
EXPORT_SYMBOL(__might_fault);
#endif

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
/*
 * Process all subpages of the specified huge page with the specified
 * operation.  The target subpage will be processed last to keep its
 * cache lines hot.
 */
static inline void process_huge_page(
        unsigned long addr_hint, unsigned int pages_per_huge_page,
        void (*process_subpage)(unsigned long addr, int idx, void *arg),
        void *arg)
{
        int i, n, base, l;
        unsigned long addr = addr_hint &
                ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);

        /* Process target subpage last to keep its cache lines hot */
        might_sleep();
        n = (addr_hint - addr) / PAGE_SIZE;
        if (2 * n <= pages_per_huge_page) {
                /* If target subpage in first half of huge page */
                base = 0;
                l = n;
                /* Process subpages at the end of huge page */
                for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
                        cond_resched();
                        process_subpage(addr + i * PAGE_SIZE, i, arg);
                }
        } else {
                /* If target subpage in second half of huge page */
                base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
                l = pages_per_huge_page - n;
                /* Process subpages at the begin of huge page */
                for (i = 0; i < base; i++) {
                        cond_resched();
                        process_subpage(addr + i * PAGE_SIZE, i, arg);
                }
        }
        /*
         * Process remaining subpages in left-right-left-right pattern
         * towards the target subpage
         */
        for (i = 0; i < l; i++) {
                int left_idx = base + i;
                int right_idx = base + 2 * l - 1 - i;

                cond_resched();
                process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
                cond_resched();
                process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
        }
}

static void clear_gigantic_page(struct page *page,
                                unsigned long addr,
                                unsigned int pages_per_huge_page)
{
        int i;
        struct page *p = page;

        might_sleep();
        for (i = 0; i < pages_per_huge_page;
             i++, p = mem_map_next(p, page, i)) {
                cond_resched();
                clear_user_highpage(p, addr + i * PAGE_SIZE);
        }
}

static void clear_subpage(unsigned long addr, int idx, void *arg)
{
        struct page *page = arg;

        clear_user_highpage(page + idx, addr);
}

void clear_huge_page(struct page *page,
                     unsigned long addr_hint, unsigned int pages_per_huge_page)
{
        unsigned long addr = addr_hint &
                ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);

        if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
                clear_gigantic_page(page, addr, pages_per_huge_page);
                return;
        }

        process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
}

static void copy_user_gigantic_page(struct page *dst, struct page *src,
                                    unsigned long addr,
                                    struct vm_area_struct *vma,
                                    unsigned int pages_per_huge_page)
{
        int i;
        struct page *dst_base = dst;
        struct page *src_base = src;

        for (i = 0; i < pages_per_huge_page; ) {
                cond_resched();
                copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);

                i++;
                dst = mem_map_next(dst, dst_base, i);
                src = mem_map_next(src, src_base, i);
        }
}

struct copy_subpage_arg {
        struct page *dst;
        struct page *src;
        struct vm_area_struct *vma;
};

static void copy_subpage(unsigned long addr, int idx, void *arg)
{
        struct copy_subpage_arg *copy_arg = arg;

        copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
                           addr, copy_arg->vma);
}

void copy_user_huge_page(struct page *dst, struct page *src,
                         unsigned long addr_hint, struct vm_area_struct *vma,
                         unsigned int pages_per_huge_page)
{
        unsigned long addr = addr_hint &
                ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
        struct copy_subpage_arg arg = {
                .dst = dst,
                .src = src,
                .vma = vma,
        };

        if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
                copy_user_gigantic_page(dst, src, addr, vma,
                                        pages_per_huge_page);
                return;
        }

        process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
}

long copy_huge_page_from_user(struct page *dst_page,
                                const void __user *usr_src,
                                unsigned int pages_per_huge_page,
                                bool allow_pagefault)
{
        void *src = (void *)usr_src;
        void *page_kaddr;
        unsigned long i, rc = 0;
        unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
        struct page *subpage = dst_page;

        for (i = 0; i < pages_per_huge_page;
             i++, subpage = mem_map_next(subpage, dst_page, i)) {
                if (allow_pagefault)
                        page_kaddr = kmap(subpage);
                else
                        page_kaddr = kmap_atomic(subpage);
                rc = copy_from_user(page_kaddr,
                                (const void __user *)(src + i * PAGE_SIZE),
                                PAGE_SIZE);
                if (allow_pagefault)
                        kunmap(subpage);
                else
                        kunmap_atomic(page_kaddr);

                ret_val -= (PAGE_SIZE - rc);
                if (rc)
                        break;

                flush_dcache_page(subpage);

                cond_resched();
        }
        return ret_val;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */

#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS

static struct kmem_cache *page_ptl_cachep;

void __init ptlock_cache_init(void)
{
        page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
                        SLAB_PANIC, NULL);
}

bool ptlock_alloc(struct page *page)
{
        spinlock_t *ptl;

        ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
        if (!ptl)
                return false;
        page->ptl = ptl;
        return true;
}

void ptlock_free(struct page *page)
{
        kmem_cache_free(page_ptl_cachep, page->ptl);
}
#endif









































































































































































































































































































































    1 
























    1 







































    1 














    1 























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_SPINLOCK_H
#define __LINUX_SPINLOCK_H

/*
 * include/linux/spinlock.h - generic spinlock/rwlock declarations
 *
 * here's the role of the various spinlock/rwlock related include files:
 *
 * on SMP builds:
 *
 *  asm/spinlock_types.h: contains the arch_spinlock_t/arch_rwlock_t and the
 *                        initializers
 *
 *  linux/spinlock_types.h:
 *                        defines the generic type and initializers
 *
 *  asm/spinlock.h:       contains the arch_spin_*()/etc. lowlevel
 *                        implementations, mostly inline assembly code
 *
 *   (also included on UP-debug builds:)
 *
 *  linux/spinlock_api_smp.h:
 *                        contains the prototypes for the _spin_*() APIs.
 *
 *  linux/spinlock.h:     builds the final spin_*() APIs.
 *
 * on UP builds:
 *
 *  linux/spinlock_type_up.h:
 *                        contains the generic, simplified UP spinlock type.
 *                        (which is an empty structure on non-debug builds)
 *
 *  linux/spinlock_types.h:
 *                        defines the generic type and initializers
 *
 *  linux/spinlock_up.h:
 *                        contains the arch_spin_*()/etc. version of UP
 *                        builds. (which are NOPs on non-debug, non-preempt
 *                        builds)
 *
 *   (included on UP-non-debug builds:)
 *
 *  linux/spinlock_api_up.h:
 *                        builds the _spin_*() APIs.
 *
 *  linux/spinlock.h:     builds the final spin_*() APIs.
 */

#include <linux/typecheck.h>
#include <linux/preempt.h>
#include <linux/linkage.h>
#include <linux/compiler.h>
#include <linux/irqflags.h>
#include <linux/thread_info.h>
#include <linux/kernel.h>
#include <linux/stringify.h>
#include <linux/bottom_half.h>
#include <linux/lockdep.h>
#include <asm/barrier.h>
#include <asm/mmiowb.h>


/*
 * Must define these before including other files, inline functions need them
 */
#define LOCK_SECTION_NAME ".text..lock."KBUILD_BASENAME

#define LOCK_SECTION_START(extra)               \
        ".subsection 1\n\t"                     \
        extra                                   \
        ".ifndef " LOCK_SECTION_NAME "\n\t"     \
        LOCK_SECTION_NAME ":\n\t"               \
        ".endif\n"

#define LOCK_SECTION_END                        \
        ".previous\n\t"

#define __lockfunc __section(".spinlock.text")

/*
 * Pull the arch_spinlock_t and arch_rwlock_t definitions:
 */
#include <linux/spinlock_types.h>

/*
 * Pull the arch_spin*() functions/declarations (UP-nondebug doesn't need them):
 */
#ifdef CONFIG_SMP
# include <asm/spinlock.h>
#else
# include <linux/spinlock_up.h>
#endif

#ifdef CONFIG_DEBUG_SPINLOCK
  extern void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
                                   struct lock_class_key *key, short inner);

# define raw_spin_lock_init(lock)                                        \
do {                                                                        \
        static struct lock_class_key __key;                                \
                                                                        \
        __raw_spin_lock_init((lock), #lock, &__key, LD_WAIT_SPIN);        \
} while (0)

#else
# define raw_spin_lock_init(lock)                                \
        do { *(lock) = __RAW_SPIN_LOCK_UNLOCKED(lock); } while (0)
#endif

#define raw_spin_is_locked(lock)        arch_spin_is_locked(&(lock)->raw_lock)

#ifdef arch_spin_is_contended
#define raw_spin_is_contended(lock)        arch_spin_is_contended(&(lock)->raw_lock)
#else
#define raw_spin_is_contended(lock)        (((void)(lock), 0))
#endif /*arch_spin_is_contended*/

/*
 * smp_mb__after_spinlock() provides the equivalent of a full memory barrier
 * between program-order earlier lock acquisitions and program-order later
 * memory accesses.
 *
 * This guarantees that the following two properties hold:
 *
 *   1) Given the snippet:
 *
 *          { X = 0;  Y = 0; }
 *
 *          CPU0                                CPU1
 *
 *          WRITE_ONCE(X, 1);                WRITE_ONCE(Y, 1);
 *          spin_lock(S);                        smp_mb();
 *          smp_mb__after_spinlock();        r1 = READ_ONCE(X);
 *          r0 = READ_ONCE(Y);
 *          spin_unlock(S);
 *
 *      it is forbidden that CPU0 does not observe CPU1's store to Y (r0 = 0)
 *      and CPU1 does not observe CPU0's store to X (r1 = 0); see the comments
 *      preceding the call to smp_mb__after_spinlock() in __schedule() and in
 *      try_to_wake_up().
 *
 *   2) Given the snippet:
 *
 *  { X = 0;  Y = 0; }
 *
 *  CPU0                CPU1                                CPU2
 *
 *  spin_lock(S);        spin_lock(S);                        r1 = READ_ONCE(Y);
 *  WRITE_ONCE(X, 1);        smp_mb__after_spinlock();        smp_rmb();
 *  spin_unlock(S);        r0 = READ_ONCE(X);                r2 = READ_ONCE(X);
 *                        WRITE_ONCE(Y, 1);
 *                        spin_unlock(S);
 *
 *      it is forbidden that CPU0's critical section executes before CPU1's
 *      critical section (r0 = 1), CPU2 observes CPU1's store to Y (r1 = 1)
 *      and CPU2 does not observe CPU0's store to X (r2 = 0); see the comments
 *      preceding the calls to smp_rmb() in try_to_wake_up() for similar
 *      snippets but "projected" onto two CPUs.
 *
 * Property (2) upgrades the lock to an RCsc lock.
 *
 * Since most load-store architectures implement ACQUIRE with an smp_mb() after
 * the LL/SC loop, they need no further barriers. Similarly all our TSO
 * architectures imply an smp_mb() for each atomic instruction and equally don't
 * need more.
 *
 * Architectures that can implement ACQUIRE better need to take care.
 */
#ifndef smp_mb__after_spinlock
#define smp_mb__after_spinlock()        do { } while (0)
#endif

#ifdef CONFIG_DEBUG_SPINLOCK
 extern void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock);
#define do_raw_spin_lock_flags(lock, flags) do_raw_spin_lock(lock)
 extern int do_raw_spin_trylock(raw_spinlock_t *lock);
 extern void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock);
#else
static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock)
{
        __acquire(lock);
        arch_spin_lock(&lock->raw_lock);
        mmiowb_spin_lock();
}

#ifndef arch_spin_lock_flags
#define arch_spin_lock_flags(lock, flags)        arch_spin_lock(lock)
#endif

static inline void
do_raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long *flags) __acquires(lock)
{
        __acquire(lock);
        arch_spin_lock_flags(&lock->raw_lock, *flags);
        mmiowb_spin_lock();
}

static inline int do_raw_spin_trylock(raw_spinlock_t *lock)
{
        int ret = arch_spin_trylock(&(lock)->raw_lock);

        if (ret)
                mmiowb_spin_lock();

        return ret;
}

static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
{
        mmiowb_spin_unlock();
        arch_spin_unlock(&lock->raw_lock);
        __release(lock);
}
#endif

/*
 * Define the various spin_lock methods.  Note we define these
 * regardless of whether CONFIG_SMP or CONFIG_PREEMPTION are set. The
 * various methods are defined as nops in the case they are not
 * required.
 */
#define raw_spin_trylock(lock)        __cond_lock(lock, _raw_spin_trylock(lock))

#define raw_spin_lock(lock)        _raw_spin_lock(lock)

#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define raw_spin_lock_nested(lock, subclass) \
        _raw_spin_lock_nested(lock, subclass)

# define raw_spin_lock_nest_lock(lock, nest_lock)                        \
         do {                                                                \
                 typecheck(struct lockdep_map *, &(nest_lock)->dep_map);\
                 _raw_spin_lock_nest_lock(lock, &(nest_lock)->dep_map);        \
         } while (0)
#else
/*
 * Always evaluate the 'subclass' argument to avoid that the compiler
 * warns about set-but-not-used variables when building with
 * CONFIG_DEBUG_LOCK_ALLOC=n and with W=1.
 */
# define raw_spin_lock_nested(lock, subclass)                \
        _raw_spin_lock(((void)(subclass), (lock)))
# define raw_spin_lock_nest_lock(lock, nest_lock)        _raw_spin_lock(lock)
#endif

#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)

#define raw_spin_lock_irqsave(lock, flags)                        \
        do {                                                \
                typecheck(unsigned long, flags);        \
                flags = _raw_spin_lock_irqsave(lock);        \
        } while (0)

#ifdef CONFIG_DEBUG_LOCK_ALLOC
#define raw_spin_lock_irqsave_nested(lock, flags, subclass)                \
        do {                                                                \
                typecheck(unsigned long, flags);                        \
                flags = _raw_spin_lock_irqsave_nested(lock, subclass);        \
        } while (0)
#else
#define raw_spin_lock_irqsave_nested(lock, flags, subclass)                \
        do {                                                                \
                typecheck(unsigned long, flags);                        \
                flags = _raw_spin_lock_irqsave(lock);                        \
        } while (0)
#endif

#else

#define raw_spin_lock_irqsave(lock, flags)                \
        do {                                                \
                typecheck(unsigned long, flags);        \
                _raw_spin_lock_irqsave(lock, flags);        \
        } while (0)

#define raw_spin_lock_irqsave_nested(lock, flags, subclass)        \
        raw_spin_lock_irqsave(lock, flags)

#endif

#define raw_spin_lock_irq(lock)                _raw_spin_lock_irq(lock)
#define raw_spin_lock_bh(lock)                _raw_spin_lock_bh(lock)
#define raw_spin_unlock(lock)                _raw_spin_unlock(lock)
#define raw_spin_unlock_irq(lock)        _raw_spin_unlock_irq(lock)

#define raw_spin_unlock_irqrestore(lock, flags)                \
        do {                                                        \
                typecheck(unsigned long, flags);                \
                _raw_spin_unlock_irqrestore(lock, flags);        \
        } while (0)
#define raw_spin_unlock_bh(lock)        _raw_spin_unlock_bh(lock)

#define raw_spin_trylock_bh(lock) \
        __cond_lock(lock, _raw_spin_trylock_bh(lock))

#define raw_spin_trylock_irq(lock) \
({ \
        local_irq_disable(); \
        raw_spin_trylock(lock) ? \
        1 : ({ local_irq_enable(); 0;  }); \
})

#define raw_spin_trylock_irqsave(lock, flags) \
({ \
        local_irq_save(flags); \
        raw_spin_trylock(lock) ? \
        1 : ({ local_irq_restore(flags); 0; }); \
})

/* Include rwlock functions */
#include <linux/rwlock.h>

/*
 * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
 */
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
# include <linux/spinlock_api_smp.h>
#else
# include <linux/spinlock_api_up.h>
#endif

/*
 * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
 */

static __always_inline raw_spinlock_t *spinlock_check(spinlock_t *lock)
{
        return &lock->rlock;
}

#ifdef CONFIG_DEBUG_SPINLOCK

# define spin_lock_init(lock)                                        \
do {                                                                \
        static struct lock_class_key __key;                        \
                                                                \
        __raw_spin_lock_init(spinlock_check(lock),                \
                             #lock, &__key, LD_WAIT_CONFIG);        \
} while (0)

#else

# define spin_lock_init(_lock)                        \
do {                                                \
        spinlock_check(_lock);                        \
        *(_lock) = __SPIN_LOCK_UNLOCKED(_lock);        \
} while (0)

#endif

static __always_inline void spin_lock(spinlock_t *lock)
{
        raw_spin_lock(&lock->rlock);
}

static __always_inline void spin_lock_bh(spinlock_t *lock)
{
        raw_spin_lock_bh(&lock->rlock);
}

static __always_inline int spin_trylock(spinlock_t *lock)
{
        return raw_spin_trylock(&lock->rlock);
}

#define spin_lock_nested(lock, subclass)                        \
do {                                                                \
        raw_spin_lock_nested(spinlock_check(lock), subclass);        \
} while (0)

#define spin_lock_nest_lock(lock, nest_lock)                                \
do {                                                                        \
        raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock);        \
} while (0)

static __always_inline void spin_lock_irq(spinlock_t *lock)
{
        raw_spin_lock_irq(&lock->rlock);
}

#define spin_lock_irqsave(lock, flags)                                \
do {                                                                \
        raw_spin_lock_irqsave(spinlock_check(lock), flags);        \
} while (0)

#define spin_lock_irqsave_nested(lock, flags, subclass)                        \
do {                                                                        \
        raw_spin_lock_irqsave_nested(spinlock_check(lock), flags, subclass); \
} while (0)

static __always_inline void spin_unlock(spinlock_t *lock)
{
        raw_spin_unlock(&lock->rlock);
}

static __always_inline void spin_unlock_bh(spinlock_t *lock)
{
        raw_spin_unlock_bh(&lock->rlock);
}

static __always_inline void spin_unlock_irq(spinlock_t *lock)
{
        raw_spin_unlock_irq(&lock->rlock);
}

static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
{
        raw_spin_unlock_irqrestore(&lock->rlock, flags);
}

static __always_inline int spin_trylock_bh(spinlock_t *lock)
{
        return raw_spin_trylock_bh(&lock->rlock);
}

static __always_inline int spin_trylock_irq(spinlock_t *lock)
{
        return raw_spin_trylock_irq(&lock->rlock);
}

#define spin_trylock_irqsave(lock, flags)                        \
({                                                                \
        raw_spin_trylock_irqsave(spinlock_check(lock), flags); \
})

/**
 * spin_is_locked() - Check whether a spinlock is locked.
 * @lock: Pointer to the spinlock.
 *
 * This function is NOT required to provide any memory ordering
 * guarantees; it could be used for debugging purposes or, when
 * additional synchronization is needed, accompanied with other
 * constructs (memory barriers) enforcing the synchronization.
 *
 * Returns: 1 if @lock is locked, 0 otherwise.
 *
 * Note that the function only tells you that the spinlock is
 * seen to be locked, not that it is locked on your CPU.
 *
 * Further, on CONFIG_SMP=n builds with CONFIG_DEBUG_SPINLOCK=n,
 * the return value is always 0 (see include/linux/spinlock_up.h).
 * Therefore you should not rely heavily on the return value.
 */
static __always_inline int spin_is_locked(spinlock_t *lock)
{
        return raw_spin_is_locked(&lock->rlock);
}

static __always_inline int spin_is_contended(spinlock_t *lock)
{
        return raw_spin_is_contended(&lock->rlock);
}

#define assert_spin_locked(lock)        assert_raw_spin_locked(&(lock)->rlock)

/*
 * Pull the atomic_t declaration:
 * (asm-mips/atomic.h needs above definitions)
 */
#include <linux/atomic.h>
/**
 * atomic_dec_and_lock - lock on reaching reference count zero
 * @atomic: the atomic counter
 * @lock: the spinlock in question
 *
 * Decrements @atomic by 1.  If the result is 0, returns true and locks
 * @lock.  Returns false for all other cases.
 */
extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
#define atomic_dec_and_lock(atomic, lock) \
                __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))

extern int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock,
                                        unsigned long *flags);
#define atomic_dec_and_lock_irqsave(atomic, lock, flags) \
                __cond_lock(lock, _atomic_dec_and_lock_irqsave(atomic, lock, &(flags)))

int __alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *lock_mask,
                             size_t max_size, unsigned int cpu_mult,
                             gfp_t gfp, const char *name,
                             struct lock_class_key *key);

#define alloc_bucket_spinlocks(locks, lock_mask, max_size, cpu_mult, gfp)    \
        ({                                                                     \
                static struct lock_class_key key;                             \
                int ret;                                                     \
                                                                             \
                ret = __alloc_bucket_spinlocks(locks, lock_mask, max_size,   \
                                               cpu_mult, gfp, #locks, &key); \
                ret;                                                             \
        })

void free_bucket_spinlocks(spinlock_t *locks);

#endif /* __LINUX_SPINLOCK_H */



















    1 
    1 



    1 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_TLB_H
#define _ASM_X86_TLB_H

#define tlb_start_vma(tlb, vma) do { } while (0)
#define tlb_end_vma(tlb, vma) do { } while (0)
#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)

#define tlb_flush tlb_flush
static inline void tlb_flush(struct mmu_gather *tlb);

#include <asm-generic/tlb.h>

static inline void tlb_flush(struct mmu_gather *tlb)
{
        unsigned long start = 0UL, end = TLB_FLUSH_ALL;
        unsigned int stride_shift = tlb_get_unmap_shift(tlb);

        if (!tlb->fullmm && !tlb->need_flush_all) {
                start = tlb->start;
                end = tlb->end;
        }

        flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
}

/*
 * While x86 architecture in general requires an IPI to perform TLB
 * shootdown, enablement code for several hypervisors overrides
 * .flush_tlb_others hook in pv_mmu_ops and implements it by issuing
 * a hypercall. To keep software pagetable walkers safe in this case we
 * switch to RCU based table free (MMU_GATHER_RCU_TABLE_FREE). See the comment
 * below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h
 * for more details.
 */
static inline void __tlb_remove_table(void *table)
{
        free_page_and_swap_cache(table);
}

#endif /* _ASM_X86_TLB_H */









































































    1 





































































































































































































































    1 




    1 

























































































































































































































































































































































































































    1 
















































    1 


























































    1 

















    1 






















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * Read-Copy Update mechanism for mutual exclusion
 *
 * Copyright IBM Corporation, 2001
 *
 * Author: Dipankar Sarma <dipankar@in.ibm.com>
 *
 * Based on the original work by Paul McKenney <paulmck@vnet.ibm.com>
 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
 * Papers:
 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
 *
 * For detailed explanation of Read-Copy Update mechanism see -
 *                http://lse.sourceforge.net/locking/rcupdate.html
 *
 */

#ifndef __LINUX_RCUPDATE_H
#define __LINUX_RCUPDATE_H

#include <linux/types.h>
#include <linux/compiler.h>
#include <linux/atomic.h>
#include <linux/irqflags.h>
#include <linux/preempt.h>
#include <linux/bottom_half.h>
#include <linux/lockdep.h>
#include <asm/processor.h>
#include <linux/cpumask.h>

#define ULONG_CMP_GE(a, b)        (ULONG_MAX / 2 >= (a) - (b))
#define ULONG_CMP_LT(a, b)        (ULONG_MAX / 2 < (a) - (b))
#define ulong2long(a)                (*(long *)(&(a)))
#define USHORT_CMP_GE(a, b)        (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
#define USHORT_CMP_LT(a, b)        (USHRT_MAX / 2 < (unsigned short)((a) - (b)))

/* Exported common interfaces */
void call_rcu(struct rcu_head *head, rcu_callback_t func);
void rcu_barrier_tasks(void);
void rcu_barrier_tasks_rude(void);
void synchronize_rcu(void);

#ifdef CONFIG_PREEMPT_RCU

void __rcu_read_lock(void);
void __rcu_read_unlock(void);

/*
 * Defined as a macro as it is a very low level header included from
 * areas that don't even know about current.  This gives the rcu_read_lock()
 * nesting depth, but makes sense only if CONFIG_PREEMPT_RCU -- in other
 * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
 */
#define rcu_preempt_depth() (current->rcu_read_lock_nesting)

#else /* #ifdef CONFIG_PREEMPT_RCU */

#ifdef CONFIG_TINY_RCU
#define rcu_read_unlock_strict() do { } while (0)
#else
void rcu_read_unlock_strict(void);
#endif

static inline void __rcu_read_lock(void)
{
        preempt_disable();
}

static inline void __rcu_read_unlock(void)
{
        preempt_enable();
        rcu_read_unlock_strict();
}

static inline int rcu_preempt_depth(void)
{
        return 0;
}

#endif /* #else #ifdef CONFIG_PREEMPT_RCU */

/* Internal to kernel */
void rcu_init(void);
extern int rcu_scheduler_active __read_mostly;
void rcu_sched_clock_irq(int user);
void rcu_report_dead(unsigned int cpu);
void rcutree_migrate_callbacks(int cpu);

#ifdef CONFIG_TASKS_RCU_GENERIC
void rcu_init_tasks_generic(void);
#else
static inline void rcu_init_tasks_generic(void) { }
#endif

#ifdef CONFIG_RCU_STALL_COMMON
void rcu_sysrq_start(void);
void rcu_sysrq_end(void);
#else /* #ifdef CONFIG_RCU_STALL_COMMON */
static inline void rcu_sysrq_start(void) { }
static inline void rcu_sysrq_end(void) { }
#endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */

#ifdef CONFIG_NO_HZ_FULL
void rcu_user_enter(void);
void rcu_user_exit(void);
#else
static inline void rcu_user_enter(void) { }
static inline void rcu_user_exit(void) { }
#endif /* CONFIG_NO_HZ_FULL */

#ifdef CONFIG_RCU_NOCB_CPU
void rcu_init_nohz(void);
void rcu_nocb_flush_deferred_wakeup(void);
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
static inline void rcu_init_nohz(void) { }
static inline void rcu_nocb_flush_deferred_wakeup(void) { }
#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */

/**
 * RCU_NONIDLE - Indicate idle-loop code that needs RCU readers
 * @a: Code that RCU needs to pay attention to.
 *
 * RCU read-side critical sections are forbidden in the inner idle loop,
 * that is, between the rcu_idle_enter() and the rcu_idle_exit() -- RCU
 * will happily ignore any such read-side critical sections.  However,
 * things like powertop need tracepoints in the inner idle loop.
 *
 * This macro provides the way out:  RCU_NONIDLE(do_something_with_RCU())
 * will tell RCU that it needs to pay attention, invoke its argument
 * (in this example, calling the do_something_with_RCU() function),
 * and then tell RCU to go back to ignoring this CPU.  It is permissible
 * to nest RCU_NONIDLE() wrappers, but not indefinitely (but the limit is
 * on the order of a million or so, even on 32-bit systems).  It is
 * not legal to block within RCU_NONIDLE(), nor is it permissible to
 * transfer control either into or out of RCU_NONIDLE()'s statement.
 */
#define RCU_NONIDLE(a) \
        do { \
                rcu_irq_enter_irqson(); \
                do { a; } while (0); \
                rcu_irq_exit_irqson(); \
        } while (0)

/*
 * Note a quasi-voluntary context switch for RCU-tasks's benefit.
 * This is a macro rather than an inline function to avoid #include hell.
 */
#ifdef CONFIG_TASKS_RCU_GENERIC

# ifdef CONFIG_TASKS_RCU
# define rcu_tasks_classic_qs(t, preempt)                                \
        do {                                                                \
                if (!(preempt) && READ_ONCE((t)->rcu_tasks_holdout))        \
                        WRITE_ONCE((t)->rcu_tasks_holdout, false);        \
        } while (0)
void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func);
void synchronize_rcu_tasks(void);
# else
# define rcu_tasks_classic_qs(t, preempt) do { } while (0)
# define call_rcu_tasks call_rcu
# define synchronize_rcu_tasks synchronize_rcu
# endif

# ifdef CONFIG_TASKS_TRACE_RCU
# define rcu_tasks_trace_qs(t)                                                \
        do {                                                                \
                if (!likely(READ_ONCE((t)->trc_reader_checked)) &&        \
                    !unlikely(READ_ONCE((t)->trc_reader_nesting))) {        \
                        smp_store_release(&(t)->trc_reader_checked, true); \
                        smp_mb(); /* Readers partitioned by store. */        \
                }                                                        \
        } while (0)
# else
# define rcu_tasks_trace_qs(t) do { } while (0)
# endif

#define rcu_tasks_qs(t, preempt)                                        \
do {                                                                        \
        rcu_tasks_classic_qs((t), (preempt));                                \
        rcu_tasks_trace_qs((t));                                        \
} while (0)

# ifdef CONFIG_TASKS_RUDE_RCU
void call_rcu_tasks_rude(struct rcu_head *head, rcu_callback_t func);
void synchronize_rcu_tasks_rude(void);
# endif

#define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false)
void exit_tasks_rcu_start(void);
void exit_tasks_rcu_stop(void);
void exit_tasks_rcu_finish(void);
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
#define rcu_tasks_qs(t, preempt) do { } while (0)
#define rcu_note_voluntary_context_switch(t) do { } while (0)
#define call_rcu_tasks call_rcu
#define synchronize_rcu_tasks synchronize_rcu
static inline void exit_tasks_rcu_start(void) { }
static inline void exit_tasks_rcu_stop(void) { }
static inline void exit_tasks_rcu_finish(void) { }
#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */

/**
 * rcu_trace_implies_rcu_gp - does an RCU Tasks Trace grace period imply an RCU grace period?
 *
 * As an accident of implementation, an RCU Tasks Trace grace period also
 * acts as an RCU grace period.  However, this could change at any time.
 * Code relying on this accident must call this function to verify that
 * this accident is still happening.
 *
 * You have been warned!
 */
static inline bool rcu_trace_implies_rcu_gp(void) { return true; }

/**
 * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU
 *
 * This macro resembles cond_resched(), except that it is defined to
 * report potential quiescent states to RCU-tasks even if the cond_resched()
 * machinery were to be shut off, as some advocate for PREEMPTION kernels.
 */
#define cond_resched_tasks_rcu_qs() \
do { \
        rcu_tasks_qs(current, false); \
        cond_resched(); \
} while (0)

/**
 * rcu_softirq_qs_periodic - Report RCU and RCU-Tasks quiescent states
 * @old_ts: jiffies at start of processing.
 *
 * This helper is for long-running softirq handlers, such as NAPI threads in
 * networking. The caller should initialize the variable passed in as @old_ts
 * at the beginning of the softirq handler. When invoked frequently, this macro
 * will invoke rcu_softirq_qs() every 100 milliseconds thereafter, which will
 * provide both RCU and RCU-Tasks quiescent states. Note that this macro
 * modifies its old_ts argument.
 *
 * Because regions of code that have disabled softirq act as RCU read-side
 * critical sections, this macro should be invoked with softirq (and
 * preemption) enabled.
 *
 * The macro is not needed when CONFIG_PREEMPT_RT is defined. RT kernels would
 * have more chance to invoke schedule() calls and provide necessary quiescent
 * states. As a contrast, calling cond_resched() only won't achieve the same
 * effect because cond_resched() does not provide RCU-Tasks quiescent states.
 */
#define rcu_softirq_qs_periodic(old_ts) \
do { \
        if (!IS_ENABLED(CONFIG_PREEMPT_RT) && \
            time_after(jiffies, (old_ts) + HZ / 10)) { \
                preempt_disable(); \
                rcu_softirq_qs(); \
                preempt_enable(); \
                (old_ts) = jiffies; \
        } \
} while (0)

/*
 * Infrastructure to implement the synchronize_() primitives in
 * TREE_RCU and rcu_barrier_() primitives in TINY_RCU.
 */

#if defined(CONFIG_TREE_RCU)
#include <linux/rcutree.h>
#elif defined(CONFIG_TINY_RCU)
#include <linux/rcutiny.h>
#else
#error "Unknown RCU implementation specified to kernel configuration"
#endif

/*
 * The init_rcu_head_on_stack() and destroy_rcu_head_on_stack() calls
 * are needed for dynamic initialization and destruction of rcu_head
 * on the stack, and init_rcu_head()/destroy_rcu_head() are needed for
 * dynamic initialization and destruction of statically allocated rcu_head
 * structures.  However, rcu_head structures allocated dynamically in the
 * heap don't need any initialization.
 */
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
void init_rcu_head(struct rcu_head *head);
void destroy_rcu_head(struct rcu_head *head);
void init_rcu_head_on_stack(struct rcu_head *head);
void destroy_rcu_head_on_stack(struct rcu_head *head);
#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
static inline void init_rcu_head(struct rcu_head *head) { }
static inline void destroy_rcu_head(struct rcu_head *head) { }
static inline void init_rcu_head_on_stack(struct rcu_head *head) { }
static inline void destroy_rcu_head_on_stack(struct rcu_head *head) { }
#endif        /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */

#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU)
bool rcu_lockdep_current_cpu_online(void);
#else /* #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */
static inline bool rcu_lockdep_current_cpu_online(void) { return true; }
#endif /* #else #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */

#ifdef CONFIG_DEBUG_LOCK_ALLOC

static inline void rcu_lock_acquire(struct lockdep_map *map)
{
        lock_acquire(map, 0, 0, 2, 0, NULL, _THIS_IP_);
}

static inline void rcu_lock_release(struct lockdep_map *map)
{
        lock_release(map, _THIS_IP_);
}

extern struct lockdep_map rcu_lock_map;
extern struct lockdep_map rcu_bh_lock_map;
extern struct lockdep_map rcu_sched_lock_map;
extern struct lockdep_map rcu_callback_map;
int debug_lockdep_rcu_enabled(void);
int rcu_read_lock_held(void);
int rcu_read_lock_bh_held(void);
int rcu_read_lock_sched_held(void);
int rcu_read_lock_any_held(void);

#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */

# define rcu_lock_acquire(a)                do { } while (0)
# define rcu_lock_release(a)                do { } while (0)

static inline int rcu_read_lock_held(void)
{
        return 1;
}

static inline int rcu_read_lock_bh_held(void)
{
        return 1;
}

static inline int rcu_read_lock_sched_held(void)
{
        return !preemptible();
}

static inline int rcu_read_lock_any_held(void)
{
        return !preemptible();
}

#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */

#ifdef CONFIG_PROVE_RCU

/**
 * RCU_LOCKDEP_WARN - emit lockdep splat if specified condition is met
 * @c: condition to check
 * @s: informative message
 *
 * This checks debug_lockdep_rcu_enabled() before checking (c) to
 * prevent early boot splats due to lockdep not yet being initialized,
 * and rechecks it after checking (c) to prevent false-positive splats
 * due to races with lockdep being disabled.  See commit 3066820034b5dd
 * ("rcu: Reject RCU_LOCKDEP_WARN() false positives") for more detail.
 */
#define RCU_LOCKDEP_WARN(c, s)                                                \
        do {                                                                \
                static bool __section(".data.unlikely") __warned;        \
                if (debug_lockdep_rcu_enabled() && (c) &&                \
                    debug_lockdep_rcu_enabled() && !__warned) {                \
                        __warned = true;                                \
                        lockdep_rcu_suspicious(__FILE__, __LINE__, s);        \
                }                                                        \
        } while (0)

#if defined(CONFIG_PROVE_RCU) && !defined(CONFIG_PREEMPT_RCU)
static inline void rcu_preempt_sleep_check(void)
{
        RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map),
                         "Illegal context switch in RCU read-side critical section");
}
#else /* #ifdef CONFIG_PROVE_RCU */
static inline void rcu_preempt_sleep_check(void) { }
#endif /* #else #ifdef CONFIG_PROVE_RCU */

#define rcu_sleep_check()                                                \
        do {                                                                \
                rcu_preempt_sleep_check();                                \
                RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map),        \
                                 "Illegal context switch in RCU-bh read-side critical section"); \
                RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map),        \
                                 "Illegal context switch in RCU-sched read-side critical section"); \
        } while (0)

#else /* #ifdef CONFIG_PROVE_RCU */

#define RCU_LOCKDEP_WARN(c, s) do { } while (0)
#define rcu_sleep_check() do { } while (0)

#endif /* #else #ifdef CONFIG_PROVE_RCU */

/*
 * Helper functions for rcu_dereference_check(), rcu_dereference_protected()
 * and rcu_assign_pointer().  Some of these could be folded into their
 * callers, but they are left separate in order to ease introduction of
 * multiple pointers markings to match different RCU implementations
 * (e.g., __srcu), should this make sense in the future.
 */

#ifdef __CHECKER__
#define rcu_check_sparse(p, space) \
        ((void)(((typeof(*p) space *)p) == p))
#else /* #ifdef __CHECKER__ */
#define rcu_check_sparse(p, space)
#endif /* #else #ifdef __CHECKER__ */

/**
 * unrcu_pointer - mark a pointer as not being RCU protected
 * @p: pointer needing to lose its __rcu property
 *
 * Converts @p from an __rcu pointer to a __kernel pointer.
 * This allows an __rcu pointer to be used with xchg() and friends.
 */
#define unrcu_pointer(p)                                                \
({                                                                        \
        typeof(*p) *_________p1 = (typeof(*p) *__force)(p);                \
        rcu_check_sparse(p, __rcu);                                         \
        ((typeof(*p) __force __kernel *)(_________p1));                 \
})

#define __rcu_access_pointer(p, space) \
({ \
        typeof(*p) *_________p1 = (typeof(*p) *__force)READ_ONCE(p); \
        rcu_check_sparse(p, space); \
        ((typeof(*p) __force __kernel *)(_________p1)); \
})
#define __rcu_dereference_check(p, c, space) \
({ \
        /* Dependency order vs. p above. */ \
        typeof(*p) *________p1 = (typeof(*p) *__force)READ_ONCE(p); \
        RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \
        rcu_check_sparse(p, space); \
        ((typeof(*p) __force __kernel *)(________p1)); \
})
#define __rcu_dereference_protected(p, c, space) \
({ \
        RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_protected() usage"); \
        rcu_check_sparse(p, space); \
        ((typeof(*p) __force __kernel *)(p)); \
})
#define rcu_dereference_raw(p) \
({ \
        /* Dependency order vs. p above. */ \
        typeof(p) ________p1 = READ_ONCE(p); \
        ((typeof(*p) __force __kernel *)(________p1)); \
})

/**
 * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
 * @v: The value to statically initialize with.
 */
#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)

/**
 * rcu_assign_pointer() - assign to RCU-protected pointer
 * @p: pointer to assign to
 * @v: value to assign (publish)
 *
 * Assigns the specified value to the specified RCU-protected
 * pointer, ensuring that any concurrent RCU readers will see
 * any prior initialization.
 *
 * Inserts memory barriers on architectures that require them
 * (which is most of them), and also prevents the compiler from
 * reordering the code that initializes the structure after the pointer
 * assignment.  More importantly, this call documents which pointers
 * will be dereferenced by RCU read-side code.
 *
 * In some special cases, you may use RCU_INIT_POINTER() instead
 * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
 * to the fact that it does not constrain either the CPU or the compiler.
 * That said, using RCU_INIT_POINTER() when you should have used
 * rcu_assign_pointer() is a very bad thing that results in
 * impossible-to-diagnose memory corruption.  So please be careful.
 * See the RCU_INIT_POINTER() comment header for details.
 *
 * Note that rcu_assign_pointer() evaluates each of its arguments only
 * once, appearances notwithstanding.  One of the "extra" evaluations
 * is in typeof() and the other visible only to sparse (__CHECKER__),
 * neither of which actually execute the argument.  As with most cpp
 * macros, this execute-arguments-only-once property is important, so
 * please be careful when making changes to rcu_assign_pointer() and the
 * other macros that it invokes.
 */
#define rcu_assign_pointer(p, v)                                              \
do {                                                                              \
        uintptr_t _r_a_p__v = (uintptr_t)(v);                                      \
        rcu_check_sparse(p, __rcu);                                              \
                                                                              \
        if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)              \
                WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
        else                                                                      \
                smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
} while (0)

/**
 * rcu_replace_pointer() - replace an RCU pointer, returning its old value
 * @rcu_ptr: RCU pointer, whose old value is returned
 * @ptr: regular pointer
 * @c: the lockdep conditions under which the dereference will take place
 *
 * Perform a replacement, where @rcu_ptr is an RCU-annotated
 * pointer and @c is the lockdep argument that is passed to the
 * rcu_dereference_protected() call used to read that pointer.  The old
 * value of @rcu_ptr is returned, and @rcu_ptr is set to @ptr.
 */
#define rcu_replace_pointer(rcu_ptr, ptr, c)                                \
({                                                                        \
        typeof(ptr) __tmp = rcu_dereference_protected((rcu_ptr), (c));        \
        rcu_assign_pointer((rcu_ptr), (ptr));                                \
        __tmp;                                                                \
})

/**
 * rcu_access_pointer() - fetch RCU pointer with no dereferencing
 * @p: The pointer to read
 *
 * Return the value of the specified RCU-protected pointer, but omit the
 * lockdep checks for being in an RCU read-side critical section.  This is
 * useful when the value of this pointer is accessed, but the pointer is
 * not dereferenced, for example, when testing an RCU-protected pointer
 * against NULL.  Although rcu_access_pointer() may also be used in cases
 * where update-side locks prevent the value of the pointer from changing,
 * you should instead use rcu_dereference_protected() for this use case.
 *
 * It is also permissible to use rcu_access_pointer() when read-side
 * access to the pointer was removed at least one grace period ago, as
 * is the case in the context of the RCU callback that is freeing up
 * the data, or after a synchronize_rcu() returns.  This can be useful
 * when tearing down multi-linked structures after a grace period
 * has elapsed.
 */
#define rcu_access_pointer(p) __rcu_access_pointer((p), __rcu)

/**
 * rcu_dereference_check() - rcu_dereference with debug checking
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * Do an rcu_dereference(), but check that the conditions under which the
 * dereference will take place are correct.  Typically the conditions
 * indicate the various locking conditions that should be held at that
 * point.  The check should return true if the conditions are satisfied.
 * An implicit check for being in an RCU read-side critical section
 * (rcu_read_lock()) is included.
 *
 * For example:
 *
 *        bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock));
 *
 * could be used to indicate to lockdep that foo->bar may only be dereferenced
 * if either rcu_read_lock() is held, or that the lock required to replace
 * the bar struct at foo->bar is held.
 *
 * Note that the list of conditions may also include indications of when a lock
 * need not be held, for example during initialisation or destruction of the
 * target struct:
 *
 *        bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock) ||
 *                                              atomic_read(&foo->usage) == 0);
 *
 * Inserts memory barriers on architectures that require them
 * (currently only the Alpha), prevents the compiler from refetching
 * (and from merging fetches), and, more importantly, documents exactly
 * which pointers are protected by RCU and checks that the pointer is
 * annotated as __rcu.
 */
#define rcu_dereference_check(p, c) \
        __rcu_dereference_check((p), (c) || rcu_read_lock_held(), __rcu)

/**
 * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * This is the RCU-bh counterpart to rcu_dereference_check().
 */
#define rcu_dereference_bh_check(p, c) \
        __rcu_dereference_check((p), (c) || rcu_read_lock_bh_held(), __rcu)

/**
 * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * This is the RCU-sched counterpart to rcu_dereference_check().
 */
#define rcu_dereference_sched_check(p, c) \
        __rcu_dereference_check((p), (c) || rcu_read_lock_sched_held(), \
                                __rcu)

/*
 * The tracing infrastructure traces RCU (we want that), but unfortunately
 * some of the RCU checks causes tracing to lock up the system.
 *
 * The no-tracing version of rcu_dereference_raw() must not call
 * rcu_read_lock_held().
 */
#define rcu_dereference_raw_check(p) __rcu_dereference_check((p), 1, __rcu)

/**
 * rcu_dereference_protected() - fetch RCU pointer when updates prevented
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * Return the value of the specified RCU-protected pointer, but omit
 * the READ_ONCE().  This is useful in cases where update-side locks
 * prevent the value of the pointer from changing.  Please note that this
 * primitive does *not* prevent the compiler from repeating this reference
 * or combining it with other references, so it should not be used without
 * protection of appropriate locks.
 *
 * This function is only for update-side use.  Using this function
 * when protected only by rcu_read_lock() will result in infrequent
 * but very ugly failures.
 */
#define rcu_dereference_protected(p, c) \
        __rcu_dereference_protected((p), (c), __rcu)


/**
 * rcu_dereference() - fetch RCU-protected pointer for dereferencing
 * @p: The pointer to read, prior to dereferencing
 *
 * This is a simple wrapper around rcu_dereference_check().
 */
#define rcu_dereference(p) rcu_dereference_check(p, 0)

/**
 * rcu_dereference_bh() - fetch an RCU-bh-protected pointer for dereferencing
 * @p: The pointer to read, prior to dereferencing
 *
 * Makes rcu_dereference_check() do the dirty work.
 */
#define rcu_dereference_bh(p) rcu_dereference_bh_check(p, 0)

/**
 * rcu_dereference_sched() - fetch RCU-sched-protected pointer for dereferencing
 * @p: The pointer to read, prior to dereferencing
 *
 * Makes rcu_dereference_check() do the dirty work.
 */
#define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0)

/**
 * rcu_pointer_handoff() - Hand off a pointer from RCU to other mechanism
 * @p: The pointer to hand off
 *
 * This is simply an identity function, but it documents where a pointer
 * is handed off from RCU to some other synchronization mechanism, for
 * example, reference counting or locking.  In C11, it would map to
 * kill_dependency().  It could be used as follows::
 *
 *        rcu_read_lock();
 *        p = rcu_dereference(gp);
 *        long_lived = is_long_lived(p);
 *        if (long_lived) {
 *                if (!atomic_inc_not_zero(p->refcnt))
 *                        long_lived = false;
 *                else
 *                        p = rcu_pointer_handoff(p);
 *        }
 *        rcu_read_unlock();
 */
#define rcu_pointer_handoff(p) (p)

/**
 * rcu_read_lock() - mark the beginning of an RCU read-side critical section
 *
 * When synchronize_rcu() is invoked on one CPU while other CPUs
 * are within RCU read-side critical sections, then the
 * synchronize_rcu() is guaranteed to block until after all the other
 * CPUs exit their critical sections.  Similarly, if call_rcu() is invoked
 * on one CPU while other CPUs are within RCU read-side critical
 * sections, invocation of the corresponding RCU callback is deferred
 * until after the all the other CPUs exit their critical sections.
 *
 * Note, however, that RCU callbacks are permitted to run concurrently
 * with new RCU read-side critical sections.  One way that this can happen
 * is via the following sequence of events: (1) CPU 0 enters an RCU
 * read-side critical section, (2) CPU 1 invokes call_rcu() to register
 * an RCU callback, (3) CPU 0 exits the RCU read-side critical section,
 * (4) CPU 2 enters a RCU read-side critical section, (5) the RCU
 * callback is invoked.  This is legal, because the RCU read-side critical
 * section that was running concurrently with the call_rcu() (and which
 * therefore might be referencing something that the corresponding RCU
 * callback would free up) has completed before the corresponding
 * RCU callback is invoked.
 *
 * RCU read-side critical sections may be nested.  Any deferred actions
 * will be deferred until the outermost RCU read-side critical section
 * completes.
 *
 * You can avoid reading and understanding the next paragraph by
 * following this rule: don't put anything in an rcu_read_lock() RCU
 * read-side critical section that would block in a !PREEMPTION kernel.
 * But if you want the full story, read on!
 *
 * In non-preemptible RCU implementations (pure TREE_RCU and TINY_RCU),
 * it is illegal to block while in an RCU read-side critical section.
 * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPTION
 * kernel builds, RCU read-side critical sections may be preempted,
 * but explicit blocking is illegal.  Finally, in preemptible RCU
 * implementations in real-time (with -rt patchset) kernel builds, RCU
 * read-side critical sections may be preempted and they may also block, but
 * only when acquiring spinlocks that are subject to priority inheritance.
 */
static __always_inline void rcu_read_lock(void)
{
        __rcu_read_lock();
        __acquire(RCU);
        rcu_lock_acquire(&rcu_lock_map);
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_lock() used illegally while idle");
}

/*
 * So where is rcu_write_lock()?  It does not exist, as there is no
 * way for writers to lock out RCU readers.  This is a feature, not
 * a bug -- this property is what provides RCU's performance benefits.
 * Of course, writers must coordinate with each other.  The normal
 * spinlock primitives work well for this, but any other technique may be
 * used as well.  RCU does not care how the writers keep out of each
 * others' way, as long as they do so.
 */

/**
 * rcu_read_unlock() - marks the end of an RCU read-side critical section.
 *
 * In most situations, rcu_read_unlock() is immune from deadlock.
 * However, in kernels built with CONFIG_RCU_BOOST, rcu_read_unlock()
 * is responsible for deboosting, which it does via rt_mutex_unlock().
 * Unfortunately, this function acquires the scheduler's runqueue and
 * priority-inheritance spinlocks.  This means that deadlock could result
 * if the caller of rcu_read_unlock() already holds one of these locks or
 * any lock that is ever acquired while holding them.
 *
 * That said, RCU readers are never priority boosted unless they were
 * preempted.  Therefore, one way to avoid deadlock is to make sure
 * that preemption never happens within any RCU read-side critical
 * section whose outermost rcu_read_unlock() is called with one of
 * rt_mutex_unlock()'s locks held.  Such preemption can be avoided in
 * a number of ways, for example, by invoking preempt_disable() before
 * critical section's outermost rcu_read_lock().
 *
 * Given that the set of locks acquired by rt_mutex_unlock() might change
 * at any time, a somewhat more future-proofed approach is to make sure
 * that that preemption never happens within any RCU read-side critical
 * section whose outermost rcu_read_unlock() is called with irqs disabled.
 * This approach relies on the fact that rt_mutex_unlock() currently only
 * acquires irq-disabled locks.
 *
 * The second of these two approaches is best in most situations,
 * however, the first approach can also be useful, at least to those
 * developers willing to keep abreast of the set of locks acquired by
 * rt_mutex_unlock().
 *
 * See rcu_read_lock() for more information.
 */
static inline void rcu_read_unlock(void)
{
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_unlock() used illegally while idle");
        __release(RCU);
        __rcu_read_unlock();
        rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */
}

/**
 * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section
 *
 * This is equivalent of rcu_read_lock(), but also disables softirqs.
 * Note that anything else that disables softirqs can also serve as
 * an RCU read-side critical section.
 *
 * Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh()
 * must occur in the same context, for example, it is illegal to invoke
 * rcu_read_unlock_bh() from one task if the matching rcu_read_lock_bh()
 * was invoked from some other task.
 */
static inline void rcu_read_lock_bh(void)
{
        local_bh_disable();
        __acquire(RCU_BH);
        rcu_lock_acquire(&rcu_bh_lock_map);
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_lock_bh() used illegally while idle");
}

/**
 * rcu_read_unlock_bh() - marks the end of a softirq-only RCU critical section
 *
 * See rcu_read_lock_bh() for more information.
 */
static inline void rcu_read_unlock_bh(void)
{
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_unlock_bh() used illegally while idle");
        rcu_lock_release(&rcu_bh_lock_map);
        __release(RCU_BH);
        local_bh_enable();
}

/**
 * rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section
 *
 * This is equivalent of rcu_read_lock(), but disables preemption.
 * Read-side critical sections can also be introduced by anything else
 * that disables preemption, including local_irq_disable() and friends.
 *
 * Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched()
 * must occur in the same context, for example, it is illegal to invoke
 * rcu_read_unlock_sched() from process context if the matching
 * rcu_read_lock_sched() was invoked from an NMI handler.
 */
static inline void rcu_read_lock_sched(void)
{
        preempt_disable();
        __acquire(RCU_SCHED);
        rcu_lock_acquire(&rcu_sched_lock_map);
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_lock_sched() used illegally while idle");
}

/* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */
static inline notrace void rcu_read_lock_sched_notrace(void)
{
        preempt_disable_notrace();
        __acquire(RCU_SCHED);
}

/**
 * rcu_read_unlock_sched() - marks the end of a RCU-classic critical section
 *
 * See rcu_read_lock_sched() for more information.
 */
static inline void rcu_read_unlock_sched(void)
{
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_unlock_sched() used illegally while idle");
        rcu_lock_release(&rcu_sched_lock_map);
        __release(RCU_SCHED);
        preempt_enable();
}

/* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */
static inline notrace void rcu_read_unlock_sched_notrace(void)
{
        __release(RCU_SCHED);
        preempt_enable_notrace();
}

/**
 * RCU_INIT_POINTER() - initialize an RCU protected pointer
 * @p: The pointer to be initialized.
 * @v: The value to initialized the pointer to.
 *
 * Initialize an RCU-protected pointer in special cases where readers
 * do not need ordering constraints on the CPU or the compiler.  These
 * special cases are:
 *
 * 1.        This use of RCU_INIT_POINTER() is NULLing out the pointer *or*
 * 2.        The caller has taken whatever steps are required to prevent
 *        RCU readers from concurrently accessing this pointer *or*
 * 3.        The referenced data structure has already been exposed to
 *        readers either at compile time or via rcu_assign_pointer() *and*
 *
 *        a.        You have not made *any* reader-visible changes to
 *                this structure since then *or*
 *        b.        It is OK for readers accessing this structure from its
 *                new location to see the old state of the structure.  (For
 *                example, the changes were to statistical counters or to
 *                other state where exact synchronization is not required.)
 *
 * Failure to follow these rules governing use of RCU_INIT_POINTER() will
 * result in impossible-to-diagnose memory corruption.  As in the structures
 * will look OK in crash dumps, but any concurrent RCU readers might
 * see pre-initialized values of the referenced data structure.  So
 * please be very careful how you use RCU_INIT_POINTER()!!!
 *
 * If you are creating an RCU-protected linked structure that is accessed
 * by a single external-to-structure RCU-protected pointer, then you may
 * use RCU_INIT_POINTER() to initialize the internal RCU-protected
 * pointers, but you must use rcu_assign_pointer() to initialize the
 * external-to-structure pointer *after* you have completely initialized
 * the reader-accessible portions of the linked structure.
 *
 * Note that unlike rcu_assign_pointer(), RCU_INIT_POINTER() provides no
 * ordering guarantees for either the CPU or the compiler.
 */
#define RCU_INIT_POINTER(p, v) \
        do { \
                rcu_check_sparse(p, __rcu); \
                WRITE_ONCE(p, RCU_INITIALIZER(v)); \
        } while (0)

/**
 * RCU_POINTER_INITIALIZER() - statically initialize an RCU protected pointer
 * @p: The pointer to be initialized.
 * @v: The value to initialized the pointer to.
 *
 * GCC-style initialization for an RCU-protected pointer in a structure field.
 */
#define RCU_POINTER_INITIALIZER(p, v) \
                .p = RCU_INITIALIZER(v)

/*
 * Does the specified offset indicate that the corresponding rcu_head
 * structure can be handled by kvfree_rcu()?
 */
#define __is_kvfree_rcu_offset(offset) ((offset) < 4096)

/*
 * Helper macro for kfree_rcu() to prevent argument-expansion eyestrain.
 */
#define __kvfree_rcu(head, offset) \
        do { \
                BUILD_BUG_ON(!__is_kvfree_rcu_offset(offset)); \
                kvfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \
        } while (0)

/**
 * kfree_rcu() - kfree an object after a grace period.
 * @ptr:        pointer to kfree
 * @rhf:        the name of the struct rcu_head within the type of @ptr.
 *
 * Many rcu callbacks functions just call kfree() on the base structure.
 * These functions are trivial, but their size adds up, and furthermore
 * when they are used in a kernel module, that module must invoke the
 * high-latency rcu_barrier() function at module-unload time.
 *
 * The kfree_rcu() function handles this issue.  Rather than encoding a
 * function address in the embedded rcu_head structure, kfree_rcu() instead
 * encodes the offset of the rcu_head structure within the base structure.
 * Because the functions are not allowed in the low-order 4096 bytes of
 * kernel virtual memory, offsets up to 4095 bytes can be accommodated.
 * If the offset is larger than 4095 bytes, a compile-time error will
 * be generated in __kvfree_rcu(). If this error is triggered, you can
 * either fall back to use of call_rcu() or rearrange the structure to
 * position the rcu_head structure into the first 4096 bytes.
 *
 * Note that the allowable offset might decrease in the future, for example,
 * to allow something like kmem_cache_free_rcu().
 *
 * The BUILD_BUG_ON check must not involve any function calls, hence the
 * checks are done in macros here.
 */
#define kfree_rcu(ptr, rhf)                                                \
do {                                                                        \
        typeof (ptr) ___p = (ptr);                                        \
                                                                        \
        if (___p)                                                        \
                __kvfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \
} while (0)

/**
 * kvfree_rcu() - kvfree an object after a grace period.
 *
 * This macro consists of one or two arguments and it is
 * based on whether an object is head-less or not. If it
 * has a head then a semantic stays the same as it used
 * to be before:
 *
 *     kvfree_rcu(ptr, rhf);
 *
 * where @ptr is a pointer to kvfree(), @rhf is the name
 * of the rcu_head structure within the type of @ptr.
 *
 * When it comes to head-less variant, only one argument
 * is passed and that is just a pointer which has to be
 * freed after a grace period. Therefore the semantic is
 *
 *     kvfree_rcu(ptr);
 *
 * where @ptr is a pointer to kvfree().
 *
 * Please note, head-less way of freeing is permitted to
 * use from a context that has to follow might_sleep()
 * annotation. Otherwise, please switch and embed the
 * rcu_head structure within the type of @ptr.
 */
#define kvfree_rcu(...) KVFREE_GET_MACRO(__VA_ARGS__,                \
        kvfree_rcu_arg_2, kvfree_rcu_arg_1)(__VA_ARGS__)

#define KVFREE_GET_MACRO(_1, _2, NAME, ...) NAME
#define kvfree_rcu_arg_2(ptr, rhf) kfree_rcu(ptr, rhf)
#define kvfree_rcu_arg_1(ptr)                                        \
do {                                                                \
        typeof(ptr) ___p = (ptr);                                \
                                                                \
        if (___p)                                                \
                kvfree_call_rcu(NULL, (rcu_callback_t) (___p));        \
} while (0)

/*
 * Place this after a lock-acquisition primitive to guarantee that
 * an UNLOCK+LOCK pair acts as a full barrier.  This guarantee applies
 * if the UNLOCK and LOCK are executed by the same CPU or if the
 * UNLOCK and LOCK operate on the same lock variable.
 */
#ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE
#define smp_mb__after_unlock_lock()        smp_mb()  /* Full ordering for lock. */
#else /* #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
#define smp_mb__after_unlock_lock()        do { } while (0)
#endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */


/* Has the specified rcu_head structure been handed to call_rcu()? */

/**
 * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu()
 * @rhp: The rcu_head structure to initialize.
 *
 * If you intend to invoke rcu_head_after_call_rcu() to test whether a
 * given rcu_head structure has already been passed to call_rcu(), then
 * you must also invoke this rcu_head_init() function on it just after
 * allocating that structure.  Calls to this function must not race with
 * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation.
 */
static inline void rcu_head_init(struct rcu_head *rhp)
{
        rhp->func = (rcu_callback_t)~0L;
}

/**
 * rcu_head_after_call_rcu() - Has this rcu_head been passed to call_rcu()?
 * @rhp: The rcu_head structure to test.
 * @f: The function passed to call_rcu() along with @rhp.
 *
 * Returns @true if the @rhp has been passed to call_rcu() with @func,
 * and @false otherwise.  Emits a warning in any other case, including
 * the case where @rhp has already been invoked after a grace period.
 * Calls to this function must not race with callback invocation.  One way
 * to avoid such races is to enclose the call to rcu_head_after_call_rcu()
 * in an RCU read-side critical section that includes a read-side fetch
 * of the pointer to the structure containing @rhp.
 */
static inline bool
rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f)
{
        rcu_callback_t func = READ_ONCE(rhp->func);

        if (func == f)
                return true;
        WARN_ON_ONCE(func != (rcu_callback_t)~0L);
        return false;
}

/* kernel/ksysfs.c definitions */
extern int rcu_expedited;
extern int rcu_normal;

#endif /* __LINUX_RCUPDATE_H */











































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Copyright (C) 1994 Linus Torvalds
 *
 *  Pentium III FXSR, SSE support
 *  General FPU state handling cleanups
 *        Gareth Hughes <gareth@valinux.com>, May 2000
 */
#include <asm/fpu/internal.h>
#include <asm/fpu/regset.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/types.h>
#include <asm/traps.h>
#include <asm/irq_regs.h>

#include <linux/hardirq.h>
#include <linux/pkeys.h>

#define CREATE_TRACE_POINTS
#include <asm/trace/fpu.h>

/*
 * Represents the initial FPU state. It's mostly (but not completely) zeroes,
 * depending on the FPU hardware format:
 */
union fpregs_state init_fpstate __read_mostly;

/* Track in-kernel FPU usage */
static DEFINE_PER_CPU(bool, in_kernel_fpu);

/*
 * Track which context is using the FPU on the CPU:
 */
DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);

/*
 * Can we use the FPU in kernel mode with the
 * whole "kernel_fpu_begin/end()" sequence?
 */
bool irq_fpu_usable(void)
{
        if (WARN_ON_ONCE(in_nmi()))
                return false;

        /* In kernel FPU usage already active? */
        if (this_cpu_read(in_kernel_fpu))
                return false;

        /*
         * When not in NMI or hard interrupt context, FPU can be used in:
         *
         * - Task context except from within fpregs_lock()'ed critical
         *   regions.
         *
         * - Soft interrupt processing context which cannot happen
         *   while in a fpregs_lock()'ed critical region.
         */
        if (!in_irq())
                return true;

        /*
         * In hard interrupt context it's safe when soft interrupts
         * are enabled, which means the interrupt did not hit in
         * a fpregs_lock()'ed critical region.
         */
        return !softirq_count();
}
EXPORT_SYMBOL(irq_fpu_usable);

/*
 * These must be called with preempt disabled. Returns
 * 'true' if the FPU state is still intact and we can
 * keep registers active.
 *
 * The legacy FNSAVE instruction cleared all FPU state
 * unconditionally, so registers are essentially destroyed.
 * Modern FPU state can be kept in registers, if there are
 * no pending FP exceptions.
 */
int copy_fpregs_to_fpstate(struct fpu *fpu)
{
        if (likely(use_xsave())) {
                copy_xregs_to_kernel(&fpu->state.xsave);

                /*
                 * AVX512 state is tracked here because its use is
                 * known to slow the max clock speed of the core.
                 */
                if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512)
                        fpu->avx512_timestamp = jiffies;
                return 1;
        }

        if (likely(use_fxsr())) {
                copy_fxregs_to_kernel(fpu);
                return 1;
        }

        /*
         * Legacy FPU register saving, FNSAVE always clears FPU registers,
         * so we have to mark them inactive:
         */
        asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));

        return 0;
}
EXPORT_SYMBOL(copy_fpregs_to_fpstate);

void kernel_fpu_begin_mask(unsigned int kfpu_mask)
{
        preempt_disable();

        WARN_ON_FPU(!irq_fpu_usable());
        WARN_ON_FPU(this_cpu_read(in_kernel_fpu));

        this_cpu_write(in_kernel_fpu, true);

        if (!(current->flags & PF_KTHREAD) &&
            !test_thread_flag(TIF_NEED_FPU_LOAD)) {
                set_thread_flag(TIF_NEED_FPU_LOAD);
                /*
                 * Ignore return value -- we don't care if reg state
                 * is clobbered.
                 */
                copy_fpregs_to_fpstate(&current->thread.fpu);
        }
        __cpu_invalidate_fpregs_state();

        /* Put sane initial values into the control registers. */
        if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM))
                ldmxcsr(MXCSR_DEFAULT);

        if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU))
                asm volatile ("fninit");
}
EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask);

void kernel_fpu_end(void)
{
        WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));

        this_cpu_write(in_kernel_fpu, false);
        preempt_enable();
}
EXPORT_SYMBOL_GPL(kernel_fpu_end);

/*
 * Save the FPU state (mark it for reload if necessary):
 *
 * This only ever gets called for the current task.
 */
void fpu__save(struct fpu *fpu)
{
        WARN_ON_FPU(fpu != &current->thread.fpu);

        fpregs_lock();
        trace_x86_fpu_before_save(fpu);

        if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
                if (!copy_fpregs_to_fpstate(fpu)) {
                        copy_kernel_to_fpregs(&fpu->state);
                }
        }

        trace_x86_fpu_after_save(fpu);
        fpregs_unlock();
}

/*
 * Legacy x87 fpstate state init:
 */
static inline void fpstate_init_fstate(struct fregs_state *fp)
{
        fp->cwd = 0xffff037fu;
        fp->swd = 0xffff0000u;
        fp->twd = 0xffffffffu;
        fp->fos = 0xffff0000u;
}

void fpstate_init(union fpregs_state *state)
{
        if (!static_cpu_has(X86_FEATURE_FPU)) {
                fpstate_init_soft(&state->soft);
                return;
        }

        memset(state, 0, fpu_kernel_xstate_size);

        if (static_cpu_has(X86_FEATURE_XSAVES))
                fpstate_init_xstate(&state->xsave);
        if (static_cpu_has(X86_FEATURE_FXSR))
                fpstate_init_fxstate(&state->fxsave);
        else
                fpstate_init_fstate(&state->fsave);
}
EXPORT_SYMBOL_GPL(fpstate_init);

int fpu__copy(struct task_struct *dst, struct task_struct *src)
{
        struct fpu *dst_fpu = &dst->thread.fpu;
        struct fpu *src_fpu = &src->thread.fpu;

        dst_fpu->last_cpu = -1;

        if (!static_cpu_has(X86_FEATURE_FPU))
                return 0;

        WARN_ON_FPU(src_fpu != &current->thread.fpu);

        /*
         * Don't let 'init optimized' areas of the XSAVE area
         * leak into the child task:
         */
        memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size);

        /*
         * If the FPU registers are not current just memcpy() the state.
         * Otherwise save current FPU registers directly into the child's FPU
         * context, without any memory-to-memory copying.
         *
         * ( The function 'fails' in the FNSAVE case, which destroys
         *   register contents so we have to load them back. )
         */
        fpregs_lock();
        if (test_thread_flag(TIF_NEED_FPU_LOAD))
                memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size);

        else if (!copy_fpregs_to_fpstate(dst_fpu))
                copy_kernel_to_fpregs(&dst_fpu->state);

        fpregs_unlock();

        set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD);

        trace_x86_fpu_copy_src(src_fpu);
        trace_x86_fpu_copy_dst(dst_fpu);

        return 0;
}

/*
 * Activate the current task's in-memory FPU context,
 * if it has not been used before:
 */
static void fpu__initialize(struct fpu *fpu)
{
        WARN_ON_FPU(fpu != &current->thread.fpu);

        set_thread_flag(TIF_NEED_FPU_LOAD);
        fpstate_init(&fpu->state);
        trace_x86_fpu_init_state(fpu);
}

/*
 * This function must be called before we read a task's fpstate.
 *
 * There's two cases where this gets called:
 *
 * - for the current task (when coredumping), in which case we have
 *   to save the latest FPU registers into the fpstate,
 *
 * - or it's called for stopped tasks (ptrace), in which case the
 *   registers were already saved by the context-switch code when
 *   the task scheduled out.
 *
 * If the task has used the FPU before then save it.
 */
void fpu__prepare_read(struct fpu *fpu)
{
        if (fpu == &current->thread.fpu)
                fpu__save(fpu);
}

/*
 * This function must be called before we write a task's fpstate.
 *
 * Invalidate any cached FPU registers.
 *
 * After this function call, after registers in the fpstate are
 * modified and the child task has woken up, the child task will
 * restore the modified FPU state from the modified context. If we
 * didn't clear its cached status here then the cached in-registers
 * state pending on its former CPU could be restored, corrupting
 * the modifications.
 */
void fpu__prepare_write(struct fpu *fpu)
{
        /*
         * Only stopped child tasks can be used to modify the FPU
         * state in the fpstate buffer:
         */
        WARN_ON_FPU(fpu == &current->thread.fpu);

        /* Invalidate any cached state: */
        __fpu_invalidate_fpregs_state(fpu);
}

/*
 * Drops current FPU state: deactivates the fpregs and
 * the fpstate. NOTE: it still leaves previous contents
 * in the fpregs in the eager-FPU case.
 *
 * This function can be used in cases where we know that
 * a state-restore is coming: either an explicit one,
 * or a reschedule.
 */
void fpu__drop(struct fpu *fpu)
{
        preempt_disable();

        if (fpu == &current->thread.fpu) {
                /* Ignore delayed exceptions from user space */
                asm volatile("1: fwait\n"
                             "2:\n"
                             _ASM_EXTABLE(1b, 2b));
                fpregs_deactivate(fpu);
        }

        trace_x86_fpu_dropped(fpu);

        preempt_enable();
}

/*
 * Clear FPU registers by setting them up from the init fpstate.
 * Caller must do fpregs_[un]lock() around it.
 */
static inline void copy_init_fpstate_to_fpregs(u64 features_mask)
{
        if (use_xsave())
                copy_kernel_to_xregs(&init_fpstate.xsave, features_mask);
        else if (static_cpu_has(X86_FEATURE_FXSR))
                copy_kernel_to_fxregs(&init_fpstate.fxsave);
        else
                copy_kernel_to_fregs(&init_fpstate.fsave);

        if (boot_cpu_has(X86_FEATURE_OSPKE))
                copy_init_pkru_to_fpregs();
}

/*
 * Clear the FPU state back to init state.
 *
 * Called by sys_execve(), by the signal handler code and by various
 * error paths.
 */
static void fpu__clear(struct fpu *fpu, bool user_only)
{
        WARN_ON_FPU(fpu != &current->thread.fpu);

        if (!static_cpu_has(X86_FEATURE_FPU)) {
                fpu__drop(fpu);
                fpu__initialize(fpu);
                return;
        }

        fpregs_lock();

        if (user_only) {
                if (!fpregs_state_valid(fpu, smp_processor_id()) &&
                    xfeatures_mask_supervisor())
                        copy_kernel_to_xregs(&fpu->state.xsave,
                                             xfeatures_mask_supervisor());
                copy_init_fpstate_to_fpregs(xfeatures_mask_user());
        } else {
                copy_init_fpstate_to_fpregs(xfeatures_mask_all);
        }

        fpregs_mark_activate();
        fpregs_unlock();
}

void fpu__clear_user_states(struct fpu *fpu)
{
        fpu__clear(fpu, true);
}

void fpu__clear_all(struct fpu *fpu)
{
        fpu__clear(fpu, false);
}

/*
 * Load FPU context before returning to userspace.
 */
void switch_fpu_return(void)
{
        if (!static_cpu_has(X86_FEATURE_FPU))
                return;

        __fpregs_load_activate();
}
EXPORT_SYMBOL_GPL(switch_fpu_return);

#ifdef CONFIG_X86_DEBUG_FPU
/*
 * If current FPU state according to its tracking (loaded FPU context on this
 * CPU) is not valid then we must have TIF_NEED_FPU_LOAD set so the context is
 * loaded on return to userland.
 */
void fpregs_assert_state_consistent(void)
{
        struct fpu *fpu = &current->thread.fpu;

        if (test_thread_flag(TIF_NEED_FPU_LOAD))
                return;

        WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id()));
}
EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent);
#endif

void fpregs_mark_activate(void)
{
        struct fpu *fpu = &current->thread.fpu;

        fpregs_activate(fpu);
        fpu->last_cpu = smp_processor_id();
        clear_thread_flag(TIF_NEED_FPU_LOAD);
}
EXPORT_SYMBOL_GPL(fpregs_mark_activate);

/*
 * x87 math exception handling:
 */

int fpu__exception_code(struct fpu *fpu, int trap_nr)
{
        int err;

        if (trap_nr == X86_TRAP_MF) {
                unsigned short cwd, swd;
                /*
                 * (~cwd & swd) will mask out exceptions that are not set to unmasked
                 * status.  0x3f is the exception bits in these regs, 0x200 is the
                 * C1 reg you need in case of a stack fault, 0x040 is the stack
                 * fault bit.  We should only be taking one exception at a time,
                 * so if this combination doesn't produce any single exception,
                 * then we have a bad program that isn't synchronizing its FPU usage
                 * and it will suffer the consequences since we won't be able to
                 * fully reproduce the context of the exception.
                 */
                if (boot_cpu_has(X86_FEATURE_FXSR)) {
                        cwd = fpu->state.fxsave.cwd;
                        swd = fpu->state.fxsave.swd;
                } else {
                        cwd = (unsigned short)fpu->state.fsave.cwd;
                        swd = (unsigned short)fpu->state.fsave.swd;
                }

                err = swd & ~cwd;
        } else {
                /*
                 * The SIMD FPU exceptions are handled a little differently, as there
                 * is only a single status/control register.  Thus, to determine which
                 * unmasked exception was caught we must mask the exception mask bits
                 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
                 */
                unsigned short mxcsr = MXCSR_DEFAULT;

                if (boot_cpu_has(X86_FEATURE_XMM))
                        mxcsr = fpu->state.fxsave.mxcsr;

                err = ~(mxcsr >> 7) & mxcsr;
        }

        if (err & 0x001) {        /* Invalid op */
                /*
                 * swd & 0x240 == 0x040: Stack Underflow
                 * swd & 0x240 == 0x240: Stack Overflow
                 * User must clear the SF bit (0x40) if set
                 */
                return FPE_FLTINV;
        } else if (err & 0x004) { /* Divide by Zero */
                return FPE_FLTDIV;
        } else if (err & 0x008) { /* Overflow */
                return FPE_FLTOVF;
        } else if (err & 0x012) { /* Denormal, Underflow */
                return FPE_FLTUND;
        } else if (err & 0x020) { /* Precision */
                return FPE_FLTRES;
        }

        /*
         * If we're using IRQ 13, or supposedly even some trap
         * X86_TRAP_MF implementations, it's possible
         * we get a spurious trap, which is not an error.
         */
        return 0;
}




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Resizable, Scalable, Concurrent Hash Table
 *
 * Copyright (c) 2015-2016 Herbert Xu <herbert@gondor.apana.org.au>
 * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
 * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
 *
 * Code partially derived from nft_hash
 * Rewritten with rehash code from br_multicast plus single list
 * pointer as suggested by Josh Triplett
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */

#ifndef _LINUX_RHASHTABLE_H
#define _LINUX_RHASHTABLE_H

#include <linux/err.h>
#include <linux/errno.h>
#include <linux/jhash.h>
#include <linux/list_nulls.h>
#include <linux/workqueue.h>
#include <linux/rculist.h>
#include <linux/bit_spinlock.h>

#include <linux/rhashtable-types.h>
/*
 * Objects in an rhashtable have an embedded struct rhash_head
 * which is linked into as hash chain from the hash table - or one
 * of two or more hash tables when the rhashtable is being resized.
 * The end of the chain is marked with a special nulls marks which has
 * the least significant bit set but otherwise stores the address of
 * the hash bucket.  This allows us to be sure we've found the end
 * of the right list.
 * The value stored in the hash bucket has BIT(0) used as a lock bit.
 * This bit must be atomically set before any changes are made to
 * the chain.  To avoid dereferencing this pointer without clearing
 * the bit first, we use an opaque 'struct rhash_lock_head *' for the
 * pointer stored in the bucket.  This struct needs to be defined so
 * that rcu_dereference() works on it, but it has no content so a
 * cast is needed for it to be useful.  This ensures it isn't
 * used by mistake with clearing the lock bit first.
 */
struct rhash_lock_head {};

/* Maximum chain length before rehash
 *
 * The maximum (not average) chain length grows with the size of the hash
 * table, at a rate of (log N)/(log log N).
 *
 * The value of 16 is selected so that even if the hash table grew to
 * 2^32 you would not expect the maximum chain length to exceed it
 * unless we are under attack (or extremely unlucky).
 *
 * As this limit is only to detect attacks, we don't need to set it to a
 * lower value as you'd need the chain length to vastly exceed 16 to have
 * any real effect on the system.
 */
#define RHT_ELASTICITY        16u

/**
 * struct bucket_table - Table of hash buckets
 * @size: Number of hash buckets
 * @nest: Number of bits of first-level nested table.
 * @rehash: Current bucket being rehashed
 * @hash_rnd: Random seed to fold into hash
 * @walkers: List of active walkers
 * @rcu: RCU structure for freeing the table
 * @future_tbl: Table under construction during rehashing
 * @ntbl: Nested table used when out of memory.
 * @buckets: size * hash buckets
 */
struct bucket_table {
        unsigned int                size;
        unsigned int                nest;
        u32                        hash_rnd;
        struct list_head        walkers;
        struct rcu_head                rcu;

        struct bucket_table __rcu *future_tbl;

        struct lockdep_map        dep_map;

        struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp;
};

/*
 * NULLS_MARKER() expects a hash value with the low
 * bits mostly likely to be significant, and it discards
 * the msb.
 * We give it an address, in which the bottom bit is
 * always 0, and the msb might be significant.
 * So we shift the address down one bit to align with
 * expectations and avoid losing a significant bit.
 *
 * We never store the NULLS_MARKER in the hash table
 * itself as we need the lsb for locking.
 * Instead we store a NULL
 */
#define        RHT_NULLS_MARKER(ptr)        \
        ((void *)NULLS_MARKER(((unsigned long) (ptr)) >> 1))
#define INIT_RHT_NULLS_HEAD(ptr)        \
        ((ptr) = NULL)

static inline bool rht_is_a_nulls(const struct rhash_head *ptr)
{
        return ((unsigned long) ptr & 1);
}

static inline void *rht_obj(const struct rhashtable *ht,
                            const struct rhash_head *he)
{
        return (char *)he - ht->p.head_offset;
}

static inline unsigned int rht_bucket_index(const struct bucket_table *tbl,
                                            unsigned int hash)
{
        return hash & (tbl->size - 1);
}

static inline unsigned int rht_key_get_hash(struct rhashtable *ht,
        const void *key, const struct rhashtable_params params,
        unsigned int hash_rnd)
{
        unsigned int hash;

        /* params must be equal to ht->p if it isn't constant. */
        if (!__builtin_constant_p(params.key_len))
                hash = ht->p.hashfn(key, ht->key_len, hash_rnd);
        else if (params.key_len) {
                unsigned int key_len = params.key_len;

                if (params.hashfn)
                        hash = params.hashfn(key, key_len, hash_rnd);
                else if (key_len & (sizeof(u32) - 1))
                        hash = jhash(key, key_len, hash_rnd);
                else
                        hash = jhash2(key, key_len / sizeof(u32), hash_rnd);
        } else {
                unsigned int key_len = ht->p.key_len;

                if (params.hashfn)
                        hash = params.hashfn(key, key_len, hash_rnd);
                else
                        hash = jhash(key, key_len, hash_rnd);
        }

        return hash;
}

static inline unsigned int rht_key_hashfn(
        struct rhashtable *ht, const struct bucket_table *tbl,
        const void *key, const struct rhashtable_params params)
{
        unsigned int hash = rht_key_get_hash(ht, key, params, tbl->hash_rnd);

        return rht_bucket_index(tbl, hash);
}

static inline unsigned int rht_head_hashfn(
        struct rhashtable *ht, const struct bucket_table *tbl,
        const struct rhash_head *he, const struct rhashtable_params params)
{
        const char *ptr = rht_obj(ht, he);

        return likely(params.obj_hashfn) ?
               rht_bucket_index(tbl, params.obj_hashfn(ptr, params.key_len ?:
                                                            ht->p.key_len,
                                                       tbl->hash_rnd)) :
               rht_key_hashfn(ht, tbl, ptr + params.key_offset, params);
}

/**
 * rht_grow_above_75 - returns true if nelems > 0.75 * table-size
 * @ht:                hash table
 * @tbl:        current table
 */
static inline bool rht_grow_above_75(const struct rhashtable *ht,
                                     const struct bucket_table *tbl)
{
        /* Expand table when exceeding 75% load */
        return atomic_read(&ht->nelems) > (tbl->size / 4 * 3) &&
               (!ht->p.max_size || tbl->size < ht->p.max_size);
}

/**
 * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size
 * @ht:                hash table
 * @tbl:        current table
 */
static inline bool rht_shrink_below_30(const struct rhashtable *ht,
                                       const struct bucket_table *tbl)
{
        /* Shrink table beneath 30% load */
        return atomic_read(&ht->nelems) < (tbl->size * 3 / 10) &&
               tbl->size > ht->p.min_size;
}

/**
 * rht_grow_above_100 - returns true if nelems > table-size
 * @ht:                hash table
 * @tbl:        current table
 */
static inline bool rht_grow_above_100(const struct rhashtable *ht,
                                      const struct bucket_table *tbl)
{
        return atomic_read(&ht->nelems) > tbl->size &&
                (!ht->p.max_size || tbl->size < ht->p.max_size);
}

/**
 * rht_grow_above_max - returns true if table is above maximum
 * @ht:                hash table
 * @tbl:        current table
 */
static inline bool rht_grow_above_max(const struct rhashtable *ht,
                                      const struct bucket_table *tbl)
{
        return atomic_read(&ht->nelems) >= ht->max_elems;
}

#ifdef CONFIG_PROVE_LOCKING
int lockdep_rht_mutex_is_held(struct rhashtable *ht);
int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash);
#else
static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht)
{
        return 1;
}

static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl,
                                             u32 hash)
{
        return 1;
}
#endif /* CONFIG_PROVE_LOCKING */

void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
                             struct rhash_head *obj);

void rhashtable_walk_enter(struct rhashtable *ht,
                           struct rhashtable_iter *iter);
void rhashtable_walk_exit(struct rhashtable_iter *iter);
int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires(RCU);

static inline void rhashtable_walk_start(struct rhashtable_iter *iter)
{
        (void)rhashtable_walk_start_check(iter);
}

void *rhashtable_walk_next(struct rhashtable_iter *iter);
void *rhashtable_walk_peek(struct rhashtable_iter *iter);
void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU);

void rhashtable_free_and_destroy(struct rhashtable *ht,
                                 void (*free_fn)(void *ptr, void *arg),
                                 void *arg);
void rhashtable_destroy(struct rhashtable *ht);

struct rhash_lock_head __rcu **rht_bucket_nested(
        const struct bucket_table *tbl, unsigned int hash);
struct rhash_lock_head __rcu **__rht_bucket_nested(
        const struct bucket_table *tbl, unsigned int hash);
struct rhash_lock_head __rcu **rht_bucket_nested_insert(
        struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash);

#define rht_dereference(p, ht) \
        rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht))

#define rht_dereference_rcu(p, ht) \
        rcu_dereference_check(p, lockdep_rht_mutex_is_held(ht))

#define rht_dereference_bucket(p, tbl, hash) \
        rcu_dereference_protected(p, lockdep_rht_bucket_is_held(tbl, hash))

#define rht_dereference_bucket_rcu(p, tbl, hash) \
        rcu_dereference_check(p, lockdep_rht_bucket_is_held(tbl, hash))

#define rht_entry(tpos, pos, member) \
        ({ tpos = container_of(pos, typeof(*tpos), member); 1; })

static inline struct rhash_lock_head __rcu *const *rht_bucket(
        const struct bucket_table *tbl, unsigned int hash)
{
        return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) :
                                     &tbl->buckets[hash];
}

static inline struct rhash_lock_head __rcu **rht_bucket_var(
        struct bucket_table *tbl, unsigned int hash)
{
        return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) :
                                     &tbl->buckets[hash];
}

static inline struct rhash_lock_head __rcu **rht_bucket_insert(
        struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash)
{
        return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) :
                                     &tbl->buckets[hash];
}

/*
 * We lock a bucket by setting BIT(0) in the pointer - this is always
 * zero in real pointers.  The NULLS mark is never stored in the bucket,
 * rather we store NULL if the bucket is empty.
 * bit_spin_locks do not handle contention well, but the whole point
 * of the hashtable design is to achieve minimum per-bucket contention.
 * A nested hash table might not have a bucket pointer.  In that case
 * we cannot get a lock.  For remove and replace the bucket cannot be
 * interesting and doesn't need locking.
 * For insert we allocate the bucket if this is the last bucket_table,
 * and then take the lock.
 * Sometimes we unlock a bucket by writing a new pointer there.  In that
 * case we don't need to unlock, but we do need to reset state such as
 * local_bh. For that we have rht_assign_unlock().  As rcu_assign_pointer()
 * provides the same release semantics that bit_spin_unlock() provides,
 * this is safe.
 * When we write to a bucket without unlocking, we use rht_assign_locked().
 */

static inline void rht_lock(struct bucket_table *tbl,
                            struct rhash_lock_head __rcu **bkt)
{
        local_bh_disable();
        bit_spin_lock(0, (unsigned long *)bkt);
        lock_map_acquire(&tbl->dep_map);
}

static inline void rht_lock_nested(struct bucket_table *tbl,
                                   struct rhash_lock_head __rcu **bucket,
                                   unsigned int subclass)
{
        local_bh_disable();
        bit_spin_lock(0, (unsigned long *)bucket);
        lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_);
}

static inline void rht_unlock(struct bucket_table *tbl,
                              struct rhash_lock_head __rcu **bkt)
{
        lock_map_release(&tbl->dep_map);
        bit_spin_unlock(0, (unsigned long *)bkt);
        local_bh_enable();
}

static inline struct rhash_head *__rht_ptr(
        struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt)
{
        return (struct rhash_head *)
                ((unsigned long)p & ~BIT(0) ?:
                 (unsigned long)RHT_NULLS_MARKER(bkt));
}

/*
 * Where 'bkt' is a bucket and might be locked:
 *   rht_ptr_rcu() dereferences that pointer and clears the lock bit.
 *   rht_ptr() dereferences in a context where the bucket is locked.
 *   rht_ptr_exclusive() dereferences in a context where exclusive
 *            access is guaranteed, such as when destroying the table.
 */
static inline struct rhash_head *rht_ptr_rcu(
        struct rhash_lock_head __rcu *const *bkt)
{
        return __rht_ptr(rcu_dereference(*bkt), bkt);
}

static inline struct rhash_head *rht_ptr(
        struct rhash_lock_head __rcu *const *bkt,
        struct bucket_table *tbl,
        unsigned int hash)
{
        return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt);
}

static inline struct rhash_head *rht_ptr_exclusive(
        struct rhash_lock_head __rcu *const *bkt)
{
        return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt);
}

static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt,
                                     struct rhash_head *obj)
{
        if (rht_is_a_nulls(obj))
                obj = NULL;
        rcu_assign_pointer(*bkt, (void *)((unsigned long)obj | BIT(0)));
}

static inline void rht_assign_unlock(struct bucket_table *tbl,
                                     struct rhash_lock_head __rcu **bkt,
                                     struct rhash_head *obj)
{
        if (rht_is_a_nulls(obj))
                obj = NULL;
        lock_map_release(&tbl->dep_map);
        rcu_assign_pointer(*bkt, (void *)obj);
        preempt_enable();
        __release(bitlock);
        local_bh_enable();
}

/**
 * rht_for_each_from - iterate over hash chain from given head
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @head:        the &struct rhash_head to start from
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 */
#define rht_for_each_from(pos, head, tbl, hash) \
        for (pos = head;                        \
             !rht_is_a_nulls(pos);                \
             pos = rht_dereference_bucket((pos)->next, tbl, hash))

/**
 * rht_for_each - iterate over hash chain
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 */
#define rht_for_each(pos, tbl, hash) \
        rht_for_each_from(pos, rht_ptr(rht_bucket(tbl, hash), tbl, hash),  \
                          tbl, hash)

/**
 * rht_for_each_entry_from - iterate over hash chain from given head
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @head:        the &struct rhash_head to start from
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 * @member:        name of the &struct rhash_head within the hashable struct.
 */
#define rht_for_each_entry_from(tpos, pos, head, tbl, hash, member)        \
        for (pos = head;                                                \
             (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);        \
             pos = rht_dereference_bucket((pos)->next, tbl, hash))

/**
 * rht_for_each_entry - iterate over hash chain of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 * @member:        name of the &struct rhash_head within the hashable struct.
 */
#define rht_for_each_entry(tpos, pos, tbl, hash, member)                \
        rht_for_each_entry_from(tpos, pos,                                \
                                rht_ptr(rht_bucket(tbl, hash), tbl, hash), \
                                tbl, hash, member)

/**
 * rht_for_each_entry_safe - safely iterate over hash chain of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @next:        the &struct rhash_head to use as next in loop cursor.
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 * @member:        name of the &struct rhash_head within the hashable struct.
 *
 * This hash chain list-traversal primitive allows for the looped code to
 * remove the loop cursor from the list.
 */
#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member)              \
        for (pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash),                      \
             next = !rht_is_a_nulls(pos) ?                                      \
                       rht_dereference_bucket(pos->next, tbl, hash) : NULL;   \
             (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);              \
             pos = next,                                                      \
             next = !rht_is_a_nulls(pos) ?                                      \
                       rht_dereference_bucket(pos->next, tbl, hash) : NULL)

/**
 * rht_for_each_rcu_from - iterate over rcu hash chain from given head
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @head:        the &struct rhash_head to start from
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 *
 * This hash chain list-traversal primitive may safely run concurrently with
 * the _rcu mutation primitives such as rhashtable_insert() as long as the
 * traversal is guarded by rcu_read_lock().
 */
#define rht_for_each_rcu_from(pos, head, tbl, hash)                        \
        for (({barrier(); }),                                                \
             pos = head;                                                \
             !rht_is_a_nulls(pos);                                        \
             pos = rcu_dereference_raw(pos->next))

/**
 * rht_for_each_rcu - iterate over rcu hash chain
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 *
 * This hash chain list-traversal primitive may safely run concurrently with
 * the _rcu mutation primitives such as rhashtable_insert() as long as the
 * traversal is guarded by rcu_read_lock().
 */
#define rht_for_each_rcu(pos, tbl, hash)                        \
        for (({barrier(); }),                                        \
             pos = rht_ptr_rcu(rht_bucket(tbl, hash));                \
             !rht_is_a_nulls(pos);                                \
             pos = rcu_dereference_raw(pos->next))

/**
 * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @head:        the &struct rhash_head to start from
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 * @member:        name of the &struct rhash_head within the hashable struct.
 *
 * This hash chain list-traversal primitive may safely run concurrently with
 * the _rcu mutation primitives such as rhashtable_insert() as long as the
 * traversal is guarded by rcu_read_lock().
 */
#define rht_for_each_entry_rcu_from(tpos, pos, head, tbl, hash, member) \
        for (({barrier(); }),                                                    \
             pos = head;                                                    \
             (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);            \
             pos = rht_dereference_bucket_rcu(pos->next, tbl, hash))

/**
 * rht_for_each_entry_rcu - iterate over rcu hash chain of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 * @member:        name of the &struct rhash_head within the hashable struct.
 *
 * This hash chain list-traversal primitive may safely run concurrently with
 * the _rcu mutation primitives such as rhashtable_insert() as long as the
 * traversal is guarded by rcu_read_lock().
 */
#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)                   \
        rht_for_each_entry_rcu_from(tpos, pos,                                   \
                                    rht_ptr_rcu(rht_bucket(tbl, hash)),           \
                                    tbl, hash, member)

/**
 * rhl_for_each_rcu - iterate over rcu hash table list
 * @pos:        the &struct rlist_head to use as a loop cursor.
 * @list:        the head of the list
 *
 * This hash chain list-traversal primitive should be used on the
 * list returned by rhltable_lookup.
 */
#define rhl_for_each_rcu(pos, list)                                        \
        for (pos = list; pos; pos = rcu_dereference_raw(pos->next))

/**
 * rhl_for_each_entry_rcu - iterate over rcu hash table list of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rlist_head to use as a loop cursor.
 * @list:        the head of the list
 * @member:        name of the &struct rlist_head within the hashable struct.
 *
 * This hash chain list-traversal primitive should be used on the
 * list returned by rhltable_lookup.
 */
#define rhl_for_each_entry_rcu(tpos, pos, list, member)                        \
        for (pos = list; pos && rht_entry(tpos, pos, member);                \
             pos = rcu_dereference_raw(pos->next))

static inline int rhashtable_compare(struct rhashtable_compare_arg *arg,
                                     const void *obj)
{
        struct rhashtable *ht = arg->ht;
        const char *ptr = obj;

        return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len);
}

/* Internal function, do not use. */
static inline struct rhash_head *__rhashtable_lookup(
        struct rhashtable *ht, const void *key,
        const struct rhashtable_params params)
{
        struct rhashtable_compare_arg arg = {
                .ht = ht,
                .key = key,
        };
        struct rhash_lock_head __rcu *const *bkt;
        struct bucket_table *tbl;
        struct rhash_head *he;
        unsigned int hash;

        tbl = rht_dereference_rcu(ht->tbl, ht);
restart:
        hash = rht_key_hashfn(ht, tbl, key, params);
        bkt = rht_bucket(tbl, hash);
        do {
                rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) {
                        if (params.obj_cmpfn ?
                            params.obj_cmpfn(&arg, rht_obj(ht, he)) :
                            rhashtable_compare(&arg, rht_obj(ht, he)))
                                continue;
                        return he;
                }
                /* An object might have been moved to a different hash chain,
                 * while we walk along it - better check and retry.
                 */
        } while (he != RHT_NULLS_MARKER(bkt));

        /* Ensure we see any new tables. */
        smp_rmb();

        tbl = rht_dereference_rcu(tbl->future_tbl, ht);
        if (unlikely(tbl))
                goto restart;

        return NULL;
}

/**
 * rhashtable_lookup - search hash table
 * @ht:                hash table
 * @key:        the pointer to the key
 * @params:        hash table parameters
 *
 * Computes the hash value for the key and traverses the bucket chain looking
 * for a entry with an identical key. The first matching entry is returned.
 *
 * This must only be called under the RCU read lock.
 *
 * Returns the first entry on which the compare function returned true.
 */
static inline void *rhashtable_lookup(
        struct rhashtable *ht, const void *key,
        const struct rhashtable_params params)
{
        struct rhash_head *he = __rhashtable_lookup(ht, key, params);

        return he ? rht_obj(ht, he) : NULL;
}

/**
 * rhashtable_lookup_fast - search hash table, without RCU read lock
 * @ht:                hash table
 * @key:        the pointer to the key
 * @params:        hash table parameters
 *
 * Computes the hash value for the key and traverses the bucket chain looking
 * for a entry with an identical key. The first matching entry is returned.
 *
 * Only use this function when you have other mechanisms guaranteeing
 * that the object won't go away after the RCU read lock is released.
 *
 * Returns the first entry on which the compare function returned true.
 */
static inline void *rhashtable_lookup_fast(
        struct rhashtable *ht, const void *key,
        const struct rhashtable_params params)
{
        void *obj;

        rcu_read_lock();
        obj = rhashtable_lookup(ht, key, params);
        rcu_read_unlock();

        return obj;
}

/**
 * rhltable_lookup - search hash list table
 * @hlt:        hash table
 * @key:        the pointer to the key
 * @params:        hash table parameters
 *
 * Computes the hash value for the key and traverses the bucket chain looking
 * for a entry with an identical key.  All matching entries are returned
 * in a list.
 *
 * This must only be called under the RCU read lock.
 *
 * Returns the list of entries that match the given key.
 */
static inline struct rhlist_head *rhltable_lookup(
        struct rhltable *hlt, const void *key,
        const struct rhashtable_params params)
{
        struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params);

        return he ? container_of(he, struct rhlist_head, rhead) : NULL;
}

/* Internal function, please use rhashtable_insert_fast() instead. This
 * function returns the existing element already in hashes in there is a clash,
 * otherwise it returns an error via ERR_PTR().
 */
static inline void *__rhashtable_insert_fast(
        struct rhashtable *ht, const void *key, struct rhash_head *obj,
        const struct rhashtable_params params, bool rhlist)
{
        struct rhashtable_compare_arg arg = {
                .ht = ht,
                .key = key,
        };
        struct rhash_lock_head __rcu **bkt;
        struct rhash_head __rcu **pprev;
        struct bucket_table *tbl;
        struct rhash_head *head;
        unsigned int hash;
        int elasticity;
        void *data;

        rcu_read_lock();

        tbl = rht_dereference_rcu(ht->tbl, ht);
        hash = rht_head_hashfn(ht, tbl, obj, params);
        elasticity = RHT_ELASTICITY;
        bkt = rht_bucket_insert(ht, tbl, hash);
        data = ERR_PTR(-ENOMEM);
        if (!bkt)
                goto out;
        pprev = NULL;
        rht_lock(tbl, bkt);

        if (unlikely(rcu_access_pointer(tbl->future_tbl))) {
slow_path:
                rht_unlock(tbl, bkt);
                rcu_read_unlock();
                return rhashtable_insert_slow(ht, key, obj);
        }

        rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) {
                struct rhlist_head *plist;
                struct rhlist_head *list;

                elasticity--;
                if (!key ||
                    (params.obj_cmpfn ?
                     params.obj_cmpfn(&arg, rht_obj(ht, head)) :
                     rhashtable_compare(&arg, rht_obj(ht, head)))) {
                        pprev = &head->next;
                        continue;
                }

                data = rht_obj(ht, head);

                if (!rhlist)
                        goto out_unlock;


                list = container_of(obj, struct rhlist_head, rhead);
                plist = container_of(head, struct rhlist_head, rhead);

                RCU_INIT_POINTER(list->next, plist);
                head = rht_dereference_bucket(head->next, tbl, hash);
                RCU_INIT_POINTER(list->rhead.next, head);
                if (pprev) {
                        rcu_assign_pointer(*pprev, obj);
                        rht_unlock(tbl, bkt);
                } else
                        rht_assign_unlock(tbl, bkt, obj);
                data = NULL;
                goto out;
        }

        if (elasticity <= 0)
                goto slow_path;

        data = ERR_PTR(-E2BIG);
        if (unlikely(rht_grow_above_max(ht, tbl)))
                goto out_unlock;

        if (unlikely(rht_grow_above_100(ht, tbl)))
                goto slow_path;

        /* Inserting at head of list makes unlocking free. */
        head = rht_ptr(bkt, tbl, hash);

        RCU_INIT_POINTER(obj->next, head);
        if (rhlist) {
                struct rhlist_head *list;

                list = container_of(obj, struct rhlist_head, rhead);
                RCU_INIT_POINTER(list->next, NULL);
        }

        atomic_inc(&ht->nelems);
        rht_assign_unlock(tbl, bkt, obj);

        if (rht_grow_above_75(ht, tbl))
                schedule_work(&ht->run_work);

        data = NULL;
out:
        rcu_read_unlock();

        return data;

out_unlock:
        rht_unlock(tbl, bkt);
        goto out;
}

/**
 * rhashtable_insert_fast - insert object into hash table
 * @ht:                hash table
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * Will take the per bucket bitlock to protect against mutual mutations
 * on the same bucket. Multiple insertions may occur in parallel unless
 * they map to the same bucket.
 *
 * It is safe to call this function from atomic context.
 *
 * Will trigger an automatic deferred table resizing if residency in the
 * table grows beyond 70%.
 */
static inline int rhashtable_insert_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        void *ret;

        ret = __rhashtable_insert_fast(ht, NULL, obj, params, false);
        if (IS_ERR(ret))
                return PTR_ERR(ret);

        return ret == NULL ? 0 : -EEXIST;
}

/**
 * rhltable_insert_key - insert object into hash list table
 * @hlt:        hash list table
 * @key:        the pointer to the key
 * @list:        pointer to hash list head inside object
 * @params:        hash table parameters
 *
 * Will take the per bucket bitlock to protect against mutual mutations
 * on the same bucket. Multiple insertions may occur in parallel unless
 * they map to the same bucket.
 *
 * It is safe to call this function from atomic context.
 *
 * Will trigger an automatic deferred table resizing if residency in the
 * table grows beyond 70%.
 */
static inline int rhltable_insert_key(
        struct rhltable *hlt, const void *key, struct rhlist_head *list,
        const struct rhashtable_params params)
{
        return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead,
                                                params, true));
}

/**
 * rhltable_insert - insert object into hash list table
 * @hlt:        hash list table
 * @list:        pointer to hash list head inside object
 * @params:        hash table parameters
 *
 * Will take the per bucket bitlock to protect against mutual mutations
 * on the same bucket. Multiple insertions may occur in parallel unless
 * they map to the same bucket.
 *
 * It is safe to call this function from atomic context.
 *
 * Will trigger an automatic deferred table resizing if residency in the
 * table grows beyond 70%.
 */
static inline int rhltable_insert(
        struct rhltable *hlt, struct rhlist_head *list,
        const struct rhashtable_params params)
{
        const char *key = rht_obj(&hlt->ht, &list->rhead);

        key += params.key_offset;

        return rhltable_insert_key(hlt, key, list, params);
}

/**
 * rhashtable_lookup_insert_fast - lookup and insert object into hash table
 * @ht:                hash table
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * This lookup function may only be used for fixed key hash table (key_len
 * parameter set). It will BUG() if used inappropriately.
 *
 * It is safe to call this function from atomic context.
 *
 * Will trigger an automatic deferred table resizing if residency in the
 * table grows beyond 70%.
 */
static inline int rhashtable_lookup_insert_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        const char *key = rht_obj(ht, obj);
        void *ret;

        BUG_ON(ht->p.obj_hashfn);

        ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params,
                                       false);
        if (IS_ERR(ret))
                return PTR_ERR(ret);

        return ret == NULL ? 0 : -EEXIST;
}

/**
 * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table
 * @ht:                hash table
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * Just like rhashtable_lookup_insert_fast(), but this function returns the
 * object if it exists, NULL if it did not and the insertion was successful,
 * and an ERR_PTR otherwise.
 */
static inline void *rhashtable_lookup_get_insert_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        const char *key = rht_obj(ht, obj);

        BUG_ON(ht->p.obj_hashfn);

        return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params,
                                        false);
}

/**
 * rhashtable_lookup_insert_key - search and insert object to hash table
 *                                  with explicit key
 * @ht:                hash table
 * @key:        key
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * Lookups may occur in parallel with hashtable mutations and resizing.
 *
 * Will trigger an automatic deferred table resizing if residency in the
 * table grows beyond 70%.
 *
 * Returns zero on success.
 */
static inline int rhashtable_lookup_insert_key(
        struct rhashtable *ht, const void *key, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        void *ret;

        BUG_ON(!ht->p.obj_hashfn || !key);

        ret = __rhashtable_insert_fast(ht, key, obj, params, false);
        if (IS_ERR(ret))
                return PTR_ERR(ret);

        return ret == NULL ? 0 : -EEXIST;
}

/**
 * rhashtable_lookup_get_insert_key - lookup and insert object into hash table
 * @ht:                hash table
 * @key:        key
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * Just like rhashtable_lookup_insert_key(), but this function returns the
 * object if it exists, NULL if it does not and the insertion was successful,
 * and an ERR_PTR otherwise.
 */
static inline void *rhashtable_lookup_get_insert_key(
        struct rhashtable *ht, const void *key, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        BUG_ON(!ht->p.obj_hashfn || !key);

        return __rhashtable_insert_fast(ht, key, obj, params, false);
}

/* Internal function, please use rhashtable_remove_fast() instead */
static inline int __rhashtable_remove_fast_one(
        struct rhashtable *ht, struct bucket_table *tbl,
        struct rhash_head *obj, const struct rhashtable_params params,
        bool rhlist)
{
        struct rhash_lock_head __rcu **bkt;
        struct rhash_head __rcu **pprev;
        struct rhash_head *he;
        unsigned int hash;
        int err = -ENOENT;

        hash = rht_head_hashfn(ht, tbl, obj, params);
        bkt = rht_bucket_var(tbl, hash);
        if (!bkt)
                return -ENOENT;
        pprev = NULL;
        rht_lock(tbl, bkt);

        rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
                struct rhlist_head *list;

                list = container_of(he, struct rhlist_head, rhead);

                if (he != obj) {
                        struct rhlist_head __rcu **lpprev;

                        pprev = &he->next;

                        if (!rhlist)
                                continue;

                        do {
                                lpprev = &list->next;
                                list = rht_dereference_bucket(list->next,
                                                              tbl, hash);
                        } while (list && obj != &list->rhead);

                        if (!list)
                                continue;

                        list = rht_dereference_bucket(list->next, tbl, hash);
                        RCU_INIT_POINTER(*lpprev, list);
                        err = 0;
                        break;
                }

                obj = rht_dereference_bucket(obj->next, tbl, hash);
                err = 1;

                if (rhlist) {
                        list = rht_dereference_bucket(list->next, tbl, hash);
                        if (list) {
                                RCU_INIT_POINTER(list->rhead.next, obj);
                                obj = &list->rhead;
                                err = 0;
                        }
                }

                if (pprev) {
                        rcu_assign_pointer(*pprev, obj);
                        rht_unlock(tbl, bkt);
                } else {
                        rht_assign_unlock(tbl, bkt, obj);
                }
                goto unlocked;
        }

        rht_unlock(tbl, bkt);
unlocked:
        if (err > 0) {
                atomic_dec(&ht->nelems);
                if (unlikely(ht->p.automatic_shrinking &&
                             rht_shrink_below_30(ht, tbl)))
                        schedule_work(&ht->run_work);
                err = 0;
        }

        return err;
}

/* Internal function, please use rhashtable_remove_fast() instead */
static inline int __rhashtable_remove_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params, bool rhlist)
{
        struct bucket_table *tbl;
        int err;

        rcu_read_lock();

        tbl = rht_dereference_rcu(ht->tbl, ht);

        /* Because we have already taken (and released) the bucket
         * lock in old_tbl, if we find that future_tbl is not yet
         * visible then that guarantees the entry to still be in
         * the old tbl if it exists.
         */
        while ((err = __rhashtable_remove_fast_one(ht, tbl, obj, params,
                                                   rhlist)) &&
               (tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
                ;

        rcu_read_unlock();

        return err;
}

/**
 * rhashtable_remove_fast - remove object from hash table
 * @ht:                hash table
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * Since the hash chain is single linked, the removal operation needs to
 * walk the bucket chain upon removal. The removal operation is thus
 * considerable slow if the hash table is not correctly sized.
 *
 * Will automatically shrink the table if permitted when residency drops
 * below 30%.
 *
 * Returns zero on success, -ENOENT if the entry could not be found.
 */
static inline int rhashtable_remove_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        return __rhashtable_remove_fast(ht, obj, params, false);
}

/**
 * rhltable_remove - remove object from hash list table
 * @hlt:        hash list table
 * @list:        pointer to hash list head inside object
 * @params:        hash table parameters
 *
 * Since the hash chain is single linked, the removal operation needs to
 * walk the bucket chain upon removal. The removal operation is thus
 * considerable slow if the hash table is not correctly sized.
 *
 * Will automatically shrink the table if permitted when residency drops
 * below 30%
 *
 * Returns zero on success, -ENOENT if the entry could not be found.
 */
static inline int rhltable_remove(
        struct rhltable *hlt, struct rhlist_head *list,
        const struct rhashtable_params params)
{
        return __rhashtable_remove_fast(&hlt->ht, &list->rhead, params, true);
}

/* Internal function, please use rhashtable_replace_fast() instead */
static inline int __rhashtable_replace_fast(
        struct rhashtable *ht, struct bucket_table *tbl,
        struct rhash_head *obj_old, struct rhash_head *obj_new,
        const struct rhashtable_params params)
{
        struct rhash_lock_head __rcu **bkt;
        struct rhash_head __rcu **pprev;
        struct rhash_head *he;
        unsigned int hash;
        int err = -ENOENT;

        /* Minimally, the old and new objects must have same hash
         * (which should mean identifiers are the same).
         */
        hash = rht_head_hashfn(ht, tbl, obj_old, params);
        if (hash != rht_head_hashfn(ht, tbl, obj_new, params))
                return -EINVAL;

        bkt = rht_bucket_var(tbl, hash);
        if (!bkt)
                return -ENOENT;

        pprev = NULL;
        rht_lock(tbl, bkt);

        rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
                if (he != obj_old) {
                        pprev = &he->next;
                        continue;
                }

                rcu_assign_pointer(obj_new->next, obj_old->next);
                if (pprev) {
                        rcu_assign_pointer(*pprev, obj_new);
                        rht_unlock(tbl, bkt);
                } else {
                        rht_assign_unlock(tbl, bkt, obj_new);
                }
                err = 0;
                goto unlocked;
        }

        rht_unlock(tbl, bkt);

unlocked:
        return err;
}

/**
 * rhashtable_replace_fast - replace an object in hash table
 * @ht:                hash table
 * @obj_old:        pointer to hash head inside object being replaced
 * @obj_new:        pointer to hash head inside object which is new
 * @params:        hash table parameters
 *
 * Replacing an object doesn't affect the number of elements in the hash table
 * or bucket, so we don't need to worry about shrinking or expanding the
 * table here.
 *
 * Returns zero on success, -ENOENT if the entry could not be found,
 * -EINVAL if hash is not the same for the old and new objects.
 */
static inline int rhashtable_replace_fast(
        struct rhashtable *ht, struct rhash_head *obj_old,
        struct rhash_head *obj_new,
        const struct rhashtable_params params)
{
        struct bucket_table *tbl;
        int err;

        rcu_read_lock();

        tbl = rht_dereference_rcu(ht->tbl, ht);

        /* Because we have already taken (and released) the bucket
         * lock in old_tbl, if we find that future_tbl is not yet
         * visible then that guarantees the entry to still be in
         * the old tbl if it exists.
         */
        while ((err = __rhashtable_replace_fast(ht, tbl, obj_old,
                                                obj_new, params)) &&
               (tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
                ;

        rcu_read_unlock();

        return err;
}

/**
 * rhltable_walk_enter - Initialise an iterator
 * @hlt:        Table to walk over
 * @iter:        Hash table Iterator
 *
 * This function prepares a hash table walk.
 *
 * Note that if you restart a walk after rhashtable_walk_stop you
 * may see the same object twice.  Also, you may miss objects if
 * there are removals in between rhashtable_walk_stop and the next
 * call to rhashtable_walk_start.
 *
 * For a completely stable walk you should construct your own data
 * structure outside the hash table.
 *
 * This function may be called from any process context, including
 * non-preemptable context, but cannot be called from softirq or
 * hardirq context.
 *
 * You must call rhashtable_walk_exit after this function returns.
 */
static inline void rhltable_walk_enter(struct rhltable *hlt,
                                       struct rhashtable_iter *iter)
{
        return rhashtable_walk_enter(&hlt->ht, iter);
}

/**
 * rhltable_free_and_destroy - free elements and destroy hash list table
 * @hlt:        the hash list table to destroy
 * @free_fn:        callback to release resources of element
 * @arg:        pointer passed to free_fn
 *
 * See documentation for rhashtable_free_and_destroy.
 */
static inline void rhltable_free_and_destroy(struct rhltable *hlt,
                                             void (*free_fn)(void *ptr,
                                                             void *arg),
                                             void *arg)
{
        return rhashtable_free_and_destroy(&hlt->ht, free_fn, arg);
}

static inline void rhltable_destroy(struct rhltable *hlt)
{
        return rhltable_free_and_destroy(hlt, NULL, NULL);
}

#endif /* _LINUX_RHASHTABLE_H */




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TASK_WORK_H
#define _LINUX_TASK_WORK_H

#include <linux/list.h>
#include <linux/sched.h>

typedef void (*task_work_func_t)(struct callback_head *);

static inline void
init_task_work(struct callback_head *twork, task_work_func_t func)
{
        twork->func = func;
}

enum task_work_notify_mode {
        TWA_NONE,
        TWA_RESUME,
        TWA_SIGNAL,
};

int task_work_add(struct task_struct *task, struct callback_head *twork,
                        enum task_work_notify_mode mode);

struct callback_head *task_work_cancel_match(struct task_struct *task,
        bool (*match)(struct callback_head *, void *data), void *data);
struct callback_head *task_work_cancel(struct task_struct *, task_work_func_t);
void task_work_run(void);

static inline void exit_task_work(struct task_struct *task)
{
        task_work_run();
}

#endif        /* _LINUX_TASK_WORK_H */













































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM skb

#if !defined(_TRACE_SKB_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_SKB_H

#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/tracepoint.h>

/*
 * Tracepoint for free an sk_buff:
 */
TRACE_EVENT(kfree_skb,

        TP_PROTO(struct sk_buff *skb, void *location),

        TP_ARGS(skb, location),

        TP_STRUCT__entry(
                __field(        void *,                skbaddr                )
                __field(        void *,                location        )
                __field(        unsigned short,        protocol        )
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->location = location;
                __entry->protocol = ntohs(skb->protocol);
        ),

        TP_printk("skbaddr=%p protocol=%u location=%p",
                __entry->skbaddr, __entry->protocol, __entry->location)
);

TRACE_EVENT(consume_skb,

        TP_PROTO(struct sk_buff *skb),

        TP_ARGS(skb),

        TP_STRUCT__entry(
                __field(        void *,        skbaddr        )
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
        ),

        TP_printk("skbaddr=%p", __entry->skbaddr)
);

TRACE_EVENT(skb_copy_datagram_iovec,

        TP_PROTO(const struct sk_buff *skb, int len),

        TP_ARGS(skb, len),

        TP_STRUCT__entry(
                __field(        const void *,                skbaddr                )
                __field(        int,                        len                )
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->len = len;
        ),

        TP_printk("skbaddr=%p len=%d", __entry->skbaddr, __entry->len)
);

#endif /* _TRACE_SKB_H */

/* This part must be outside protection */
#include <trace/define_trace.h>







































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
// SPDX-License-Identifier: GPL-2.0+
/*
 * ext4_jbd2.h
 *
 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
 *
 * Copyright 1998--1999 Red Hat corp --- All Rights Reserved
 *
 * Ext4-specific journaling extensions.
 */

#ifndef _EXT4_JBD2_H
#define _EXT4_JBD2_H

#include <linux/fs.h>
#include <linux/jbd2.h>
#include "ext4.h"

#define EXT4_JOURNAL(inode)        (EXT4_SB((inode)->i_sb)->s_journal)

/* Define the number of blocks we need to account to a transaction to
 * modify one block of data.
 *
 * We may have to touch one inode, one bitmap buffer, up to three
 * indirection blocks, the group and superblock summaries, and the data
 * block to complete the transaction.
 *
 * For extents-enabled fs we may have to allocate and modify up to
 * 5 levels of tree, data block (for each of these we need bitmap + group
 * summaries), root which is stored in the inode, sb
 */

#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb)                                \
        (ext4_has_feature_extents(sb) ? 20U : 8U)

/* Extended attribute operations touch at most two data buffers,
 * two bitmap buffers, and two group summaries, in addition to the inode
 * and the superblock, which are already accounted for. */

#define EXT4_XATTR_TRANS_BLOCKS                6U

/* Define the minimum size for a transaction which modifies data.  This
 * needs to take into account the fact that we may end up modifying two
 * quota files too (one for the group, one for the user quota).  The
 * superblock only gets updated once, of course, so don't bother
 * counting that again for the quota updates. */

#define EXT4_DATA_TRANS_BLOCKS(sb)        (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
                                         EXT4_XATTR_TRANS_BLOCKS - 2 + \
                                         EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))

/*
 * Define the number of metadata blocks we need to account to modify data.
 *
 * This include super block, inode block, quota blocks and xattr blocks
 */
#define EXT4_META_TRANS_BLOCKS(sb)        (EXT4_XATTR_TRANS_BLOCKS + \
                                        EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))

/* Define an arbitrary limit for the amount of data we will anticipate
 * writing to any given transaction.  For unbounded transactions such as
 * write(2) and truncate(2) we can write more than this, but we always
 * start off at the maximum transaction size and grow the transaction
 * optimistically as we go. */

#define EXT4_MAX_TRANS_DATA                64U

/* We break up a large truncate or write transaction once the handle's
 * buffer credits gets this low, we need either to extend the
 * transaction or to start a new one.  Reserve enough space here for
 * inode, bitmap, superblock, group and indirection updates for at least
 * one block, plus two quota updates.  Quota allocations are not
 * needed. */

#define EXT4_RESERVE_TRANS_BLOCKS        12U

/*
 * Number of credits needed if we need to insert an entry into a
 * directory.  For each new index block, we need 4 blocks (old index
 * block, new index block, bitmap block, bg summary).  For normal
 * htree directories there are 2 levels; if the largedir feature
 * enabled it's 3 levels.
 */
#define EXT4_INDEX_EXTRA_TRANS_BLOCKS        12U

#ifdef CONFIG_QUOTA
/* Amount of blocks needed for quota update - we know that the structure was
 * allocated so we need to update only data block */
#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
                ext4_has_feature_quota(sb)) ? 1 : 0)
/* Amount of blocks needed for quota insert/delete - we do some block writes
 * but inode, sb and group updates are done only once */
#define EXT4_QUOTA_INIT_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
                ext4_has_feature_quota(sb)) ?\
                (DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
                 +3+DQUOT_INIT_REWRITE) : 0)

#define EXT4_QUOTA_DEL_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
                ext4_has_feature_quota(sb)) ?\
                (DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
                 +3+DQUOT_DEL_REWRITE) : 0)
#else
#define EXT4_QUOTA_TRANS_BLOCKS(sb) 0
#define EXT4_QUOTA_INIT_BLOCKS(sb) 0
#define EXT4_QUOTA_DEL_BLOCKS(sb) 0
#endif
#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))

/*
 * Ext4 handle operation types -- for logging purposes
 */
#define EXT4_HT_MISC             0
#define EXT4_HT_INODE            1
#define EXT4_HT_WRITE_PAGE       2
#define EXT4_HT_MAP_BLOCKS       3
#define EXT4_HT_DIR              4
#define EXT4_HT_TRUNCATE         5
#define EXT4_HT_QUOTA            6
#define EXT4_HT_RESIZE           7
#define EXT4_HT_MIGRATE          8
#define EXT4_HT_MOVE_EXTENTS     9
#define EXT4_HT_XATTR           10
#define EXT4_HT_EXT_CONVERT     11
#define EXT4_HT_MAX             12

/**
 *   struct ext4_journal_cb_entry - Base structure for callback information.
 *
 *   This struct is a 'seed' structure for a using with your own callback
 *   structs. If you are using callbacks you must allocate one of these
 *   or another struct of your own definition which has this struct
 *   as it's first element and pass it to ext4_journal_callback_add().
 */
struct ext4_journal_cb_entry {
        /* list information for other callbacks attached to the same handle */
        struct list_head jce_list;

        /*  Function to call with this callback structure */
        void (*jce_func)(struct super_block *sb,
                         struct ext4_journal_cb_entry *jce, int error);

        /* user data goes here */
};

/**
 * ext4_journal_callback_add: add a function to call after transaction commit
 * @handle: active journal transaction handle to register callback on
 * @func: callback function to call after the transaction has committed:
 *        @sb: superblock of current filesystem for transaction
 *        @jce: returned journal callback data
 *        @rc: journal state at commit (0 = transaction committed properly)
 * @jce: journal callback data (internal and function private data struct)
 *
 * The registered function will be called in the context of the journal thread
 * after the transaction for which the handle was created has completed.
 *
 * No locks are held when the callback function is called, so it is safe to
 * call blocking functions from within the callback, but the callback should
 * not block or run for too long, or the filesystem will be blocked waiting for
 * the next transaction to commit. No journaling functions can be used, or
 * there is a risk of deadlock.
 *
 * There is no guaranteed calling order of multiple registered callbacks on
 * the same transaction.
 */
static inline void _ext4_journal_callback_add(handle_t *handle,
                        struct ext4_journal_cb_entry *jce)
{
        /* Add the jce to transaction's private list */
        list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
}

static inline void ext4_journal_callback_add(handle_t *handle,
                        void (*func)(struct super_block *sb,
                                     struct ext4_journal_cb_entry *jce,
                                     int rc),
                        struct ext4_journal_cb_entry *jce)
{
        struct ext4_sb_info *sbi =
                        EXT4_SB(handle->h_transaction->t_journal->j_private);

        /* Add the jce to transaction's private list */
        jce->jce_func = func;
        spin_lock(&sbi->s_md_lock);
        _ext4_journal_callback_add(handle, jce);
        spin_unlock(&sbi->s_md_lock);
}


/**
 * ext4_journal_callback_del: delete a registered callback
 * @handle: active journal transaction handle on which callback was registered
 * @jce: registered journal callback entry to unregister
 * Return true if object was successfully removed
 */
static inline bool ext4_journal_callback_try_del(handle_t *handle,
                                             struct ext4_journal_cb_entry *jce)
{
        bool deleted;
        struct ext4_sb_info *sbi =
                        EXT4_SB(handle->h_transaction->t_journal->j_private);

        spin_lock(&sbi->s_md_lock);
        deleted = !list_empty(&jce->jce_list);
        list_del_init(&jce->jce_list);
        spin_unlock(&sbi->s_md_lock);
        return deleted;
}

int
ext4_mark_iloc_dirty(handle_t *handle,
                     struct inode *inode,
                     struct ext4_iloc *iloc);

/*
 * On success, We end up with an outstanding reference count against
 * iloc->bh.  This _must_ be cleaned up later.
 */

int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
                        struct ext4_iloc *iloc);

#define ext4_mark_inode_dirty(__h, __i)                                        \
                __ext4_mark_inode_dirty((__h), (__i), __func__, __LINE__)
int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode,
                                const char *func, unsigned int line);

int ext4_expand_extra_isize(struct inode *inode,
                            unsigned int new_extra_isize,
                            struct ext4_iloc *iloc);
/*
 * Wrapper functions with which ext4 calls into JBD.
 */
int __ext4_journal_get_write_access(const char *where, unsigned int line,
                                    handle_t *handle, struct buffer_head *bh);

int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
                  int is_metadata, struct inode *inode,
                  struct buffer_head *bh, ext4_fsblk_t blocknr);

int __ext4_journal_get_create_access(const char *where, unsigned int line,
                                handle_t *handle, struct buffer_head *bh);

int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
                                 handle_t *handle, struct inode *inode,
                                 struct buffer_head *bh);

int __ext4_handle_dirty_super(const char *where, unsigned int line,
                              handle_t *handle, struct super_block *sb);

#define ext4_journal_get_write_access(handle, bh) \
        __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
        __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \
                      (bh), (block_nr))
#define ext4_journal_get_create_access(handle, bh) \
        __ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh))
#define ext4_handle_dirty_metadata(handle, inode, bh) \
        __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
                                     (bh))
#define ext4_handle_dirty_super(handle, sb) \
        __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))

handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
                                  int type, int blocks, int rsv_blocks,
                                  int revoke_creds);
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);

#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)

/* Note:  Do not use this for NULL handles.  This is only to determine if
 * a properly allocated handle is using a journal or not. */
static inline int ext4_handle_valid(handle_t *handle)
{
        if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
                return 0;
        return 1;
}

static inline void ext4_handle_sync(handle_t *handle)
{
        if (ext4_handle_valid(handle))
                handle->h_sync = 1;
}

static inline int ext4_handle_is_aborted(handle_t *handle)
{
        if (ext4_handle_valid(handle))
                return is_handle_aborted(handle);
        return 0;
}

static inline int ext4_free_metadata_revoke_credits(struct super_block *sb,
                                                    int blocks)
{
        /* Freeing each metadata block can result in freeing one cluster */
        return blocks * EXT4_SB(sb)->s_cluster_ratio;
}

static inline int ext4_trans_default_revoke_credits(struct super_block *sb)
{
        return ext4_free_metadata_revoke_credits(sb, 8);
}

#define ext4_journal_start_sb(sb, type, nblocks)                        \
        __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0,        \
                                ext4_trans_default_revoke_credits(sb))

#define ext4_journal_start(inode, type, nblocks)                        \
        __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0,        \
                             ext4_trans_default_revoke_credits((inode)->i_sb))

#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks)\
        __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks),\
                             ext4_trans_default_revoke_credits((inode)->i_sb))

#define ext4_journal_start_with_revoke(inode, type, blocks, revoke_creds) \
        __ext4_journal_start((inode), __LINE__, (type), (blocks), 0,        \
                             (revoke_creds))

static inline handle_t *__ext4_journal_start(struct inode *inode,
                                             unsigned int line, int type,
                                             int blocks, int rsv_blocks,
                                             int revoke_creds)
{
        return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
                                       rsv_blocks, revoke_creds);
}

#define ext4_journal_stop(handle) \
        __ext4_journal_stop(__func__, __LINE__, (handle))

#define ext4_journal_start_reserved(handle, type) \
        __ext4_journal_start_reserved((handle), __LINE__, (type))

handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
                                        int type);

static inline handle_t *ext4_journal_current_handle(void)
{
        return journal_current_handle();
}

static inline int ext4_journal_extend(handle_t *handle, int nblocks, int revoke)
{
        if (ext4_handle_valid(handle))
                return jbd2_journal_extend(handle, nblocks, revoke);
        return 0;
}

static inline int ext4_journal_restart(handle_t *handle, int nblocks,
                                       int revoke)
{
        if (ext4_handle_valid(handle))
                return jbd2__journal_restart(handle, nblocks, revoke, GFP_NOFS);
        return 0;
}

int __ext4_journal_ensure_credits(handle_t *handle, int check_cred,
                                  int extend_cred, int revoke_cred);


/*
 * Ensure @handle has at least @check_creds credits available. If not,
 * transaction will be extended or restarted to contain at least @extend_cred
 * credits. Before restarting transaction @fn is executed to allow for cleanup
 * before the transaction is restarted.
 *
 * The return value is < 0 in case of error, 0 in case the handle has enough
 * credits or transaction extension succeeded, 1 in case transaction had to be
 * restarted.
 */
#define ext4_journal_ensure_credits_fn(handle, check_cred, extend_cred,        \
                                       revoke_cred, fn) \
({                                                                        \
        __label__ __ensure_end;                                                \
        int err = __ext4_journal_ensure_credits((handle), (check_cred),        \
                                        (extend_cred), (revoke_cred));        \
                                                                        \
        if (err <= 0)                                                        \
                goto __ensure_end;                                        \
        err = (fn);                                                        \
        if (err < 0)                                                        \
                goto __ensure_end;                                        \
        err = ext4_journal_restart((handle), (extend_cred), (revoke_cred)); \
        if (err == 0)                                                        \
                err = 1;                                                \
__ensure_end:                                                                \
        err;                                                                \
})

/*
 * Ensure given handle has at least requested amount of credits available,
 * possibly restarting transaction if needed. We also make sure the transaction
 * has space for at least ext4_trans_default_revoke_credits(sb) revoke records
 * as freeing one or two blocks is very common pattern and requesting this is
 * very cheap.
 */
static inline int ext4_journal_ensure_credits(handle_t *handle, int credits,
                                              int revoke_creds)
{
        return ext4_journal_ensure_credits_fn(handle, credits, credits,
                                revoke_creds, 0);
}

static inline int ext4_journal_blocks_per_page(struct inode *inode)
{
        if (EXT4_JOURNAL(inode) != NULL)
                return jbd2_journal_blocks_per_page(inode);
        return 0;
}

static inline int ext4_journal_force_commit(journal_t *journal)
{
        if (journal)
                return jbd2_journal_force_commit(journal);
        return 0;
}

static inline int ext4_jbd2_inode_add_write(handle_t *handle,
                struct inode *inode, loff_t start_byte, loff_t length)
{
        if (ext4_handle_valid(handle))
                return jbd2_journal_inode_ranged_write(handle,
                                EXT4_I(inode)->jinode, start_byte, length);
        return 0;
}

static inline int ext4_jbd2_inode_add_wait(handle_t *handle,
                struct inode *inode, loff_t start_byte, loff_t length)
{
        if (ext4_handle_valid(handle))
                return jbd2_journal_inode_ranged_wait(handle,
                                EXT4_I(inode)->jinode, start_byte, length);
        return 0;
}

static inline void ext4_update_inode_fsync_trans(handle_t *handle,
                                                 struct inode *inode,
                                                 int datasync)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        if (ext4_handle_valid(handle) && !is_handle_aborted(handle)) {
                ei->i_sync_tid = handle->h_transaction->t_tid;
                if (datasync)
                        ei->i_datasync_tid = handle->h_transaction->t_tid;
        }
}

/* super.c */
int ext4_force_commit(struct super_block *sb);

/*
 * Ext4 inode journal modes
 */
#define EXT4_INODE_JOURNAL_DATA_MODE        0x01 /* journal data mode */
#define EXT4_INODE_ORDERED_DATA_MODE        0x02 /* ordered data mode */
#define EXT4_INODE_WRITEBACK_DATA_MODE        0x04 /* writeback data mode */

int ext4_inode_journal_mode(struct inode *inode);

static inline int ext4_should_journal_data(struct inode *inode)
{
        return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE;
}

static inline int ext4_should_order_data(struct inode *inode)
{
        return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE;
}

static inline int ext4_should_writeback_data(struct inode *inode)
{
        return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;
}

static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks)
{
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
                return 0;
        if (!ext4_should_journal_data(inode))
                return 0;
        /*
         * Data blocks in one extent are contiguous, just account for partial
         * clusters at extent boundaries
         */
        return blocks + 2*(EXT4_SB(inode->i_sb)->s_cluster_ratio - 1);
}

/*
 * This function controls whether or not we should try to go down the
 * dioread_nolock code paths, which makes it safe to avoid taking
 * i_mutex for direct I/O reads.  This only works for extent-based
 * files, and it doesn't work if data journaling is enabled, since the
 * dioread_nolock code uses b_private to pass information back to the
 * I/O completion handler, and this conflicts with the jbd's use of
 * b_private.
 */
static inline int ext4_should_dioread_nolock(struct inode *inode)
{
        if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
                return 0;
        if (!S_ISREG(inode->i_mode))
                return 0;
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return 0;
        if (ext4_should_journal_data(inode))
                return 0;
        /* temporary fix to prevent generic/422 test failures */
        if (!test_opt(inode->i_sb, DELALLOC))
                return 0;
        return 1;
}

#endif        /* _EXT4_JBD2_H */






































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_VMSTAT_H
#define _LINUX_VMSTAT_H

#include <linux/types.h>
#include <linux/percpu.h>
#include <linux/mmzone.h>
#include <linux/vm_event_item.h>
#include <linux/atomic.h>
#include <linux/static_key.h>
#include <linux/mmdebug.h>

extern int sysctl_stat_interval;

#ifdef CONFIG_NUMA
#define ENABLE_NUMA_STAT   1
#define DISABLE_NUMA_STAT   0
extern int sysctl_vm_numa_stat;
DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key);
int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
                void *buffer, size_t *length, loff_t *ppos);
#endif

struct reclaim_stat {
        unsigned nr_dirty;
        unsigned nr_unqueued_dirty;
        unsigned nr_congested;
        unsigned nr_writeback;
        unsigned nr_immediate;
        unsigned nr_pageout;
        unsigned nr_activate[ANON_AND_FILE];
        unsigned nr_ref_keep;
        unsigned nr_unmap_fail;
        unsigned nr_lazyfree_fail;
};

enum writeback_stat_item {
        NR_DIRTY_THRESHOLD,
        NR_DIRTY_BG_THRESHOLD,
        NR_VM_WRITEBACK_STAT_ITEMS,
};

#ifdef CONFIG_VM_EVENT_COUNTERS
/*
 * Light weight per cpu counter implementation.
 *
 * Counters should only be incremented and no critical kernel component
 * should rely on the counter values.
 *
 * Counters are handled completely inline. On many platforms the code
 * generated will simply be the increment of a global address.
 */

struct vm_event_state {
        unsigned long event[NR_VM_EVENT_ITEMS];
};

DECLARE_PER_CPU(struct vm_event_state, vm_event_states);

/*
 * vm counters are allowed to be racy. Use raw_cpu_ops to avoid the
 * local_irq_disable overhead.
 */
static inline void __count_vm_event(enum vm_event_item item)
{
        raw_cpu_inc(vm_event_states.event[item]);
}

static inline void count_vm_event(enum vm_event_item item)
{
        this_cpu_inc(vm_event_states.event[item]);
}

static inline void __count_vm_events(enum vm_event_item item, long delta)
{
        raw_cpu_add(vm_event_states.event[item], delta);
}

static inline void count_vm_events(enum vm_event_item item, long delta)
{
        this_cpu_add(vm_event_states.event[item], delta);
}

extern void all_vm_events(unsigned long *);

extern void vm_events_fold_cpu(int cpu);

#else

/* Disable counters */
static inline void count_vm_event(enum vm_event_item item)
{
}
static inline void count_vm_events(enum vm_event_item item, long delta)
{
}
static inline void __count_vm_event(enum vm_event_item item)
{
}
static inline void __count_vm_events(enum vm_event_item item, long delta)
{
}
static inline void all_vm_events(unsigned long *ret)
{
}
static inline void vm_events_fold_cpu(int cpu)
{
}

#endif /* CONFIG_VM_EVENT_COUNTERS */

#ifdef CONFIG_NUMA_BALANCING
#define count_vm_numa_event(x)     count_vm_event(x)
#define count_vm_numa_events(x, y) count_vm_events(x, y)
#else
#define count_vm_numa_event(x) do {} while (0)
#define count_vm_numa_events(x, y) do { (void)(y); } while (0)
#endif /* CONFIG_NUMA_BALANCING */

#ifdef CONFIG_DEBUG_TLBFLUSH
#define count_vm_tlb_event(x)           count_vm_event(x)
#define count_vm_tlb_events(x, y)  count_vm_events(x, y)
#else
#define count_vm_tlb_event(x)     do {} while (0)
#define count_vm_tlb_events(x, y) do { (void)(y); } while (0)
#endif

#ifdef CONFIG_DEBUG_VM_VMACACHE
#define count_vm_vmacache_event(x) count_vm_event(x)
#else
#define count_vm_vmacache_event(x) do {} while (0)
#endif

#define __count_zid_vm_events(item, zid, delta) \
        __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)

/*
 * Zone and node-based page accounting with per cpu differentials.
 */
extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS];
extern atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS];

#ifdef CONFIG_NUMA
static inline void zone_numa_state_add(long x, struct zone *zone,
                                 enum numa_stat_item item)
{
        atomic_long_add(x, &zone->vm_numa_stat[item]);
        atomic_long_add(x, &vm_numa_stat[item]);
}

static inline unsigned long global_numa_state(enum numa_stat_item item)
{
        long x = atomic_long_read(&vm_numa_stat[item]);

        return x;
}

static inline unsigned long zone_numa_state_snapshot(struct zone *zone,
                                        enum numa_stat_item item)
{
        long x = atomic_long_read(&zone->vm_numa_stat[item]);
        int cpu;

        for_each_online_cpu(cpu)
                x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item];

        return x;
}
#endif /* CONFIG_NUMA */

static inline void zone_page_state_add(long x, struct zone *zone,
                                 enum zone_stat_item item)
{
        atomic_long_add(x, &zone->vm_stat[item]);
        atomic_long_add(x, &vm_zone_stat[item]);
}

static inline void node_page_state_add(long x, struct pglist_data *pgdat,
                                 enum node_stat_item item)
{
        atomic_long_add(x, &pgdat->vm_stat[item]);
        atomic_long_add(x, &vm_node_stat[item]);
}

static inline unsigned long global_zone_page_state(enum zone_stat_item item)
{
        long x = atomic_long_read(&vm_zone_stat[item]);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

static inline
unsigned long global_node_page_state_pages(enum node_stat_item item)
{
        long x = atomic_long_read(&vm_node_stat[item]);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

static inline unsigned long global_node_page_state(enum node_stat_item item)
{
        VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));

        return global_node_page_state_pages(item);
}

static inline unsigned long zone_page_state(struct zone *zone,
                                        enum zone_stat_item item)
{
        long x = atomic_long_read(&zone->vm_stat[item]);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

/*
 * More accurate version that also considers the currently pending
 * deltas. For that we need to loop over all cpus to find the current
 * deltas. There is no synchronization so the result cannot be
 * exactly accurate either.
 */
static inline unsigned long zone_page_state_snapshot(struct zone *zone,
                                        enum zone_stat_item item)
{
        long x = atomic_long_read(&zone->vm_stat[item]);

#ifdef CONFIG_SMP
        int cpu;
        for_each_online_cpu(cpu)
                x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item];

        if (x < 0)
                x = 0;
#endif
        return x;
}

#ifdef CONFIG_NUMA
extern void __inc_numa_state(struct zone *zone, enum numa_stat_item item);
extern unsigned long sum_zone_node_page_state(int node,
                                              enum zone_stat_item item);
extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item);
extern unsigned long node_page_state(struct pglist_data *pgdat,
                                                enum node_stat_item item);
extern unsigned long node_page_state_pages(struct pglist_data *pgdat,
                                           enum node_stat_item item);
#else
#define sum_zone_node_page_state(node, item) global_zone_page_state(item)
#define node_page_state(node, item) global_node_page_state(item)
#define node_page_state_pages(node, item) global_node_page_state_pages(item)
#endif /* CONFIG_NUMA */

#ifdef CONFIG_SMP
void __mod_zone_page_state(struct zone *, enum zone_stat_item item, long);
void __inc_zone_page_state(struct page *, enum zone_stat_item);
void __dec_zone_page_state(struct page *, enum zone_stat_item);

void __mod_node_page_state(struct pglist_data *, enum node_stat_item item, long);
void __inc_node_page_state(struct page *, enum node_stat_item);
void __dec_node_page_state(struct page *, enum node_stat_item);

void mod_zone_page_state(struct zone *, enum zone_stat_item, long);
void inc_zone_page_state(struct page *, enum zone_stat_item);
void dec_zone_page_state(struct page *, enum zone_stat_item);

void mod_node_page_state(struct pglist_data *, enum node_stat_item, long);
void inc_node_page_state(struct page *, enum node_stat_item);
void dec_node_page_state(struct page *, enum node_stat_item);

extern void inc_node_state(struct pglist_data *, enum node_stat_item);
extern void __inc_zone_state(struct zone *, enum zone_stat_item);
extern void __inc_node_state(struct pglist_data *, enum node_stat_item);
extern void dec_zone_state(struct zone *, enum zone_stat_item);
extern void __dec_zone_state(struct zone *, enum zone_stat_item);
extern void __dec_node_state(struct pglist_data *, enum node_stat_item);

void quiet_vmstat(void);
void cpu_vm_stats_fold(int cpu);
void refresh_zone_stat_thresholds(void);

struct ctl_table;
int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp,
                loff_t *ppos);

void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);

int calculate_pressure_threshold(struct zone *zone);
int calculate_normal_threshold(struct zone *zone);
void set_pgdat_percpu_threshold(pg_data_t *pgdat,
                                int (*calculate_pressure)(struct zone *));
#else /* CONFIG_SMP */

/*
 * We do not maintain differentials in a single processor configuration.
 * The functions directly modify the zone and global counters.
 */
static inline void __mod_zone_page_state(struct zone *zone,
                        enum zone_stat_item item, long delta)
{
        zone_page_state_add(delta, zone, item);
}

static inline void __mod_node_page_state(struct pglist_data *pgdat,
                        enum node_stat_item item, int delta)
{
        if (vmstat_item_in_bytes(item)) {
                VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
                delta >>= PAGE_SHIFT;
        }

        node_page_state_add(delta, pgdat, item);
}

static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
        atomic_long_inc(&zone->vm_stat[item]);
        atomic_long_inc(&vm_zone_stat[item]);
}

static inline void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
{
        atomic_long_inc(&pgdat->vm_stat[item]);
        atomic_long_inc(&vm_node_stat[item]);
}

static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
{
        atomic_long_dec(&zone->vm_stat[item]);
        atomic_long_dec(&vm_zone_stat[item]);
}

static inline void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
{
        atomic_long_dec(&pgdat->vm_stat[item]);
        atomic_long_dec(&vm_node_stat[item]);
}

static inline void __inc_zone_page_state(struct page *page,
                        enum zone_stat_item item)
{
        __inc_zone_state(page_zone(page), item);
}

static inline void __inc_node_page_state(struct page *page,
                        enum node_stat_item item)
{
        __inc_node_state(page_pgdat(page), item);
}


static inline void __dec_zone_page_state(struct page *page,
                        enum zone_stat_item item)
{
        __dec_zone_state(page_zone(page), item);
}

static inline void __dec_node_page_state(struct page *page,
                        enum node_stat_item item)
{
        __dec_node_state(page_pgdat(page), item);
}


/*
 * We only use atomic operations to update counters. So there is no need to
 * disable interrupts.
 */
#define inc_zone_page_state __inc_zone_page_state
#define dec_zone_page_state __dec_zone_page_state
#define mod_zone_page_state __mod_zone_page_state

#define inc_node_page_state __inc_node_page_state
#define dec_node_page_state __dec_node_page_state
#define mod_node_page_state __mod_node_page_state

#define inc_zone_state __inc_zone_state
#define inc_node_state __inc_node_state
#define dec_zone_state __dec_zone_state

#define set_pgdat_percpu_threshold(pgdat, callback) { }

static inline void refresh_zone_stat_thresholds(void) { }
static inline void cpu_vm_stats_fold(int cpu) { }
static inline void quiet_vmstat(void) { }

static inline void drain_zonestat(struct zone *zone,
                        struct per_cpu_pageset *pset) { }
#endif                /* CONFIG_SMP */

static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
                                             int migratetype)
{
        __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
        if (is_migrate_cma(migratetype))
                __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
}

extern const char * const vmstat_text[];

static inline const char *zone_stat_name(enum zone_stat_item item)
{
        return vmstat_text[item];
}

#ifdef CONFIG_NUMA
static inline const char *numa_stat_name(enum numa_stat_item item)
{
        return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
                           item];
}
#endif /* CONFIG_NUMA */

static inline const char *node_stat_name(enum node_stat_item item)
{
        return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
                           NR_VM_NUMA_STAT_ITEMS +
                           item];
}

static inline const char *lru_list_name(enum lru_list lru)
{
        return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
}

static inline const char *writeback_stat_name(enum writeback_stat_item item)
{
        return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
                           NR_VM_NUMA_STAT_ITEMS +
                           NR_VM_NODE_STAT_ITEMS +
                           item];
}

#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
static inline const char *vm_event_name(enum vm_event_item item)
{
        return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
                           NR_VM_NUMA_STAT_ITEMS +
                           NR_VM_NODE_STAT_ITEMS +
                           NR_VM_WRITEBACK_STAT_ITEMS +
                           item];
}
#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */

#endif /* _LINUX_VMSTAT_H */
























































































































































































    1 














    1 
    1 





































    1 














































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
 */

#include <linux/dcache.h>
#include <linux/fs.h>
#include <linux/gfp.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/srcu.h>

#include <linux/fsnotify_backend.h>
#include "fsnotify.h"

/*
 * Clear all of the marks on an inode when it is being evicted from core
 */
void __fsnotify_inode_delete(struct inode *inode)
{
        fsnotify_clear_marks_by_inode(inode);
}
EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);

void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
{
        fsnotify_clear_marks_by_mount(mnt);
}

/**
 * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
 * @sb: superblock being unmounted.
 *
 * Called during unmount with no locks held, so needs to be safe against
 * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
 */
static void fsnotify_unmount_inodes(struct super_block *sb)
{
        struct inode *inode, *iput_inode = NULL;

        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                /*
                 * We cannot __iget() an inode in state I_FREEING,
                 * I_WILL_FREE, or I_NEW which is fine because by that point
                 * the inode cannot have any associated watches.
                 */
                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }

                /*
                 * If i_count is zero, the inode cannot have any watches and
                 * doing an __iget/iput with SB_ACTIVE clear would actually
                 * evict all inodes with zero i_count from icache which is
                 * unnecessarily violent and may in fact be illegal to do.
                 * However, we should have been called /after/ evict_inodes
                 * removed all zero refcount inodes, in any case.  Test to
                 * be sure.
                 */
                if (!atomic_read(&inode->i_count)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }

                __iget(inode);
                spin_unlock(&inode->i_lock);
                spin_unlock(&sb->s_inode_list_lock);

                if (iput_inode)
                        iput(iput_inode);

                /* for each watch, send FS_UNMOUNT and then remove it */
                fsnotify_inode(inode, FS_UNMOUNT);

                fsnotify_inode_delete(inode);

                iput_inode = inode;

                cond_resched();
                spin_lock(&sb->s_inode_list_lock);
        }
        spin_unlock(&sb->s_inode_list_lock);

        if (iput_inode)
                iput(iput_inode);
        /* Wait for outstanding inode references from connectors */
        wait_var_event(&sb->s_fsnotify_inode_refs,
                       !atomic_long_read(&sb->s_fsnotify_inode_refs));
}

void fsnotify_sb_delete(struct super_block *sb)
{
        fsnotify_unmount_inodes(sb);
        fsnotify_clear_marks_by_sb(sb);
}

/*
 * Given an inode, first check if we care what happens to our children.  Inotify
 * and dnotify both tell their parents about events.  If we care about any event
 * on a child we run all of our children and set a dentry flag saying that the
 * parent cares.  Thus when an event happens on a child it can quickly tell if
 * if there is a need to find a parent and send the event to the parent.
 */
void __fsnotify_update_child_dentry_flags(struct inode *inode)
{
        struct dentry *alias;
        int watched;

        if (!S_ISDIR(inode->i_mode))
                return;

        /* determine if the children should tell inode about their events */
        watched = fsnotify_inode_watches_children(inode);

        spin_lock(&inode->i_lock);
        /* run all of the dentries associated with this inode.  Since this is a
         * directory, there damn well better only be one item on this list */
        hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
                struct dentry *child;

                /* run all of the children of the original inode and fix their
                 * d_flags to indicate parental interest (their parent is the
                 * original inode) */
                spin_lock(&alias->d_lock);
                list_for_each_entry(child, &alias->d_subdirs, d_child) {
                        if (!child->d_inode)
                                continue;

                        spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
                        if (watched)
                                child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
                        else
                                child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
                        spin_unlock(&child->d_lock);
                }
                spin_unlock(&alias->d_lock);
        }
        spin_unlock(&inode->i_lock);
}

/* Are inode/sb/mount interested in parent and name info with this event? */
static bool fsnotify_event_needs_parent(struct inode *inode, struct mount *mnt,
                                        __u32 mask)
{
        __u32 marks_mask = 0;

        /* We only send parent/name to inode/sb/mount for events on non-dir */
        if (mask & FS_ISDIR)
                return false;

        /*
         * All events that are possible on child can also may be reported with
         * parent/name info to inode/sb/mount.  Otherwise, a watching parent
         * could result in events reported with unexpected name info to sb/mount.
         */
        BUILD_BUG_ON(FS_EVENTS_POSS_ON_CHILD & ~FS_EVENTS_POSS_TO_PARENT);

        /* Did either inode/sb/mount subscribe for events with parent/name? */
        marks_mask |= fsnotify_parent_needed_mask(inode->i_fsnotify_mask);
        marks_mask |= fsnotify_parent_needed_mask(inode->i_sb->s_fsnotify_mask);
        if (mnt)
                marks_mask |= fsnotify_parent_needed_mask(mnt->mnt_fsnotify_mask);

        /* Did they subscribe for this event with parent/name info? */
        return mask & marks_mask;
}

/*
 * Notify this dentry's parent about a child's events with child name info
 * if parent is watching or if inode/sb/mount are interested in events with
 * parent and name info.
 *
 * Notify only the child without name info if parent is not watching and
 * inode/sb/mount are not interested in events with parent and name info.
 */
int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
                      int data_type)
{
        const struct path *path = fsnotify_data_path(data, data_type);
        struct mount *mnt = path ? real_mount(path->mnt) : NULL;
        struct inode *inode = d_inode(dentry);
        struct dentry *parent;
        bool parent_watched = dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED;
        bool parent_needed, parent_interested;
        __u32 p_mask;
        struct inode *p_inode = NULL;
        struct name_snapshot name;
        struct qstr *file_name = NULL;
        int ret = 0;

        /*
         * Do inode/sb/mount care about parent and name info on non-dir?
         * Do they care about any event at all?
         */
        if (!inode->i_fsnotify_marks && !inode->i_sb->s_fsnotify_marks &&
            (!mnt || !mnt->mnt_fsnotify_marks) && !parent_watched)
                return 0;

        parent = NULL;
        parent_needed = fsnotify_event_needs_parent(inode, mnt, mask);
        if (!parent_watched && !parent_needed)
                goto notify;

        /* Does parent inode care about events on children? */
        parent = dget_parent(dentry);
        p_inode = parent->d_inode;
        p_mask = fsnotify_inode_watches_children(p_inode);
        if (unlikely(parent_watched && !p_mask))
                __fsnotify_update_child_dentry_flags(p_inode);

        /*
         * Include parent/name in notification either if some notification
         * groups require parent info or the parent is interested in this event.
         */
        parent_interested = mask & p_mask & ALL_FSNOTIFY_EVENTS;
        if (parent_needed || parent_interested) {
                /* When notifying parent, child should be passed as data */
                WARN_ON_ONCE(inode != fsnotify_data_inode(data, data_type));

                /* Notify both parent and child with child name info */
                take_dentry_name_snapshot(&name, dentry);
                file_name = &name.name;
                if (parent_interested)
                        mask |= FS_EVENT_ON_CHILD;
        }

notify:
        ret = fsnotify(mask, data, data_type, p_inode, file_name, inode, 0);

        if (file_name)
                release_dentry_name_snapshot(&name);
        dput(parent);

        return ret;
}
EXPORT_SYMBOL_GPL(__fsnotify_parent);

static int fsnotify_handle_inode_event(struct fsnotify_group *group,
                                       struct fsnotify_mark *inode_mark,
                                       u32 mask, const void *data, int data_type,
                                       struct inode *dir, const struct qstr *name,
                                       u32 cookie)
{
        const struct path *path = fsnotify_data_path(data, data_type);
        struct inode *inode = fsnotify_data_inode(data, data_type);
        const struct fsnotify_ops *ops = group->ops;

        if (WARN_ON_ONCE(!ops->handle_inode_event))
                return 0;

        if ((inode_mark->mask & FS_EXCL_UNLINK) &&
            path && d_unlinked(path->dentry))
                return 0;

        /* Check interest of this mark in case event was sent with two marks */
        if (!(mask & inode_mark->mask & ALL_FSNOTIFY_EVENTS))
                return 0;

        return ops->handle_inode_event(inode_mark, mask, inode, dir, name, cookie);
}

static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask,
                                 const void *data, int data_type,
                                 struct inode *dir, const struct qstr *name,
                                 u32 cookie, struct fsnotify_iter_info *iter_info)
{
        struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
        struct fsnotify_mark *parent_mark = fsnotify_iter_parent_mark(iter_info);
        int ret;

        if (WARN_ON_ONCE(fsnotify_iter_sb_mark(iter_info)) ||
            WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info)))
                return 0;

        if (parent_mark) {
                /*
                 * parent_mark indicates that the parent inode is watching
                 * children and interested in this event, which is an event
                 * possible on child. But is *this mark* watching children and
                 * interested in this event?
                 */
                if (parent_mark->mask & FS_EVENT_ON_CHILD) {
                        ret = fsnotify_handle_inode_event(group, parent_mark, mask,
                                                          data, data_type, dir, name, 0);
                        if (ret)
                                return ret;
                }
                if (!inode_mark)
                        return 0;
        }

        if (mask & FS_EVENT_ON_CHILD) {
                /*
                 * Some events can be sent on both parent dir and child marks
                 * (e.g. FS_ATTRIB).  If both parent dir and child are
                 * watching, report the event once to parent dir with name (if
                 * interested) and once to child without name (if interested).
                 * The child watcher is expecting an event without a file name
                 * and without the FS_EVENT_ON_CHILD flag.
                 */
                mask &= ~FS_EVENT_ON_CHILD;
                dir = NULL;
                name = NULL;
        }

        return fsnotify_handle_inode_event(group, inode_mark, mask, data, data_type,
                                           dir, name, cookie);
}

static int send_to_group(__u32 mask, const void *data, int data_type,
                         struct inode *dir, const struct qstr *file_name,
                         u32 cookie, struct fsnotify_iter_info *iter_info)
{
        struct fsnotify_group *group = NULL;
        __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS);
        __u32 marks_mask = 0;
        __u32 marks_ignored_mask = 0;
        struct fsnotify_mark *mark;
        int type;

        if (WARN_ON(!iter_info->report_mask))
                return 0;

        /* clear ignored on inode modification */
        if (mask & FS_MODIFY) {
                fsnotify_foreach_obj_type(type) {
                        if (!fsnotify_iter_should_report_type(iter_info, type))
                                continue;
                        mark = iter_info->marks[type];
                        if (mark &&
                            !(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
                                mark->ignored_mask = 0;
                }
        }

        fsnotify_foreach_obj_type(type) {
                if (!fsnotify_iter_should_report_type(iter_info, type))
                        continue;
                mark = iter_info->marks[type];
                /* does the object mark tell us to do something? */
                if (mark) {
                        group = mark->group;
                        marks_mask |= mark->mask;
                        marks_ignored_mask |= mark->ignored_mask;
                }
        }

        pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignored_mask=%x data=%p data_type=%d dir=%p cookie=%d\n",
                 __func__, group, mask, marks_mask, marks_ignored_mask,
                 data, data_type, dir, cookie);

        if (!(test_mask & marks_mask & ~marks_ignored_mask))
                return 0;

        if (group->ops->handle_event) {
                return group->ops->handle_event(group, mask, data, data_type, dir,
                                                file_name, cookie, iter_info);
        }

        return fsnotify_handle_event(group, mask, data, data_type, dir,
                                     file_name, cookie, iter_info);
}

static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector **connp)
{
        struct fsnotify_mark_connector *conn;
        struct hlist_node *node = NULL;

        conn = srcu_dereference(*connp, &fsnotify_mark_srcu);
        if (conn)
                node = srcu_dereference(conn->list.first, &fsnotify_mark_srcu);

        return hlist_entry_safe(node, struct fsnotify_mark, obj_list);
}

static struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark)
{
        struct hlist_node *node = NULL;

        if (mark)
                node = srcu_dereference(mark->obj_list.next,
                                        &fsnotify_mark_srcu);

        return hlist_entry_safe(node, struct fsnotify_mark, obj_list);
}

/*
 * iter_info is a multi head priority queue of marks.
 * Pick a subset of marks from queue heads, all with the
 * same group and set the report_mask for selected subset.
 * Returns the report_mask of the selected subset.
 */
static unsigned int fsnotify_iter_select_report_types(
                struct fsnotify_iter_info *iter_info)
{
        struct fsnotify_group *max_prio_group = NULL;
        struct fsnotify_mark *mark;
        int type;

        /* Choose max prio group among groups of all queue heads */
        fsnotify_foreach_obj_type(type) {
                mark = iter_info->marks[type];
                if (mark &&
                    fsnotify_compare_groups(max_prio_group, mark->group) > 0)
                        max_prio_group = mark->group;
        }

        if (!max_prio_group)
                return 0;

        /* Set the report mask for marks from same group as max prio group */
        iter_info->report_mask = 0;
        fsnotify_foreach_obj_type(type) {
                mark = iter_info->marks[type];
                if (mark &&
                    fsnotify_compare_groups(max_prio_group, mark->group) == 0)
                        fsnotify_iter_set_report_type(iter_info, type);
        }

        return iter_info->report_mask;
}

/*
 * Pop from iter_info multi head queue, the marks that were iterated in the
 * current iteration step.
 */
static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info)
{
        int type;

        fsnotify_foreach_obj_type(type) {
                if (fsnotify_iter_should_report_type(iter_info, type))
                        iter_info->marks[type] =
                                fsnotify_next_mark(iter_info->marks[type]);
        }
}

/*
 * fsnotify - This is the main call to fsnotify.
 *
 * The VFS calls into hook specific functions in linux/fsnotify.h.
 * Those functions then in turn call here.  Here will call out to all of the
 * registered fsnotify_group.  Those groups can then use the notification event
 * in whatever means they feel necessary.
 *
 * @mask:        event type and flags
 * @data:        object that event happened on
 * @data_type:        type of object for fanotify_data_XXX() accessors
 * @dir:        optional directory associated with event -
 *                if @file_name is not NULL, this is the directory that
 *                @file_name is relative to
 * @file_name:        optional file name associated with event
 * @inode:        optional inode associated with event -
 *                either @dir or @inode must be non-NULL.
 *                if both are non-NULL event may be reported to both.
 * @cookie:        inotify rename cookie
 */
int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
             const struct qstr *file_name, struct inode *inode, u32 cookie)
{
        const struct path *path = fsnotify_data_path(data, data_type);
        struct fsnotify_iter_info iter_info = {};
        struct super_block *sb;
        struct mount *mnt = NULL;
        struct inode *parent = NULL;
        int ret = 0;
        __u32 test_mask, marks_mask;

        if (path)
                mnt = real_mount(path->mnt);

        if (!inode) {
                /* Dirent event - report on TYPE_INODE to dir */
                inode = dir;
        } else if (mask & FS_EVENT_ON_CHILD) {
                /*
                 * Event on child - report on TYPE_PARENT to dir if it is
                 * watching children and on TYPE_INODE to child.
                 */
                parent = dir;
        }
        sb = inode->i_sb;

        /*
         * Optimization: srcu_read_lock() has a memory barrier which can
         * be expensive.  It protects walking the *_fsnotify_marks lists.
         * However, if we do not walk the lists, we do not have to do
         * SRCU because we have no references to any objects and do not
         * need SRCU to keep them "alive".
         */
        if (!sb->s_fsnotify_marks &&
            (!mnt || !mnt->mnt_fsnotify_marks) &&
            (!inode || !inode->i_fsnotify_marks) &&
            (!parent || !parent->i_fsnotify_marks))
                return 0;

        marks_mask = sb->s_fsnotify_mask;
        if (mnt)
                marks_mask |= mnt->mnt_fsnotify_mask;
        if (inode)
                marks_mask |= inode->i_fsnotify_mask;
        if (parent)
                marks_mask |= parent->i_fsnotify_mask;


        /*
         * if this is a modify event we may need to clear the ignored masks
         * otherwise return if none of the marks care about this type of event.
         */
        test_mask = (mask & ALL_FSNOTIFY_EVENTS);
        if (!(mask & FS_MODIFY) && !(test_mask & marks_mask))
                return 0;

        iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);

        iter_info.marks[FSNOTIFY_OBJ_TYPE_SB] =
                fsnotify_first_mark(&sb->s_fsnotify_marks);
        if (mnt) {
                iter_info.marks[FSNOTIFY_OBJ_TYPE_VFSMOUNT] =
                        fsnotify_first_mark(&mnt->mnt_fsnotify_marks);
        }
        if (inode) {
                iter_info.marks[FSNOTIFY_OBJ_TYPE_INODE] =
                        fsnotify_first_mark(&inode->i_fsnotify_marks);
        }
        if (parent) {
                iter_info.marks[FSNOTIFY_OBJ_TYPE_PARENT] =
                        fsnotify_first_mark(&parent->i_fsnotify_marks);
        }

        /*
         * We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark
         * ignore masks are properly reflected for mount/sb mark notifications.
         * That's why this traversal is so complicated...
         */
        while (fsnotify_iter_select_report_types(&iter_info)) {
                ret = send_to_group(mask, data, data_type, dir, file_name,
                                    cookie, &iter_info);

                if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
                        goto out;

                fsnotify_iter_next(&iter_info);
        }
        ret = 0;
out:
        srcu_read_unlock(&fsnotify_mark_srcu, iter_info.srcu_idx);

        return ret;
}
EXPORT_SYMBOL_GPL(fsnotify);

static __init int fsnotify_init(void)
{
        int ret;

        BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 25);

        ret = init_srcu_struct(&fsnotify_mark_srcu);
        if (ret)
                panic("initializing fsnotify_mark_srcu");

        fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector,
                                                    SLAB_PANIC);

        return 0;
}
core_initcall(fsnotify_init);








































































































































    1 































































    1 









































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Variant of atomic_t specialized for reference counts.
 *
 * The interface matches the atomic_t interface (to aid in porting) but only
 * provides the few functions one should use for reference counting.
 *
 * Saturation semantics
 * ====================
 *
 * refcount_t differs from atomic_t in that the counter saturates at
 * REFCOUNT_SATURATED and will not move once there. This avoids wrapping the
 * counter and causing 'spurious' use-after-free issues. In order to avoid the
 * cost associated with introducing cmpxchg() loops into all of the saturating
 * operations, we temporarily allow the counter to take on an unchecked value
 * and then explicitly set it to REFCOUNT_SATURATED on detecting that underflow
 * or overflow has occurred. Although this is racy when multiple threads
 * access the refcount concurrently, by placing REFCOUNT_SATURATED roughly
 * equidistant from 0 and INT_MAX we minimise the scope for error:
 *
 *                                    INT_MAX     REFCOUNT_SATURATED   UINT_MAX
 *   0                          (0x7fff_ffff)    (0xc000_0000)    (0xffff_ffff)
 *   +--------------------------------+----------------+----------------+
 *                                     <---------- bad value! ---------->
 *
 * (in a signed view of the world, the "bad value" range corresponds to
 * a negative counter value).
 *
 * As an example, consider a refcount_inc() operation that causes the counter
 * to overflow:
 *
 *         int old = atomic_fetch_add_relaxed(r);
 *        // old is INT_MAX, refcount now INT_MIN (0x8000_0000)
 *        if (old < 0)
 *                atomic_set(r, REFCOUNT_SATURATED);
 *
 * If another thread also performs a refcount_inc() operation between the two
 * atomic operations, then the count will continue to edge closer to 0. If it
 * reaches a value of 1 before /any/ of the threads reset it to the saturated
 * value, then a concurrent refcount_dec_and_test() may erroneously free the
 * underlying object.
 * Linux limits the maximum number of tasks to PID_MAX_LIMIT, which is currently
 * 0x400000 (and can't easily be raised in the future beyond FUTEX_TID_MASK).
 * With the current PID limit, if no batched refcounting operations are used and
 * the attacker can't repeatedly trigger kernel oopses in the middle of refcount
 * operations, this makes it impossible for a saturated refcount to leave the
 * saturation range, even if it is possible for multiple uses of the same
 * refcount to nest in the context of a single task:
 *
 *     (UINT_MAX+1-REFCOUNT_SATURATED) / PID_MAX_LIMIT =
 *     0x40000000 / 0x400000 = 0x100 = 256
 *
 * If hundreds of references are added/removed with a single refcounting
 * operation, it may potentially be possible to leave the saturation range; but
 * given the precise timing details involved with the round-robin scheduling of
 * each thread manipulating the refcount and the need to hit the race multiple
 * times in succession, there doesn't appear to be a practical avenue of attack
 * even if using refcount_add() operations with larger increments.
 *
 * Memory ordering
 * ===============
 *
 * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
 * and provide only what is strictly required for refcounts.
 *
 * The increments are fully relaxed; these will not provide ordering. The
 * rationale is that whatever is used to obtain the object we're increasing the
 * reference count on will provide the ordering. For locked data structures,
 * its the lock acquire, for RCU/lockless data structures its the dependent
 * load.
 *
 * Do note that inc_not_zero() provides a control dependency which will order
 * future stores against the inc, this ensures we'll never modify the object
 * if we did not in fact acquire a reference.
 *
 * The decrements will provide release order, such that all the prior loads and
 * stores will be issued before, it also provides a control dependency, which
 * will order us against the subsequent free().
 *
 * The control dependency is against the load of the cmpxchg (ll/sc) that
 * succeeded. This means the stores aren't fully ordered, but this is fine
 * because the 1->0 transition indicates no concurrency.
 *
 * Note that the allocator is responsible for ordering things between free()
 * and alloc().
 *
 * The decrements dec_and_test() and sub_and_test() also provide acquire
 * ordering on success.
 *
 */

#ifndef _LINUX_REFCOUNT_H
#define _LINUX_REFCOUNT_H

#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/compiler.h>
#include <linux/limits.h>
#include <linux/spinlock_types.h>

struct mutex;

/**
 * struct refcount_t - variant of atomic_t specialized for reference counts
 * @refs: atomic_t counter field
 *
 * The counter saturates at REFCOUNT_SATURATED and will not move once
 * there. This avoids wrapping the counter and causing 'spurious'
 * use-after-free bugs.
 */
typedef struct refcount_struct {
        atomic_t refs;
} refcount_t;

#define REFCOUNT_INIT(n)        { .refs = ATOMIC_INIT(n), }
#define REFCOUNT_MAX                INT_MAX
#define REFCOUNT_SATURATED        (INT_MIN / 2)

enum refcount_saturation_type {
        REFCOUNT_ADD_NOT_ZERO_OVF,
        REFCOUNT_ADD_OVF,
        REFCOUNT_ADD_UAF,
        REFCOUNT_SUB_UAF,
        REFCOUNT_DEC_LEAK,
};

void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t);

/**
 * refcount_set - set a refcount's value
 * @r: the refcount
 * @n: value to which the refcount will be set
 */
static inline void refcount_set(refcount_t *r, int n)
{
        atomic_set(&r->refs, n);
}

/**
 * refcount_read - get a refcount's value
 * @r: the refcount
 *
 * Return: the refcount's value
 */
static inline unsigned int refcount_read(const refcount_t *r)
{
        return atomic_read(&r->refs);
}

static inline __must_check bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp)
{
        int old = refcount_read(r);

        do {
                if (!old)
                        break;
        } while (!atomic_try_cmpxchg_relaxed(&r->refs, &old, old + i));

        if (oldp)
                *oldp = old;

        if (unlikely(old < 0 || old + i < 0))
                refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF);

        return old;
}

/**
 * refcount_add_not_zero - add a value to a refcount unless it is 0
 * @i: the value to add to the refcount
 * @r: the refcount
 *
 * Will saturate at REFCOUNT_SATURATED and WARN.
 *
 * Provides no memory ordering, it is assumed the caller has guaranteed the
 * object memory to be stable (RCU, etc.). It does provide a control dependency
 * and thereby orders future stores. See the comment on top.
 *
 * Use of this function is not recommended for the normal reference counting
 * use case in which references are taken and released one at a time.  In these
 * cases, refcount_inc(), or one of its variants, should instead be used to
 * increment a reference count.
 *
 * Return: false if the passed refcount is 0, true otherwise
 */
static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r)
{
        return __refcount_add_not_zero(i, r, NULL);
}

static inline void __refcount_add(int i, refcount_t *r, int *oldp)
{
        int old = atomic_fetch_add_relaxed(i, &r->refs);

        if (oldp)
                *oldp = old;

        if (unlikely(!old))
                refcount_warn_saturate(r, REFCOUNT_ADD_UAF);
        else if (unlikely(old < 0 || old + i < 0))
                refcount_warn_saturate(r, REFCOUNT_ADD_OVF);
}

/**
 * refcount_add - add a value to a refcount
 * @i: the value to add to the refcount
 * @r: the refcount
 *
 * Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN.
 *
 * Provides no memory ordering, it is assumed the caller has guaranteed the
 * object memory to be stable (RCU, etc.). It does provide a control dependency
 * and thereby orders future stores. See the comment on top.
 *
 * Use of this function is not recommended for the normal reference counting
 * use case in which references are taken and released one at a time.  In these
 * cases, refcount_inc(), or one of its variants, should instead be used to
 * increment a reference count.
 */
static inline void refcount_add(int i, refcount_t *r)
{
        __refcount_add(i, r, NULL);
}

static inline __must_check bool __refcount_inc_not_zero(refcount_t *r, int *oldp)
{
        return __refcount_add_not_zero(1, r, oldp);
}

/**
 * refcount_inc_not_zero - increment a refcount unless it is 0
 * @r: the refcount to increment
 *
 * Similar to atomic_inc_not_zero(), but will saturate at REFCOUNT_SATURATED
 * and WARN.
 *
 * Provides no memory ordering, it is assumed the caller has guaranteed the
 * object memory to be stable (RCU, etc.). It does provide a control dependency
 * and thereby orders future stores. See the comment on top.
 *
 * Return: true if the increment was successful, false otherwise
 */
static inline __must_check bool refcount_inc_not_zero(refcount_t *r)
{
        return __refcount_inc_not_zero(r, NULL);
}

static inline void __refcount_inc(refcount_t *r, int *oldp)
{
        __refcount_add(1, r, oldp);
}

/**
 * refcount_inc - increment a refcount
 * @r: the refcount to increment
 *
 * Similar to atomic_inc(), but will saturate at REFCOUNT_SATURATED and WARN.
 *
 * Provides no memory ordering, it is assumed the caller already has a
 * reference on the object.
 *
 * Will WARN if the refcount is 0, as this represents a possible use-after-free
 * condition.
 */
static inline void refcount_inc(refcount_t *r)
{
        __refcount_inc(r, NULL);
}

static inline __must_check bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp)
{
        int old = atomic_fetch_sub_release(i, &r->refs);

        if (oldp)
                *oldp = old;

        if (old == i) {
                smp_acquire__after_ctrl_dep();
                return true;
        }

        if (unlikely(old < 0 || old - i < 0))
                refcount_warn_saturate(r, REFCOUNT_SUB_UAF);

        return false;
}

/**
 * refcount_sub_and_test - subtract from a refcount and test if it is 0
 * @i: amount to subtract from the refcount
 * @r: the refcount
 *
 * Similar to atomic_dec_and_test(), but it will WARN, return false and
 * ultimately leak on underflow and will fail to decrement when saturated
 * at REFCOUNT_SATURATED.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides an acquire ordering on success such that free()
 * must come after.
 *
 * Use of this function is not recommended for the normal reference counting
 * use case in which references are taken and released one at a time.  In these
 * cases, refcount_dec(), or one of its variants, should instead be used to
 * decrement a reference count.
 *
 * Return: true if the resulting refcount is 0, false otherwise
 */
static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r)
{
        return __refcount_sub_and_test(i, r, NULL);
}

static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp)
{
        return __refcount_sub_and_test(1, r, oldp);
}

/**
 * refcount_dec_and_test - decrement a refcount and test if it is 0
 * @r: the refcount
 *
 * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
 * decrement when saturated at REFCOUNT_SATURATED.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides an acquire ordering on success such that free()
 * must come after.
 *
 * Return: true if the resulting refcount is 0, false otherwise
 */
static inline __must_check bool refcount_dec_and_test(refcount_t *r)
{
        return __refcount_dec_and_test(r, NULL);
}

static inline void __refcount_dec(refcount_t *r, int *oldp)
{
        int old = atomic_fetch_sub_release(1, &r->refs);

        if (oldp)
                *oldp = old;

        if (unlikely(old <= 1))
                refcount_warn_saturate(r, REFCOUNT_DEC_LEAK);
}

/**
 * refcount_dec - decrement a refcount
 * @r: the refcount
 *
 * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
 * when saturated at REFCOUNT_SATURATED.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before.
 */
static inline void refcount_dec(refcount_t *r)
{
        __refcount_dec(r, NULL);
}

extern __must_check bool refcount_dec_if_one(refcount_t *r);
extern __must_check bool refcount_dec_not_one(refcount_t *r);
extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock);
extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock);
extern __must_check bool refcount_dec_and_lock_irqsave(refcount_t *r,
                                                       spinlock_t *lock,
                                                       unsigned long *flags);
#endif /* _LINUX_REFCOUNT_H */



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_SCHED_GENERIC_H
#define __NET_SCHED_GENERIC_H

#include <linux/netdevice.h>
#include <linux/types.h>
#include <linux/rcupdate.h>
#include <linux/pkt_sched.h>
#include <linux/pkt_cls.h>
#include <linux/percpu.h>
#include <linux/dynamic_queue_limits.h>
#include <linux/list.h>
#include <linux/refcount.h>
#include <linux/workqueue.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/atomic.h>
#include <linux/hashtable.h>
#include <net/gen_stats.h>
#include <net/rtnetlink.h>
#include <net/flow_offload.h>

struct Qdisc_ops;
struct qdisc_walker;
struct tcf_walker;
struct module;
struct bpf_flow_keys;

struct qdisc_rate_table {
        struct tc_ratespec rate;
        u32                data[256];
        struct qdisc_rate_table *next;
        int                refcnt;
};

enum qdisc_state_t {
        __QDISC_STATE_SCHED,
        __QDISC_STATE_DEACTIVATED,
        __QDISC_STATE_MISSED,
};

struct qdisc_size_table {
        struct rcu_head                rcu;
        struct list_head        list;
        struct tc_sizespec        szopts;
        int                        refcnt;
        u16                        data[];
};

/* similar to sk_buff_head, but skb->prev pointer is undefined. */
struct qdisc_skb_head {
        struct sk_buff        *head;
        struct sk_buff        *tail;
        __u32                qlen;
        spinlock_t        lock;
};

struct Qdisc {
        int                         (*enqueue)(struct sk_buff *skb,
                                           struct Qdisc *sch,
                                           struct sk_buff **to_free);
        struct sk_buff *        (*dequeue)(struct Qdisc *sch);
        unsigned int                flags;
#define TCQ_F_BUILTIN                1
#define TCQ_F_INGRESS                2
#define TCQ_F_CAN_BYPASS        4
#define TCQ_F_MQROOT                8
#define TCQ_F_ONETXQUEUE        0x10 /* dequeue_skb() can assume all skbs are for
                                      * q->dev_queue : It can test
                                      * netif_xmit_frozen_or_stopped() before
                                      * dequeueing next packet.
                                      * Its true for MQ/MQPRIO slaves, or non
                                      * multiqueue device.
                                      */
#define TCQ_F_WARN_NONWC        (1 << 16)
#define TCQ_F_CPUSTATS                0x20 /* run using percpu statistics */
#define TCQ_F_NOPARENT                0x40 /* root of its hierarchy :
                                      * qdisc_tree_decrease_qlen() should stop.
                                      */
#define TCQ_F_INVISIBLE                0x80 /* invisible by default in dump */
#define TCQ_F_NOLOCK                0x100 /* qdisc does not require locking */
#define TCQ_F_OFFLOADED                0x200 /* qdisc is offloaded to HW */
        u32                        limit;
        const struct Qdisc_ops        *ops;
        struct qdisc_size_table        __rcu *stab;
        struct hlist_node       hash;
        u32                        handle;
        u32                        parent;

        struct netdev_queue        *dev_queue;

        struct net_rate_estimator __rcu *rate_est;
        struct gnet_stats_basic_cpu __percpu *cpu_bstats;
        struct gnet_stats_queue        __percpu *cpu_qstats;
        int                        pad;
        refcount_t                refcnt;

        /*
         * For performance sake on SMP, we put highly modified fields at the end
         */
        struct sk_buff_head        gso_skb ____cacheline_aligned_in_smp;
        struct qdisc_skb_head        q;
        struct gnet_stats_basic_packed bstats;
        seqcount_t                running;
        struct gnet_stats_queue        qstats;
        unsigned long                state;
        struct Qdisc            *next_sched;
        struct sk_buff_head        skb_bad_txq;

        spinlock_t                busylock ____cacheline_aligned_in_smp;
        spinlock_t                seqlock;

        /* for NOLOCK qdisc, true if there are no enqueued skbs */
        bool                        empty;
        struct rcu_head                rcu;

        /* private data */
        long privdata[] ____cacheline_aligned;
};

static inline void qdisc_refcount_inc(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_BUILTIN)
                return;
        refcount_inc(&qdisc->refcnt);
}

/* Intended to be used by unlocked users, when concurrent qdisc release is
 * possible.
 */

static inline struct Qdisc *qdisc_refcount_inc_nz(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_BUILTIN)
                return qdisc;
        if (refcount_inc_not_zero(&qdisc->refcnt))
                return qdisc;
        return NULL;
}

static inline bool qdisc_is_running(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_NOLOCK)
                return spin_is_locked(&qdisc->seqlock);
        return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
}

static inline bool qdisc_is_percpu_stats(const struct Qdisc *q)
{
        return q->flags & TCQ_F_CPUSTATS;
}

static inline bool qdisc_is_empty(const struct Qdisc *qdisc)
{
        if (qdisc_is_percpu_stats(qdisc))
                return READ_ONCE(qdisc->empty);
        return !READ_ONCE(qdisc->q.qlen);
}

static inline bool qdisc_run_begin(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_NOLOCK) {
                if (spin_trylock(&qdisc->seqlock))
                        goto nolock_empty;

                /* No need to insist if the MISSED flag was already set.
                 * Note that test_and_set_bit() also gives us memory ordering
                 * guarantees wrt potential earlier enqueue() and below
                 * spin_trylock(), both of which are necessary to prevent races
                 */
                if (test_and_set_bit(__QDISC_STATE_MISSED, &qdisc->state))
                        return false;

                /* Try to take the lock again to make sure that we will either
                 * grab it or the CPU that still has it will see MISSED set
                 * when testing it in qdisc_run_end()
                 */
                if (!spin_trylock(&qdisc->seqlock))
                        return false;

nolock_empty:
                WRITE_ONCE(qdisc->empty, false);
        } else if (qdisc_is_running(qdisc)) {
                return false;
        }
        /* Variant of write_seqcount_begin() telling lockdep a trylock
         * was attempted.
         */
        raw_write_seqcount_begin(&qdisc->running);
        seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_);
        return true;
}

static inline void qdisc_run_end(struct Qdisc *qdisc)
{
        write_seqcount_end(&qdisc->running);
        if (qdisc->flags & TCQ_F_NOLOCK) {
                spin_unlock(&qdisc->seqlock);

                /* spin_unlock() only has store-release semantic. The unlock
                 * and test_bit() ordering is a store-load ordering, so a full
                 * memory barrier is needed here.
                 */
                smp_mb();

                if (unlikely(test_bit(__QDISC_STATE_MISSED,
                                      &qdisc->state))) {
                        clear_bit(__QDISC_STATE_MISSED, &qdisc->state);
                        __netif_schedule(qdisc);
                }
        }
}

static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
{
        return qdisc->flags & TCQ_F_ONETXQUEUE;
}

static inline int qdisc_avail_bulklimit(const struct netdev_queue *txq)
{
#ifdef CONFIG_BQL
        /* Non-BQL migrated drivers will return 0, too. */
        return dql_avail(&txq->dql);
#else
        return 0;
#endif
}

struct Qdisc_class_ops {
        unsigned int                flags;
        /* Child qdisc manipulation */
        struct netdev_queue *        (*select_queue)(struct Qdisc *, struct tcmsg *);
        int                        (*graft)(struct Qdisc *, unsigned long cl,
                                        struct Qdisc *, struct Qdisc **,
                                        struct netlink_ext_ack *extack);
        struct Qdisc *                (*leaf)(struct Qdisc *, unsigned long cl);
        void                        (*qlen_notify)(struct Qdisc *, unsigned long);

        /* Class manipulation routines */
        unsigned long                (*find)(struct Qdisc *, u32 classid);
        int                        (*change)(struct Qdisc *, u32, u32,
                                        struct nlattr **, unsigned long *,
                                        struct netlink_ext_ack *);
        int                        (*delete)(struct Qdisc *, unsigned long);
        void                        (*walk)(struct Qdisc *, struct qdisc_walker * arg);

        /* Filter manipulation */
        struct tcf_block *        (*tcf_block)(struct Qdisc *sch,
                                             unsigned long arg,
                                             struct netlink_ext_ack *extack);
        unsigned long                (*bind_tcf)(struct Qdisc *, unsigned long,
                                        u32 classid);
        void                        (*unbind_tcf)(struct Qdisc *, unsigned long);

        /* rtnetlink specific */
        int                        (*dump)(struct Qdisc *, unsigned long,
                                        struct sk_buff *skb, struct tcmsg*);
        int                        (*dump_stats)(struct Qdisc *, unsigned long,
                                        struct gnet_dump *);
};

/* Qdisc_class_ops flag values */

/* Implements API that doesn't require rtnl lock */
enum qdisc_class_ops_flags {
        QDISC_CLASS_OPS_DOIT_UNLOCKED = 1,
};

struct Qdisc_ops {
        struct Qdisc_ops        *next;
        const struct Qdisc_class_ops        *cl_ops;
        char                        id[IFNAMSIZ];
        int                        priv_size;
        unsigned int                static_flags;

        int                         (*enqueue)(struct sk_buff *skb,
                                           struct Qdisc *sch,
                                           struct sk_buff **to_free);
        struct sk_buff *        (*dequeue)(struct Qdisc *);
        struct sk_buff *        (*peek)(struct Qdisc *);

        int                        (*init)(struct Qdisc *sch, struct nlattr *arg,
                                        struct netlink_ext_ack *extack);
        void                        (*reset)(struct Qdisc *);
        void                        (*destroy)(struct Qdisc *);
        int                        (*change)(struct Qdisc *sch,
                                          struct nlattr *arg,
                                          struct netlink_ext_ack *extack);
        void                        (*attach)(struct Qdisc *sch);
        int                        (*change_tx_queue_len)(struct Qdisc *, unsigned int);
        void                        (*change_real_num_tx)(struct Qdisc *sch,
                                                      unsigned int new_real_tx);

        int                        (*dump)(struct Qdisc *, struct sk_buff *);
        int                        (*dump_stats)(struct Qdisc *, struct gnet_dump *);

        void                        (*ingress_block_set)(struct Qdisc *sch,
                                                     u32 block_index);
        void                        (*egress_block_set)(struct Qdisc *sch,
                                                    u32 block_index);
        u32                        (*ingress_block_get)(struct Qdisc *sch);
        u32                        (*egress_block_get)(struct Qdisc *sch);

        struct module                *owner;
};


struct tcf_result {
        union {
                struct {
                        unsigned long        class;
                        u32                classid;
                };
                const struct tcf_proto *goto_tp;

                /* used in the skb_tc_reinsert function */
                struct {
                        bool                ingress;
                        struct gnet_stats_queue *qstats;
                };
        };
};

struct tcf_chain;

struct tcf_proto_ops {
        struct list_head        head;
        char                        kind[IFNAMSIZ];

        int                        (*classify)(struct sk_buff *,
                                            const struct tcf_proto *,
                                            struct tcf_result *);
        int                        (*init)(struct tcf_proto*);
        void                        (*destroy)(struct tcf_proto *tp, bool rtnl_held,
                                           struct netlink_ext_ack *extack);

        void*                        (*get)(struct tcf_proto*, u32 handle);
        void                        (*put)(struct tcf_proto *tp, void *f);
        int                        (*change)(struct net *net, struct sk_buff *,
                                        struct tcf_proto*, unsigned long,
                                        u32 handle, struct nlattr **,
                                        void **, bool, bool,
                                        struct netlink_ext_ack *);
        int                        (*delete)(struct tcf_proto *tp, void *arg,
                                          bool *last, bool rtnl_held,
                                          struct netlink_ext_ack *);
        bool                        (*delete_empty)(struct tcf_proto *tp);
        void                        (*walk)(struct tcf_proto *tp,
                                        struct tcf_walker *arg, bool rtnl_held);
        int                        (*reoffload)(struct tcf_proto *tp, bool add,
                                             flow_setup_cb_t *cb, void *cb_priv,
                                             struct netlink_ext_ack *extack);
        void                        (*hw_add)(struct tcf_proto *tp,
                                          void *type_data);
        void                        (*hw_del)(struct tcf_proto *tp,
                                          void *type_data);
        void                        (*bind_class)(void *, u32, unsigned long,
                                              void *, unsigned long);
        void *                        (*tmplt_create)(struct net *net,
                                                struct tcf_chain *chain,
                                                struct nlattr **tca,
                                                struct netlink_ext_ack *extack);
        void                        (*tmplt_destroy)(void *tmplt_priv);

        /* rtnetlink specific */
        int                        (*dump)(struct net*, struct tcf_proto*, void *,
                                        struct sk_buff *skb, struct tcmsg*,
                                        bool);
        int                        (*terse_dump)(struct net *net,
                                              struct tcf_proto *tp, void *fh,
                                              struct sk_buff *skb,
                                              struct tcmsg *t, bool rtnl_held);
        int                        (*tmplt_dump)(struct sk_buff *skb,
                                              struct net *net,
                                              void *tmplt_priv);

        struct module                *owner;
        int                        flags;
};

/* Classifiers setting TCF_PROTO_OPS_DOIT_UNLOCKED in tcf_proto_ops->flags
 * are expected to implement tcf_proto_ops->delete_empty(), otherwise race
 * conditions can occur when filters are inserted/deleted simultaneously.
 */
enum tcf_proto_ops_flags {
        TCF_PROTO_OPS_DOIT_UNLOCKED = 1,
};

struct tcf_proto {
        /* Fast access part */
        struct tcf_proto __rcu        *next;
        void __rcu                *root;

        /* called under RCU BH lock*/
        int                        (*classify)(struct sk_buff *,
                                            const struct tcf_proto *,
                                            struct tcf_result *);
        __be16                        protocol;

        /* All the rest */
        u32                        prio;
        void                        *data;
        const struct tcf_proto_ops        *ops;
        struct tcf_chain        *chain;
        /* Lock protects tcf_proto shared state and can be used by unlocked
         * classifiers to protect their private data.
         */
        spinlock_t                lock;
        bool                        deleting;
        refcount_t                refcnt;
        struct rcu_head                rcu;
        struct hlist_node        destroy_ht_node;
};

struct qdisc_skb_cb {
        struct {
                unsigned int                pkt_len;
                u16                        slave_dev_queue_mapping;
                u16                        tc_classid;
        };
#define QDISC_CB_PRIV_LEN 20
        unsigned char                data[QDISC_CB_PRIV_LEN];
        u16                        mru;
};

typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv);

struct tcf_chain {
        /* Protects filter_chain. */
        struct mutex filter_chain_lock;
        struct tcf_proto __rcu *filter_chain;
        struct list_head list;
        struct tcf_block *block;
        u32 index; /* chain index */
        unsigned int refcnt;
        unsigned int action_refcnt;
        bool explicitly_created;
        bool flushing;
        const struct tcf_proto_ops *tmplt_ops;
        void *tmplt_priv;
        struct rcu_head rcu;
};

struct tcf_block {
        /* Lock protects tcf_block and lifetime-management data of chains
         * attached to the block (refcnt, action_refcnt, explicitly_created).
         */
        struct mutex lock;
        struct list_head chain_list;
        u32 index; /* block index for shared blocks */
        u32 classid; /* which class this block belongs to */
        refcount_t refcnt;
        struct net *net;
        struct Qdisc *q;
        struct rw_semaphore cb_lock; /* protects cb_list and offload counters */
        struct flow_block flow_block;
        struct list_head owner_list;
        bool keep_dst;
        atomic_t offloadcnt; /* Number of oddloaded filters */
        unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */
        unsigned int lockeddevcnt; /* Number of devs that require rtnl lock. */
        struct {
                struct tcf_chain *chain;
                struct list_head filter_chain_list;
        } chain0;
        struct rcu_head rcu;
        DECLARE_HASHTABLE(proto_destroy_ht, 7);
        struct mutex proto_destroy_lock; /* Lock for proto_destroy hashtable. */
};

#ifdef CONFIG_PROVE_LOCKING
static inline bool lockdep_tcf_chain_is_locked(struct tcf_chain *chain)
{
        return lockdep_is_held(&chain->filter_chain_lock);
}

static inline bool lockdep_tcf_proto_is_locked(struct tcf_proto *tp)
{
        return lockdep_is_held(&tp->lock);
}
#else
static inline bool lockdep_tcf_chain_is_locked(struct tcf_block *chain)
{
        return true;
}

static inline bool lockdep_tcf_proto_is_locked(struct tcf_proto *tp)
{
        return true;
}
#endif /* #ifdef CONFIG_PROVE_LOCKING */

#define tcf_chain_dereference(p, chain)                                        \
        rcu_dereference_protected(p, lockdep_tcf_chain_is_locked(chain))

#define tcf_proto_dereference(p, tp)                                        \
        rcu_dereference_protected(p, lockdep_tcf_proto_is_locked(tp))

static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
{
        struct qdisc_skb_cb *qcb;

        BUILD_BUG_ON(sizeof(skb->cb) < sizeof(*qcb));
        BUILD_BUG_ON(sizeof(qcb->data) < sz);
}

static inline int qdisc_qlen_cpu(const struct Qdisc *q)
{
        return this_cpu_ptr(q->cpu_qstats)->qlen;
}

static inline int qdisc_qlen(const struct Qdisc *q)
{
        return q->q.qlen;
}

static inline int qdisc_qlen_sum(const struct Qdisc *q)
{
        __u32 qlen = q->qstats.qlen;
        int i;

        if (qdisc_is_percpu_stats(q)) {
                for_each_possible_cpu(i)
                        qlen += per_cpu_ptr(q->cpu_qstats, i)->qlen;
        } else {
                qlen += q->q.qlen;
        }

        return qlen;
}

static inline struct qdisc_skb_cb *qdisc_skb_cb(const struct sk_buff *skb)
{
        return (struct qdisc_skb_cb *)skb->cb;
}

static inline spinlock_t *qdisc_lock(struct Qdisc *qdisc)
{
        return &qdisc->q.lock;
}

static inline struct Qdisc *qdisc_root(const struct Qdisc *qdisc)
{
        struct Qdisc *q = rcu_dereference_rtnl(qdisc->dev_queue->qdisc);

        return q;
}

static inline struct Qdisc *qdisc_root_bh(const struct Qdisc *qdisc)
{
        return rcu_dereference_bh(qdisc->dev_queue->qdisc);
}

static inline struct Qdisc *qdisc_root_sleeping(const struct Qdisc *qdisc)
{
        return qdisc->dev_queue->qdisc_sleeping;
}

/* The qdisc root lock is a mechanism by which to top level
 * of a qdisc tree can be locked from any qdisc node in the
 * forest.  This allows changing the configuration of some
 * aspect of the qdisc tree while blocking out asynchronous
 * qdisc access in the packet processing paths.
 *
 * It is only legal to do this when the root will not change
 * on us.  Otherwise we'll potentially lock the wrong qdisc
 * root.  This is enforced by holding the RTNL semaphore, which
 * all users of this lock accessor must do.
 */
static inline spinlock_t *qdisc_root_lock(const struct Qdisc *qdisc)
{
        struct Qdisc *root = qdisc_root(qdisc);

        ASSERT_RTNL();
        return qdisc_lock(root);
}

static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc)
{
        struct Qdisc *root = qdisc_root_sleeping(qdisc);

        ASSERT_RTNL();
        return qdisc_lock(root);
}

static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
{
        struct Qdisc *root = qdisc_root_sleeping(qdisc);

        ASSERT_RTNL();
        return &root->running;
}

static inline struct net_device *qdisc_dev(const struct Qdisc *qdisc)
{
        return qdisc->dev_queue->dev;
}

static inline void sch_tree_lock(const struct Qdisc *q)
{
        spin_lock_bh(qdisc_root_sleeping_lock(q));
}

static inline void sch_tree_unlock(const struct Qdisc *q)
{
        spin_unlock_bh(qdisc_root_sleeping_lock(q));
}

extern struct Qdisc noop_qdisc;
extern struct Qdisc_ops noop_qdisc_ops;
extern struct Qdisc_ops pfifo_fast_ops;
extern struct Qdisc_ops mq_qdisc_ops;
extern struct Qdisc_ops noqueue_qdisc_ops;
extern const struct Qdisc_ops *default_qdisc_ops;
static inline const struct Qdisc_ops *
get_default_qdisc_ops(const struct net_device *dev, int ntx)
{
        return ntx < dev->real_num_tx_queues ?
                        default_qdisc_ops : &pfifo_fast_ops;
}

struct Qdisc_class_common {
        u32                        classid;
        struct hlist_node        hnode;
};

struct Qdisc_class_hash {
        struct hlist_head        *hash;
        unsigned int                hashsize;
        unsigned int                hashmask;
        unsigned int                hashelems;
};

static inline unsigned int qdisc_class_hash(u32 id, u32 mask)
{
        id ^= id >> 8;
        id ^= id >> 4;
        return id & mask;
}

static inline struct Qdisc_class_common *
qdisc_class_find(const struct Qdisc_class_hash *hash, u32 id)
{
        struct Qdisc_class_common *cl;
        unsigned int h;

        if (!id)
                return NULL;

        h = qdisc_class_hash(id, hash->hashmask);
        hlist_for_each_entry(cl, &hash->hash[h], hnode) {
                if (cl->classid == id)
                        return cl;
        }
        return NULL;
}

static inline int tc_classid_to_hwtc(struct net_device *dev, u32 classid)
{
        u32 hwtc = TC_H_MIN(classid) - TC_H_MIN_PRIORITY;

        return (hwtc < netdev_get_num_tc(dev)) ? hwtc : -EINVAL;
}

int qdisc_class_hash_init(struct Qdisc_class_hash *);
void qdisc_class_hash_insert(struct Qdisc_class_hash *,
                             struct Qdisc_class_common *);
void qdisc_class_hash_remove(struct Qdisc_class_hash *,
                             struct Qdisc_class_common *);
void qdisc_class_hash_grow(struct Qdisc *, struct Qdisc_class_hash *);
void qdisc_class_hash_destroy(struct Qdisc_class_hash *);

int dev_qdisc_change_tx_queue_len(struct net_device *dev);
void dev_qdisc_change_real_num_tx(struct net_device *dev,
                                  unsigned int new_real_tx);
void dev_init_scheduler(struct net_device *dev);
void dev_shutdown(struct net_device *dev);
void dev_activate(struct net_device *dev);
void dev_deactivate(struct net_device *dev);
void dev_deactivate_many(struct list_head *head);
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
                              struct Qdisc *qdisc);
void qdisc_reset(struct Qdisc *qdisc);
void qdisc_put(struct Qdisc *qdisc);
void qdisc_put_unlocked(struct Qdisc *qdisc);
void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, int n, int len);
#ifdef CONFIG_NET_SCHED
int qdisc_offload_dump_helper(struct Qdisc *q, enum tc_setup_type type,
                              void *type_data);
void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
                                struct Qdisc *new, struct Qdisc *old,
                                enum tc_setup_type type, void *type_data,
                                struct netlink_ext_ack *extack);
#else
static inline int
qdisc_offload_dump_helper(struct Qdisc *q, enum tc_setup_type type,
                          void *type_data)
{
        q->flags &= ~TCQ_F_OFFLOADED;
        return 0;
}

static inline void
qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
                           struct Qdisc *new, struct Qdisc *old,
                           enum tc_setup_type type, void *type_data,
                           struct netlink_ext_ack *extack)
{
}
#endif
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
                          const struct Qdisc_ops *ops,
                          struct netlink_ext_ack *extack);
void qdisc_free(struct Qdisc *qdisc);
struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
                                const struct Qdisc_ops *ops, u32 parentid,
                                struct netlink_ext_ack *extack);
void __qdisc_calculate_pkt_len(struct sk_buff *skb,
                               const struct qdisc_size_table *stab);
int skb_do_redirect(struct sk_buff *);

static inline bool skb_at_tc_ingress(const struct sk_buff *skb)
{
#ifdef CONFIG_NET_CLS_ACT
        return skb->tc_at_ingress;
#else
        return false;
#endif
}

static inline bool skb_skip_tc_classify(struct sk_buff *skb)
{
#ifdef CONFIG_NET_CLS_ACT
        if (skb->tc_skip_classify) {
                skb->tc_skip_classify = 0;
                return true;
        }
#endif
        return false;
}

/* Reset all TX qdiscs greater than index of a device.  */
static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i)
{
        struct Qdisc *qdisc;

        for (; i < dev->num_tx_queues; i++) {
                qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc);
                if (qdisc) {
                        spin_lock_bh(qdisc_lock(qdisc));
                        qdisc_reset(qdisc);
                        spin_unlock_bh(qdisc_lock(qdisc));
                }
        }
}

/* Are all TX queues of the device empty?  */
static inline bool qdisc_all_tx_empty(const struct net_device *dev)
{
        unsigned int i;

        rcu_read_lock();
        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
                const struct Qdisc *q = rcu_dereference(txq->qdisc);

                if (!qdisc_is_empty(q)) {
                        rcu_read_unlock();
                        return false;
                }
        }
        rcu_read_unlock();
        return true;
}

/* Are any of the TX qdiscs changing?  */
static inline bool qdisc_tx_changing(const struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
                if (rcu_access_pointer(txq->qdisc) != txq->qdisc_sleeping)
                        return true;
        }
        return false;
}

/* Is the device using the noop qdisc on all queues?  */
static inline bool qdisc_tx_is_noop(const struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
                if (rcu_access_pointer(txq->qdisc) != &noop_qdisc)
                        return false;
        }
        return true;
}

static inline unsigned int qdisc_pkt_len(const struct sk_buff *skb)
{
        return qdisc_skb_cb(skb)->pkt_len;
}

/* additional qdisc xmit flags (NET_XMIT_MASK in linux/netdevice.h) */
enum net_xmit_qdisc_t {
        __NET_XMIT_STOLEN = 0x00010000,
        __NET_XMIT_BYPASS = 0x00020000,
};

#ifdef CONFIG_NET_CLS_ACT
#define net_xmit_drop_count(e)        ((e) & __NET_XMIT_STOLEN ? 0 : 1)
#else
#define net_xmit_drop_count(e)        (1)
#endif

static inline void qdisc_calculate_pkt_len(struct sk_buff *skb,
                                           const struct Qdisc *sch)
{
#ifdef CONFIG_NET_SCHED
        struct qdisc_size_table *stab = rcu_dereference_bh(sch->stab);

        if (stab)
                __qdisc_calculate_pkt_len(skb, stab);
#endif
}

static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
                                struct sk_buff **to_free)
{
        qdisc_calculate_pkt_len(skb, sch);
        return sch->enqueue(skb, sch, to_free);
}

static inline void _bstats_update(struct gnet_stats_basic_packed *bstats,
                                  __u64 bytes, __u32 packets)
{
        bstats->bytes += bytes;
        bstats->packets += packets;
}

static inline void bstats_update(struct gnet_stats_basic_packed *bstats,
                                 const struct sk_buff *skb)
{
        _bstats_update(bstats,
                       qdisc_pkt_len(skb),
                       skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1);
}

static inline void _bstats_cpu_update(struct gnet_stats_basic_cpu *bstats,
                                      __u64 bytes, __u32 packets)
{
        u64_stats_update_begin(&bstats->syncp);
        _bstats_update(&bstats->bstats, bytes, packets);
        u64_stats_update_end(&bstats->syncp);
}

static inline void bstats_cpu_update(struct gnet_stats_basic_cpu *bstats,
                                     const struct sk_buff *skb)
{
        u64_stats_update_begin(&bstats->syncp);
        bstats_update(&bstats->bstats, skb);
        u64_stats_update_end(&bstats->syncp);
}

static inline void qdisc_bstats_cpu_update(struct Qdisc *sch,
                                           const struct sk_buff *skb)
{
        bstats_cpu_update(this_cpu_ptr(sch->cpu_bstats), skb);
}

static inline void qdisc_bstats_update(struct Qdisc *sch,
                                       const struct sk_buff *skb)
{
        bstats_update(&sch->bstats, skb);
}

static inline void qdisc_qstats_backlog_dec(struct Qdisc *sch,
                                            const struct sk_buff *skb)
{
        sch->qstats.backlog -= qdisc_pkt_len(skb);
}

static inline void qdisc_qstats_cpu_backlog_dec(struct Qdisc *sch,
                                                const struct sk_buff *skb)
{
        this_cpu_sub(sch->cpu_qstats->backlog, qdisc_pkt_len(skb));
}

static inline void qdisc_qstats_backlog_inc(struct Qdisc *sch,
                                            const struct sk_buff *skb)
{
        sch->qstats.backlog += qdisc_pkt_len(skb);
}

static inline void qdisc_qstats_cpu_backlog_inc(struct Qdisc *sch,
                                                const struct sk_buff *skb)
{
        this_cpu_add(sch->cpu_qstats->backlog, qdisc_pkt_len(skb));
}

static inline void qdisc_qstats_cpu_qlen_inc(struct Qdisc *sch)
{
        this_cpu_inc(sch->cpu_qstats->qlen);
}

static inline void qdisc_qstats_cpu_qlen_dec(struct Qdisc *sch)
{
        this_cpu_dec(sch->cpu_qstats->qlen);
}

static inline void qdisc_qstats_cpu_requeues_inc(struct Qdisc *sch)
{
        this_cpu_inc(sch->cpu_qstats->requeues);
}

static inline void __qdisc_qstats_drop(struct Qdisc *sch, int count)
{
        sch->qstats.drops += count;
}

static inline void qstats_drop_inc(struct gnet_stats_queue *qstats)
{
        qstats->drops++;
}

static inline void qstats_overlimit_inc(struct gnet_stats_queue *qstats)
{
        qstats->overlimits++;
}

static inline void qdisc_qstats_drop(struct Qdisc *sch)
{
        qstats_drop_inc(&sch->qstats);
}

static inline void qdisc_qstats_cpu_drop(struct Qdisc *sch)
{
        this_cpu_inc(sch->cpu_qstats->drops);
}

static inline void qdisc_qstats_overlimit(struct Qdisc *sch)
{
        sch->qstats.overlimits++;
}

static inline int qdisc_qstats_copy(struct gnet_dump *d, struct Qdisc *sch)
{
        __u32 qlen = qdisc_qlen_sum(sch);

        return gnet_stats_copy_queue(d, sch->cpu_qstats, &sch->qstats, qlen);
}

static inline void qdisc_qstats_qlen_backlog(struct Qdisc *sch,  __u32 *qlen,
                                             __u32 *backlog)
{
        struct gnet_stats_queue qstats = { 0 };
        __u32 len = qdisc_qlen_sum(sch);

        __gnet_stats_copy_queue(&qstats, sch->cpu_qstats, &sch->qstats, len);
        *qlen = qstats.qlen;
        *backlog = qstats.backlog;
}

static inline void qdisc_tree_flush_backlog(struct Qdisc *sch)
{
        __u32 qlen, backlog;

        qdisc_qstats_qlen_backlog(sch, &qlen, &backlog);
        qdisc_tree_reduce_backlog(sch, qlen, backlog);
}

static inline void qdisc_purge_queue(struct Qdisc *sch)
{
        __u32 qlen, backlog;

        qdisc_qstats_qlen_backlog(sch, &qlen, &backlog);
        qdisc_reset(sch);
        qdisc_tree_reduce_backlog(sch, qlen, backlog);
}

static inline void qdisc_skb_head_init(struct qdisc_skb_head *qh)
{
        qh->head = NULL;
        qh->tail = NULL;
        qh->qlen = 0;
}

static inline void __qdisc_enqueue_tail(struct sk_buff *skb,
                                        struct qdisc_skb_head *qh)
{
        struct sk_buff *last = qh->tail;

        if (last) {
                skb->next = NULL;
                last->next = skb;
                qh->tail = skb;
        } else {
                qh->tail = skb;
                qh->head = skb;
        }
        qh->qlen++;
}

static inline int qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch)
{
        __qdisc_enqueue_tail(skb, &sch->q);
        qdisc_qstats_backlog_inc(sch, skb);
        return NET_XMIT_SUCCESS;
}

static inline void __qdisc_enqueue_head(struct sk_buff *skb,
                                        struct qdisc_skb_head *qh)
{
        skb->next = qh->head;

        if (!qh->head)
                qh->tail = skb;
        qh->head = skb;
        qh->qlen++;
}

static inline struct sk_buff *__qdisc_dequeue_head(struct qdisc_skb_head *qh)
{
        struct sk_buff *skb = qh->head;

        if (likely(skb != NULL)) {
                qh->head = skb->next;
                qh->qlen--;
                if (qh->head == NULL)
                        qh->tail = NULL;
                skb->next = NULL;
        }

        return skb;
}

static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch)
{
        struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);

        if (likely(skb != NULL)) {
                qdisc_qstats_backlog_dec(sch, skb);
                qdisc_bstats_update(sch, skb);
        }

        return skb;
}

/* Instead of calling kfree_skb() while root qdisc lock is held,
 * queue the skb for future freeing at end of __dev_xmit_skb()
 */
static inline void __qdisc_drop(struct sk_buff *skb, struct sk_buff **to_free)
{
        skb->next = *to_free;
        *to_free = skb;
}

static inline void __qdisc_drop_all(struct sk_buff *skb,
                                    struct sk_buff **to_free)
{
        if (skb->prev)
                skb->prev->next = *to_free;
        else
                skb->next = *to_free;
        *to_free = skb;
}

static inline unsigned int __qdisc_queue_drop_head(struct Qdisc *sch,
                                                   struct qdisc_skb_head *qh,
                                                   struct sk_buff **to_free)
{
        struct sk_buff *skb = __qdisc_dequeue_head(qh);

        if (likely(skb != NULL)) {
                unsigned int len = qdisc_pkt_len(skb);

                qdisc_qstats_backlog_dec(sch, skb);
                __qdisc_drop(skb, to_free);
                return len;
        }

        return 0;
}

static inline struct sk_buff *qdisc_peek_head(struct Qdisc *sch)
{
        const struct qdisc_skb_head *qh = &sch->q;

        return qh->head;
}

/* generic pseudo peek method for non-work-conserving qdisc */
static inline struct sk_buff *qdisc_peek_dequeued(struct Qdisc *sch)
{
        struct sk_buff *skb = skb_peek(&sch->gso_skb);

        /* we can reuse ->gso_skb because peek isn't called for root qdiscs */
        if (!skb) {
                skb = sch->dequeue(sch);

                if (skb) {
                        __skb_queue_head(&sch->gso_skb, skb);
                        /* it's still part of the queue */
                        qdisc_qstats_backlog_inc(sch, skb);
                        sch->q.qlen++;
                }
        }

        return skb;
}

static inline void qdisc_update_stats_at_dequeue(struct Qdisc *sch,
                                                 struct sk_buff *skb)
{
        if (qdisc_is_percpu_stats(sch)) {
                qdisc_qstats_cpu_backlog_dec(sch, skb);
                qdisc_bstats_cpu_update(sch, skb);
                qdisc_qstats_cpu_qlen_dec(sch);
        } else {
                qdisc_qstats_backlog_dec(sch, skb);
                qdisc_bstats_update(sch, skb);
                sch->q.qlen--;
        }
}

static inline void qdisc_update_stats_at_enqueue(struct Qdisc *sch,
                                                 unsigned int pkt_len)
{
        if (qdisc_is_percpu_stats(sch)) {
                qdisc_qstats_cpu_qlen_inc(sch);
                this_cpu_add(sch->cpu_qstats->backlog, pkt_len);
        } else {
                sch->qstats.backlog += pkt_len;
                sch->q.qlen++;
        }
}

/* use instead of qdisc->dequeue() for all qdiscs queried with ->peek() */
static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch)
{
        struct sk_buff *skb = skb_peek(&sch->gso_skb);

        if (skb) {
                skb = __skb_dequeue(&sch->gso_skb);
                if (qdisc_is_percpu_stats(sch)) {
                        qdisc_qstats_cpu_backlog_dec(sch, skb);
                        qdisc_qstats_cpu_qlen_dec(sch);
                } else {
                        qdisc_qstats_backlog_dec(sch, skb);
                        sch->q.qlen--;
                }
        } else {
                skb = sch->dequeue(sch);
        }

        return skb;
}

static inline void __qdisc_reset_queue(struct qdisc_skb_head *qh)
{
        /*
         * We do not know the backlog in bytes of this list, it
         * is up to the caller to correct it
         */
        ASSERT_RTNL();
        if (qh->qlen) {
                rtnl_kfree_skbs(qh->head, qh->tail);

                qh->head = NULL;
                qh->tail = NULL;
                qh->qlen = 0;
        }
}

static inline void qdisc_reset_queue(struct Qdisc *sch)
{
        __qdisc_reset_queue(&sch->q);
}

static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new,
                                          struct Qdisc **pold)
{
        struct Qdisc *old;

        sch_tree_lock(sch);
        old = *pold;
        *pold = new;
        if (old != NULL)
                qdisc_purge_queue(old);
        sch_tree_unlock(sch);

        return old;
}

static inline void rtnl_qdisc_drop(struct sk_buff *skb, struct Qdisc *sch)
{
        rtnl_kfree_skbs(skb, skb);
        qdisc_qstats_drop(sch);
}

static inline int qdisc_drop_cpu(struct sk_buff *skb, struct Qdisc *sch,
                                 struct sk_buff **to_free)
{
        __qdisc_drop(skb, to_free);
        qdisc_qstats_cpu_drop(sch);

        return NET_XMIT_DROP;
}

static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch,
                             struct sk_buff **to_free)
{
        __qdisc_drop(skb, to_free);
        qdisc_qstats_drop(sch);

        return NET_XMIT_DROP;
}

static inline int qdisc_drop_all(struct sk_buff *skb, struct Qdisc *sch,
                                 struct sk_buff **to_free)
{
        __qdisc_drop_all(skb, to_free);
        qdisc_qstats_drop(sch);

        return NET_XMIT_DROP;
}

/* Length to Time (L2T) lookup in a qdisc_rate_table, to determine how
   long it will take to send a packet given its size.
 */
static inline u32 qdisc_l2t(struct qdisc_rate_table* rtab, unsigned int pktlen)
{
        int slot = pktlen + rtab->rate.cell_align + rtab->rate.overhead;
        if (slot < 0)
                slot = 0;
        slot >>= rtab->rate.cell_log;
        if (slot > 255)
                return rtab->data[255]*(slot >> 8) + rtab->data[slot & 0xFF];
        return rtab->data[slot];
}

struct psched_ratecfg {
        u64        rate_bytes_ps; /* bytes per second */
        u32        mult;
        u16        overhead;
        u16        mpu;
        u8        linklayer;
        u8        shift;
};

static inline u64 psched_l2t_ns(const struct psched_ratecfg *r,
                                unsigned int len)
{
        len += r->overhead;

        if (len < r->mpu)
                len = r->mpu;

        if (unlikely(r->linklayer == TC_LINKLAYER_ATM))
                return ((u64)(DIV_ROUND_UP(len,48)*53) * r->mult) >> r->shift;

        return ((u64)len * r->mult) >> r->shift;
}

void psched_ratecfg_precompute(struct psched_ratecfg *r,
                               const struct tc_ratespec *conf,
                               u64 rate64);

static inline void psched_ratecfg_getrate(struct tc_ratespec *res,
                                          const struct psched_ratecfg *r)
{
        memset(res, 0, sizeof(*res));

        /* legacy struct tc_ratespec has a 32bit @rate field
         * Qdisc using 64bit rate should add new attributes
         * in order to maintain compatibility.
         */
        res->rate = min_t(u64, r->rate_bytes_ps, ~0U);

        res->overhead = r->overhead;
        res->mpu = r->mpu;
        res->linklayer = (r->linklayer & TC_LINKLAYER_MASK);
}

/* Mini Qdisc serves for specific needs of ingress/clsact Qdisc.
 * The fast path only needs to access filter list and to update stats
 */
struct mini_Qdisc {
        struct tcf_proto *filter_list;
        struct tcf_block *block;
        struct gnet_stats_basic_cpu __percpu *cpu_bstats;
        struct gnet_stats_queue        __percpu *cpu_qstats;
        struct rcu_head rcu;
};

static inline void mini_qdisc_bstats_cpu_update(struct mini_Qdisc *miniq,
                                                const struct sk_buff *skb)
{
        bstats_cpu_update(this_cpu_ptr(miniq->cpu_bstats), skb);
}

static inline void mini_qdisc_qstats_cpu_drop(struct mini_Qdisc *miniq)
{
        this_cpu_inc(miniq->cpu_qstats->drops);
}

struct mini_Qdisc_pair {
        struct mini_Qdisc miniq1;
        struct mini_Qdisc miniq2;
        struct mini_Qdisc __rcu **p_miniq;
};

void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
                          struct tcf_proto *tp_head);
void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
                          struct mini_Qdisc __rcu **p_miniq);
void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp,
                                struct tcf_block *block);

/* Make sure qdisc is no longer in SCHED state. */
static inline void qdisc_synchronize(const struct Qdisc *q)
{
        while (test_bit(__QDISC_STATE_SCHED, &q->state))
                msleep(1);
}

#endif










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PAGEMAP_H
#define _LINUX_PAGEMAP_H

/*
 * Copyright 1995 Linus Torvalds
 */
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/list.h>
#include <linux/highmem.h>
#include <linux/compiler.h>
#include <linux/uaccess.h>
#include <linux/gfp.h>
#include <linux/bitops.h>
#include <linux/hardirq.h> /* for in_interrupt() */
#include <linux/hugetlb_inline.h>

struct pagevec;

/*
 * Bits in mapping->flags.
 */
enum mapping_flags {
        AS_EIO                = 0,        /* IO error on async write */
        AS_ENOSPC        = 1,        /* ENOSPC on async write */
        AS_MM_ALL_LOCKS        = 2,        /* under mm_take_all_locks() */
        AS_UNEVICTABLE        = 3,        /* e.g., ramdisk, SHM_LOCK */
        AS_EXITING        = 4,         /* final truncate in progress */
        /* writeback related tags are not used */
        AS_NO_WRITEBACK_TAGS = 5,
        AS_THP_SUPPORT = 6,        /* THPs supported */
};

/**
 * mapping_set_error - record a writeback error in the address_space
 * @mapping: the mapping in which an error should be set
 * @error: the error to set in the mapping
 *
 * When writeback fails in some way, we must record that error so that
 * userspace can be informed when fsync and the like are called.  We endeavor
 * to report errors on any file that was open at the time of the error.  Some
 * internal callers also need to know when writeback errors have occurred.
 *
 * When a writeback error occurs, most filesystems will want to call
 * mapping_set_error to record the error in the mapping so that it can be
 * reported when the application calls fsync(2).
 */
static inline void mapping_set_error(struct address_space *mapping, int error)
{
        if (likely(!error))
                return;

        /* Record in wb_err for checkers using errseq_t based tracking */
        __filemap_set_wb_err(mapping, error);

        /* Record it in superblock */
        if (mapping->host)
                errseq_set(&mapping->host->i_sb->s_wb_err, error);

        /* Record it in flags for now, for legacy callers */
        if (error == -ENOSPC)
                set_bit(AS_ENOSPC, &mapping->flags);
        else
                set_bit(AS_EIO, &mapping->flags);
}

static inline void mapping_set_unevictable(struct address_space *mapping)
{
        set_bit(AS_UNEVICTABLE, &mapping->flags);
}

static inline void mapping_clear_unevictable(struct address_space *mapping)
{
        clear_bit(AS_UNEVICTABLE, &mapping->flags);
}

static inline bool mapping_unevictable(struct address_space *mapping)
{
        return mapping && test_bit(AS_UNEVICTABLE, &mapping->flags);
}

static inline void mapping_set_exiting(struct address_space *mapping)
{
        set_bit(AS_EXITING, &mapping->flags);
}

static inline int mapping_exiting(struct address_space *mapping)
{
        return test_bit(AS_EXITING, &mapping->flags);
}

static inline void mapping_set_no_writeback_tags(struct address_space *mapping)
{
        set_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
}

static inline int mapping_use_writeback_tags(struct address_space *mapping)
{
        return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
}

static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
{
        return mapping->gfp_mask;
}

/* Restricts the given gfp_mask to what the mapping allows. */
static inline gfp_t mapping_gfp_constraint(struct address_space *mapping,
                gfp_t gfp_mask)
{
        return mapping_gfp_mask(mapping) & gfp_mask;
}

/*
 * This is non-atomic.  Only to be used before the mapping is activated.
 * Probably needs a barrier...
 */
static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
{
        m->gfp_mask = mask;
}

static inline bool mapping_thp_support(struct address_space *mapping)
{
        return test_bit(AS_THP_SUPPORT, &mapping->flags);
}

static inline int filemap_nr_thps(struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        return atomic_read(&mapping->nr_thps);
#else
        return 0;
#endif
}

static inline void filemap_nr_thps_inc(struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        if (!mapping_thp_support(mapping))
                atomic_inc(&mapping->nr_thps);
#else
        WARN_ON_ONCE(1);
#endif
}

static inline void filemap_nr_thps_dec(struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        if (!mapping_thp_support(mapping))
                atomic_dec(&mapping->nr_thps);
#else
        WARN_ON_ONCE(1);
#endif
}

void release_pages(struct page **pages, int nr);

/*
 * speculatively take a reference to a page.
 * If the page is free (_refcount == 0), then _refcount is untouched, and 0
 * is returned. Otherwise, _refcount is incremented by 1 and 1 is returned.
 *
 * This function must be called inside the same rcu_read_lock() section as has
 * been used to lookup the page in the pagecache radix-tree (or page table):
 * this allows allocators to use a synchronize_rcu() to stabilize _refcount.
 *
 * Unless an RCU grace period has passed, the count of all pages coming out
 * of the allocator must be considered unstable. page_count may return higher
 * than expected, and put_page must be able to do the right thing when the
 * page has been finished with, no matter what it is subsequently allocated
 * for (because put_page is what is used here to drop an invalid speculative
 * reference).
 *
 * This is the interesting part of the lockless pagecache (and lockless
 * get_user_pages) locking protocol, where the lookup-side (eg. find_get_page)
 * has the following pattern:
 * 1. find page in radix tree
 * 2. conditionally increment refcount
 * 3. check the page is still in pagecache (if no, goto 1)
 *
 * Remove-side that cares about stability of _refcount (eg. reclaim) has the
 * following (with the i_pages lock held):
 * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg)
 * B. remove page from pagecache
 * C. free the page
 *
 * There are 2 critical interleavings that matter:
 * - 2 runs before A: in this case, A sees elevated refcount and bails out
 * - A runs before 2: in this case, 2 sees zero refcount and retries;
 *   subsequently, B will complete and 1 will find no page, causing the
 *   lookup to return NULL.
 *
 * It is possible that between 1 and 2, the page is removed then the exact same
 * page is inserted into the same position in pagecache. That's OK: the
 * old find_get_page using a lock could equally have run before or after
 * such a re-insertion, depending on order that locks are granted.
 *
 * Lookups racing against pagecache insertion isn't a big problem: either 1
 * will find the page or it will not. Likewise, the old find_get_page could run
 * either before the insertion or afterwards, depending on timing.
 */
static inline int __page_cache_add_speculative(struct page *page, int count)
{
#ifdef CONFIG_TINY_RCU
# ifdef CONFIG_PREEMPT_COUNT
        VM_BUG_ON(!in_atomic() && !irqs_disabled());
# endif
        /*
         * Preempt must be disabled here - we rely on rcu_read_lock doing
         * this for us.
         *
         * Pagecache won't be truncated from interrupt context, so if we have
         * found a page in the radix tree here, we have pinned its refcount by
         * disabling preempt, and hence no need for the "speculative get" that
         * SMP requires.
         */
        VM_BUG_ON_PAGE(page_count(page) == 0, page);
        page_ref_add(page, count);

#else
        if (unlikely(!page_ref_add_unless(page, count, 0))) {
                /*
                 * Either the page has been freed, or will be freed.
                 * In either case, retry here and the caller should
                 * do the right thing (see comments above).
                 */
                return 0;
        }
#endif
        VM_BUG_ON_PAGE(PageTail(page), page);

        return 1;
}

static inline int page_cache_get_speculative(struct page *page)
{
        return __page_cache_add_speculative(page, 1);
}

static inline int page_cache_add_speculative(struct page *page, int count)
{
        return __page_cache_add_speculative(page, count);
}

/**
 * attach_page_private - Attach private data to a page.
 * @page: Page to attach data to.
 * @data: Data to attach to page.
 *
 * Attaching private data to a page increments the page's reference count.
 * The data must be detached before the page will be freed.
 */
static inline void attach_page_private(struct page *page, void *data)
{
        get_page(page);
        set_page_private(page, (unsigned long)data);
        SetPagePrivate(page);
}

/**
 * detach_page_private - Detach private data from a page.
 * @page: Page to detach data from.
 *
 * Removes the data that was previously attached to the page and decrements
 * the refcount on the page.
 *
 * Return: Data that was attached to the page.
 */
static inline void *detach_page_private(struct page *page)
{
        void *data = (void *)page_private(page);

        if (!PagePrivate(page))
                return NULL;
        ClearPagePrivate(page);
        set_page_private(page, 0);
        put_page(page);

        return data;
}

#ifdef CONFIG_NUMA
extern struct page *__page_cache_alloc(gfp_t gfp);
#else
static inline struct page *__page_cache_alloc(gfp_t gfp)
{
        return alloc_pages(gfp, 0);
}
#endif

static inline struct page *page_cache_alloc(struct address_space *x)
{
        return __page_cache_alloc(mapping_gfp_mask(x));
}

static inline gfp_t readahead_gfp_mask(struct address_space *x)
{
        return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN;
}

typedef int filler_t(void *, struct page *);

pgoff_t page_cache_next_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan);
pgoff_t page_cache_prev_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan);

#define FGP_ACCESSED                0x00000001
#define FGP_LOCK                0x00000002
#define FGP_CREAT                0x00000004
#define FGP_WRITE                0x00000008
#define FGP_NOFS                0x00000010
#define FGP_NOWAIT                0x00000020
#define FGP_FOR_MMAP                0x00000040
#define FGP_HEAD                0x00000080

struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
                int fgp_flags, gfp_t cache_gfp_mask);

/**
 * find_get_page - find and get a page reference
 * @mapping: the address_space to search
 * @offset: the page index
 *
 * Looks up the page cache slot at @mapping & @offset.  If there is a
 * page cache page, it is returned with an increased refcount.
 *
 * Otherwise, %NULL is returned.
 */
static inline struct page *find_get_page(struct address_space *mapping,
                                        pgoff_t offset)
{
        return pagecache_get_page(mapping, offset, 0, 0);
}

static inline struct page *find_get_page_flags(struct address_space *mapping,
                                        pgoff_t offset, int fgp_flags)
{
        return pagecache_get_page(mapping, offset, fgp_flags, 0);
}

/**
 * find_lock_page - locate, pin and lock a pagecache page
 * @mapping: the address_space to search
 * @index: the page index
 *
 * Looks up the page cache entry at @mapping & @index.  If there is a
 * page cache page, it is returned locked and with an increased
 * refcount.
 *
 * Context: May sleep.
 * Return: A struct page or %NULL if there is no page in the cache for this
 * index.
 */
static inline struct page *find_lock_page(struct address_space *mapping,
                                        pgoff_t index)
{
        return pagecache_get_page(mapping, index, FGP_LOCK, 0);
}

/**
 * find_lock_head - Locate, pin and lock a pagecache page.
 * @mapping: The address_space to search.
 * @index: The page index.
 *
 * Looks up the page cache entry at @mapping & @index.  If there is a
 * page cache page, its head page is returned locked and with an increased
 * refcount.
 *
 * Context: May sleep.
 * Return: A struct page which is !PageTail, or %NULL if there is no page
 * in the cache for this index.
 */
static inline struct page *find_lock_head(struct address_space *mapping,
                                        pgoff_t index)
{
        return pagecache_get_page(mapping, index, FGP_LOCK | FGP_HEAD, 0);
}

/**
 * find_or_create_page - locate or add a pagecache page
 * @mapping: the page's address_space
 * @index: the page's index into the mapping
 * @gfp_mask: page allocation mode
 *
 * Looks up the page cache slot at @mapping & @offset.  If there is a
 * page cache page, it is returned locked and with an increased
 * refcount.
 *
 * If the page is not present, a new page is allocated using @gfp_mask
 * and added to the page cache and the VM's LRU list.  The page is
 * returned locked and with an increased refcount.
 *
 * On memory exhaustion, %NULL is returned.
 *
 * find_or_create_page() may sleep, even if @gfp_flags specifies an
 * atomic allocation!
 */
static inline struct page *find_or_create_page(struct address_space *mapping,
                                        pgoff_t index, gfp_t gfp_mask)
{
        return pagecache_get_page(mapping, index,
                                        FGP_LOCK|FGP_ACCESSED|FGP_CREAT,
                                        gfp_mask);
}

/**
 * grab_cache_page_nowait - returns locked page at given index in given cache
 * @mapping: target address_space
 * @index: the page index
 *
 * Same as grab_cache_page(), but do not wait if the page is unavailable.
 * This is intended for speculative data generators, where the data can
 * be regenerated if the page couldn't be grabbed.  This routine should
 * be safe to call while holding the lock for another page.
 *
 * Clear __GFP_FS when allocating the page to avoid recursion into the fs
 * and deadlock against the caller's locked page.
 */
static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
                                pgoff_t index)
{
        return pagecache_get_page(mapping, index,
                        FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
                        mapping_gfp_mask(mapping));
}

/* Does this page contain this index? */
static inline bool thp_contains(struct page *head, pgoff_t index)
{
        /* HugeTLBfs indexes the page cache in units of hpage_size */
        if (PageHuge(head))
                return head->index == index;
        return page_index(head) == (index & ~(thp_nr_pages(head) - 1UL));
}

/*
 * Given the page we found in the page cache, return the page corresponding
 * to this index in the file
 */
static inline struct page *find_subpage(struct page *head, pgoff_t index)
{
        /* HugeTLBfs wants the head page regardless */
        if (PageHuge(head))
                return head;

        return head + (index & (thp_nr_pages(head) - 1));
}

unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
                          unsigned int nr_entries, struct page **entries,
                          pgoff_t *indices);
unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
                        pgoff_t end, unsigned int nr_pages,
                        struct page **pages);
static inline unsigned find_get_pages(struct address_space *mapping,
                        pgoff_t *start, unsigned int nr_pages,
                        struct page **pages)
{
        return find_get_pages_range(mapping, start, (pgoff_t)-1, nr_pages,
                                    pages);
}
unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
                               unsigned int nr_pages, struct page **pages);
unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
                        pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
                        struct page **pages);
static inline unsigned find_get_pages_tag(struct address_space *mapping,
                        pgoff_t *index, xa_mark_t tag, unsigned int nr_pages,
                        struct page **pages)
{
        return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag,
                                        nr_pages, pages);
}

struct page *grab_cache_page_write_begin(struct address_space *mapping,
                        pgoff_t index, unsigned flags);

/*
 * Returns locked page at given index in given cache, creating it if needed.
 */
static inline struct page *grab_cache_page(struct address_space *mapping,
                                                                pgoff_t index)
{
        return find_or_create_page(mapping, index, mapping_gfp_mask(mapping));
}

extern struct page * read_cache_page(struct address_space *mapping,
                                pgoff_t index, filler_t *filler, void *data);
extern struct page * read_cache_page_gfp(struct address_space *mapping,
                                pgoff_t index, gfp_t gfp_mask);
extern int read_cache_pages(struct address_space *mapping,
                struct list_head *pages, filler_t *filler, void *data);

static inline struct page *read_mapping_page(struct address_space *mapping,
                                pgoff_t index, void *data)
{
        return read_cache_page(mapping, index, NULL, data);
}

/*
 * Get index of the page within radix-tree (but not for hugetlb pages).
 * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE)
 */
static inline pgoff_t page_to_index(struct page *page)
{
        pgoff_t pgoff;

        if (likely(!PageTransTail(page)))
                return page->index;

        /*
         *  We don't initialize ->index for tail pages: calculate based on
         *  head page
         */
        pgoff = compound_head(page)->index;
        pgoff += page - compound_head(page);
        return pgoff;
}

extern pgoff_t hugetlb_basepage_index(struct page *page);

/*
 * Get the offset in PAGE_SIZE (even for hugetlb pages).
 * (TODO: hugetlb pages should have ->index in PAGE_SIZE)
 */
static inline pgoff_t page_to_pgoff(struct page *page)
{
        if (unlikely(PageHuge(page)))
                return hugetlb_basepage_index(page);
        return page_to_index(page);
}

/*
 * Return byte-offset into filesystem object for page.
 */
static inline loff_t page_offset(struct page *page)
{
        return ((loff_t)page->index) << PAGE_SHIFT;
}

static inline loff_t page_file_offset(struct page *page)
{
        return ((loff_t)page_index(page)) << PAGE_SHIFT;
}

extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
                                     unsigned long address);

static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
                                        unsigned long address)
{
        pgoff_t pgoff;
        if (unlikely(is_vm_hugetlb_page(vma)))
                return linear_hugepage_index(vma, address);
        pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
        pgoff += vma->vm_pgoff;
        return pgoff;
}

struct wait_page_key {
        struct page *page;
        int bit_nr;
        int page_match;
};

struct wait_page_queue {
        struct page *page;
        int bit_nr;
        wait_queue_entry_t wait;
};

static inline bool wake_page_match(struct wait_page_queue *wait_page,
                                  struct wait_page_key *key)
{
        if (wait_page->page != key->page)
               return false;
        key->page_match = 1;

        if (wait_page->bit_nr != key->bit_nr)
                return false;

        return true;
}

extern void __lock_page(struct page *page);
extern int __lock_page_killable(struct page *page);
extern int __lock_page_async(struct page *page, struct wait_page_queue *wait);
extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
                                unsigned int flags);
extern void unlock_page(struct page *page);

/*
 * Return true if the page was successfully locked
 */
static inline int trylock_page(struct page *page)
{
        page = compound_head(page);
        return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
}

/*
 * lock_page may only be called if we have the page's inode pinned.
 */
static inline void lock_page(struct page *page)
{
        might_sleep();
        if (!trylock_page(page))
                __lock_page(page);
}

/*
 * lock_page_killable is like lock_page but can be interrupted by fatal
 * signals.  It returns 0 if it locked the page and -EINTR if it was
 * killed while waiting.
 */
static inline int lock_page_killable(struct page *page)
{
        might_sleep();
        if (!trylock_page(page))
                return __lock_page_killable(page);
        return 0;
}

/*
 * lock_page_async - Lock the page, unless this would block. If the page
 * is already locked, then queue a callback when the page becomes unlocked.
 * This callback can then retry the operation.
 *
 * Returns 0 if the page is locked successfully, or -EIOCBQUEUED if the page
 * was already locked and the callback defined in 'wait' was queued.
 */
static inline int lock_page_async(struct page *page,
                                  struct wait_page_queue *wait)
{
        if (!trylock_page(page))
                return __lock_page_async(page, wait);
        return 0;
}

/*
 * lock_page_or_retry - Lock the page, unless this would block and the
 * caller indicated that it can handle a retry.
 *
 * Return value and mmap_lock implications depend on flags; see
 * __lock_page_or_retry().
 */
static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm,
                                     unsigned int flags)
{
        might_sleep();
        return trylock_page(page) || __lock_page_or_retry(page, mm, flags);
}

/*
 * This is exported only for wait_on_page_locked/wait_on_page_writeback, etc.,
 * and should not be used directly.
 */
extern void wait_on_page_bit(struct page *page, int bit_nr);
extern int wait_on_page_bit_killable(struct page *page, int bit_nr);

/* 
 * Wait for a page to be unlocked.
 *
 * This must be called with the caller "holding" the page,
 * ie with increased "page->count" so that the page won't
 * go away during the wait..
 */
static inline void wait_on_page_locked(struct page *page)
{
        if (PageLocked(page))
                wait_on_page_bit(compound_head(page), PG_locked);
}

static inline int wait_on_page_locked_killable(struct page *page)
{
        if (!PageLocked(page))
                return 0;
        return wait_on_page_bit_killable(compound_head(page), PG_locked);
}

extern void put_and_wait_on_page_locked(struct page *page);

void wait_on_page_writeback(struct page *page);
extern void end_page_writeback(struct page *page);
void wait_for_stable_page(struct page *page);

void page_endio(struct page *page, bool is_write, int err);

/*
 * Add an arbitrary waiter to a page's wait queue
 */
extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter);

/*
 * Fault everything in given userspace address range in.
 */
static inline int fault_in_pages_writeable(char __user *uaddr, int size)
{
        char __user *end = uaddr + size - 1;

        if (unlikely(size == 0))
                return 0;

        if (unlikely(uaddr > end))
                return -EFAULT;
        /*
         * Writing zeroes into userspace here is OK, because we know that if
         * the zero gets there, we'll be overwriting it.
         */
        do {
                if (unlikely(__put_user(0, uaddr) != 0))
                        return -EFAULT;
                uaddr += PAGE_SIZE;
        } while (uaddr <= end);

        /* Check whether the range spilled into the next page. */
        if (((unsigned long)uaddr & PAGE_MASK) ==
                        ((unsigned long)end & PAGE_MASK))
                return __put_user(0, end);

        return 0;
}

static inline int fault_in_pages_readable(const char __user *uaddr, int size)
{
        volatile char c;
        const char __user *end = uaddr + size - 1;

        if (unlikely(size == 0))
                return 0;

        if (unlikely(uaddr > end))
                return -EFAULT;

        do {
                if (unlikely(__get_user(c, uaddr) != 0))
                        return -EFAULT;
                uaddr += PAGE_SIZE;
        } while (uaddr <= end);

        /* Check whether the range spilled into the next page. */
        if (((unsigned long)uaddr & PAGE_MASK) ==
                        ((unsigned long)end & PAGE_MASK)) {
                return __get_user(c, end);
        }

        (void)c;
        return 0;
}

int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
                                pgoff_t index, gfp_t gfp_mask);
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
                                pgoff_t index, gfp_t gfp_mask);
extern void delete_from_page_cache(struct page *page);
extern void __delete_from_page_cache(struct page *page, void *shadow);
int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
void delete_from_page_cache_batch(struct address_space *mapping,
                                  struct pagevec *pvec);

/*
 * Like add_to_page_cache_locked, but used to add newly allocated pages:
 * the page is new, so we can just run __SetPageLocked() against it.
 */
static inline int add_to_page_cache(struct page *page,
                struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
{
        int error;

        __SetPageLocked(page);
        error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
        if (unlikely(error))
                __ClearPageLocked(page);
        return error;
}

/**
 * struct readahead_control - Describes a readahead request.
 *
 * A readahead request is for consecutive pages.  Filesystems which
 * implement the ->readahead method should call readahead_page() or
 * readahead_page_batch() in a loop and attempt to start I/O against
 * each page in the request.
 *
 * Most of the fields in this struct are private and should be accessed
 * by the functions below.
 *
 * @file: The file, used primarily by network filesystems for authentication.
 *          May be NULL if invoked internally by the filesystem.
 * @mapping: Readahead this filesystem object.
 */
struct readahead_control {
        struct file *file;
        struct address_space *mapping;
/* private: use the readahead_* accessors instead */
        pgoff_t _index;
        unsigned int _nr_pages;
        unsigned int _batch_count;
};

#define DEFINE_READAHEAD(rac, f, m, i)                                        \
        struct readahead_control rac = {                                \
                .file = f,                                                \
                .mapping = m,                                                \
                ._index = i,                                                \
        }

#define VM_READAHEAD_PAGES        (SZ_128K / PAGE_SIZE)

void page_cache_ra_unbounded(struct readahead_control *,
                unsigned long nr_to_read, unsigned long lookahead_count);
void page_cache_sync_ra(struct readahead_control *, struct file_ra_state *,
                unsigned long req_count);
void page_cache_async_ra(struct readahead_control *, struct file_ra_state *,
                struct page *, unsigned long req_count);

/**
 * page_cache_sync_readahead - generic file readahead
 * @mapping: address_space which holds the pagecache and I/O vectors
 * @ra: file_ra_state which holds the readahead state
 * @file: Used by the filesystem for authentication.
 * @index: Index of first page to be read.
 * @req_count: Total number of pages being read by the caller.
 *
 * page_cache_sync_readahead() should be called when a cache miss happened:
 * it will submit the read.  The readahead logic may decide to piggyback more
 * pages onto the read request if access patterns suggest it will improve
 * performance.
 */
static inline
void page_cache_sync_readahead(struct address_space *mapping,
                struct file_ra_state *ra, struct file *file, pgoff_t index,
                unsigned long req_count)
{
        DEFINE_READAHEAD(ractl, file, mapping, index);
        page_cache_sync_ra(&ractl, ra, req_count);
}

/**
 * page_cache_async_readahead - file readahead for marked pages
 * @mapping: address_space which holds the pagecache and I/O vectors
 * @ra: file_ra_state which holds the readahead state
 * @file: Used by the filesystem for authentication.
 * @page: The page at @index which triggered the readahead call.
 * @index: Index of first page to be read.
 * @req_count: Total number of pages being read by the caller.
 *
 * page_cache_async_readahead() should be called when a page is used which
 * is marked as PageReadahead; this is a marker to suggest that the application
 * has used up enough of the readahead window that we should start pulling in
 * more pages.
 */
static inline
void page_cache_async_readahead(struct address_space *mapping,
                struct file_ra_state *ra, struct file *file,
                struct page *page, pgoff_t index, unsigned long req_count)
{
        DEFINE_READAHEAD(ractl, file, mapping, index);
        page_cache_async_ra(&ractl, ra, page, req_count);
}

/**
 * readahead_page - Get the next page to read.
 * @rac: The current readahead request.
 *
 * Context: The page is locked and has an elevated refcount.  The caller
 * should decreases the refcount once the page has been submitted for I/O
 * and unlock the page once all I/O to that page has completed.
 * Return: A pointer to the next page, or %NULL if we are done.
 */
static inline struct page *readahead_page(struct readahead_control *rac)
{
        struct page *page;

        BUG_ON(rac->_batch_count > rac->_nr_pages);
        rac->_nr_pages -= rac->_batch_count;
        rac->_index += rac->_batch_count;

        if (!rac->_nr_pages) {
                rac->_batch_count = 0;
                return NULL;
        }

        page = xa_load(&rac->mapping->i_pages, rac->_index);
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        rac->_batch_count = thp_nr_pages(page);

        return page;
}

static inline unsigned int __readahead_batch(struct readahead_control *rac,
                struct page **array, unsigned int array_sz)
{
        unsigned int i = 0;
        XA_STATE(xas, &rac->mapping->i_pages, 0);
        struct page *page;

        BUG_ON(rac->_batch_count > rac->_nr_pages);
        rac->_nr_pages -= rac->_batch_count;
        rac->_index += rac->_batch_count;
        rac->_batch_count = 0;

        xas_set(&xas, rac->_index);
        rcu_read_lock();
        xas_for_each(&xas, page, rac->_index + rac->_nr_pages - 1) {
                if (xas_retry(&xas, page))
                        continue;
                VM_BUG_ON_PAGE(!PageLocked(page), page);
                VM_BUG_ON_PAGE(PageTail(page), page);
                array[i++] = page;
                rac->_batch_count += thp_nr_pages(page);

                /*
                 * The page cache isn't using multi-index entries yet,
                 * so the xas cursor needs to be manually moved to the
                 * next index.  This can be removed once the page cache
                 * is converted.
                 */
                if (PageHead(page))
                        xas_set(&xas, rac->_index + rac->_batch_count);

                if (i == array_sz)
                        break;
        }
        rcu_read_unlock();

        return i;
}

/**
 * readahead_page_batch - Get a batch of pages to read.
 * @rac: The current readahead request.
 * @array: An array of pointers to struct page.
 *
 * Context: The pages are locked and have an elevated refcount.  The caller
 * should decreases the refcount once the page has been submitted for I/O
 * and unlock the page once all I/O to that page has completed.
 * Return: The number of pages placed in the array.  0 indicates the request
 * is complete.
 */
#define readahead_page_batch(rac, array)                                \
        __readahead_batch(rac, array, ARRAY_SIZE(array))

/**
 * readahead_pos - The byte offset into the file of this readahead request.
 * @rac: The readahead request.
 */
static inline loff_t readahead_pos(struct readahead_control *rac)
{
        return (loff_t)rac->_index * PAGE_SIZE;
}

/**
 * readahead_length - The number of bytes in this readahead request.
 * @rac: The readahead request.
 */
static inline loff_t readahead_length(struct readahead_control *rac)
{
        return (loff_t)rac->_nr_pages * PAGE_SIZE;
}

/**
 * readahead_index - The index of the first page in this readahead request.
 * @rac: The readahead request.
 */
static inline pgoff_t readahead_index(struct readahead_control *rac)
{
        return rac->_index;
}

/**
 * readahead_count - The number of pages in this readahead request.
 * @rac: The readahead request.
 */
static inline unsigned int readahead_count(struct readahead_control *rac)
{
        return rac->_nr_pages;
}

static inline unsigned long dir_pages(struct inode *inode)
{
        return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >>
                               PAGE_SHIFT;
}

/**
 * page_mkwrite_check_truncate - check if page was truncated
 * @page: the page to check
 * @inode: the inode to check the page against
 *
 * Returns the number of bytes in the page up to EOF,
 * or -EFAULT if the page was truncated.
 */
static inline int page_mkwrite_check_truncate(struct page *page,
                                              struct inode *inode)
{
        loff_t size = i_size_read(inode);
        pgoff_t index = size >> PAGE_SHIFT;
        int offset = offset_in_page(size);

        if (page->mapping != inode->i_mapping)
                return -EFAULT;

        /* page is wholly inside EOF */
        if (page->index < index)
                return PAGE_SIZE;
        /* page is wholly past EOF */
        if (page->index > index || !offset)
                return -EFAULT;
        /* page is partially inside EOF */
        return offset;
}

/**
 * i_blocks_per_page - How many blocks fit in this page.
 * @inode: The inode which contains the blocks.
 * @page: The page (head page if the page is a THP).
 *
 * If the block size is larger than the size of this page, return zero.
 *
 * Context: The caller should hold a refcount on the page to prevent it
 * from being split.
 * Return: The number of filesystem blocks covered by this page.
 */
static inline
unsigned int i_blocks_per_page(struct inode *inode, struct page *page)
{
        return thp_size(page) >> inode->i_blkbits;
}
#endif /* _LINUX_PAGEMAP_H */




















































































    1 










































































































































































































































    1 







    1 































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* internal.h: mm/ internal definitions
 *
 * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */
#ifndef __MM_INTERNAL_H
#define __MM_INTERNAL_H

#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/tracepoint-defs.h>

/*
 * The set of flags that only affect watermark checking and reclaim
 * behaviour. This is used by the MM to obey the caller constraints
 * about IO, FS and watermark checking while ignoring placement
 * hints such as HIGHMEM usage.
 */
#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
                        __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
                        __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
                        __GFP_ATOMIC)

/* The GFP flags allowed during early boot */
#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))

/* Control allocation cpuset and node placement constraints */
#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)

/* Do not use these with a slab allocator */
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)

void page_writeback_init(void);

vm_fault_t do_swap_page(struct vm_fault *vmf);

void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
                unsigned long floor, unsigned long ceiling);

static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
{
        return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
}

void unmap_page_range(struct mmu_gather *tlb,
                             struct vm_area_struct *vma,
                             unsigned long addr, unsigned long end,
                             struct zap_details *details);

void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read,
                unsigned long lookahead_size);
void force_page_cache_ra(struct readahead_control *, struct file_ra_state *,
                unsigned long nr);
static inline void force_page_cache_readahead(struct address_space *mapping,
                struct file *file, pgoff_t index, unsigned long nr_to_read)
{
        DEFINE_READAHEAD(ractl, file, mapping, index);
        force_page_cache_ra(&ractl, &file->f_ra, nr_to_read);
}

struct page *find_get_entry(struct address_space *mapping, pgoff_t index);
struct page *find_lock_entry(struct address_space *mapping, pgoff_t index);

/**
 * page_evictable - test whether a page is evictable
 * @page: the page to test
 *
 * Test whether page is evictable--i.e., should be placed on active/inactive
 * lists vs unevictable list.
 *
 * Reasons page might not be evictable:
 * (1) page's mapping marked unevictable
 * (2) page is part of an mlocked VMA
 *
 */
static inline bool page_evictable(struct page *page)
{
        bool ret;

        /* Prevent address_space of inode and swap cache from being freed */
        rcu_read_lock();
        ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
        rcu_read_unlock();
        return ret;
}

/*
 * Turn a non-refcounted page (->_refcount == 0) into refcounted with
 * a count of one.
 */
static inline void set_page_refcounted(struct page *page)
{
        VM_BUG_ON_PAGE(PageTail(page), page);
        VM_BUG_ON_PAGE(page_ref_count(page), page);
        set_page_count(page, 1);
}

extern unsigned long highest_memmap_pfn;

/*
 * Maximum number of reclaim retries without progress before the OOM
 * killer is consider the only way forward.
 */
#define MAX_RECLAIM_RETRIES 16

/*
 * in mm/vmscan.c:
 */
extern int isolate_lru_page(struct page *page);
extern void putback_lru_page(struct page *page);

/*
 * in mm/rmap.c:
 */
extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);

/*
 * in mm/page_alloc.c
 */

/*
 * Structure for holding the mostly immutable allocation parameters passed
 * between functions involved in allocations, including the alloc_pages*
 * family of functions.
 *
 * nodemask, migratetype and highest_zoneidx are initialized only once in
 * __alloc_pages_nodemask() and then never change.
 *
 * zonelist, preferred_zone and highest_zoneidx are set first in
 * __alloc_pages_nodemask() for the fast path, and might be later changed
 * in __alloc_pages_slowpath(). All other functions pass the whole structure
 * by a const pointer.
 */
struct alloc_context {
        struct zonelist *zonelist;
        nodemask_t *nodemask;
        struct zoneref *preferred_zoneref;
        int migratetype;

        /*
         * highest_zoneidx represents highest usable zone index of
         * the allocation request. Due to the nature of the zone,
         * memory on lower zone than the highest_zoneidx will be
         * protected by lowmem_reserve[highest_zoneidx].
         *
         * highest_zoneidx is also used by reclaim/compaction to limit
         * the target zone since higher zone than this index cannot be
         * usable for this allocation request.
         */
        enum zone_type highest_zoneidx;
        bool spread_dirty_pages;
};

/*
 * Locate the struct page for both the matching buddy in our
 * pair (buddy1) and the combined O(n+1) page they form (page).
 *
 * 1) Any buddy B1 will have an order O twin B2 which satisfies
 * the following equation:
 *     B2 = B1 ^ (1 << O)
 * For example, if the starting buddy (buddy2) is #8 its order
 * 1 buddy is #10:
 *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
 *
 * 2) Any buddy B will have an order O+1 parent P which
 * satisfies the following equation:
 *     P = B & ~(1 << O)
 *
 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
 */
static inline unsigned long
__find_buddy_pfn(unsigned long page_pfn, unsigned int order)
{
        return page_pfn ^ (1 << order);
}

extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
                                unsigned long end_pfn, struct zone *zone);

static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
                                unsigned long end_pfn, struct zone *zone)
{
        if (zone->contiguous)
                return pfn_to_page(start_pfn);

        return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
}

extern int __isolate_free_page(struct page *page, unsigned int order);
extern void __putback_isolated_page(struct page *page, unsigned int order,
                                    int mt);
extern void memblock_free_pages(struct page *page, unsigned long pfn,
                                        unsigned int order);
extern void __free_pages_core(struct page *page, unsigned int order);
extern void prep_compound_page(struct page *page, unsigned int order);
extern void post_alloc_hook(struct page *page, unsigned int order,
                                        gfp_t gfp_flags);
extern int user_min_free_kbytes;

extern void zone_pcp_update(struct zone *zone);
extern void zone_pcp_reset(struct zone *zone);

#if defined CONFIG_COMPACTION || defined CONFIG_CMA

/*
 * in mm/compaction.c
 */
/*
 * compact_control is used to track pages being migrated and the free pages
 * they are being migrated to during memory compaction. The free_pfn starts
 * at the end of a zone and migrate_pfn begins at the start. Movable pages
 * are moved to the end of a zone during a compaction run and the run
 * completes when free_pfn <= migrate_pfn
 */
struct compact_control {
        struct list_head freepages;        /* List of free pages to migrate to */
        struct list_head migratepages;        /* List of pages being migrated */
        unsigned int nr_freepages;        /* Number of isolated free pages */
        unsigned int nr_migratepages;        /* Number of pages to migrate */
        unsigned long free_pfn;                /* isolate_freepages search base */
        unsigned long migrate_pfn;        /* isolate_migratepages search base */
        unsigned long fast_start_pfn;        /* a pfn to start linear scan from */
        struct zone *zone;
        unsigned long total_migrate_scanned;
        unsigned long total_free_scanned;
        unsigned short fast_search_fail;/* failures to use free list searches */
        short search_order;                /* order to start a fast search at */
        const gfp_t gfp_mask;                /* gfp mask of a direct compactor */
        int order;                        /* order a direct compactor needs */
        int migratetype;                /* migratetype of direct compactor */
        const unsigned int alloc_flags;        /* alloc flags of a direct compactor */
        const int highest_zoneidx;        /* zone index of a direct compactor */
        enum migrate_mode mode;                /* Async or sync migration mode */
        bool ignore_skip_hint;                /* Scan blocks even if marked skip */
        bool no_set_skip_hint;                /* Don't mark blocks for skipping */
        bool ignore_block_suitable;        /* Scan blocks considered unsuitable */
        bool direct_compaction;                /* False from kcompactd or /proc/... */
        bool proactive_compaction;        /* kcompactd proactive compaction */
        bool whole_zone;                /* Whole zone should/has been scanned */
        bool contended;                        /* Signal lock or sched contention */
        bool rescan;                        /* Rescanning the same pageblock */
        bool alloc_contig;                /* alloc_contig_range allocation */
};

/*
 * Used in direct compaction when a page should be taken from the freelists
 * immediately when one is created during the free path.
 */
struct capture_control {
        struct compact_control *cc;
        struct page *page;
};

unsigned long
isolate_freepages_range(struct compact_control *cc,
                        unsigned long start_pfn, unsigned long end_pfn);
unsigned long
isolate_migratepages_range(struct compact_control *cc,
                           unsigned long low_pfn, unsigned long end_pfn);
int find_suitable_fallback(struct free_area *area, unsigned int order,
                        int migratetype, bool only_stealable, bool *can_steal);

#endif

/*
 * This function returns the order of a free page in the buddy system. In
 * general, page_zone(page)->lock must be held by the caller to prevent the
 * page from being allocated in parallel and returning garbage as the order.
 * If a caller does not hold page_zone(page)->lock, it must guarantee that the
 * page cannot be allocated or merged in parallel. Alternatively, it must
 * handle invalid values gracefully, and use buddy_order_unsafe() below.
 */
static inline unsigned int buddy_order(struct page *page)
{
        /* PageBuddy() must be checked by the caller */
        return page_private(page);
}

/*
 * Like buddy_order(), but for callers who cannot afford to hold the zone lock.
 * PageBuddy() should be checked first by the caller to minimize race window,
 * and invalid values must be handled gracefully.
 *
 * READ_ONCE is used so that if the caller assigns the result into a local
 * variable and e.g. tests it for valid range before using, the compiler cannot
 * decide to remove the variable and inline the page_private(page) multiple
 * times, potentially observing different values in the tests and the actual
 * use of the result.
 */
#define buddy_order_unsafe(page)        READ_ONCE(page_private(page))

static inline bool is_cow_mapping(vm_flags_t flags)
{
        return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}

/*
 * These three helpers classifies VMAs for virtual memory accounting.
 */

/*
 * Executable code area - executable, not writable, not stack
 */
static inline bool is_exec_mapping(vm_flags_t flags)
{
        return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC;
}

/*
 * Stack area - atomatically grows in one direction
 *
 * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
 * do_mmap() forbids all other combinations.
 */
static inline bool is_stack_mapping(vm_flags_t flags)
{
        return (flags & VM_STACK) == VM_STACK;
}

/*
 * Data area - private, writable, not stack
 */
static inline bool is_data_mapping(vm_flags_t flags)
{
        return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE;
}

/* mm/util.c */
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
                struct vm_area_struct *prev);
void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma);

#ifdef CONFIG_MMU
extern long populate_vma_page_range(struct vm_area_struct *vma,
                unsigned long start, unsigned long end, int *nonblocking);
extern void munlock_vma_pages_range(struct vm_area_struct *vma,
                        unsigned long start, unsigned long end);
static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
{
        munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
}

/*
 * must be called with vma's mmap_lock held for read or write, and page locked.
 */
extern void mlock_vma_page(struct page *page);
extern unsigned int munlock_vma_page(struct page *page);

/*
 * Clear the page's PageMlocked().  This can be useful in a situation where
 * we want to unconditionally remove a page from the pagecache -- e.g.,
 * on truncation or freeing.
 *
 * It is legal to call this function for any page, mlocked or not.
 * If called for a page that is still mapped by mlocked vmas, all we do
 * is revert to lazy LRU behaviour -- semantics are not broken.
 */
extern void clear_page_mlock(struct page *page);

/*
 * mlock_migrate_page - called only from migrate_misplaced_transhuge_page()
 * (because that does not go through the full procedure of migration ptes):
 * to migrate the Mlocked page flag; update statistics.
 */
static inline void mlock_migrate_page(struct page *newpage, struct page *page)
{
        if (TestClearPageMlocked(page)) {
                int nr_pages = thp_nr_pages(page);

                /* Holding pmd lock, no change in irq context: __mod is safe */
                __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
                SetPageMlocked(newpage);
                __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
        }
}

extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);

/*
 * At what user virtual address is page expected in vma?
 * Returns -EFAULT if all of the page is outside the range of vma.
 * If page is a compound head, the entire compound page is considered.
 */
static inline unsigned long
vma_address(struct page *page, struct vm_area_struct *vma)
{
        pgoff_t pgoff;
        unsigned long address;

        VM_BUG_ON_PAGE(PageKsm(page), page);        /* KSM page->index unusable */
        pgoff = page_to_pgoff(page);
        if (pgoff >= vma->vm_pgoff) {
                address = vma->vm_start +
                        ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
                /* Check for address beyond vma (or wrapped through 0?) */
                if (address < vma->vm_start || address >= vma->vm_end)
                        address = -EFAULT;
        } else if (PageHead(page) &&
                   pgoff + compound_nr(page) - 1 >= vma->vm_pgoff) {
                /* Test above avoids possibility of wrap to 0 on 32-bit */
                address = vma->vm_start;
        } else {
                address = -EFAULT;
        }
        return address;
}

/*
 * Then at what user virtual address will none of the page be found in vma?
 * Assumes that vma_address() already returned a good starting address.
 * If page is a compound head, the entire compound page is considered.
 */
static inline unsigned long
vma_address_end(struct page *page, struct vm_area_struct *vma)
{
        pgoff_t pgoff;
        unsigned long address;

        VM_BUG_ON_PAGE(PageKsm(page), page);        /* KSM page->index unusable */
        pgoff = page_to_pgoff(page) + compound_nr(page);
        address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
        /* Check for address beyond vma (or wrapped through 0?) */
        if (address < vma->vm_start || address > vma->vm_end)
                address = vma->vm_end;
        return address;
}

static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
                                                    struct file *fpin)
{
        int flags = vmf->flags;

        if (fpin)
                return fpin;

        /*
         * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
         * anything, so we only pin the file and drop the mmap_lock if only
         * FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt.
         */
        if (fault_flag_allow_retry_first(flags) &&
            !(flags & FAULT_FLAG_RETRY_NOWAIT)) {
                fpin = get_file(vmf->vma->vm_file);
                mmap_read_unlock(vmf->vma->vm_mm);
        }
        return fpin;
}

#else /* !CONFIG_MMU */
static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
static inline void mlock_migrate_page(struct page *new, struct page *old) { }

#endif /* !CONFIG_MMU */

/*
 * Return the mem_map entry representing the 'offset' subpage within
 * the maximally aligned gigantic page 'base'.  Handle any discontiguity
 * in the mem_map at MAX_ORDER_NR_PAGES boundaries.
 */
static inline struct page *mem_map_offset(struct page *base, int offset)
{
        if (unlikely(offset >= MAX_ORDER_NR_PAGES))
                return nth_page(base, offset);
        return base + offset;
}

/*
 * Iterator over all subpages within the maximally aligned gigantic
 * page 'base'.  Handle any discontiguity in the mem_map.
 */
static inline struct page *mem_map_next(struct page *iter,
                                                struct page *base, int offset)
{
        if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) {
                unsigned long pfn = page_to_pfn(base) + offset;
                if (!pfn_valid(pfn))
                        return NULL;
                return pfn_to_page(pfn);
        }
        return iter + 1;
}

/* Memory initialisation debug and verification */
enum mminit_level {
        MMINIT_WARNING,
        MMINIT_VERIFY,
        MMINIT_TRACE
};

#ifdef CONFIG_DEBUG_MEMORY_INIT

extern int mminit_loglevel;

#define mminit_dprintk(level, prefix, fmt, arg...) \
do { \
        if (level < mminit_loglevel) { \
                if (level <= MMINIT_WARNING) \
                        pr_warn("mminit::" prefix " " fmt, ##arg);        \
                else \
                        printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
        } \
} while (0)

extern void mminit_verify_pageflags_layout(void);
extern void mminit_verify_zonelist(void);
#else

static inline void mminit_dprintk(enum mminit_level level,
                                const char *prefix, const char *fmt, ...)
{
}

static inline void mminit_verify_pageflags_layout(void)
{
}

static inline void mminit_verify_zonelist(void)
{
}
#endif /* CONFIG_DEBUG_MEMORY_INIT */

/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
#if defined(CONFIG_SPARSEMEM)
extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
                                unsigned long *end_pfn);
#else
static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
                                unsigned long *end_pfn)
{
}
#endif /* CONFIG_SPARSEMEM */

#define NODE_RECLAIM_NOSCAN        -2
#define NODE_RECLAIM_FULL        -1
#define NODE_RECLAIM_SOME        0
#define NODE_RECLAIM_SUCCESS        1

#ifdef CONFIG_NUMA
extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
#else
static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
                                unsigned int order)
{
        return NODE_RECLAIM_NOSCAN;
}
#endif

extern int hwpoison_filter(struct page *p);

extern u32 hwpoison_filter_dev_major;
extern u32 hwpoison_filter_dev_minor;
extern u64 hwpoison_filter_flags_mask;
extern u64 hwpoison_filter_flags_value;
extern u64 hwpoison_filter_memcg;
extern u32 hwpoison_filter_enable;

extern unsigned long  __must_check vm_mmap_pgoff(struct file *, unsigned long,
        unsigned long, unsigned long,
        unsigned long, unsigned long);

extern void set_pageblock_order(void);
unsigned int reclaim_clean_pages_from_list(struct zone *zone,
                                            struct list_head *page_list);
/* The ALLOC_WMARK bits are used as an index to zone->watermark */
#define ALLOC_WMARK_MIN                WMARK_MIN
#define ALLOC_WMARK_LOW                WMARK_LOW
#define ALLOC_WMARK_HIGH        WMARK_HIGH
#define ALLOC_NO_WATERMARKS        0x04 /* don't check watermarks at all */

/* Mask to get the watermark bits */
#define ALLOC_WMARK_MASK        (ALLOC_NO_WATERMARKS-1)

/*
 * Only MMU archs have async oom victim reclaim - aka oom_reaper so we
 * cannot assume a reduced access to memory reserves is sufficient for
 * !MMU
 */
#ifdef CONFIG_MMU
#define ALLOC_OOM                0x08
#else
#define ALLOC_OOM                ALLOC_NO_WATERMARKS
#endif

#define ALLOC_HARDER                 0x10 /* try to alloc harder */
#define ALLOC_HIGH                 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET                 0x40 /* check for correct cpuset */
#define ALLOC_CMA                 0x80 /* allow allocations from CMA areas */
#ifdef CONFIG_ZONE_DMA32
#define ALLOC_NOFRAGMENT        0x100 /* avoid mixing pageblock types */
#else
#define ALLOC_NOFRAGMENT          0x0
#endif
#define ALLOC_KSWAPD                0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */

enum ttu_flags;
struct tlbflush_unmap_batch;


/*
 * only for MM internal work items which do not depend on
 * any allocations or locks which might depend on allocations
 */
extern struct workqueue_struct *mm_percpu_wq;

#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
void try_to_unmap_flush(void);
void try_to_unmap_flush_dirty(void);
void flush_tlb_batched_pending(struct mm_struct *mm);
#else
static inline void try_to_unmap_flush(void)
{
}
static inline void try_to_unmap_flush_dirty(void)
{
}
static inline void flush_tlb_batched_pending(struct mm_struct *mm)
{
}
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */

extern const struct trace_print_flags pageflag_names[];
extern const struct trace_print_flags vmaflag_names[];
extern const struct trace_print_flags gfpflag_names[];

static inline bool is_migrate_highatomic(enum migratetype migratetype)
{
        return migratetype == MIGRATE_HIGHATOMIC;
}

static inline bool is_migrate_highatomic_page(struct page *page)
{
        return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC;
}

void setup_zone_pageset(struct zone *zone);

struct migration_target_control {
        int nid;                /* preferred node id */
        nodemask_t *nmask;
        gfp_t gfp_mask;
};

#endif        /* __MM_INTERNAL_H */






































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_BIT_SPINLOCK_H
#define __LINUX_BIT_SPINLOCK_H

#include <linux/kernel.h>
#include <linux/preempt.h>
#include <linux/atomic.h>
#include <linux/bug.h>

/*
 *  bit-based spin_lock()
 *
 * Don't use this unless you really need to: spin_lock() and spin_unlock()
 * are significantly faster.
 */
static inline void bit_spin_lock(int bitnum, unsigned long *addr)
{
        /*
         * Assuming the lock is uncontended, this never enters
         * the body of the outer loop. If it is contended, then
         * within the inner loop a non-atomic test is used to
         * busywait with less bus contention for a good time to
         * attempt to acquire the lock bit.
         */
        preempt_disable();
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        while (unlikely(test_and_set_bit_lock(bitnum, addr))) {
                preempt_enable();
                do {
                        cpu_relax();
                } while (test_bit(bitnum, addr));
                preempt_disable();
        }
#endif
        __acquire(bitlock);
}

/*
 * Return true if it was acquired
 */
static inline int bit_spin_trylock(int bitnum, unsigned long *addr)
{
        preempt_disable();
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        if (unlikely(test_and_set_bit_lock(bitnum, addr))) {
                preempt_enable();
                return 0;
        }
#endif
        __acquire(bitlock);
        return 1;
}

/*
 *  bit-based spin_unlock()
 */
static inline void bit_spin_unlock(int bitnum, unsigned long *addr)
{
#ifdef CONFIG_DEBUG_SPINLOCK
        BUG_ON(!test_bit(bitnum, addr));
#endif
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        clear_bit_unlock(bitnum, addr);
#endif
        preempt_enable();
        __release(bitlock);
}

/*
 *  bit-based spin_unlock()
 *  non-atomic version, which can be used eg. if the bit lock itself is
 *  protecting the rest of the flags in the word.
 */
static inline void __bit_spin_unlock(int bitnum, unsigned long *addr)
{
#ifdef CONFIG_DEBUG_SPINLOCK
        BUG_ON(!test_bit(bitnum, addr));
#endif
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        __clear_bit_unlock(bitnum, addr);
#endif
        preempt_enable();
        __release(bitlock);
}

/*
 * Return true if the lock is held.
 */
static inline int bit_spin_is_locked(int bitnum, unsigned long *addr)
{
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        return test_bit(bitnum, addr);
#elif defined CONFIG_PREEMPT_COUNT
        return preempt_count();
#else
        return 1;
#endif
}

#endif /* __LINUX_BIT_SPINLOCK_H */


















































    1 








    1 

















































































































    1 
    1 

    1 

    1 






















    1 












    1 


    1 
    1 


    1 

    1 

    1 







    1 











    1 




















    1 

    1 








    1 




    1 


    1 



    1 

    1 












    1 
    1 







    1 
    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
  Red Black Trees
  (C) 1999  Andrea Arcangeli <andrea@suse.de>
  (C) 2002  David Woodhouse <dwmw2@infradead.org>
  (C) 2012  Michel Lespinasse <walken@google.com>


  linux/include/linux/rbtree_augmented.h
*/

#ifndef _LINUX_RBTREE_AUGMENTED_H
#define _LINUX_RBTREE_AUGMENTED_H

#include <linux/compiler.h>
#include <linux/rbtree.h>
#include <linux/rcupdate.h>

/*
 * Please note - only struct rb_augment_callbacks and the prototypes for
 * rb_insert_augmented() and rb_erase_augmented() are intended to be public.
 * The rest are implementation details you are not expected to depend on.
 *
 * See Documentation/core-api/rbtree.rst for documentation and samples.
 */

struct rb_augment_callbacks {
        void (*propagate)(struct rb_node *node, struct rb_node *stop);
        void (*copy)(struct rb_node *old, struct rb_node *new);
        void (*rotate)(struct rb_node *old, struct rb_node *new);
};

extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
        void (*augment_rotate)(struct rb_node *old, struct rb_node *new));

/*
 * Fixup the rbtree and update the augmented information when rebalancing.
 *
 * On insertion, the user must update the augmented information on the path
 * leading to the inserted node, then call rb_link_node() as usual and
 * rb_insert_augmented() instead of the usual rb_insert_color() call.
 * If rb_insert_augmented() rebalances the rbtree, it will callback into
 * a user provided function to update the augmented information on the
 * affected subtrees.
 */
static inline void
rb_insert_augmented(struct rb_node *node, struct rb_root *root,
                    const struct rb_augment_callbacks *augment)
{
        __rb_insert_augmented(node, root, augment->rotate);
}

static inline void
rb_insert_augmented_cached(struct rb_node *node,
                           struct rb_root_cached *root, bool newleft,
                           const struct rb_augment_callbacks *augment)
{
        if (newleft)
                root->rb_leftmost = node;
        rb_insert_augmented(node, &root->rb_root, augment);
}

/*
 * Template for declaring augmented rbtree callbacks (generic case)
 *
 * RBSTATIC:    'static' or empty
 * RBNAME:      name of the rb_augment_callbacks structure
 * RBSTRUCT:    struct type of the tree nodes
 * RBFIELD:     name of struct rb_node field within RBSTRUCT
 * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree
 * RBCOMPUTE:   name of function that recomputes the RBAUGMENTED data
 */

#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME,                                \
                             RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE)        \
static inline void                                                        \
RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop)                \
{                                                                        \
        while (rb != stop) {                                                \
                RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD);        \
                if (RBCOMPUTE(node, true))                                \
                        break;                                                \
                rb = rb_parent(&node->RBFIELD);                                \
        }                                                                \
}                                                                        \
static inline void                                                        \
RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new)                \
{                                                                        \
        RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);                \
        RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);                \
        new->RBAUGMENTED = old->RBAUGMENTED;                                \
}                                                                        \
static void                                                                \
RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new)        \
{                                                                        \
        RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);                \
        RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);                \
        new->RBAUGMENTED = old->RBAUGMENTED;                                \
        RBCOMPUTE(old, false);                                                \
}                                                                        \
RBSTATIC const struct rb_augment_callbacks RBNAME = {                        \
        .propagate = RBNAME ## _propagate,                                \
        .copy = RBNAME ## _copy,                                        \
        .rotate = RBNAME ## _rotate                                        \
};

/*
 * Template for declaring augmented rbtree callbacks,
 * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes.
 *
 * RBSTATIC:    'static' or empty
 * RBNAME:      name of the rb_augment_callbacks structure
 * RBSTRUCT:    struct type of the tree nodes
 * RBFIELD:     name of struct rb_node field within RBSTRUCT
 * RBTYPE:      type of the RBAUGMENTED field
 * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree
 * RBCOMPUTE:   name of function that returns the per-node RBTYPE scalar
 */

#define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD,              \
                                 RBTYPE, RBAUGMENTED, RBCOMPUTE)              \
static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit)              \
{                                                                              \
        RBSTRUCT *child;                                                      \
        RBTYPE max = RBCOMPUTE(node);                                              \
        if (node->RBFIELD.rb_left) {                                              \
                child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD);   \
                if (child->RBAUGMENTED > max)                                      \
                        max = child->RBAUGMENTED;                              \
        }                                                                      \
        if (node->RBFIELD.rb_right) {                                              \
                child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD);  \
                if (child->RBAUGMENTED > max)                                      \
                        max = child->RBAUGMENTED;                              \
        }                                                                      \
        if (exit && node->RBAUGMENTED == max)                                      \
                return true;                                                      \
        node->RBAUGMENTED = max;                                              \
        return false;                                                              \
}                                                                              \
RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME,                                              \
                     RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max)


#define        RB_RED                0
#define        RB_BLACK        1

#define __rb_parent(pc)    ((struct rb_node *)(pc & ~3))

#define __rb_color(pc)     ((pc) & 1)
#define __rb_is_black(pc)  __rb_color(pc)
#define __rb_is_red(pc)    (!__rb_color(pc))
#define rb_color(rb)       __rb_color((rb)->__rb_parent_color)
#define rb_is_red(rb)      __rb_is_red((rb)->__rb_parent_color)
#define rb_is_black(rb)    __rb_is_black((rb)->__rb_parent_color)

static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
{
        rb->__rb_parent_color = rb_color(rb) | (unsigned long)p;
}

static inline void rb_set_parent_color(struct rb_node *rb,
                                       struct rb_node *p, int color)
{
        rb->__rb_parent_color = (unsigned long)p | color;
}

static inline void
__rb_change_child(struct rb_node *old, struct rb_node *new,
                  struct rb_node *parent, struct rb_root *root)
{
        if (parent) {
                if (parent->rb_left == old)
                        WRITE_ONCE(parent->rb_left, new);
                else
                        WRITE_ONCE(parent->rb_right, new);
        } else
                WRITE_ONCE(root->rb_node, new);
}

static inline void
__rb_change_child_rcu(struct rb_node *old, struct rb_node *new,
                      struct rb_node *parent, struct rb_root *root)
{
        if (parent) {
                if (parent->rb_left == old)
                        rcu_assign_pointer(parent->rb_left, new);
                else
                        rcu_assign_pointer(parent->rb_right, new);
        } else
                rcu_assign_pointer(root->rb_node, new);
}

extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
        void (*augment_rotate)(struct rb_node *old, struct rb_node *new));

static __always_inline struct rb_node *
__rb_erase_augmented(struct rb_node *node, struct rb_root *root,
                     const struct rb_augment_callbacks *augment)
{
        struct rb_node *child = node->rb_right;
        struct rb_node *tmp = node->rb_left;
        struct rb_node *parent, *rebalance;
        unsigned long pc;

        if (!tmp) {
                /*
                 * Case 1: node to erase has no more than 1 child (easy!)
                 *
                 * Note that if there is one child it must be red due to 5)
                 * and node must be black due to 4). We adjust colors locally
                 * so as to bypass __rb_erase_color() later on.
                 */
                pc = node->__rb_parent_color;
                parent = __rb_parent(pc);
                __rb_change_child(node, child, parent, root);
                if (child) {
                        child->__rb_parent_color = pc;
                        rebalance = NULL;
                } else
                        rebalance = __rb_is_black(pc) ? parent : NULL;
                tmp = parent;
        } else if (!child) {
                /* Still case 1, but this time the child is node->rb_left */
                tmp->__rb_parent_color = pc = node->__rb_parent_color;
                parent = __rb_parent(pc);
                __rb_change_child(node, tmp, parent, root);
                rebalance = NULL;
                tmp = parent;
        } else {
                struct rb_node *successor = child, *child2;

                tmp = child->rb_left;
                if (!tmp) {
                        /*
                         * Case 2: node's successor is its right child
                         *
                         *    (n)          (s)
                         *    / \          / \
                         *  (x) (s)  ->  (x) (c)
                         *        \
                         *        (c)
                         */
                        parent = successor;
                        child2 = successor->rb_right;

                        augment->copy(node, successor);
                } else {
                        /*
                         * Case 3: node's successor is leftmost under
                         * node's right child subtree
                         *
                         *    (n)          (s)
                         *    / \          / \
                         *  (x) (y)  ->  (x) (y)
                         *      /            /
                         *    (p)          (p)
                         *    /            /
                         *  (s)          (c)
                         *    \
                         *    (c)
                         */
                        do {
                                parent = successor;
                                successor = tmp;
                                tmp = tmp->rb_left;
                        } while (tmp);
                        child2 = successor->rb_right;
                        WRITE_ONCE(parent->rb_left, child2);
                        WRITE_ONCE(successor->rb_right, child);
                        rb_set_parent(child, successor);

                        augment->copy(node, successor);
                        augment->propagate(parent, successor);
                }

                tmp = node->rb_left;
                WRITE_ONCE(successor->rb_left, tmp);
                rb_set_parent(tmp, successor);

                pc = node->__rb_parent_color;
                tmp = __rb_parent(pc);
                __rb_change_child(node, successor, tmp, root);

                if (child2) {
                        rb_set_parent_color(child2, parent, RB_BLACK);
                        rebalance = NULL;
                } else {
                        rebalance = rb_is_black(successor) ? parent : NULL;
                }
                successor->__rb_parent_color = pc;
                tmp = successor;
        }

        augment->propagate(tmp, NULL);
        return rebalance;
}

static __always_inline void
rb_erase_augmented(struct rb_node *node, struct rb_root *root,
                   const struct rb_augment_callbacks *augment)
{
        struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
        if (rebalance)
                __rb_erase_color(rebalance, root, augment->rotate);
}

static __always_inline void
rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root,
                          const struct rb_augment_callbacks *augment)
{
        if (root->rb_leftmost == node)
                root->rb_leftmost = rb_next(node);
        rb_erase_augmented(node, &root->rb_root, augment);
}

#endif        /* _LINUX_RBTREE_AUGMENTED_H */






































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef LLIST_H
#define LLIST_H
/*
 * Lock-less NULL terminated single linked list
 *
 * Cases where locking is not needed:
 * If there are multiple producers and multiple consumers, llist_add can be
 * used in producers and llist_del_all can be used in consumers simultaneously
 * without locking. Also a single consumer can use llist_del_first while
 * multiple producers simultaneously use llist_add, without any locking.
 *
 * Cases where locking is needed:
 * If we have multiple consumers with llist_del_first used in one consumer, and
 * llist_del_first or llist_del_all used in other consumers, then a lock is
 * needed.  This is because llist_del_first depends on list->first->next not
 * changing, but without lock protection, there's no way to be sure about that
 * if a preemption happens in the middle of the delete operation and on being
 * preempted back, the list->first is the same as before causing the cmpxchg in
 * llist_del_first to succeed. For example, while a llist_del_first operation
 * is in progress in one consumer, then a llist_del_first, llist_add,
 * llist_add (or llist_del_all, llist_add, llist_add) sequence in another
 * consumer may cause violations.
 *
 * This can be summarized as follows:
 *
 *           |   add    | del_first |  del_all
 * add       |    -     |     -     |     -
 * del_first |          |     L     |     L
 * del_all   |          |           |     -
 *
 * Where, a particular row's operation can happen concurrently with a column's
 * operation, with "-" being no lock needed, while "L" being lock is needed.
 *
 * The list entries deleted via llist_del_all can be traversed with
 * traversing function such as llist_for_each etc.  But the list
 * entries can not be traversed safely before deleted from the list.
 * The order of deleted entries is from the newest to the oldest added
 * one.  If you want to traverse from the oldest to the newest, you
 * must reverse the order by yourself before traversing.
 *
 * The basic atomic operation of this list is cmpxchg on long.  On
 * architectures that don't have NMI-safe cmpxchg implementation, the
 * list can NOT be used in NMI handlers.  So code that uses the list in
 * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
 *
 * Copyright 2010,2011 Intel Corp.
 *   Author: Huang Ying <ying.huang@intel.com>
 */

#include <linux/atomic.h>
#include <linux/kernel.h>

struct llist_head {
        struct llist_node *first;
};

struct llist_node {
        struct llist_node *next;
};

#define LLIST_HEAD_INIT(name)        { NULL }
#define LLIST_HEAD(name)        struct llist_head name = LLIST_HEAD_INIT(name)

/**
 * init_llist_head - initialize lock-less list head
 * @head:        the head for your lock-less list
 */
static inline void init_llist_head(struct llist_head *list)
{
        list->first = NULL;
}

/**
 * llist_entry - get the struct of this entry
 * @ptr:        the &struct llist_node pointer.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the llist_node within the struct.
 */
#define llist_entry(ptr, type, member)                \
        container_of(ptr, type, member)

/**
 * member_address_is_nonnull - check whether the member address is not NULL
 * @ptr:        the object pointer (struct type * that contains the llist_node)
 * @member:        the name of the llist_node within the struct.
 *
 * This macro is conceptually the same as
 *        &ptr->member != NULL
 * but it works around the fact that compilers can decide that taking a member
 * address is never a NULL pointer.
 *
 * Real objects that start at a high address and have a member at NULL are
 * unlikely to exist, but such pointers may be returned e.g. by the
 * container_of() macro.
 */
#define member_address_is_nonnull(ptr, member)        \
        ((uintptr_t)(ptr) + offsetof(typeof(*(ptr)), member) != 0)

/**
 * llist_for_each - iterate over some deleted entries of a lock-less list
 * @pos:        the &struct llist_node to use as a loop cursor
 * @node:        the first entry of deleted list entries
 *
 * In general, some entries of the lock-less list can be traversed
 * safely only after being deleted from list, so start with an entry
 * instead of list head.
 *
 * If being used on entries deleted from lock-less list directly, the
 * traverse order is from the newest to the oldest added entry.  If
 * you want to traverse from the oldest to the newest, you must
 * reverse the order by yourself before traversing.
 */
#define llist_for_each(pos, node)                        \
        for ((pos) = (node); pos; (pos) = (pos)->next)

/**
 * llist_for_each_safe - iterate over some deleted entries of a lock-less list
 *                         safe against removal of list entry
 * @pos:        the &struct llist_node to use as a loop cursor
 * @n:                another &struct llist_node to use as temporary storage
 * @node:        the first entry of deleted list entries
 *
 * In general, some entries of the lock-less list can be traversed
 * safely only after being deleted from list, so start with an entry
 * instead of list head.
 *
 * If being used on entries deleted from lock-less list directly, the
 * traverse order is from the newest to the oldest added entry.  If
 * you want to traverse from the oldest to the newest, you must
 * reverse the order by yourself before traversing.
 */
#define llist_for_each_safe(pos, n, node)                        \
        for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n))

/**
 * llist_for_each_entry - iterate over some deleted entries of lock-less list of given type
 * @pos:        the type * to use as a loop cursor.
 * @node:        the fist entry of deleted list entries.
 * @member:        the name of the llist_node with the struct.
 *
 * In general, some entries of the lock-less list can be traversed
 * safely only after being removed from list, so start with an entry
 * instead of list head.
 *
 * If being used on entries deleted from lock-less list directly, the
 * traverse order is from the newest to the oldest added entry.  If
 * you want to traverse from the oldest to the newest, you must
 * reverse the order by yourself before traversing.
 */
#define llist_for_each_entry(pos, node, member)                                \
        for ((pos) = llist_entry((node), typeof(*(pos)), member);        \
             member_address_is_nonnull(pos, member);                        \
             (pos) = llist_entry((pos)->member.next, typeof(*(pos)), member))

/**
 * llist_for_each_entry_safe - iterate over some deleted entries of lock-less list of given type
 *                               safe against removal of list entry
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @node:        the first entry of deleted list entries.
 * @member:        the name of the llist_node with the struct.
 *
 * In general, some entries of the lock-less list can be traversed
 * safely only after being removed from list, so start with an entry
 * instead of list head.
 *
 * If being used on entries deleted from lock-less list directly, the
 * traverse order is from the newest to the oldest added entry.  If
 * you want to traverse from the oldest to the newest, you must
 * reverse the order by yourself before traversing.
 */
#define llist_for_each_entry_safe(pos, n, node, member)                               \
        for (pos = llist_entry((node), typeof(*pos), member);                       \
             member_address_is_nonnull(pos, member) &&                               \
                (n = llist_entry(pos->member.next, typeof(*n), member), true); \
             pos = n)

/**
 * llist_empty - tests whether a lock-less list is empty
 * @head:        the list to test
 *
 * Not guaranteed to be accurate or up to date.  Just a quick way to
 * test whether the list is empty without deleting something from the
 * list.
 */
static inline bool llist_empty(const struct llist_head *head)
{
        return READ_ONCE(head->first) == NULL;
}

static inline struct llist_node *llist_next(struct llist_node *node)
{
        return node->next;
}

extern bool llist_add_batch(struct llist_node *new_first,
                            struct llist_node *new_last,
                            struct llist_head *head);
/**
 * llist_add - add a new entry
 * @new:        new entry to be added
 * @head:        the head for your lock-less list
 *
 * Returns true if the list was empty prior to adding this entry.
 */
static inline bool llist_add(struct llist_node *new, struct llist_head *head)
{
        return llist_add_batch(new, new, head);
}

/**
 * llist_del_all - delete all entries from lock-less list
 * @head:        the head of lock-less list to delete all entries
 *
 * If list is empty, return NULL, otherwise, delete all entries and
 * return the pointer to the first entry.  The order of entries
 * deleted is from the newest to the oldest added one.
 */
static inline struct llist_node *llist_del_all(struct llist_head *head)
{
        return xchg(&head->first, NULL);
}

extern struct llist_node *llist_del_first(struct llist_head *head);

struct llist_node *llist_reverse_order(struct llist_node *head);

#endif /* LLIST_H */


























































































































































































    1 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CPUFEATURE_H
#define _ASM_X86_CPUFEATURE_H

#include <asm/processor.h>

#if defined(__KERNEL__) && !defined(__ASSEMBLY__)

#include <asm/asm.h>
#include <linux/bitops.h>
#include <asm/alternative.h>

enum cpuid_leafs
{
        CPUID_1_EDX                = 0,
        CPUID_8000_0001_EDX,
        CPUID_8086_0001_EDX,
        CPUID_LNX_1,
        CPUID_1_ECX,
        CPUID_C000_0001_EDX,
        CPUID_8000_0001_ECX,
        CPUID_LNX_2,
        CPUID_LNX_3,
        CPUID_7_0_EBX,
        CPUID_D_1_EAX,
        CPUID_LNX_4,
        CPUID_7_1_EAX,
        CPUID_8000_0008_EBX,
        CPUID_6_EAX,
        CPUID_8000_000A_EDX,
        CPUID_7_ECX,
        CPUID_8000_0007_EBX,
        CPUID_7_EDX,
        CPUID_8000_001F_EAX,
        CPUID_8000_0021_EAX,
        CPUID_LNX_5,
        NR_CPUID_WORDS,
};

#ifdef CONFIG_X86_FEATURE_NAMES
extern const char * const x86_cap_flags[NCAPINTS*32];
extern const char * const x86_power_flags[32];
#define X86_CAP_FMT "%s"
#define x86_cap_flag(flag) x86_cap_flags[flag]
#else
#define X86_CAP_FMT "%d:%d"
#define x86_cap_flag(flag) ((flag) >> 5), ((flag) & 31)
#endif

/*
 * In order to save room, we index into this array by doing
 * X86_BUG_<name> - NCAPINTS*32.
 */
extern const char * const x86_bug_flags[NBUGINTS*32];

#define test_cpu_cap(c, bit)                                                \
         arch_test_bit(bit, (unsigned long *)((c)->x86_capability))

/*
 * There are 32 bits/features in each mask word.  The high bits
 * (selected with (bit>>5) give us the word number and the low 5
 * bits give us the bit/feature number inside the word.
 * (1UL<<((bit)&31) gives us a mask for the feature_bit so we can
 * see if it is set in the mask word.
 */
#define CHECK_BIT_IN_MASK_WORD(maskname, word, bit)        \
        (((bit)>>5)==(word) && (1UL<<((bit)&31) & maskname##word ))

/*
 * {REQUIRED,DISABLED}_MASK_CHECK below may seem duplicated with the
 * following BUILD_BUG_ON_ZERO() check but when NCAPINTS gets changed, all
 * header macros which use NCAPINTS need to be changed. The duplicated macro
 * use causes the compiler to issue errors for all headers so that all usage
 * sites can be corrected.
 */
#define REQUIRED_MASK_BIT_SET(feature_bit)                \
         ( CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  0, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  1, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  2, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  3, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  4, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  5, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  6, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  7, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  8, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  9, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 10, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 11, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 12, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 13, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 14, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 15, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 20, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 21, feature_bit) ||        \
           REQUIRED_MASK_CHECK                                          ||        \
           BUILD_BUG_ON_ZERO(NCAPINTS != 22))

#define DISABLED_MASK_BIT_SET(feature_bit)                                \
         ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  0, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  1, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  2, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  3, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  4, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  5, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  6, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  7, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  8, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  9, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 10, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 11, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 12, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 13, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 14, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 15, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 20, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 21, feature_bit) ||        \
           DISABLED_MASK_CHECK                                          ||        \
           BUILD_BUG_ON_ZERO(NCAPINTS != 22))

#define cpu_has(c, bit)                                                        \
        (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :        \
         test_cpu_cap(c, bit))

#define this_cpu_has(bit)                                                \
        (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :        \
         x86_this_cpu_test_bit(bit,                                        \
                (unsigned long __percpu *)&cpu_info.x86_capability))

/*
 * This macro is for detection of features which need kernel
 * infrastructure to be used.  It may *not* directly test the CPU
 * itself.  Use the cpu_has() family if you want true runtime
 * testing of CPU features, like in hypervisor code where you are
 * supporting a possible guest feature where host support for it
 * is not relevant.
 */
#define cpu_feature_enabled(bit)        \
        (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit))

#define boot_cpu_has(bit)        cpu_has(&boot_cpu_data, bit)

#define set_cpu_cap(c, bit)        set_bit(bit, (unsigned long *)((c)->x86_capability))

extern void setup_clear_cpu_cap(unsigned int bit);
extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);

#define setup_force_cpu_cap(bit) do { \
        set_cpu_cap(&boot_cpu_data, bit);        \
        set_bit(bit, (unsigned long *)cpu_caps_set);        \
} while (0)

#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)

#if defined(__clang__) && !defined(CONFIG_CC_HAS_ASM_GOTO)

/*
 * Workaround for the sake of BPF compilation which utilizes kernel
 * headers, but clang does not support ASM GOTO and fails the build.
 */
#ifndef __BPF_TRACING__
#warning "Compiler lacks ASM_GOTO support. Add -D __BPF_TRACING__ to your compiler arguments"
#endif

#define static_cpu_has(bit)            boot_cpu_has(bit)

#else

/*
 * Static testing of CPU features. Used the same as boot_cpu_has(). It
 * statically patches the target code for additional performance. Use
 * static_cpu_has() only in fast paths, where every cycle counts. Which
 * means that the boot_cpu_has() variant is already fast enough for the
 * majority of cases and you should stick to using it as it is generally
 * only two instructions: a RIP-relative MOV and a TEST.
 */
static __always_inline bool _static_cpu_has(u16 bit)
{
        asm_volatile_goto(
                ALTERNATIVE_TERNARY("jmp 6f", %P[feature], "", "jmp %l[t_no]")
                ".section .altinstr_aux,\"ax\"\n"
                "6:\n"
                " testb %[bitnum],%[cap_byte]\n"
                " jnz %l[t_yes]\n"
                " jmp %l[t_no]\n"
                ".previous\n"
                 : : [feature]  "i" (bit),
                     [bitnum]   "i" (1 << (bit & 7)),
                     [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3])
                 : : t_yes, t_no);
t_yes:
        return true;
t_no:
        return false;
}

#define static_cpu_has(bit)                                        \
(                                                                \
        __builtin_constant_p(boot_cpu_has(bit)) ?                \
                boot_cpu_has(bit) :                                \
                _static_cpu_has(bit)                                \
)
#endif

#define cpu_has_bug(c, bit)                cpu_has(c, (bit))
#define set_cpu_bug(c, bit)                set_cpu_cap(c, (bit))
#define clear_cpu_bug(c, bit)                clear_cpu_cap(c, (bit))

#define static_cpu_has_bug(bit)                static_cpu_has((bit))
#define boot_cpu_has_bug(bit)                cpu_has_bug(&boot_cpu_data, (bit))
#define boot_cpu_set_bug(bit)                set_cpu_cap(&boot_cpu_data, (bit))

#define MAX_CPU_FEATURES                (NCAPINTS * 32)
#define cpu_have_feature                boot_cpu_has

#define CPU_FEATURE_TYPEFMT                "x86,ven%04Xfam%04Xmod%04X"
#define CPU_FEATURE_TYPEVAL                boot_cpu_data.x86_vendor, boot_cpu_data.x86, \
                                        boot_cpu_data.x86_model

#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
#endif /* _ASM_X86_CPUFEATURE_H */











    1 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_HUGETLB_INLINE_H
#define _LINUX_HUGETLB_INLINE_H

#ifdef CONFIG_HUGETLB_PAGE

#include <linux/mm.h>

static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
{
        return !!(vma->vm_flags & VM_HUGETLB);
}

#else

static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
{
        return false;
}

#endif

#endif


































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#ifndef INTERNAL_IO_WQ_H
#define INTERNAL_IO_WQ_H

#include <linux/refcount.h>

struct io_wq;

enum {
        IO_WQ_WORK_CANCEL        = 1,
        IO_WQ_WORK_HASHED        = 2,
        IO_WQ_WORK_UNBOUND        = 4,
        IO_WQ_WORK_CONCURRENT        = 16,

        IO_WQ_HASH_SHIFT        = 24,        /* upper 8 bits are used for hash key */
};

enum io_wq_cancel {
        IO_WQ_CANCEL_OK,        /* cancelled before started */
        IO_WQ_CANCEL_RUNNING,        /* found, running, and attempted cancelled */
        IO_WQ_CANCEL_NOTFOUND,        /* work not found */
};

struct io_wq_work_node {
        struct io_wq_work_node *next;
};

struct io_wq_work_list {
        struct io_wq_work_node *first;
        struct io_wq_work_node *last;
};

static inline void wq_list_add_after(struct io_wq_work_node *node,
                                     struct io_wq_work_node *pos,
                                     struct io_wq_work_list *list)
{
        struct io_wq_work_node *next = pos->next;

        pos->next = node;
        node->next = next;
        if (!next)
                list->last = node;
}

static inline void wq_list_add_tail(struct io_wq_work_node *node,
                                    struct io_wq_work_list *list)
{
        node->next = NULL;
        if (!list->first) {
                list->last = node;
                WRITE_ONCE(list->first, node);
        } else {
                list->last->next = node;
                list->last = node;
        }
}

static inline void wq_list_cut(struct io_wq_work_list *list,
                               struct io_wq_work_node *last,
                               struct io_wq_work_node *prev)
{
        /* first in the list, if prev==NULL */
        if (!prev)
                WRITE_ONCE(list->first, last->next);
        else
                prev->next = last->next;

        if (last == list->last)
                list->last = prev;
        last->next = NULL;
}

static inline void wq_list_del(struct io_wq_work_list *list,
                               struct io_wq_work_node *node,
                               struct io_wq_work_node *prev)
{
        wq_list_cut(list, node, prev);
}

#define wq_list_for_each(pos, prv, head)                        \
        for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)

#define wq_list_empty(list)        (READ_ONCE((list)->first) == NULL)
#define INIT_WQ_LIST(list)        do {                                \
        (list)->first = NULL;                                        \
        (list)->last = NULL;                                        \
} while (0)

struct io_wq_work {
        struct io_wq_work_node list;
        unsigned flags;
};

static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
{
        if (!work->list.next)
                return NULL;

        return container_of(work->list.next, struct io_wq_work, list);
}

typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
typedef void (io_wq_work_fn)(struct io_wq_work *);

struct io_wq_hash {
        refcount_t refs;
        unsigned long map;
        struct wait_queue_head wait;
};

static inline void io_wq_put_hash(struct io_wq_hash *hash)
{
        if (refcount_dec_and_test(&hash->refs))
                kfree(hash);
}

struct io_wq_data {
        struct io_wq_hash *hash;
        struct task_struct *task;
        io_wq_work_fn *do_work;
        free_work_fn *free_work;
};

struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
void io_wq_exit_start(struct io_wq *wq);
void io_wq_put_and_exit(struct io_wq *wq);

void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_hash_work(struct io_wq_work *work, void *val);

int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask);
int io_wq_max_workers(struct io_wq *wq, int *new_count);
bool io_wq_worker_stopped(void);

static inline bool io_wq_is_hashed(struct io_wq_work *work)
{
        return work->flags & IO_WQ_WORK_HASHED;
}

typedef bool (work_cancel_fn)(struct io_wq_work *, void *);

enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
                                        void *data, bool cancel_all);

#if defined(CONFIG_IO_WQ)
extern void io_wq_worker_sleeping(struct task_struct *);
extern void io_wq_worker_running(struct task_struct *);
#else
static inline void io_wq_worker_sleeping(struct task_struct *tsk)
{
}
static inline void io_wq_worker_running(struct task_struct *tsk)
{
}
#endif

static inline bool io_wq_current_is_worker(void)
{
        return in_task() && (current->flags & PF_IO_WORKER) &&
                current->pf_io_worker;
}
#endif







    1 




1
2
3
4
5
6
7
8
9
10
11
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/processor.h>

static inline int phys_addr_valid(resource_size_t addr)
{
#ifdef CONFIG_PHYS_ADDR_T_64BIT
        return !(addr >> boot_cpu_data.x86_phys_bits);
#else
        return 1;
#endif
}









































    1 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_GENERIC_GETORDER_H
#define __ASM_GENERIC_GETORDER_H

#ifndef __ASSEMBLY__

#include <linux/compiler.h>
#include <linux/log2.h>

/**
 * get_order - Determine the allocation order of a memory size
 * @size: The size for which to get the order
 *
 * Determine the allocation order of a particular sized block of memory.  This
 * is on a logarithmic scale, where:
 *
 *        0 -> 2^0 * PAGE_SIZE and below
 *        1 -> 2^1 * PAGE_SIZE to 2^0 * PAGE_SIZE + 1
 *        2 -> 2^2 * PAGE_SIZE to 2^1 * PAGE_SIZE + 1
 *        3 -> 2^3 * PAGE_SIZE to 2^2 * PAGE_SIZE + 1
 *        4 -> 2^4 * PAGE_SIZE to 2^3 * PAGE_SIZE + 1
 *        ...
 *
 * The order returned is used to find the smallest allocation granule required
 * to hold an object of the specified size.
 *
 * The result is undefined if the size is 0.
 */
static inline __attribute_const__ int get_order(unsigned long size)
{
        if (__builtin_constant_p(size)) {
                if (!size)
                        return BITS_PER_LONG - PAGE_SHIFT;

                if (size < (1UL << PAGE_SHIFT))
                        return 0;

                return ilog2((size) - 1) - PAGE_SHIFT + 1;
        }

        size--;
        size >>= PAGE_SHIFT;
#if BITS_PER_LONG == 32
        return fls(size);
#else
        return fls64(size);
#endif
}

#endif        /* __ASSEMBLY__ */

#endif        /* __ASM_GENERIC_GETORDER_H */



























































































































































































































































































































































































































































    1 


    1 



































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_HUGETLB_H
#define _LINUX_HUGETLB_H

#include <linux/mm_types.h>
#include <linux/mmdebug.h>
#include <linux/fs.h>
#include <linux/hugetlb_inline.h>
#include <linux/cgroup.h>
#include <linux/page_ref.h>
#include <linux/list.h>
#include <linux/kref.h>
#include <linux/pgtable.h>
#include <linux/gfp.h>

struct ctl_table;
struct user_struct;
struct mmu_gather;

#ifndef is_hugepd
typedef struct { unsigned long pd; } hugepd_t;
#define is_hugepd(hugepd) (0)
#define __hugepd(x) ((hugepd_t) { (x) })
#endif

#ifdef CONFIG_HUGETLB_PAGE

#include <linux/mempolicy.h>
#include <linux/shm.h>
#include <asm/tlbflush.h>

struct hugepage_subpool {
        spinlock_t lock;
        long count;
        long max_hpages;        /* Maximum huge pages or -1 if no maximum. */
        long used_hpages;        /* Used count against maximum, includes */
                                /* both alloced and reserved pages. */
        struct hstate *hstate;
        long min_hpages;        /* Minimum huge pages or -1 if no minimum. */
        long rsv_hpages;        /* Pages reserved against global pool to */
                                /* sasitfy minimum size. */
};

struct resv_map {
        struct kref refs;
        spinlock_t lock;
        struct list_head regions;
        long adds_in_progress;
        struct list_head region_cache;
        long region_cache_count;
#ifdef CONFIG_CGROUP_HUGETLB
        /*
         * On private mappings, the counter to uncharge reservations is stored
         * here. If these fields are 0, then either the mapping is shared, or
         * cgroup accounting is disabled for this resv_map.
         */
        struct page_counter *reservation_counter;
        unsigned long pages_per_hpage;
        struct cgroup_subsys_state *css;
#endif
};

/*
 * Region tracking -- allows tracking of reservations and instantiated pages
 *                    across the pages in a mapping.
 *
 * The region data structures are embedded into a resv_map and protected
 * by a resv_map's lock.  The set of regions within the resv_map represent
 * reservations for huge pages, or huge pages that have already been
 * instantiated within the map.  The from and to elements are huge page
 * indicies into the associated mapping.  from indicates the starting index
 * of the region.  to represents the first index past the end of  the region.
 *
 * For example, a file region structure with from == 0 and to == 4 represents
 * four huge pages in a mapping.  It is important to note that the to element
 * represents the first element past the end of the region. This is used in
 * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
 *
 * Interval notation of the form [from, to) will be used to indicate that
 * the endpoint from is inclusive and to is exclusive.
 */
struct file_region {
        struct list_head link;
        long from;
        long to;
#ifdef CONFIG_CGROUP_HUGETLB
        /*
         * On shared mappings, each reserved region appears as a struct
         * file_region in resv_map. These fields hold the info needed to
         * uncharge each reservation.
         */
        struct page_counter *reservation_counter;
        struct cgroup_subsys_state *css;
#endif
};

extern struct resv_map *resv_map_alloc(void);
void resv_map_release(struct kref *ref);

extern spinlock_t hugetlb_lock;
extern int hugetlb_max_hstate __read_mostly;
#define for_each_hstate(h) \
        for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++)

struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
                                                long min_hpages);
void hugepage_put_subpool(struct hugepage_subpool *spool);

void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *);
int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);
int hugetlb_treat_movable_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);
int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);

int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
                         struct page **, struct vm_area_struct **,
                         unsigned long *, unsigned long *, long, unsigned int,
                         int *);
void unmap_hugepage_range(struct vm_area_struct *,
                          unsigned long, unsigned long, struct page *);
void __unmap_hugepage_range_final(struct mmu_gather *tlb,
                          struct vm_area_struct *vma,
                          unsigned long start, unsigned long end,
                          struct page *ref_page);
void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
                                unsigned long start, unsigned long end,
                                struct page *ref_page);
void hugetlb_report_meminfo(struct seq_file *);
int hugetlb_report_node_meminfo(char *buf, int len, int nid);
void hugetlb_show_meminfo(void);
unsigned long hugetlb_total_pages(void);
vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, unsigned int flags);
int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
                                struct vm_area_struct *dst_vma,
                                unsigned long dst_addr,
                                unsigned long src_addr,
                                struct page **pagep);
bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
                                                struct vm_area_struct *vma,
                                                vm_flags_t vm_flags);
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
                                                long freed);
int isolate_hugetlb(struct page *page, struct list_head *list);
void putback_active_hugepage(struct page *page);
void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
void free_huge_page(struct page *page);
void hugetlb_fix_reserve_counts(struct inode *inode);
extern struct mutex *hugetlb_fault_mutex_table;
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx);

pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);

struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage);

extern int sysctl_hugetlb_shm_group;
extern struct list_head huge_boot_pages;

/* arch callbacks */

pte_t *huge_pte_alloc(struct mm_struct *mm,
                        unsigned long addr, unsigned long sz);
pte_t *huge_pte_offset(struct mm_struct *mm,
                       unsigned long addr, unsigned long sz);
int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
                                unsigned long *addr, pte_t *ptep);
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end);
struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
                              int write);
struct page *follow_huge_pd(struct vm_area_struct *vma,
                            unsigned long address, hugepd_t hpd,
                            int flags, int pdshift);
struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address,
                                 int flags);
struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
                                pud_t *pud, int flags);
struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
                             pgd_t *pgd, int flags);

int pmd_huge(pmd_t pmd);
int pud_huge(pud_t pud);
unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                unsigned long address, unsigned long end, pgprot_t newprot);

bool is_hugetlb_entry_migration(pte_t pte);

#else /* !CONFIG_HUGETLB_PAGE */

static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
{
}

static inline unsigned long hugetlb_total_pages(void)
{
        return 0;
}

static inline struct address_space *hugetlb_page_mapping_lock_write(
                                                        struct page *hpage)
{
        return NULL;
}

static inline int huge_pmd_unshare(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long *addr, pte_t *ptep)
{
        return 0;
}

static inline void adjust_range_if_pmd_sharing_possible(
                                struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end)
{
}

static inline long follow_hugetlb_page(struct mm_struct *mm,
                        struct vm_area_struct *vma, struct page **pages,
                        struct vm_area_struct **vmas, unsigned long *position,
                        unsigned long *nr_pages, long i, unsigned int flags,
                        int *nonblocking)
{
        BUG();
        return 0;
}

static inline struct page *follow_huge_addr(struct mm_struct *mm,
                                        unsigned long address, int write)
{
        return ERR_PTR(-EINVAL);
}

static inline int copy_hugetlb_page_range(struct mm_struct *dst,
                        struct mm_struct *src, struct vm_area_struct *vma)
{
        BUG();
        return 0;
}

static inline void hugetlb_report_meminfo(struct seq_file *m)
{
}

static inline int hugetlb_report_node_meminfo(char *buf, int len, int nid)
{
        return 0;
}

static inline void hugetlb_show_meminfo(void)
{
}

static inline struct page *follow_huge_pd(struct vm_area_struct *vma,
                                unsigned long address, hugepd_t hpd, int flags,
                                int pdshift)
{
        return NULL;
}

static inline struct page *follow_huge_pmd_pte(struct vm_area_struct *vma,
                                unsigned long address, int flags)
{
        return NULL;
}

static inline struct page *follow_huge_pud(struct mm_struct *mm,
                                unsigned long address, pud_t *pud, int flags)
{
        return NULL;
}

static inline struct page *follow_huge_pgd(struct mm_struct *mm,
                                unsigned long address, pgd_t *pgd, int flags)
{
        return NULL;
}

static inline int prepare_hugepage_range(struct file *file,
                                unsigned long addr, unsigned long len)
{
        return -EINVAL;
}

static inline int pmd_huge(pmd_t pmd)
{
        return 0;
}

static inline int pud_huge(pud_t pud)
{
        return 0;
}

static inline int is_hugepage_only_range(struct mm_struct *mm,
                                        unsigned long addr, unsigned long len)
{
        return 0;
}

static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
                                unsigned long addr, unsigned long end,
                                unsigned long floor, unsigned long ceiling)
{
        BUG();
}

static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                                                pte_t *dst_pte,
                                                struct vm_area_struct *dst_vma,
                                                unsigned long dst_addr,
                                                unsigned long src_addr,
                                                struct page **pagep)
{
        BUG();
        return 0;
}

static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
                                        unsigned long sz)
{
        return NULL;
}

static inline int isolate_hugetlb(struct page *page, struct list_head *list)
{
        return -EBUSY;
}

static inline void putback_active_hugepage(struct page *page)
{
}

static inline void move_hugetlb_state(struct page *oldpage,
                                        struct page *newpage, int reason)
{
}

static inline unsigned long hugetlb_change_protection(
                        struct vm_area_struct *vma, unsigned long address,
                        unsigned long end, pgprot_t newprot)
{
        return 0;
}

static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb,
                        struct vm_area_struct *vma, unsigned long start,
                        unsigned long end, struct page *ref_page)
{
        BUG();
}

static inline void __unmap_hugepage_range(struct mmu_gather *tlb,
                        struct vm_area_struct *vma, unsigned long start,
                        unsigned long end, struct page *ref_page)
{
        BUG();
}

static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
                        struct vm_area_struct *vma, unsigned long address,
                        unsigned int flags)
{
        BUG();
        return 0;
}

#endif /* !CONFIG_HUGETLB_PAGE */
/*
 * hugepages at page global directory. If arch support
 * hugepages at pgd level, they need to define this.
 */
#ifndef pgd_huge
#define pgd_huge(x)        0
#endif
#ifndef p4d_huge
#define p4d_huge(x)        0
#endif

#ifndef pgd_write
static inline int pgd_write(pgd_t pgd)
{
        BUG();
        return 0;
}
#endif

#define HUGETLB_ANON_FILE "anon_hugepage"

enum {
        /*
         * The file will be used as an shm file so shmfs accounting rules
         * apply
         */
        HUGETLB_SHMFS_INODE     = 1,
        /*
         * The file is being created on the internal vfs mount and shmfs
         * accounting rules do not apply
         */
        HUGETLB_ANONHUGE_INODE  = 2,
};

#ifdef CONFIG_HUGETLBFS
struct hugetlbfs_sb_info {
        long        max_inodes;   /* inodes allowed */
        long        free_inodes;  /* inodes free */
        spinlock_t        stat_lock;
        struct hstate *hstate;
        struct hugepage_subpool *spool;
        kuid_t        uid;
        kgid_t        gid;
        umode_t mode;
};

static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
{
        return sb->s_fs_info;
}

struct hugetlbfs_inode_info {
        struct shared_policy policy;
        struct inode vfs_inode;
        unsigned int seals;
};

static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
{
        return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
}

extern const struct file_operations hugetlbfs_file_operations;
extern const struct vm_operations_struct hugetlb_vm_ops;
struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
                                struct user_struct **user, int creat_flags,
                                int page_size_log);

static inline bool is_file_hugepages(struct file *file)
{
        if (file->f_op == &hugetlbfs_file_operations)
                return true;

        return is_file_shm_hugepages(file);
}

static inline struct hstate *hstate_inode(struct inode *i)
{
        return HUGETLBFS_SB(i->i_sb)->hstate;
}
#else /* !CONFIG_HUGETLBFS */

#define is_file_hugepages(file)                        false
static inline struct file *
hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag,
                struct user_struct **user, int creat_flags,
                int page_size_log)
{
        return ERR_PTR(-ENOSYS);
}

static inline struct hstate *hstate_inode(struct inode *i)
{
        return NULL;
}
#endif /* !CONFIG_HUGETLBFS */

#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                                        unsigned long len, unsigned long pgoff,
                                        unsigned long flags);
#endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */

#ifdef CONFIG_HUGETLB_PAGE

#define HSTATE_NAME_LEN 32
/* Defines one hugetlb page size */
struct hstate {
        int next_nid_to_alloc;
        int next_nid_to_free;
        unsigned int order;
        unsigned long mask;
        unsigned long max_huge_pages;
        unsigned long nr_huge_pages;
        unsigned long free_huge_pages;
        unsigned long resv_huge_pages;
        unsigned long surplus_huge_pages;
        unsigned long nr_overcommit_huge_pages;
        struct list_head hugepage_activelist;
        struct list_head hugepage_freelists[MAX_NUMNODES];
        unsigned int nr_huge_pages_node[MAX_NUMNODES];
        unsigned int free_huge_pages_node[MAX_NUMNODES];
        unsigned int surplus_huge_pages_node[MAX_NUMNODES];
#ifdef CONFIG_CGROUP_HUGETLB
        /* cgroup control files */
        struct cftype cgroup_files_dfl[7];
        struct cftype cgroup_files_legacy[9];
#endif
        char name[HSTATE_NAME_LEN];
};

struct huge_bootmem_page {
        struct list_head list;
        struct hstate *hstate;
};

struct page *alloc_huge_page(struct vm_area_struct *vma,
                                unsigned long addr, int avoid_reserve);
struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
                                nodemask_t *nmask, gfp_t gfp_mask);
struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
                                unsigned long address);
int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
                        pgoff_t idx);

/* arch callback */
int __init __alloc_bootmem_huge_page(struct hstate *h);
int __init alloc_bootmem_huge_page(struct hstate *h);

void __init hugetlb_add_hstate(unsigned order);
bool __init arch_hugetlb_valid_size(unsigned long size);
struct hstate *size_to_hstate(unsigned long size);

#ifndef HUGE_MAX_HSTATE
#define HUGE_MAX_HSTATE 1
#endif

extern struct hstate hstates[HUGE_MAX_HSTATE];
extern unsigned int default_hstate_idx;

#define default_hstate (hstates[default_hstate_idx])

static inline struct hstate *hstate_file(struct file *f)
{
        return hstate_inode(file_inode(f));
}

static inline struct hstate *hstate_sizelog(int page_size_log)
{
        if (!page_size_log)
                return &default_hstate;

        if (page_size_log < BITS_PER_LONG)
                return size_to_hstate(1UL << page_size_log);

        return NULL;
}

static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
{
        return hstate_file(vma->vm_file);
}

static inline unsigned long huge_page_size(struct hstate *h)
{
        return (unsigned long)PAGE_SIZE << h->order;
}

extern unsigned long vma_kernel_pagesize(struct vm_area_struct *vma);

extern unsigned long vma_mmu_pagesize(struct vm_area_struct *vma);

static inline unsigned long huge_page_mask(struct hstate *h)
{
        return h->mask;
}

static inline unsigned int huge_page_order(struct hstate *h)
{
        return h->order;
}

static inline unsigned huge_page_shift(struct hstate *h)
{
        return h->order + PAGE_SHIFT;
}

static inline bool hstate_is_gigantic(struct hstate *h)
{
        return huge_page_order(h) >= MAX_ORDER;
}

static inline unsigned int pages_per_huge_page(struct hstate *h)
{
        return 1 << h->order;
}

static inline unsigned int blocks_per_huge_page(struct hstate *h)
{
        return huge_page_size(h) / 512;
}

#include <asm/hugetlb.h>

#ifndef is_hugepage_only_range
static inline int is_hugepage_only_range(struct mm_struct *mm,
                                        unsigned long addr, unsigned long len)
{
        return 0;
}
#define is_hugepage_only_range is_hugepage_only_range
#endif

#ifndef arch_clear_hugepage_flags
static inline void arch_clear_hugepage_flags(struct page *page) { }
#define arch_clear_hugepage_flags arch_clear_hugepage_flags
#endif

#ifndef arch_make_huge_pte
static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
                                       struct page *page, int writable)
{
        return entry;
}
#endif

static inline struct hstate *page_hstate(struct page *page)
{
        VM_BUG_ON_PAGE(!PageHuge(page), page);
        return size_to_hstate(page_size(page));
}

static inline unsigned hstate_index_to_shift(unsigned index)
{
        return hstates[index].order + PAGE_SHIFT;
}

static inline int hstate_index(struct hstate *h)
{
        return h - hstates;
}

extern int dissolve_free_huge_page(struct page *page);
extern int dissolve_free_huge_pages(unsigned long start_pfn,
                                    unsigned long end_pfn);

#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
#ifndef arch_hugetlb_migration_supported
static inline bool arch_hugetlb_migration_supported(struct hstate *h)
{
        if ((huge_page_shift(h) == PMD_SHIFT) ||
                (huge_page_shift(h) == PUD_SHIFT) ||
                        (huge_page_shift(h) == PGDIR_SHIFT))
                return true;
        else
                return false;
}
#endif
#else
static inline bool arch_hugetlb_migration_supported(struct hstate *h)
{
        return false;
}
#endif

static inline bool hugepage_migration_supported(struct hstate *h)
{
        return arch_hugetlb_migration_supported(h);
}

/*
 * Movability check is different as compared to migration check.
 * It determines whether or not a huge page should be placed on
 * movable zone or not. Movability of any huge page should be
 * required only if huge page size is supported for migration.
 * There wont be any reason for the huge page to be movable if
 * it is not migratable to start with. Also the size of the huge
 * page should be large enough to be placed under a movable zone
 * and still feasible enough to be migratable. Just the presence
 * in movable zone does not make the migration feasible.
 *
 * So even though large huge page sizes like the gigantic ones
 * are migratable they should not be movable because its not
 * feasible to migrate them from movable zone.
 */
static inline bool hugepage_movable_supported(struct hstate *h)
{
        if (!hugepage_migration_supported(h))
                return false;

        if (hstate_is_gigantic(h))
                return false;
        return true;
}

/* Movability of hugepages depends on migration support. */
static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
        if (hugepage_movable_supported(h))
                return GFP_HIGHUSER_MOVABLE;
        else
                return GFP_HIGHUSER;
}

static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
{
        gfp_t modified_mask = htlb_alloc_mask(h);

        /* Some callers might want to enforce node */
        modified_mask |= (gfp_mask & __GFP_THISNODE);

        modified_mask |= (gfp_mask & __GFP_NOWARN);

        return modified_mask;
}

static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
                                           struct mm_struct *mm, pte_t *pte)
{
        if (huge_page_size(h) == PMD_SIZE)
                return pmd_lockptr(mm, (pmd_t *) pte);
        VM_BUG_ON(huge_page_size(h) == PAGE_SIZE);
        return &mm->page_table_lock;
}

#ifndef hugepages_supported
/*
 * Some platform decide whether they support huge pages at boot
 * time. Some of them, such as powerpc, set HPAGE_SHIFT to 0
 * when there is no such support
 */
#define hugepages_supported() (HPAGE_SHIFT != 0)
#endif

void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm);

static inline void hugetlb_count_init(struct mm_struct *mm)
{
        atomic_long_set(&mm->hugetlb_usage, 0);
}

static inline void hugetlb_count_add(long l, struct mm_struct *mm)
{
        atomic_long_add(l, &mm->hugetlb_usage);
}

static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
{
        atomic_long_sub(l, &mm->hugetlb_usage);
}

#ifndef set_huge_swap_pte_at
static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
                                        pte_t *ptep, pte_t pte, unsigned long sz)
{
        set_huge_pte_at(mm, addr, ptep, pte);
}
#endif

#ifndef huge_ptep_modify_prot_start
#define huge_ptep_modify_prot_start huge_ptep_modify_prot_start
static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
                                                unsigned long addr, pte_t *ptep)
{
        return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep);
}
#endif

#ifndef huge_ptep_modify_prot_commit
#define huge_ptep_modify_prot_commit huge_ptep_modify_prot_commit
static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
                                                unsigned long addr, pte_t *ptep,
                                                pte_t old_pte, pte_t pte)
{
        set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
}
#endif

void set_page_huge_active(struct page *page);

#else        /* CONFIG_HUGETLB_PAGE */
struct hstate {};

static inline struct page *alloc_huge_page(struct vm_area_struct *vma,
                                           unsigned long addr,
                                           int avoid_reserve)
{
        return NULL;
}

static inline struct page *
alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
                        nodemask_t *nmask, gfp_t gfp_mask)
{
        return NULL;
}

static inline struct page *alloc_huge_page_vma(struct hstate *h,
                                               struct vm_area_struct *vma,
                                               unsigned long address)
{
        return NULL;
}

static inline int __alloc_bootmem_huge_page(struct hstate *h)
{
        return 0;
}

static inline struct hstate *hstate_file(struct file *f)
{
        return NULL;
}

static inline struct hstate *hstate_sizelog(int page_size_log)
{
        return NULL;
}

static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
{
        return NULL;
}

static inline struct hstate *page_hstate(struct page *page)
{
        return NULL;
}

static inline unsigned long huge_page_size(struct hstate *h)
{
        return PAGE_SIZE;
}

static inline unsigned long huge_page_mask(struct hstate *h)
{
        return PAGE_MASK;
}

static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
{
        return PAGE_SIZE;
}

static inline unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
        return PAGE_SIZE;
}

static inline unsigned int huge_page_order(struct hstate *h)
{
        return 0;
}

static inline unsigned int huge_page_shift(struct hstate *h)
{
        return PAGE_SHIFT;
}

static inline bool hstate_is_gigantic(struct hstate *h)
{
        return false;
}

static inline unsigned int pages_per_huge_page(struct hstate *h)
{
        return 1;
}

static inline unsigned hstate_index_to_shift(unsigned index)
{
        return 0;
}

static inline int hstate_index(struct hstate *h)
{
        return 0;
}

static inline int dissolve_free_huge_page(struct page *page)
{
        return 0;
}

static inline int dissolve_free_huge_pages(unsigned long start_pfn,
                                           unsigned long end_pfn)
{
        return 0;
}

static inline bool hugepage_migration_supported(struct hstate *h)
{
        return false;
}

static inline bool hugepage_movable_supported(struct hstate *h)
{
        return false;
}

static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
        return 0;
}

static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
{
        return 0;
}

static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
                                           struct mm_struct *mm, pte_t *pte)
{
        return &mm->page_table_lock;
}

static inline void hugetlb_count_init(struct mm_struct *mm)
{
}

static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m)
{
}

static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
{
}

static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
                                        pte_t *ptep, pte_t pte, unsigned long sz)
{
}
#endif        /* CONFIG_HUGETLB_PAGE */

static inline spinlock_t *huge_pte_lock(struct hstate *h,
                                        struct mm_struct *mm, pte_t *pte)
{
        spinlock_t *ptl;

        ptl = huge_pte_lockptr(h, mm, pte);
        spin_lock(ptl);
        return ptl;
}

#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
extern void __init hugetlb_cma_reserve(int order);
extern void __init hugetlb_cma_check(void);
#else
static inline __init void hugetlb_cma_reserve(int order)
{
}
static inline __init void hugetlb_cma_check(void)
{
}
#endif

#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
static inline bool hugetlb_pmd_shared(pte_t *pte)
{
        return page_count(virt_to_page(pte)) > 1;
}
#else
static inline bool hugetlb_pmd_shared(pte_t *pte)
{
        return false;
}
#endif

#endif /* _LINUX_HUGETLB_H */

































































































































































































































































































































































    1 
    1 










    1 








    1 
    1 










    1 
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
// SPDX-License-Identifier: GPL-2.0-only
/*
 * lib/bitmap.c
 * Helper functions for bitmap.h.
 */

#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/bug.h>
#include <linux/ctype.h>
#include <linux/device.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/thread_info.h>
#include <linux/uaccess.h>

#include <asm/page.h>

#include "kstrtox.h"

/**
 * DOC: bitmap introduction
 *
 * bitmaps provide an array of bits, implemented using an
 * array of unsigned longs.  The number of valid bits in a
 * given bitmap does _not_ need to be an exact multiple of
 * BITS_PER_LONG.
 *
 * The possible unused bits in the last, partially used word
 * of a bitmap are 'don't care'.  The implementation makes
 * no particular effort to keep them zero.  It ensures that
 * their value will not affect the results of any operation.
 * The bitmap operations that return Boolean (bitmap_empty,
 * for example) or scalar (bitmap_weight, for example) results
 * carefully filter out these unused bits from impacting their
 * results.
 *
 * The byte ordering of bitmaps is more natural on little
 * endian architectures.  See the big-endian headers
 * include/asm-ppc64/bitops.h and include/asm-s390/bitops.h
 * for the best explanations of this ordering.
 */

int __bitmap_equal(const unsigned long *bitmap1,
                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k, lim = bits/BITS_PER_LONG;
        for (k = 0; k < lim; ++k)
                if (bitmap1[k] != bitmap2[k])
                        return 0;

        if (bits % BITS_PER_LONG)
                if ((bitmap1[k] ^ bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits))
                        return 0;

        return 1;
}
EXPORT_SYMBOL(__bitmap_equal);

bool __bitmap_or_equal(const unsigned long *bitmap1,
                       const unsigned long *bitmap2,
                       const unsigned long *bitmap3,
                       unsigned int bits)
{
        unsigned int k, lim = bits / BITS_PER_LONG;
        unsigned long tmp;

        for (k = 0; k < lim; ++k) {
                if ((bitmap1[k] | bitmap2[k]) != bitmap3[k])
                        return false;
        }

        if (!(bits % BITS_PER_LONG))
                return true;

        tmp = (bitmap1[k] | bitmap2[k]) ^ bitmap3[k];
        return (tmp & BITMAP_LAST_WORD_MASK(bits)) == 0;
}

void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int bits)
{
        unsigned int k, lim = BITS_TO_LONGS(bits);
        for (k = 0; k < lim; ++k)
                dst[k] = ~src[k];
}
EXPORT_SYMBOL(__bitmap_complement);

/**
 * __bitmap_shift_right - logical right shift of the bits in a bitmap
 *   @dst : destination bitmap
 *   @src : source bitmap
 *   @shift : shift by this many bits
 *   @nbits : bitmap size, in bits
 *
 * Shifting right (dividing) means moving bits in the MS -> LS bit
 * direction.  Zeros are fed into the vacated MS positions and the
 * LS bits shifted off the bottom are lost.
 */
void __bitmap_shift_right(unsigned long *dst, const unsigned long *src,
                        unsigned shift, unsigned nbits)
{
        unsigned k, lim = BITS_TO_LONGS(nbits);
        unsigned off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG;
        unsigned long mask = BITMAP_LAST_WORD_MASK(nbits);
        for (k = 0; off + k < lim; ++k) {
                unsigned long upper, lower;

                /*
                 * If shift is not word aligned, take lower rem bits of
                 * word above and make them the top rem bits of result.
                 */
                if (!rem || off + k + 1 >= lim)
                        upper = 0;
                else {
                        upper = src[off + k + 1];
                        if (off + k + 1 == lim - 1)
                                upper &= mask;
                        upper <<= (BITS_PER_LONG - rem);
                }
                lower = src[off + k];
                if (off + k == lim - 1)
                        lower &= mask;
                lower >>= rem;
                dst[k] = lower | upper;
        }
        if (off)
                memset(&dst[lim - off], 0, off*sizeof(unsigned long));
}
EXPORT_SYMBOL(__bitmap_shift_right);


/**
 * __bitmap_shift_left - logical left shift of the bits in a bitmap
 *   @dst : destination bitmap
 *   @src : source bitmap
 *   @shift : shift by this many bits
 *   @nbits : bitmap size, in bits
 *
 * Shifting left (multiplying) means moving bits in the LS -> MS
 * direction.  Zeros are fed into the vacated LS bit positions
 * and those MS bits shifted off the top are lost.
 */

void __bitmap_shift_left(unsigned long *dst, const unsigned long *src,
                        unsigned int shift, unsigned int nbits)
{
        int k;
        unsigned int lim = BITS_TO_LONGS(nbits);
        unsigned int off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG;
        for (k = lim - off - 1; k >= 0; --k) {
                unsigned long upper, lower;

                /*
                 * If shift is not word aligned, take upper rem bits of
                 * word below and make them the bottom rem bits of result.
                 */
                if (rem && k > 0)
                        lower = src[k - 1] >> (BITS_PER_LONG - rem);
                else
                        lower = 0;
                upper = src[k] << rem;
                dst[k + off] = lower | upper;
        }
        if (off)
                memset(dst, 0, off*sizeof(unsigned long));
}
EXPORT_SYMBOL(__bitmap_shift_left);

/**
 * bitmap_cut() - remove bit region from bitmap and right shift remaining bits
 * @dst: destination bitmap, might overlap with src
 * @src: source bitmap
 * @first: start bit of region to be removed
 * @cut: number of bits to remove
 * @nbits: bitmap size, in bits
 *
 * Set the n-th bit of @dst iff the n-th bit of @src is set and
 * n is less than @first, or the m-th bit of @src is set for any
 * m such that @first <= n < nbits, and m = n + @cut.
 *
 * In pictures, example for a big-endian 32-bit architecture:
 *
 * The @src bitmap is::
 *
 *   31                                   63
 *   |                                    |
 *   10000000 11000001 11110010 00010101  10000000 11000001 01110010 00010101
 *                   |  |              |                                    |
 *                  16  14             0                                   32
 *
 * if @cut is 3, and @first is 14, bits 14-16 in @src are cut and @dst is::
 *
 *   31                                   63
 *   |                                    |
 *   10110000 00011000 00110010 00010101  00010000 00011000 00101110 01000010
 *                      |              |                                    |
 *                      14 (bit 17     0                                   32
 *                          from @src)
 *
 * Note that @dst and @src might overlap partially or entirely.
 *
 * This is implemented in the obvious way, with a shift and carry
 * step for each moved bit. Optimisation is left as an exercise
 * for the compiler.
 */
void bitmap_cut(unsigned long *dst, const unsigned long *src,
                unsigned int first, unsigned int cut, unsigned int nbits)
{
        unsigned int len = BITS_TO_LONGS(nbits);
        unsigned long keep = 0, carry;
        int i;

        if (first % BITS_PER_LONG) {
                keep = src[first / BITS_PER_LONG] &
                       (~0UL >> (BITS_PER_LONG - first % BITS_PER_LONG));
        }

        memmove(dst, src, len * sizeof(*dst));

        while (cut--) {
                for (i = first / BITS_PER_LONG; i < len; i++) {
                        if (i < len - 1)
                                carry = dst[i + 1] & 1UL;
                        else
                                carry = 0;

                        dst[i] = (dst[i] >> 1) | (carry << (BITS_PER_LONG - 1));
                }
        }

        dst[first / BITS_PER_LONG] &= ~0UL << (first % BITS_PER_LONG);
        dst[first / BITS_PER_LONG] |= keep;
}
EXPORT_SYMBOL(bitmap_cut);

int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int lim = bits/BITS_PER_LONG;
        unsigned long result = 0;

        for (k = 0; k < lim; k++)
                result |= (dst[k] = bitmap1[k] & bitmap2[k]);
        if (bits % BITS_PER_LONG)
                result |= (dst[k] = bitmap1[k] & bitmap2[k] &
                           BITMAP_LAST_WORD_MASK(bits));
        return result != 0;
}
EXPORT_SYMBOL(__bitmap_and);

void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int nr = BITS_TO_LONGS(bits);

        for (k = 0; k < nr; k++)
                dst[k] = bitmap1[k] | bitmap2[k];
}
EXPORT_SYMBOL(__bitmap_or);

void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int nr = BITS_TO_LONGS(bits);

        for (k = 0; k < nr; k++)
                dst[k] = bitmap1[k] ^ bitmap2[k];
}
EXPORT_SYMBOL(__bitmap_xor);

int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int lim = bits/BITS_PER_LONG;
        unsigned long result = 0;

        for (k = 0; k < lim; k++)
                result |= (dst[k] = bitmap1[k] & ~bitmap2[k]);
        if (bits % BITS_PER_LONG)
                result |= (dst[k] = bitmap1[k] & ~bitmap2[k] &
                           BITMAP_LAST_WORD_MASK(bits));
        return result != 0;
}
EXPORT_SYMBOL(__bitmap_andnot);

void __bitmap_replace(unsigned long *dst,
                      const unsigned long *old, const unsigned long *new,
                      const unsigned long *mask, unsigned int nbits)
{
        unsigned int k;
        unsigned int nr = BITS_TO_LONGS(nbits);

        for (k = 0; k < nr; k++)
                dst[k] = (old[k] & ~mask[k]) | (new[k] & mask[k]);
}
EXPORT_SYMBOL(__bitmap_replace);

int __bitmap_intersects(const unsigned long *bitmap1,
                        const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k, lim = bits/BITS_PER_LONG;
        for (k = 0; k < lim; ++k)
                if (bitmap1[k] & bitmap2[k])
                        return 1;

        if (bits % BITS_PER_LONG)
                if ((bitmap1[k] & bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits))
                        return 1;
        return 0;
}
EXPORT_SYMBOL(__bitmap_intersects);

int __bitmap_subset(const unsigned long *bitmap1,
                    const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k, lim = bits/BITS_PER_LONG;
        for (k = 0; k < lim; ++k)
                if (bitmap1[k] & ~bitmap2[k])
                        return 0;

        if (bits % BITS_PER_LONG)
                if ((bitmap1[k] & ~bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits))
                        return 0;
        return 1;
}
EXPORT_SYMBOL(__bitmap_subset);

int __bitmap_weight(const unsigned long *bitmap, unsigned int bits)
{
        unsigned int k, lim = bits/BITS_PER_LONG;
        int w = 0;

        for (k = 0; k < lim; k++)
                w += hweight_long(bitmap[k]);

        if (bits % BITS_PER_LONG)
                w += hweight_long(bitmap[k] & BITMAP_LAST_WORD_MASK(bits));

        return w;
}
EXPORT_SYMBOL(__bitmap_weight);

void __bitmap_set(unsigned long *map, unsigned int start, int len)
{
        unsigned long *p = map + BIT_WORD(start);
        const unsigned int size = start + len;
        int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
        unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start);

        while (len - bits_to_set >= 0) {
                *p |= mask_to_set;
                len -= bits_to_set;
                bits_to_set = BITS_PER_LONG;
                mask_to_set = ~0UL;
                p++;
        }
        if (len) {
                mask_to_set &= BITMAP_LAST_WORD_MASK(size);
                *p |= mask_to_set;
        }
}
EXPORT_SYMBOL(__bitmap_set);

void __bitmap_clear(unsigned long *map, unsigned int start, int len)
{
        unsigned long *p = map + BIT_WORD(start);
        const unsigned int size = start + len;
        int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
        unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);

        while (len - bits_to_clear >= 0) {
                *p &= ~mask_to_clear;
                len -= bits_to_clear;
                bits_to_clear = BITS_PER_LONG;
                mask_to_clear = ~0UL;
                p++;
        }
        if (len) {
                mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
                *p &= ~mask_to_clear;
        }
}
EXPORT_SYMBOL(__bitmap_clear);

/**
 * bitmap_find_next_zero_area_off - find a contiguous aligned zero area
 * @map: The address to base the search on
 * @size: The bitmap size in bits
 * @start: The bitnumber to start searching at
 * @nr: The number of zeroed bits we're looking for
 * @align_mask: Alignment mask for zero area
 * @align_offset: Alignment offset for zero area.
 *
 * The @align_mask should be one less than a power of 2; the effect is that
 * the bit offset of all zero areas this function finds plus @align_offset
 * is multiple of that power of 2.
 */
unsigned long bitmap_find_next_zero_area_off(unsigned long *map,
                                             unsigned long size,
                                             unsigned long start,
                                             unsigned int nr,
                                             unsigned long align_mask,
                                             unsigned long align_offset)
{
        unsigned long index, end, i;
again:
        index = find_next_zero_bit(map, size, start);

        /* Align allocation */
        index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset;

        end = index + nr;
        if (end > size)
                return end;
        i = find_next_bit(map, end, index);
        if (i < end) {
                start = i + 1;
                goto again;
        }
        return index;
}
EXPORT_SYMBOL(bitmap_find_next_zero_area_off);

/*
 * Bitmap printing & parsing functions: first version by Nadia Yvette Chambers,
 * second version by Paul Jackson, third by Joe Korty.
 */

/**
 * bitmap_parse_user - convert an ASCII hex string in a user buffer into a bitmap
 *
 * @ubuf: pointer to user buffer containing string.
 * @ulen: buffer size in bytes.  If string is smaller than this
 *    then it must be terminated with a \0.
 * @maskp: pointer to bitmap array that will contain result.
 * @nmaskbits: size of bitmap, in bits.
 */
int bitmap_parse_user(const char __user *ubuf,
                        unsigned int ulen, unsigned long *maskp,
                        int nmaskbits)
{
        char *buf;
        int ret;

        buf = memdup_user_nul(ubuf, ulen);
        if (IS_ERR(buf))
                return PTR_ERR(buf);

        ret = bitmap_parse(buf, UINT_MAX, maskp, nmaskbits);

        kfree(buf);
        return ret;
}
EXPORT_SYMBOL(bitmap_parse_user);

/**
 * bitmap_print_to_pagebuf - convert bitmap to list or hex format ASCII string
 * @list: indicates whether the bitmap must be list
 * @buf: page aligned buffer into which string is placed
 * @maskp: pointer to bitmap to convert
 * @nmaskbits: size of bitmap, in bits
 *
 * Output format is a comma-separated list of decimal numbers and
 * ranges if list is specified or hex digits grouped into comma-separated
 * sets of 8 digits/set. Returns the number of characters written to buf.
 *
 * It is assumed that @buf is a pointer into a PAGE_SIZE, page-aligned
 * area and that sufficient storage remains at @buf to accommodate the
 * bitmap_print_to_pagebuf() output. Returns the number of characters
 * actually printed to @buf, excluding terminating '\0'.
 */
int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp,
                            int nmaskbits)
{
        ptrdiff_t len = PAGE_SIZE - offset_in_page(buf);

        return list ? scnprintf(buf, len, "%*pbl\n", nmaskbits, maskp) :
                      scnprintf(buf, len, "%*pb\n", nmaskbits, maskp);
}
EXPORT_SYMBOL(bitmap_print_to_pagebuf);

/*
 * Region 9-38:4/10 describes the following bitmap structure:
 * 0           9  12    18                        38
 * .........****......****......****......
 *            ^  ^     ^                         ^
 *      start  off   group_len               end
 */
struct region {
        unsigned int start;
        unsigned int off;
        unsigned int group_len;
        unsigned int end;
};

static int bitmap_set_region(const struct region *r,
                                unsigned long *bitmap, int nbits)
{
        unsigned int start;

        if (r->end >= nbits)
                return -ERANGE;

        for (start = r->start; start <= r->end; start += r->group_len)
                bitmap_set(bitmap, start, min(r->end - start + 1, r->off));

        return 0;
}

static int bitmap_check_region(const struct region *r)
{
        if (r->start > r->end || r->group_len == 0 || r->off > r->group_len)
                return -EINVAL;

        return 0;
}

static const char *bitmap_getnum(const char *str, unsigned int *num)
{
        unsigned long long n;
        unsigned int len;

        len = _parse_integer(str, 10, &n);
        if (!len)
                return ERR_PTR(-EINVAL);
        if (len & KSTRTOX_OVERFLOW || n != (unsigned int)n)
                return ERR_PTR(-EOVERFLOW);

        *num = n;
        return str + len;
}

static inline bool end_of_str(char c)
{
        return c == '\0' || c == '\n';
}

static inline bool __end_of_region(char c)
{
        return isspace(c) || c == ',';
}

static inline bool end_of_region(char c)
{
        return __end_of_region(c) || end_of_str(c);
}

/*
 * The format allows commas and whitespaces at the beginning
 * of the region.
 */
static const char *bitmap_find_region(const char *str)
{
        while (__end_of_region(*str))
                str++;

        return end_of_str(*str) ? NULL : str;
}

static const char *bitmap_find_region_reverse(const char *start, const char *end)
{
        while (start <= end && __end_of_region(*end))
                end--;

        return end;
}

static const char *bitmap_parse_region(const char *str, struct region *r)
{
        str = bitmap_getnum(str, &r->start);
        if (IS_ERR(str))
                return str;

        if (end_of_region(*str))
                goto no_end;

        if (*str != '-')
                return ERR_PTR(-EINVAL);

        str = bitmap_getnum(str + 1, &r->end);
        if (IS_ERR(str))
                return str;

        if (end_of_region(*str))
                goto no_pattern;

        if (*str != ':')
                return ERR_PTR(-EINVAL);

        str = bitmap_getnum(str + 1, &r->off);
        if (IS_ERR(str))
                return str;

        if (*str != '/')
                return ERR_PTR(-EINVAL);

        return bitmap_getnum(str + 1, &r->group_len);

no_end:
        r->end = r->start;
no_pattern:
        r->off = r->end + 1;
        r->group_len = r->end + 1;

        return end_of_str(*str) ? NULL : str;
}

/**
 * bitmap_parselist - convert list format ASCII string to bitmap
 * @buf: read user string from this buffer; must be terminated
 *    with a \0 or \n.
 * @maskp: write resulting mask here
 * @nmaskbits: number of bits in mask to be written
 *
 * Input format is a comma-separated list of decimal numbers and
 * ranges.  Consecutively set bits are shown as two hyphen-separated
 * decimal numbers, the smallest and largest bit numbers set in
 * the range.
 * Optionally each range can be postfixed to denote that only parts of it
 * should be set. The range will divided to groups of specific size.
 * From each group will be used only defined amount of bits.
 * Syntax: range:used_size/group_size
 * Example: 0-1023:2/256 ==> 0,1,256,257,512,513,768,769
 *
 * Returns: 0 on success, -errno on invalid input strings. Error values:
 *
 *   - ``-EINVAL``: wrong region format
 *   - ``-EINVAL``: invalid character in string
 *   - ``-ERANGE``: bit number specified too large for mask
 *   - ``-EOVERFLOW``: integer overflow in the input parameters
 */
int bitmap_parselist(const char *buf, unsigned long *maskp, int nmaskbits)
{
        struct region r;
        long ret;

        bitmap_zero(maskp, nmaskbits);

        while (buf) {
                buf = bitmap_find_region(buf);
                if (buf == NULL)
                        return 0;

                buf = bitmap_parse_region(buf, &r);
                if (IS_ERR(buf))
                        return PTR_ERR(buf);

                ret = bitmap_check_region(&r);
                if (ret)
                        return ret;

                ret = bitmap_set_region(&r, maskp, nmaskbits);
                if (ret)
                        return ret;
        }

        return 0;
}
EXPORT_SYMBOL(bitmap_parselist);


/**
 * bitmap_parselist_user()
 *
 * @ubuf: pointer to user buffer containing string.
 * @ulen: buffer size in bytes.  If string is smaller than this
 *    then it must be terminated with a \0.
 * @maskp: pointer to bitmap array that will contain result.
 * @nmaskbits: size of bitmap, in bits.
 *
 * Wrapper for bitmap_parselist(), providing it with user buffer.
 */
int bitmap_parselist_user(const char __user *ubuf,
                        unsigned int ulen, unsigned long *maskp,
                        int nmaskbits)
{
        char *buf;
        int ret;

        buf = memdup_user_nul(ubuf, ulen);
        if (IS_ERR(buf))
                return PTR_ERR(buf);

        ret = bitmap_parselist(buf, maskp, nmaskbits);

        kfree(buf);
        return ret;
}
EXPORT_SYMBOL(bitmap_parselist_user);

static const char *bitmap_get_x32_reverse(const char *start,
                                        const char *end, u32 *num)
{
        u32 ret = 0;
        int c, i;

        for (i = 0; i < 32; i += 4) {
                c = hex_to_bin(*end--);
                if (c < 0)
                        return ERR_PTR(-EINVAL);

                ret |= c << i;

                if (start > end || __end_of_region(*end))
                        goto out;
        }

        if (hex_to_bin(*end--) >= 0)
                return ERR_PTR(-EOVERFLOW);
out:
        *num = ret;
        return end;
}

/**
 * bitmap_parse - convert an ASCII hex string into a bitmap.
 * @start: pointer to buffer containing string.
 * @buflen: buffer size in bytes.  If string is smaller than this
 *    then it must be terminated with a \0 or \n. In that case,
 *    UINT_MAX may be provided instead of string length.
 * @maskp: pointer to bitmap array that will contain result.
 * @nmaskbits: size of bitmap, in bits.
 *
 * Commas group hex digits into chunks.  Each chunk defines exactly 32
 * bits of the resultant bitmask.  No chunk may specify a value larger
 * than 32 bits (%-EOVERFLOW), and if a chunk specifies a smaller value
 * then leading 0-bits are prepended.  %-EINVAL is returned for illegal
 * characters. Grouping such as "1,,5", ",44", "," or "" is allowed.
 * Leading, embedded and trailing whitespace accepted.
 */
int bitmap_parse(const char *start, unsigned int buflen,
                unsigned long *maskp, int nmaskbits)
{
        const char *end = strnchrnul(start, buflen, '\n') - 1;
        int chunks = BITS_TO_U32(nmaskbits);
        u32 *bitmap = (u32 *)maskp;
        int unset_bit;
        int chunk;

        for (chunk = 0; ; chunk++) {
                end = bitmap_find_region_reverse(start, end);
                if (start > end)
                        break;

                if (!chunks--)
                        return -EOVERFLOW;

#if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN)
                end = bitmap_get_x32_reverse(start, end, &bitmap[chunk ^ 1]);
#else
                end = bitmap_get_x32_reverse(start, end, &bitmap[chunk]);
#endif
                if (IS_ERR(end))
                        return PTR_ERR(end);
        }

        unset_bit = (BITS_TO_U32(nmaskbits) - chunks) * 32;
        if (unset_bit < nmaskbits) {
                bitmap_clear(maskp, unset_bit, nmaskbits - unset_bit);
                return 0;
        }

        if (find_next_bit(maskp, unset_bit, nmaskbits) != unset_bit)
                return -EOVERFLOW;

        return 0;
}
EXPORT_SYMBOL(bitmap_parse);


#ifdef CONFIG_NUMA
/**
 * bitmap_pos_to_ord - find ordinal of set bit at given position in bitmap
 *        @buf: pointer to a bitmap
 *        @pos: a bit position in @buf (0 <= @pos < @nbits)
 *        @nbits: number of valid bit positions in @buf
 *
 * Map the bit at position @pos in @buf (of length @nbits) to the
 * ordinal of which set bit it is.  If it is not set or if @pos
 * is not a valid bit position, map to -1.
 *
 * If for example, just bits 4 through 7 are set in @buf, then @pos
 * values 4 through 7 will get mapped to 0 through 3, respectively,
 * and other @pos values will get mapped to -1.  When @pos value 7
 * gets mapped to (returns) @ord value 3 in this example, that means
 * that bit 7 is the 3rd (starting with 0th) set bit in @buf.
 *
 * The bit positions 0 through @bits are valid positions in @buf.
 */
static int bitmap_pos_to_ord(const unsigned long *buf, unsigned int pos, unsigned int nbits)
{
        if (pos >= nbits || !test_bit(pos, buf))
                return -1;

        return __bitmap_weight(buf, pos);
}

/**
 * bitmap_ord_to_pos - find position of n-th set bit in bitmap
 *        @buf: pointer to bitmap
 *        @ord: ordinal bit position (n-th set bit, n >= 0)
 *        @nbits: number of valid bit positions in @buf
 *
 * Map the ordinal offset of bit @ord in @buf to its position in @buf.
 * Value of @ord should be in range 0 <= @ord < weight(buf). If @ord
 * >= weight(buf), returns @nbits.
 *
 * If for example, just bits 4 through 7 are set in @buf, then @ord
 * values 0 through 3 will get mapped to 4 through 7, respectively,
 * and all other @ord values returns @nbits.  When @ord value 3
 * gets mapped to (returns) @pos value 7 in this example, that means
 * that the 3rd set bit (starting with 0th) is at position 7 in @buf.
 *
 * The bit positions 0 through @nbits-1 are valid positions in @buf.
 */
unsigned int bitmap_ord_to_pos(const unsigned long *buf, unsigned int ord, unsigned int nbits)
{
        unsigned int pos;

        for (pos = find_first_bit(buf, nbits);
             pos < nbits && ord;
             pos = find_next_bit(buf, nbits, pos + 1))
                ord--;

        return pos;
}

/**
 * bitmap_remap - Apply map defined by a pair of bitmaps to another bitmap
 *        @dst: remapped result
 *        @src: subset to be remapped
 *        @old: defines domain of map
 *        @new: defines range of map
 *        @nbits: number of bits in each of these bitmaps
 *
 * Let @old and @new define a mapping of bit positions, such that
 * whatever position is held by the n-th set bit in @old is mapped
 * to the n-th set bit in @new.  In the more general case, allowing
 * for the possibility that the weight 'w' of @new is less than the
 * weight of @old, map the position of the n-th set bit in @old to
 * the position of the m-th set bit in @new, where m == n % w.
 *
 * If either of the @old and @new bitmaps are empty, or if @src and
 * @dst point to the same location, then this routine copies @src
 * to @dst.
 *
 * The positions of unset bits in @old are mapped to themselves
 * (the identify map).
 *
 * Apply the above specified mapping to @src, placing the result in
 * @dst, clearing any bits previously set in @dst.
 *
 * For example, lets say that @old has bits 4 through 7 set, and
 * @new has bits 12 through 15 set.  This defines the mapping of bit
 * position 4 to 12, 5 to 13, 6 to 14 and 7 to 15, and of all other
 * bit positions unchanged.  So if say @src comes into this routine
 * with bits 1, 5 and 7 set, then @dst should leave with bits 1,
 * 13 and 15 set.
 */
void bitmap_remap(unsigned long *dst, const unsigned long *src,
                const unsigned long *old, const unsigned long *new,
                unsigned int nbits)
{
        unsigned int oldbit, w;

        if (dst == src)                /* following doesn't handle inplace remaps */
                return;
        bitmap_zero(dst, nbits);

        w = bitmap_weight(new, nbits);
        for_each_set_bit(oldbit, src, nbits) {
                int n = bitmap_pos_to_ord(old, oldbit, nbits);

                if (n < 0 || w == 0)
                        set_bit(oldbit, dst);        /* identity map */
                else
                        set_bit(bitmap_ord_to_pos(new, n % w, nbits), dst);
        }
}

/**
 * bitmap_bitremap - Apply map defined by a pair of bitmaps to a single bit
 *        @oldbit: bit position to be mapped
 *        @old: defines domain of map
 *        @new: defines range of map
 *        @bits: number of bits in each of these bitmaps
 *
 * Let @old and @new define a mapping of bit positions, such that
 * whatever position is held by the n-th set bit in @old is mapped
 * to the n-th set bit in @new.  In the more general case, allowing
 * for the possibility that the weight 'w' of @new is less than the
 * weight of @old, map the position of the n-th set bit in @old to
 * the position of the m-th set bit in @new, where m == n % w.
 *
 * The positions of unset bits in @old are mapped to themselves
 * (the identify map).
 *
 * Apply the above specified mapping to bit position @oldbit, returning
 * the new bit position.
 *
 * For example, lets say that @old has bits 4 through 7 set, and
 * @new has bits 12 through 15 set.  This defines the mapping of bit
 * position 4 to 12, 5 to 13, 6 to 14 and 7 to 15, and of all other
 * bit positions unchanged.  So if say @oldbit is 5, then this routine
 * returns 13.
 */
int bitmap_bitremap(int oldbit, const unsigned long *old,
                                const unsigned long *new, int bits)
{
        int w = bitmap_weight(new, bits);
        int n = bitmap_pos_to_ord(old, oldbit, bits);
        if (n < 0 || w == 0)
                return oldbit;
        else
                return bitmap_ord_to_pos(new, n % w, bits);
}

/**
 * bitmap_onto - translate one bitmap relative to another
 *        @dst: resulting translated bitmap
 *         @orig: original untranslated bitmap
 *         @relmap: bitmap relative to which translated
 *        @bits: number of bits in each of these bitmaps
 *
 * Set the n-th bit of @dst iff there exists some m such that the
 * n-th bit of @relmap is set, the m-th bit of @orig is set, and
 * the n-th bit of @relmap is also the m-th _set_ bit of @relmap.
 * (If you understood the previous sentence the first time your
 * read it, you're overqualified for your current job.)
 *
 * In other words, @orig is mapped onto (surjectively) @dst,
 * using the map { <n, m> | the n-th bit of @relmap is the
 * m-th set bit of @relmap }.
 *
 * Any set bits in @orig above bit number W, where W is the
 * weight of (number of set bits in) @relmap are mapped nowhere.
 * In particular, if for all bits m set in @orig, m >= W, then
 * @dst will end up empty.  In situations where the possibility
 * of such an empty result is not desired, one way to avoid it is
 * to use the bitmap_fold() operator, below, to first fold the
 * @orig bitmap over itself so that all its set bits x are in the
 * range 0 <= x < W.  The bitmap_fold() operator does this by
 * setting the bit (m % W) in @dst, for each bit (m) set in @orig.
 *
 * Example [1] for bitmap_onto():
 *  Let's say @relmap has bits 30-39 set, and @orig has bits
 *  1, 3, 5, 7, 9 and 11 set.  Then on return from this routine,
 *  @dst will have bits 31, 33, 35, 37 and 39 set.
 *
 *  When bit 0 is set in @orig, it means turn on the bit in
 *  @dst corresponding to whatever is the first bit (if any)
 *  that is turned on in @relmap.  Since bit 0 was off in the
 *  above example, we leave off that bit (bit 30) in @dst.
 *
 *  When bit 1 is set in @orig (as in the above example), it
 *  means turn on the bit in @dst corresponding to whatever
 *  is the second bit that is turned on in @relmap.  The second
 *  bit in @relmap that was turned on in the above example was
 *  bit 31, so we turned on bit 31 in @dst.
 *
 *  Similarly, we turned on bits 33, 35, 37 and 39 in @dst,
 *  because they were the 4th, 6th, 8th and 10th set bits
 *  set in @relmap, and the 4th, 6th, 8th and 10th bits of
 *  @orig (i.e. bits 3, 5, 7 and 9) were also set.
 *
 *  When bit 11 is set in @orig, it means turn on the bit in
 *  @dst corresponding to whatever is the twelfth bit that is
 *  turned on in @relmap.  In the above example, there were
 *  only ten bits turned on in @relmap (30..39), so that bit
 *  11 was set in @orig had no affect on @dst.
 *
 * Example [2] for bitmap_fold() + bitmap_onto():
 *  Let's say @relmap has these ten bits set::
 *
 *                40 41 42 43 45 48 53 61 74 95
 *
 *  (for the curious, that's 40 plus the first ten terms of the
 *  Fibonacci sequence.)
 *
 *  Further lets say we use the following code, invoking
 *  bitmap_fold() then bitmap_onto, as suggested above to
 *  avoid the possibility of an empty @dst result::
 *
 *        unsigned long *tmp;        // a temporary bitmap's bits
 *
 *        bitmap_fold(tmp, orig, bitmap_weight(relmap, bits), bits);
 *        bitmap_onto(dst, tmp, relmap, bits);
 *
 *  Then this table shows what various values of @dst would be, for
 *  various @orig's.  I list the zero-based positions of each set bit.
 *  The tmp column shows the intermediate result, as computed by
 *  using bitmap_fold() to fold the @orig bitmap modulo ten
 *  (the weight of @relmap):
 *
 *      =============== ============== =================
 *      @orig           tmp            @dst
 *      0                0             40
 *      1                1             41
 *      9                9             95
 *      10               0             40 [#f1]_
 *      1 3 5 7          1 3 5 7       41 43 48 61
 *      0 1 2 3 4        0 1 2 3 4     40 41 42 43 45
 *      0 9 18 27        0 9 8 7       40 61 74 95
 *      0 10 20 30       0             40
 *      0 11 22 33       0 1 2 3       40 41 42 43
 *      0 12 24 36       0 2 4 6       40 42 45 53
 *      78 102 211       1 2 8         41 42 74 [#f1]_
 *      =============== ============== =================
 *
 * .. [#f1]
 *
 *     For these marked lines, if we hadn't first done bitmap_fold()
 *     into tmp, then the @dst result would have been empty.
 *
 * If either of @orig or @relmap is empty (no set bits), then @dst
 * will be returned empty.
 *
 * If (as explained above) the only set bits in @orig are in positions
 * m where m >= W, (where W is the weight of @relmap) then @dst will
 * once again be returned empty.
 *
 * All bits in @dst not set by the above rule are cleared.
 */
void bitmap_onto(unsigned long *dst, const unsigned long *orig,
                        const unsigned long *relmap, unsigned int bits)
{
        unsigned int n, m;        /* same meaning as in above comment */

        if (dst == orig)        /* following doesn't handle inplace mappings */
                return;
        bitmap_zero(dst, bits);

        /*
         * The following code is a more efficient, but less
         * obvious, equivalent to the loop:
         *        for (m = 0; m < bitmap_weight(relmap, bits); m++) {
         *                n = bitmap_ord_to_pos(orig, m, bits);
         *                if (test_bit(m, orig))
         *                        set_bit(n, dst);
         *        }
         */

        m = 0;
        for_each_set_bit(n, relmap, bits) {
                /* m == bitmap_pos_to_ord(relmap, n, bits) */
                if (test_bit(m, orig))
                        set_bit(n, dst);
                m++;
        }
}

/**
 * bitmap_fold - fold larger bitmap into smaller, modulo specified size
 *        @dst: resulting smaller bitmap
 *        @orig: original larger bitmap
 *        @sz: specified size
 *        @nbits: number of bits in each of these bitmaps
 *
 * For each bit oldbit in @orig, set bit oldbit mod @sz in @dst.
 * Clear all other bits in @dst.  See further the comment and
 * Example [2] for bitmap_onto() for why and how to use this.
 */
void bitmap_fold(unsigned long *dst, const unsigned long *orig,
                        unsigned int sz, unsigned int nbits)
{
        unsigned int oldbit;

        if (dst == orig)        /* following doesn't handle inplace mappings */
                return;
        bitmap_zero(dst, nbits);

        for_each_set_bit(oldbit, orig, nbits)
                set_bit(oldbit % sz, dst);
}
#endif /* CONFIG_NUMA */

/*
 * Common code for bitmap_*_region() routines.
 *        bitmap: array of unsigned longs corresponding to the bitmap
 *        pos: the beginning of the region
 *        order: region size (log base 2 of number of bits)
 *        reg_op: operation(s) to perform on that region of bitmap
 *
 * Can set, verify and/or release a region of bits in a bitmap,
 * depending on which combination of REG_OP_* flag bits is set.
 *
 * A region of a bitmap is a sequence of bits in the bitmap, of
 * some size '1 << order' (a power of two), aligned to that same
 * '1 << order' power of two.
 *
 * Returns 1 if REG_OP_ISFREE succeeds (region is all zero bits).
 * Returns 0 in all other cases and reg_ops.
 */

enum {
        REG_OP_ISFREE,                /* true if region is all zero bits */
        REG_OP_ALLOC,                /* set all bits in region */
        REG_OP_RELEASE,                /* clear all bits in region */
};

static int __reg_op(unsigned long *bitmap, unsigned int pos, int order, int reg_op)
{
        int nbits_reg;                /* number of bits in region */
        int index;                /* index first long of region in bitmap */
        int offset;                /* bit offset region in bitmap[index] */
        int nlongs_reg;                /* num longs spanned by region in bitmap */
        int nbitsinlong;        /* num bits of region in each spanned long */
        unsigned long mask;        /* bitmask for one long of region */
        int i;                        /* scans bitmap by longs */
        int ret = 0;                /* return value */

        /*
         * Either nlongs_reg == 1 (for small orders that fit in one long)
         * or (offset == 0 && mask == ~0UL) (for larger multiword orders.)
         */
        nbits_reg = 1 << order;
        index = pos / BITS_PER_LONG;
        offset = pos - (index * BITS_PER_LONG);
        nlongs_reg = BITS_TO_LONGS(nbits_reg);
        nbitsinlong = min(nbits_reg,  BITS_PER_LONG);

        /*
         * Can't do "mask = (1UL << nbitsinlong) - 1", as that
         * overflows if nbitsinlong == BITS_PER_LONG.
         */
        mask = (1UL << (nbitsinlong - 1));
        mask += mask - 1;
        mask <<= offset;

        switch (reg_op) {
        case REG_OP_ISFREE:
                for (i = 0; i < nlongs_reg; i++) {
                        if (bitmap[index + i] & mask)
                                goto done;
                }
                ret = 1;        /* all bits in region free (zero) */
                break;

        case REG_OP_ALLOC:
                for (i = 0; i < nlongs_reg; i++)
                        bitmap[index + i] |= mask;
                break;

        case REG_OP_RELEASE:
                for (i = 0; i < nlongs_reg; i++)
                        bitmap[index + i] &= ~mask;
                break;
        }
done:
        return ret;
}

/**
 * bitmap_find_free_region - find a contiguous aligned mem region
 *        @bitmap: array of unsigned longs corresponding to the bitmap
 *        @bits: number of bits in the bitmap
 *        @order: region size (log base 2 of number of bits) to find
 *
 * Find a region of free (zero) bits in a @bitmap of @bits bits and
 * allocate them (set them to one).  Only consider regions of length
 * a power (@order) of two, aligned to that power of two, which
 * makes the search algorithm much faster.
 *
 * Return the bit offset in bitmap of the allocated region,
 * or -errno on failure.
 */
int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order)
{
        unsigned int pos, end;                /* scans bitmap by regions of size order */

        for (pos = 0 ; (end = pos + (1U << order)) <= bits; pos = end) {
                if (!__reg_op(bitmap, pos, order, REG_OP_ISFREE))
                        continue;
                __reg_op(bitmap, pos, order, REG_OP_ALLOC);
                return pos;
        }
        return -ENOMEM;
}
EXPORT_SYMBOL(bitmap_find_free_region);

/**
 * bitmap_release_region - release allocated bitmap region
 *        @bitmap: array of unsigned longs corresponding to the bitmap
 *        @pos: beginning of bit region to release
 *        @order: region size (log base 2 of number of bits) to release
 *
 * This is the complement to __bitmap_find_free_region() and releases
 * the found region (by clearing it in the bitmap).
 *
 * No return value.
 */
void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order)
{
        __reg_op(bitmap, pos, order, REG_OP_RELEASE);
}
EXPORT_SYMBOL(bitmap_release_region);

/**
 * bitmap_allocate_region - allocate bitmap region
 *        @bitmap: array of unsigned longs corresponding to the bitmap
 *        @pos: beginning of bit region to allocate
 *        @order: region size (log base 2 of number of bits) to allocate
 *
 * Allocate (set bits in) a specified region of a bitmap.
 *
 * Return 0 on success, or %-EBUSY if specified region wasn't
 * free (not all bits were zero).
 */
int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order)
{
        if (!__reg_op(bitmap, pos, order, REG_OP_ISFREE))
                return -EBUSY;
        return __reg_op(bitmap, pos, order, REG_OP_ALLOC);
}
EXPORT_SYMBOL(bitmap_allocate_region);

/**
 * bitmap_copy_le - copy a bitmap, putting the bits into little-endian order.
 * @dst:   destination buffer
 * @src:   bitmap to copy
 * @nbits: number of bits in the bitmap
 *
 * Require nbits % BITS_PER_LONG == 0.
 */
#ifdef __BIG_ENDIAN
void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits)
{
        unsigned int i;

        for (i = 0; i < nbits/BITS_PER_LONG; i++) {
                if (BITS_PER_LONG == 64)
                        dst[i] = cpu_to_le64(src[i]);
                else
                        dst[i] = cpu_to_le32(src[i]);
        }
}
EXPORT_SYMBOL(bitmap_copy_le);
#endif

unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags)
{
        return kmalloc_array(BITS_TO_LONGS(nbits), sizeof(unsigned long),
                             flags);
}
EXPORT_SYMBOL(bitmap_alloc);

unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags)
{
        return bitmap_alloc(nbits, flags | __GFP_ZERO);
}
EXPORT_SYMBOL(bitmap_zalloc);

void bitmap_free(const unsigned long *bitmap)
{
        kfree(bitmap);
}
EXPORT_SYMBOL(bitmap_free);

static void devm_bitmap_free(void *data)
{
        unsigned long *bitmap = data;

        bitmap_free(bitmap);
}

unsigned long *devm_bitmap_alloc(struct device *dev,
                                 unsigned int nbits, gfp_t flags)
{
        unsigned long *bitmap;
        int ret;

        bitmap = bitmap_alloc(nbits, flags);
        if (!bitmap)
                return NULL;

        ret = devm_add_action_or_reset(dev, devm_bitmap_free, bitmap);
        if (ret)
                return NULL;

        return bitmap;
}
EXPORT_SYMBOL_GPL(devm_bitmap_alloc);

unsigned long *devm_bitmap_zalloc(struct device *dev,
                                  unsigned int nbits, gfp_t flags)
{
        return devm_bitmap_alloc(dev, nbits, flags | __GFP_ZERO);
}
EXPORT_SYMBOL_GPL(devm_bitmap_zalloc);

#if BITS_PER_LONG == 64
/**
 * bitmap_from_arr32 - copy the contents of u32 array of bits to bitmap
 *        @bitmap: array of unsigned longs, the destination bitmap
 *        @buf: array of u32 (in host byte order), the source bitmap
 *        @nbits: number of bits in @bitmap
 */
void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, unsigned int nbits)
{
        unsigned int i, halfwords;

        halfwords = DIV_ROUND_UP(nbits, 32);
        for (i = 0; i < halfwords; i++) {
                bitmap[i/2] = (unsigned long) buf[i];
                if (++i < halfwords)
                        bitmap[i/2] |= ((unsigned long) buf[i]) << 32;
        }

        /* Clear tail bits in last word beyond nbits. */
        if (nbits % BITS_PER_LONG)
                bitmap[(halfwords - 1) / 2] &= BITMAP_LAST_WORD_MASK(nbits);
}
EXPORT_SYMBOL(bitmap_from_arr32);

/**
 * bitmap_to_arr32 - copy the contents of bitmap to a u32 array of bits
 *        @buf: array of u32 (in host byte order), the dest bitmap
 *        @bitmap: array of unsigned longs, the source bitmap
 *        @nbits: number of bits in @bitmap
 */
void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap, unsigned int nbits)
{
        unsigned int i, halfwords;

        halfwords = DIV_ROUND_UP(nbits, 32);
        for (i = 0; i < halfwords; i++) {
                buf[i] = (u32) (bitmap[i/2] & UINT_MAX);
                if (++i < halfwords)
                        buf[i] = (u32) (bitmap[i/2] >> 32);
        }

        /* Clear tail bits in last element of array beyond nbits. */
        if (nbits % BITS_PER_LONG)
                buf[halfwords - 1] &= (u32) (UINT_MAX >> ((-nbits) & 31));
}
EXPORT_SYMBOL(bitmap_to_arr32);

#endif

























































































    1 






























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
/*
 * include/linux/topology.h
 *
 * Written by: Matthew Dobson, IBM Corporation
 *
 * Copyright (C) 2002, IBM Corp.
 *
 * All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
 * NON INFRINGEMENT.  See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * Send feedback to <colpatch@us.ibm.com>
 */
#ifndef _LINUX_TOPOLOGY_H
#define _LINUX_TOPOLOGY_H

#include <linux/arch_topology.h>
#include <linux/cpumask.h>
#include <linux/bitops.h>
#include <linux/mmzone.h>
#include <linux/smp.h>
#include <linux/percpu.h>
#include <asm/topology.h>

#ifndef nr_cpus_node
#define nr_cpus_node(node) cpumask_weight(cpumask_of_node(node))
#endif

#define for_each_node_with_cpus(node)                        \
        for_each_online_node(node)                        \
                if (nr_cpus_node(node))

int arch_update_cpu_topology(void);

/* Conform to ACPI 2.0 SLIT distance definitions */
#define LOCAL_DISTANCE                10
#define REMOTE_DISTANCE                20
#define DISTANCE_BITS           8
#ifndef node_distance
#define node_distance(from,to)        ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
#endif
#ifndef RECLAIM_DISTANCE
/*
 * If the distance between nodes in a system is larger than RECLAIM_DISTANCE
 * (in whatever arch specific measurement units returned by node_distance())
 * and node_reclaim_mode is enabled then the VM will only call node_reclaim()
 * on nodes within this distance.
 */
#define RECLAIM_DISTANCE 30
#endif

/*
 * The following tunable allows platforms to override the default node
 * reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are
 * sufficiently fast that the default value actually hurts
 * performance.
 *
 * AMD EPYC machines use this because even though the 2-hop distance
 * is 32 (3.2x slower than a local memory access) performance actually
 * *improves* if allowed to reclaim memory and load balance tasks
 * between NUMA nodes 2-hops apart.
 */
extern int __read_mostly node_reclaim_distance;

#ifndef PENALTY_FOR_NODE_WITH_CPUS
#define PENALTY_FOR_NODE_WITH_CPUS        (1)
#endif

#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DECLARE_PER_CPU(int, numa_node);

#ifndef numa_node_id
/* Returns the number of the current Node. */
static inline int numa_node_id(void)
{
        return raw_cpu_read(numa_node);
}
#endif

#ifndef cpu_to_node
static inline int cpu_to_node(int cpu)
{
        return per_cpu(numa_node, cpu);
}
#endif

#ifndef set_numa_node
static inline void set_numa_node(int node)
{
        this_cpu_write(numa_node, node);
}
#endif

#ifndef set_cpu_numa_node
static inline void set_cpu_numa_node(int cpu, int node)
{
        per_cpu(numa_node, cpu) = node;
}
#endif

#else        /* !CONFIG_USE_PERCPU_NUMA_NODE_ID */

/* Returns the number of the current Node. */
#ifndef numa_node_id
static inline int numa_node_id(void)
{
        return cpu_to_node(raw_smp_processor_id());
}
#endif

#endif        /* [!]CONFIG_USE_PERCPU_NUMA_NODE_ID */

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

/*
 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem().
 */
DECLARE_PER_CPU(int, _numa_mem_);

#ifndef set_numa_mem
static inline void set_numa_mem(int node)
{
        this_cpu_write(_numa_mem_, node);
}
#endif

#ifndef numa_mem_id
/* Returns the number of the nearest Node with memory */
static inline int numa_mem_id(void)
{
        return raw_cpu_read(_numa_mem_);
}
#endif

#ifndef cpu_to_mem
static inline int cpu_to_mem(int cpu)
{
        return per_cpu(_numa_mem_, cpu);
}
#endif

#ifndef set_cpu_numa_mem
static inline void set_cpu_numa_mem(int cpu, int node)
{
        per_cpu(_numa_mem_, cpu) = node;
}
#endif

#else        /* !CONFIG_HAVE_MEMORYLESS_NODES */

#ifndef numa_mem_id
/* Returns the number of the nearest Node with memory */
static inline int numa_mem_id(void)
{
        return numa_node_id();
}
#endif

#ifndef cpu_to_mem
static inline int cpu_to_mem(int cpu)
{
        return cpu_to_node(cpu);
}
#endif

#endif        /* [!]CONFIG_HAVE_MEMORYLESS_NODES */

#ifndef topology_physical_package_id
#define topology_physical_package_id(cpu)        ((void)(cpu), -1)
#endif
#ifndef topology_die_id
#define topology_die_id(cpu)                        ((void)(cpu), -1)
#endif
#ifndef topology_core_id
#define topology_core_id(cpu)                        ((void)(cpu), 0)
#endif
#ifndef topology_sibling_cpumask
#define topology_sibling_cpumask(cpu)                cpumask_of(cpu)
#endif
#ifndef topology_core_cpumask
#define topology_core_cpumask(cpu)                cpumask_of(cpu)
#endif
#ifndef topology_die_cpumask
#define topology_die_cpumask(cpu)                cpumask_of(cpu)
#endif

#if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask)
static inline const struct cpumask *cpu_smt_mask(int cpu)
{
        return topology_sibling_cpumask(cpu);
}
#endif

static inline const struct cpumask *cpu_cpu_mask(int cpu)
{
        return cpumask_of_node(cpu_to_node(cpu));
}


#endif /* _LINUX_TOPOLOGY_H */



























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849
9850
9851
9852
9853
9854
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897
9898
9899
9900
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910
9911
9912
9913
9914
9915
9916
9917
9918
9919
9920
9921
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931
9932
9933
9934
9935
9936
9937
9938
9939
9940
9941
9942
9943
9944
9945
9946
9947
9948
9949
9950
9951
9952
9953
9954
9955
9956
9957
9958
9959
9960
9961
9962
9963
9964
9965
9966
9967
9968
9969
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979
9980
9981
9982
9983
9984
9985
9986
9987
9988
9989
9990
9991
9992
9993
9994
9995
9996
9997
9998
9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231
10232
10233
10234
10235
10236
10237
10238
10239
10240
10241
10242
10243
10244
10245
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257
10258
10259
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287
10288
10289
10290
10291
10292
10293
10294
10295
10296
10297
10298
10299
10300
10301
10302
10303
10304
10305
10306
10307
10308
10309
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330
10331
10332
10333
10334
10335
10336
10337
10338
10339
10340
10341
10342
10343
10344
10345
10346
10347
10348
10349
10350
10351
10352
10353
10354
10355
10356
10357
10358
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369
10370
10371
10372
10373
10374
10375
10376
10377
10378
10379
10380
10381
10382
10383
10384
10385
10386
10387
10388
10389
10390
10391
10392
10393
10394
10395
10396
10397
10398
10399
10400
10401
10402
10403
10404
10405
10406
10407
10408
10409
10410
10411
10412
10413
10414
10415
10416
10417
10418
10419
10420
10421
10422
10423
10424
10425
10426
10427
10428
10429
10430
10431
10432
10433
10434
10435
10436
10437
10438
10439
10440
10441
10442
10443
10444
10445
10446
10447
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457
10458
10459
10460
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474
10475
10476
10477
10478
10479
10480
10481
10482
10483
10484
10485
10486
10487
10488
10489
10490
10491
10492
10493
10494
10495
10496
10497
10498
10499
10500
10501
10502
10503
10504
10505
10506
10507
10508
10509
10510
10511
10512
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536
10537
10538
10539
10540
10541
10542
10543
10544
10545
10546
10547
10548
10549
10550
10551
10552
10553
10554
10555
10556
10557
10558
10559
10560
10561
10562
10563
10564
10565
10566
10567
10568
10569
10570
10571
10572
10573
10574
10575
10576
10577
10578
10579
10580
10581
10582
10583
10584
10585
10586
10587
10588
10589
10590
10591
10592
10593
10594
10595
10596
10597
10598
10599
10600
10601
10602
10603
10604
10605
10606
10607
10608
10609
10610
10611
10612
10613
10614
10615
10616
10617
10618
10619
10620
10621
10622
10623
10624
10625
10626
10627
10628
10629
10630
10631
10632
10633
10634
10635
10636
10637
10638
10639
10640
10641
10642
10643
10644
10645
10646
10647
10648
10649
10650
10651
10652
10653
10654
10655
10656
10657
10658
10659
10660
10661
10662
10663
10664
10665
10666
10667
10668
10669
10670
10671
10672
10673
10674
10675
10676
10677
10678
10679
10680
10681
10682
10683
10684
10685
10686
10687
10688
10689
10690
10691
10692
10693
10694
10695
10696
10697
10698
10699
10700
10701
10702
10703
10704
10705
10706
10707
10708
10709
10710
10711
10712
10713
10714
10715
10716
10717
10718
10719
10720
10721
10722
10723
10724
10725
10726
10727
10728
10729
10730
10731
10732
10733
10734
10735
10736
10737
10738
10739
10740
10741
10742
10743
10744
10745
10746
10747
10748
10749
10750
10751
10752
10753
10754
10755
10756
10757
10758
10759
10760
10761
10762
10763
10764
10765
10766
10767
10768
10769
10770
10771
10772
10773
10774
10775
10776
10777
10778
10779
10780
10781
10782
10783
10784
10785
10786
10787
10788
10789
10790
10791
10792
10793
10794
10795
10796
10797
10798
10799
10800
10801
10802
10803
10804
10805
10806
10807
10808
10809
10810
10811
10812
10813
10814
10815
10816
10817
10818
10819
10820
10821
10822
10823
10824
10825
10826
10827
10828
10829
10830
10831
10832
10833
10834
10835
10836
10837
10838
10839
10840
10841
10842
10843
10844
10845
10846
10847
10848
10849
10850
10851
10852
10853
10854
10855
10856
10857
10858
10859
10860
10861
10862
10863
10864
10865
10866
10867
10868
10869
10870
10871
10872
10873
10874
10875
10876
10877
10878
10879
10880
10881
10882
10883
10884
10885
10886
10887
10888
10889
10890
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900
10901
10902
10903
10904
10905
10906
10907
10908
10909
10910
10911
10912
10913
10914
10915
10916
10917
10918
10919
10920
10921
10922
10923
10924
10925
10926
10927
10928
10929
10930
10931
10932
10933
10934
10935
10936
10937
10938
10939
10940
10941
10942
10943
10944
10945
10946
10947
10948
10949
10950
10951
10952
10953
10954
10955
10956
10957
10958
10959
10960
10961
10962
10963
10964
10965
10966
10967
10968
10969
10970
10971
10972
10973
10974
10975
10976
10977
10978
10979
10980
10981
10982
10983
10984
10985
10986
10987
10988
10989
10990
10991
10992
10993
10994
10995
10996
10997
10998
10999
11000
11001
11002
11003
11004
11005
11006
11007
11008
11009
11010
11011
11012
11013
11014
11015
11016
11017
11018
11019
11020
11021
11022
11023
11024
11025
11026
11027
11028
11029
11030
11031
11032
11033
11034
11035
11036
11037
11038
11039
11040
11041
11042
11043
11044
11045
11046
11047
11048
11049
11050
11051
11052
11053
11054
11055
11056
11057
11058
11059
11060
11061
11062
11063
11064
11065
11066
11067
11068
11069
11070
11071
11072
11073
11074
11075
11076
11077
11078
11079
11080
11081
11082
11083
11084
11085
11086
11087
11088
11089
11090
11091
11092
11093
11094
11095
11096
11097
11098
11099
11100
11101
11102
11103
11104
11105
11106
11107
11108
11109
11110
11111
11112
11113
11114
11115
11116
11117
11118
11119
11120
11121
11122
11123
11124
11125
11126
11127
11128
11129
11130
11131
11132
11133
11134
11135
11136
11137
11138
11139
11140
11141
11142
11143
11144
11145
11146
11147
11148
11149
11150
11151
11152
11153
11154
11155
11156
11157
11158
11159
11160
11161
11162
11163
11164
11165
11166
11167
11168
11169
11170
11171
11172
11173
11174
11175
11176
11177
11178
11179
11180
11181
11182
11183
11184
11185
11186
11187
11188
11189
11190
11191
11192
11193
11194
11195
11196
11197
11198
11199
11200
11201
11202
11203
11204
11205
11206
11207
11208
11209
11210
11211
11212
11213
11214
11215
11216
11217
11218
11219
11220
11221
11222
11223
11224
11225
11226
11227
11228
11229
11230
11231
11232
11233
11234
11235
11236
11237
11238
11239
11240
11241
11242
11243
11244
11245
11246
11247
11248
11249
11250
11251
11252
11253
11254
11255
11256
11257
11258
11259
11260
11261
11262
11263
11264
11265
11266
11267
11268
11269
11270
11271
11272
11273
11274
11275
11276
11277
11278
11279
11280
11281
11282
11283
11284
11285
11286
11287
11288
11289
11290
11291
11292
11293
11294
11295
11296
11297
11298
11299
11300
11301
11302
11303
11304
11305
11306
11307
11308
11309
11310
11311
11312
11313
11314
11315
11316
11317
11318
11319
11320
11321
11322
11323
11324
11325
11326
11327
11328
11329
11330
11331
11332
11333
11334
11335
11336
11337
11338
11339
11340
11341
11342
11343
11344
11345
11346
11347
11348
11349
11350
11351
11352
11353
11354
11355
11356
11357
11358
11359
11360
11361
11362
11363
11364
11365
11366
11367
11368
11369
11370
11371
11372
11373
11374
11375
11376
11377
11378
11379
11380
11381
11382
11383
11384
11385
11386
11387
11388
11389
11390
11391
11392
11393
11394
11395
11396
11397
11398
11399
11400
11401
11402
11403
11404
11405
11406
11407
11408
11409
11410
11411
11412
11413
11414
11415
11416
11417
11418
11419
11420
11421
11422
11423
11424
11425
11426
11427
11428
11429
11430
11431
11432
11433
11434
11435
11436
11437
11438
11439
11440
11441
11442
11443
11444
11445
11446
11447
11448
11449
11450
11451
11452
11453
11454
11455
11456
11457
11458
11459
11460
11461
11462
11463
11464
11465
11466
11467
11468
11469
11470
11471
11472
11473
11474
11475
11476
11477
11478
11479
11480
11481
11482
11483
11484
11485
11486
11487
11488
11489
11490
11491
11492
11493
11494
11495
11496
11497
11498
11499
11500
11501
11502
11503
11504
11505
11506
11507
11508
11509
11510
11511
11512
11513
11514
11515
11516
11517
11518
11519
11520
11521
11522
11523
11524
11525
11526
11527
11528
11529
11530
11531
11532
11533
11534
11535
11536
11537
11538
11539
11540
11541
11542
11543
11544
11545
11546
11547
11548
11549
11550
11551
11552
11553
11554
11555
11556
11557
11558
11559
11560
11561
11562
11563
11564
11565
11566
11567
11568
11569
11570
11571
11572
11573
11574
11575
11576
11577
11578
11579
11580
11581
11582
11583
11584
11585
11586
11587
11588
11589
11590
11591
11592
11593
11594
11595
11596
11597
11598
11599
11600
11601
11602
11603
11604
11605
11606
11607
11608
11609
11610
11611
11612
11613
11614
11615
11616
11617
11618
11619
11620
11621
11622
11623
11624
11625
11626
11627
11628
11629
11630
11631
11632
11633
11634
11635
11636
11637
11638
11639
11640
11641
11642
11643
11644
11645
11646
11647
11648
11649
11650
11651
11652
11653
11654
11655
11656
11657
11658
11659
11660
11661
11662
11663
11664
11665
11666
11667
11668
11669
11670
11671
11672
11673
11674
11675
11676
11677
11678
11679
11680
11681
11682
11683
11684
11685
11686
11687
11688
11689
11690
11691
11692
11693
11694
11695
11696
11697
11698
11699
11700
11701
11702
11703
11704
11705
11706
11707
11708
11709
11710
11711
11712
11713
11714
11715
11716
11717
11718
11719
11720
11721
11722
11723
11724
11725
11726
11727
11728
11729
11730
11731
11732
11733
11734
11735
11736
11737
11738
11739
11740
11741
11742
11743
11744
11745
11746
11747
11748
11749
11750
11751
11752
11753
11754
11755
11756
11757
11758
11759
11760
11761
11762
11763
11764
11765
11766
11767
11768
11769
11770
11771
11772
11773
11774
11775
11776
11777
11778
11779
11780
11781
11782
11783
11784
11785
11786
11787
11788
11789
11790
11791
11792
11793
11794
11795
11796
11797
11798
11799
11800
11801
11802
11803
11804
11805
11806
11807
11808
11809
11810
11811
11812
11813
11814
11815
11816
11817
11818
11819
11820
11821
11822
11823
11824
11825
11826
11827
11828
11829
11830
11831
11832
11833
11834
11835
11836
11837
11838
11839
11840
11841
11842
11843
11844
11845
11846
11847
11848
11849
11850
11851
11852
11853
11854
11855
11856
11857
11858
11859
11860
11861
11862
11863
11864
11865
11866
11867
11868
11869
11870
11871
11872
11873
11874
11875
11876
11877
11878
11879
11880
11881
11882
11883
11884
11885
11886
11887
11888
11889
11890
11891
11892
11893
11894
11895
11896
11897
11898
11899
11900
11901
11902
11903
11904
11905
11906
11907
11908
11909
11910
11911
11912
11913
11914
11915
11916
11917
11918
11919
11920
11921
11922
11923
11924
11925
11926
11927
11928
11929
11930
11931
11932
11933
11934
11935
11936
11937
11938
11939
11940
11941
11942
11943
11944
11945
11946
11947
11948
11949
11950
11951
11952
11953
11954
11955
11956
11957
11958
11959
11960
11961
11962
11963
11964
11965
11966
11967
11968
11969
11970
11971
11972
11973
11974
11975
11976
11977
11978
11979
11980
11981
11982
11983
11984
11985
11986
11987
11988
11989
11990
11991
11992
11993
11994
11995
11996
11997
11998
11999
12000
12001
12002
12003
12004
12005
12006
12007
12008
12009
12010
12011
12012
12013
12014
12015
12016
12017
12018
12019
12020
12021
12022
12023
12024
12025
12026
12027
12028
12029
12030
12031
12032
12033
12034
12035
12036
12037
12038
12039
12040
12041
12042
12043
12044
12045
12046
12047
12048
12049
12050
12051
12052
12053
12054
12055
12056
12057
12058
12059
12060
12061
12062
12063
12064
12065
12066
12067
12068
12069
12070
12071
12072
12073
12074
12075
12076
12077
12078
12079
12080
12081
12082
12083
12084
12085
12086
12087
12088
12089
12090
12091
12092
12093
12094
12095
12096
12097
12098
12099
12100
12101
12102
12103
12104
12105
12106
12107
12108
12109
12110
12111
12112
12113
12114
12115
12116
12117
12118
12119
12120
12121
12122
12123
12124
12125
12126
12127
12128
12129
12130
12131
12132
12133
12134
12135
12136
12137
12138
12139
12140
12141
12142
12143
12144
12145
12146
12147
12148
12149
12150
12151
12152
12153
12154
12155
12156
12157
12158
12159
12160
12161
12162
12163
12164
12165
12166
12167
12168
12169
12170
12171
12172
12173
12174
12175
12176
12177
12178
12179
12180
12181
12182
12183
12184
12185
12186
12187
12188
12189
12190
12191
12192
12193
12194
12195
12196
12197
12198
12199
12200
12201
12202
12203
12204
12205
12206
12207
12208
12209
12210
12211
12212
12213
12214
12215
12216
12217
12218
12219
12220
12221
12222
12223
12224
12225
12226
12227
12228
12229
12230
12231
12232
12233
12234
12235
12236
12237
12238
12239
12240
12241
12242
12243
12244
12245
12246
12247
12248
12249
12250
12251
12252
12253
12254
12255
12256
12257
12258
12259
12260
12261
12262
12263
12264
12265
12266
12267
12268
12269
12270
12271
12272
12273
12274
12275
12276
12277
12278
12279
12280
12281
12282
12283
12284
12285
12286
12287
12288
12289
12290
12291
12292
12293
12294
12295
12296
12297
12298
12299
12300
12301
12302
12303
12304
12305
12306
12307
12308
12309
12310
12311
12312
12313
12314
12315
12316
12317
12318
12319
12320
12321
12322
12323
12324
12325
12326
12327
12328
12329
12330
12331
12332
12333
12334
12335
12336
12337
12338
12339
12340
12341
12342
12343
12344
12345
12346
12347
12348
12349
12350
12351
12352
12353
12354
12355
12356
12357
12358
12359
12360
12361
12362
12363
12364
12365
12366
12367
12368
12369
12370
12371
12372
12373
12374
12375
12376
12377
12378
12379
12380
12381
12382
12383
12384
12385
12386
12387
12388
12389
12390
12391
12392
12393
12394
12395
12396
12397
12398
12399
12400
12401
12402
12403
12404
12405
12406
12407
12408
12409
12410
12411
12412
12413
12414
12415
12416
12417
12418
12419
12420
12421
12422
12423
12424
12425
12426
12427
12428
12429
12430
12431
12432
12433
12434
12435
12436
12437
12438
12439
12440
12441
12442
12443
12444
12445
12446
12447
12448
12449
12450
12451
12452
12453
12454
12455
12456
12457
12458
12459
12460
12461
12462
12463
12464
12465
12466
12467
12468
12469
12470
12471
12472
12473
12474
12475
12476
12477
12478
12479
12480
12481
12482
12483
12484
12485
12486
12487
12488
12489
12490
12491
12492
12493
12494
12495
12496
12497
12498
12499
12500
12501
12502
12503
12504
12505
12506
12507
12508
12509
12510
12511
12512
12513
12514
12515
12516
12517
12518
12519
12520
12521
12522
12523
12524
12525
12526
12527
12528
12529
12530
12531
12532
12533
12534
12535
12536
12537
12538
12539
12540
12541
12542
12543
12544
12545
12546
12547
12548
12549
12550
12551
12552
12553
12554
12555
12556
12557
12558
12559
12560
12561
12562
12563
12564
12565
12566
12567
12568
12569
12570
12571
12572
12573
12574
12575
12576
12577
12578
12579
12580
12581
12582
12583
12584
12585
12586
12587
12588
12589
12590
12591
12592
12593
12594
12595
12596
12597
12598
12599
12600
12601
12602
12603
12604
12605
12606
12607
12608
12609
12610
12611
12612
12613
12614
12615
12616
12617
12618
12619
12620
12621
12622
12623
12624
12625
12626
12627
12628
12629
12630
12631
12632
12633
12634
12635
12636
12637
12638
12639
12640
12641
12642
12643
12644
12645
12646
12647
12648
12649
12650
12651
12652
12653
12654
12655
12656
12657
12658
12659
12660
12661
12662
12663
12664
12665
12666
12667
12668
12669
12670
12671
12672
12673
12674
12675
12676
12677
12678
12679
12680
12681
12682
12683
12684
12685
12686
12687
12688
12689
12690
12691
12692
12693
12694
12695
12696
12697
12698
12699
12700
12701
12702
12703
12704
12705
12706
12707
12708
12709
12710
12711
12712
12713
12714
12715
12716
12717
12718
12719
12720
12721
12722
12723
12724
12725
12726
12727
12728
12729
12730
12731
12732
12733
12734
12735
12736
12737
12738
12739
12740
12741
12742
12743
12744
12745
12746
12747
12748
12749
12750
12751
12752
12753
12754
12755
12756
12757
12758
12759
12760
12761
12762
12763
12764
12765
12766
12767
12768
12769
12770
12771
12772
12773
12774
12775
12776
12777
12778
12779
12780
12781
12782
12783
12784
12785
12786
12787
12788
12789
12790
12791
12792
12793
12794
12795
12796
12797
12798
12799
12800
12801
12802
12803
12804
12805
12806
12807
12808
12809
12810
12811
12812
12813
12814
12815
12816
12817
12818
12819
12820
12821
12822
12823
12824
12825
12826
12827
12828
12829
12830
12831
12832
12833
12834
12835
12836
12837
12838
12839
12840
12841
12842
12843
12844
12845
12846
12847
12848
12849
12850
12851
12852
12853
12854
12855
12856
12857
12858
12859
12860
12861
12862
12863
12864
12865
12866
12867
12868
12869
12870
12871
12872
12873
12874
12875
12876
12877
12878
12879
12880
12881
12882
12883
12884
12885
12886
12887
12888
12889
12890
12891
12892
12893
12894
12895
12896
12897
12898
12899
12900
12901
12902
12903
12904
12905
12906
12907
12908
12909
12910
12911
12912
12913
12914
12915
12916
12917
12918
12919
12920
12921
12922
12923
12924
12925
12926
12927
12928
12929
12930
12931
12932
12933
12934
12935
12936
12937
12938
12939
12940
12941
12942
12943
12944
12945
12946
12947
12948
12949
12950
12951
12952
12953
12954
12955
12956
12957
12958
12959
12960
12961
12962
12963
12964
12965
12966
12967
12968
12969
12970
12971
12972
12973
12974
12975
12976
12977
12978
12979
12980
12981
12982
12983
12984
12985
12986
12987
12988
12989
12990
12991
12992
12993
12994
12995
12996
12997
12998
12999
13000
13001
13002
13003
13004
13005
13006
13007
13008
13009
13010
13011
13012
13013
13014
13015
13016
13017
13018
13019
13020
13021
13022
13023
13024
13025
13026
13027
13028
13029
13030
13031
13032
13033
13034
13035
13036
13037
13038
13039
13040
13041
13042
13043
13044
13045
13046
13047
13048
13049
13050
13051
13052
13053
13054
13055
13056
13057
13058
13059
13060
13061
13062
13063
13064
13065
13066
13067
13068
13069
13070
13071
13072
13073
13074
13075
13076
13077
13078
13079
13080
13081
13082
13083
13084
13085
13086
13087
13088
13089
13090
13091
13092
13093
13094
13095
13096
13097
13098
13099
13100
13101
13102
13103
13104
13105
13106
13107
13108
13109
13110
13111
13112
13113
13114
13115
13116
13117
13118
13119
13120
13121
13122
13123
13124
13125
13126
13127
13128
13129
13130
13131
13132
13133
13134
13135
13136
13137
13138
13139
13140
13141
13142
13143
13144
13145
13146
13147
13148
13149
13150
13151
13152
13153
13154
13155
13156
13157
13158
13159
13160
13161
13162
13163
13164
13165
13166
13167
13168
13169
13170
13171
13172
13173
13174
13175
13176
13177
13178
13179
13180
13181
13182
13183
13184
13185
13186
13187
13188
13189
13190
13191
13192
13193
13194
13195
13196
13197
13198
13199
13200
13201
13202
13203
13204
13205
13206
13207
13208
13209
13210
13211
13212
13213
13214
13215
13216
13217
13218
13219
13220
13221
13222
13223
13224
13225
13226
13227
13228
13229
13230
13231
13232
13233
13234
13235
13236
13237
13238
13239
13240
13241
13242
13243
13244
13245
13246
13247
13248
13249
13250
13251
13252
13253
13254
13255
13256
13257
13258
13259
13260
13261
13262
13263
13264
13265
13266
13267
13268
13269
13270
13271
13272
13273
13274
13275
13276
13277
13278
13279
13280
13281
13282
13283
13284
13285
13286
13287
13288
13289
13290
13291
13292
13293
13294
13295
13296
13297
13298
13299
13300
13301
13302
13303
13304
13305
13306
13307
13308
13309
13310
13311
13312
13313
13314
13315
13316
13317
13318
13319
13320
13321
13322
13323
13324
13325
13326
13327
13328
13329
13330
13331
13332
13333
13334
13335
13336
13337
13338
13339
13340
13341
13342
// SPDX-License-Identifier: GPL-2.0
/*
 * Performance events core code:
 *
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
 *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
 */

#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/idr.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/hash.h>
#include <linux/tick.h>
#include <linux/sysfs.h>
#include <linux/dcache.h>
#include <linux/percpu.h>
#include <linux/ptrace.h>
#include <linux/reboot.h>
#include <linux/vmstat.h>
#include <linux/device.h>
#include <linux/export.h>
#include <linux/vmalloc.h>
#include <linux/hardirq.h>
#include <linux/hugetlb.h>
#include <linux/rculist.h>
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/anon_inodes.h>
#include <linux/kernel_stat.h>
#include <linux/cgroup.h>
#include <linux/perf_event.h>
#include <linux/trace_events.h>
#include <linux/hw_breakpoint.h>
#include <linux/mm_types.h>
#include <linux/module.h>
#include <linux/mman.h>
#include <linux/compat.h>
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/namei.h>
#include <linux/parser.h>
#include <linux/sched/clock.h>
#include <linux/sched/mm.h>
#include <linux/proc_ns.h>
#include <linux/mount.h>
#include <linux/min_heap.h>

#include "internal.h"

#include <asm/irq_regs.h>

typedef int (*remote_function_f)(void *);

struct remote_function_call {
        struct task_struct        *p;
        remote_function_f        func;
        void                        *info;
        int                        ret;
};

static void remote_function(void *data)
{
        struct remote_function_call *tfc = data;
        struct task_struct *p = tfc->p;

        if (p) {
                /* -EAGAIN */
                if (task_cpu(p) != smp_processor_id())
                        return;

                /*
                 * Now that we're on right CPU with IRQs disabled, we can test
                 * if we hit the right task without races.
                 */

                tfc->ret = -ESRCH; /* No such (running) process */
                if (p != current)
                        return;
        }

        tfc->ret = tfc->func(tfc->info);
}

/**
 * task_function_call - call a function on the cpu on which a task runs
 * @p:                the task to evaluate
 * @func:        the function to be called
 * @info:        the function call argument
 *
 * Calls the function @func when the task is currently running. This might
 * be on the current CPU, which just calls the function directly.  This will
 * retry due to any failures in smp_call_function_single(), such as if the
 * task_cpu() goes offline concurrently.
 *
 * returns @func return value or -ESRCH or -ENXIO when the process isn't running
 */
static int
task_function_call(struct task_struct *p, remote_function_f func, void *info)
{
        struct remote_function_call data = {
                .p        = p,
                .func        = func,
                .info        = info,
                .ret        = -EAGAIN,
        };
        int ret;

        for (;;) {
                ret = smp_call_function_single(task_cpu(p), remote_function,
                                               &data, 1);
                if (!ret)
                        ret = data.ret;

                if (ret != -EAGAIN)
                        break;

                cond_resched();
        }

        return ret;
}

/**
 * cpu_function_call - call a function on the cpu
 * @func:        the function to be called
 * @info:        the function call argument
 *
 * Calls the function @func on the remote cpu.
 *
 * returns: @func return value or -ENXIO when the cpu is offline
 */
static int cpu_function_call(int cpu, remote_function_f func, void *info)
{
        struct remote_function_call data = {
                .p        = NULL,
                .func        = func,
                .info        = info,
                .ret        = -ENXIO, /* No such CPU */
        };

        smp_call_function_single(cpu, remote_function, &data, 1);

        return data.ret;
}

static inline struct perf_cpu_context *
__get_cpu_context(struct perf_event_context *ctx)
{
        return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
}

static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
                          struct perf_event_context *ctx)
{
        raw_spin_lock(&cpuctx->ctx.lock);
        if (ctx)
                raw_spin_lock(&ctx->lock);
}

static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
                            struct perf_event_context *ctx)
{
        if (ctx)
                raw_spin_unlock(&ctx->lock);
        raw_spin_unlock(&cpuctx->ctx.lock);
}

#define TASK_TOMBSTONE ((void *)-1L)

static bool is_kernel_event(struct perf_event *event)
{
        return READ_ONCE(event->owner) == TASK_TOMBSTONE;
}

/*
 * On task ctx scheduling...
 *
 * When !ctx->nr_events a task context will not be scheduled. This means
 * we can disable the scheduler hooks (for performance) without leaving
 * pending task ctx state.
 *
 * This however results in two special cases:
 *
 *  - removing the last event from a task ctx; this is relatively straight
 *    forward and is done in __perf_remove_from_context.
 *
 *  - adding the first event to a task ctx; this is tricky because we cannot
 *    rely on ctx->is_active and therefore cannot use event_function_call().
 *    See perf_install_in_context().
 *
 * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
 */

typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
                        struct perf_event_context *, void *);

struct event_function_struct {
        struct perf_event *event;
        event_f func;
        void *data;
};

static int event_function(void *info)
{
        struct event_function_struct *efs = info;
        struct perf_event *event = efs->event;
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        struct perf_event_context *task_ctx = cpuctx->task_ctx;
        int ret = 0;

        lockdep_assert_irqs_disabled();

        perf_ctx_lock(cpuctx, task_ctx);
        /*
         * Since we do the IPI call without holding ctx->lock things can have
         * changed, double check we hit the task we set out to hit.
         */
        if (ctx->task) {
                if (ctx->task != current) {
                        ret = -ESRCH;
                        goto unlock;
                }

                /*
                 * We only use event_function_call() on established contexts,
                 * and event_function() is only ever called when active (or
                 * rather, we'll have bailed in task_function_call() or the
                 * above ctx->task != current test), therefore we must have
                 * ctx->is_active here.
                 */
                WARN_ON_ONCE(!ctx->is_active);
                /*
                 * And since we have ctx->is_active, cpuctx->task_ctx must
                 * match.
                 */
                WARN_ON_ONCE(task_ctx != ctx);
        } else {
                WARN_ON_ONCE(&cpuctx->ctx != ctx);
        }

        efs->func(event, cpuctx, ctx, efs->data);
unlock:
        perf_ctx_unlock(cpuctx, task_ctx);

        return ret;
}

static void event_function_call(struct perf_event *event, event_f func, void *data)
{
        struct perf_event_context *ctx = event->ctx;
        struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
        struct event_function_struct efs = {
                .event = event,
                .func = func,
                .data = data,
        };

        if (!event->parent) {
                /*
                 * If this is a !child event, we must hold ctx::mutex to
                 * stabilize the event->ctx relation. See
                 * perf_event_ctx_lock().
                 */
                lockdep_assert_held(&ctx->mutex);
        }

        if (!task) {
                cpu_function_call(event->cpu, event_function, &efs);
                return;
        }

        if (task == TASK_TOMBSTONE)
                return;

again:
        if (!task_function_call(task, event_function, &efs))
                return;

        raw_spin_lock_irq(&ctx->lock);
        /*
         * Reload the task pointer, it might have been changed by
         * a concurrent perf_event_context_sched_out().
         */
        task = ctx->task;
        if (task == TASK_TOMBSTONE) {
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }
        if (ctx->is_active) {
                raw_spin_unlock_irq(&ctx->lock);
                goto again;
        }
        func(event, NULL, ctx, data);
        raw_spin_unlock_irq(&ctx->lock);
}

/*
 * Similar to event_function_call() + event_function(), but hard assumes IRQs
 * are already disabled and we're on the right CPU.
 */
static void event_function_local(struct perf_event *event, event_f func, void *data)
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        struct task_struct *task = READ_ONCE(ctx->task);
        struct perf_event_context *task_ctx = NULL;

        lockdep_assert_irqs_disabled();

        if (task) {
                if (task == TASK_TOMBSTONE)
                        return;

                task_ctx = ctx;
        }

        perf_ctx_lock(cpuctx, task_ctx);

        task = ctx->task;
        if (task == TASK_TOMBSTONE)
                goto unlock;

        if (task) {
                /*
                 * We must be either inactive or active and the right task,
                 * otherwise we're screwed, since we cannot IPI to somewhere
                 * else.
                 */
                if (ctx->is_active) {
                        if (WARN_ON_ONCE(task != current))
                                goto unlock;

                        if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
                                goto unlock;
                }
        } else {
                WARN_ON_ONCE(&cpuctx->ctx != ctx);
        }

        func(event, cpuctx, ctx, data);
unlock:
        perf_ctx_unlock(cpuctx, task_ctx);
}

#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
                       PERF_FLAG_FD_OUTPUT  |\
                       PERF_FLAG_PID_CGROUP |\
                       PERF_FLAG_FD_CLOEXEC)

/*
 * branch priv levels that need permission checks
 */
#define PERF_SAMPLE_BRANCH_PERM_PLM \
        (PERF_SAMPLE_BRANCH_KERNEL |\
         PERF_SAMPLE_BRANCH_HV)

enum event_type_t {
        EVENT_FLEXIBLE = 0x1,
        EVENT_PINNED = 0x2,
        EVENT_TIME = 0x4,
        /* see ctx_resched() for details */
        EVENT_CPU = 0x8,
        EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
};

/*
 * perf_sched_events : >0 events exist
 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
 */

static void perf_sched_delayed(struct work_struct *work);
DEFINE_STATIC_KEY_FALSE(perf_sched_events);
static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
static DEFINE_MUTEX(perf_sched_mutex);
static atomic_t perf_sched_count;

static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
static DEFINE_PER_CPU(int, perf_sched_cb_usages);
static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);

static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_namespaces_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
static atomic_t nr_freq_events __read_mostly;
static atomic_t nr_switch_events __read_mostly;
static atomic_t nr_ksymbol_events __read_mostly;
static atomic_t nr_bpf_events __read_mostly;
static atomic_t nr_cgroup_events __read_mostly;
static atomic_t nr_text_poke_events __read_mostly;

static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu;
static cpumask_var_t perf_online_mask;

/*
 * perf event paranoia level:
 *  -1 - not paranoid at all
 *   0 - disallow raw tracepoint access for unpriv
 *   1 - disallow cpu events for unpriv
 *   2 - disallow kernel profiling for unpriv
 */
int sysctl_perf_event_paranoid __read_mostly = 2;

/* Minimum for 512 kiB + 1 user control page */
int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */

/*
 * max perf event sample rate
 */
#define DEFAULT_MAX_SAMPLE_RATE                100000
#define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
#define DEFAULT_CPU_TIME_MAX_PERCENT        25

int sysctl_perf_event_sample_rate __read_mostly        = DEFAULT_MAX_SAMPLE_RATE;

static int max_samples_per_tick __read_mostly        = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
static int perf_sample_period_ns __read_mostly        = DEFAULT_SAMPLE_PERIOD_NS;

static int perf_sample_allowed_ns __read_mostly =
        DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;

static void update_perf_cpu_limits(void)
{
        u64 tmp = perf_sample_period_ns;

        tmp *= sysctl_perf_cpu_time_max_percent;
        tmp = div_u64(tmp, 100);
        if (!tmp)
                tmp = 1;

        WRITE_ONCE(perf_sample_allowed_ns, tmp);
}

static bool perf_rotate_context(struct perf_cpu_context *cpuctx);

int perf_proc_update_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;
        int perf_cpu = sysctl_perf_cpu_time_max_percent;
        /*
         * If throttling is disabled don't allow the write:
         */
        if (write && (perf_cpu == 100 || perf_cpu == 0))
                return -EINVAL;

        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret || !write)
                return ret;

        max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
        update_perf_cpu_limits();

        return 0;
}

int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;

int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);

        if (ret || !write)
                return ret;

        if (sysctl_perf_cpu_time_max_percent == 100 ||
            sysctl_perf_cpu_time_max_percent == 0) {
                printk(KERN_WARNING
                       "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
                WRITE_ONCE(perf_sample_allowed_ns, 0);
        } else {
                update_perf_cpu_limits();
        }

        return 0;
}

/*
 * perf samples are done in some very critical code paths (NMIs).
 * If they take too much CPU time, the system can lock up and not
 * get any real work done.  This will drop the sample rate when
 * we detect that events are taking too long.
 */
#define NR_ACCUMULATED_SAMPLES 128
static DEFINE_PER_CPU(u64, running_sample_length);

static u64 __report_avg;
static u64 __report_allowed;

static void perf_duration_warn(struct irq_work *w)
{
        printk_ratelimited(KERN_INFO
                "perf: interrupt took too long (%lld > %lld), lowering "
                "kernel.perf_event_max_sample_rate to %d\n",
                __report_avg, __report_allowed,
                sysctl_perf_event_sample_rate);
}

static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);

void perf_sample_event_took(u64 sample_len_ns)
{
        u64 max_len = READ_ONCE(perf_sample_allowed_ns);
        u64 running_len;
        u64 avg_len;
        u32 max;

        if (max_len == 0)
                return;

        /* Decay the counter by 1 average sample. */
        running_len = __this_cpu_read(running_sample_length);
        running_len -= running_len/NR_ACCUMULATED_SAMPLES;
        running_len += sample_len_ns;
        __this_cpu_write(running_sample_length, running_len);

        /*
         * Note: this will be biased artifically low until we have
         * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
         * from having to maintain a count.
         */
        avg_len = running_len/NR_ACCUMULATED_SAMPLES;
        if (avg_len <= max_len)
                return;

        __report_avg = avg_len;
        __report_allowed = max_len;

        /*
         * Compute a throttle threshold 25% below the current duration.
         */
        avg_len += avg_len / 4;
        max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
        if (avg_len < max)
                max /= (u32)avg_len;
        else
                max = 1;

        WRITE_ONCE(perf_sample_allowed_ns, avg_len);
        WRITE_ONCE(max_samples_per_tick, max);

        sysctl_perf_event_sample_rate = max * HZ;
        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;

        if (!irq_work_queue(&perf_duration_work)) {
                early_printk("perf: interrupt took too long (%lld > %lld), lowering "
                             "kernel.perf_event_max_sample_rate to %d\n",
                             __report_avg, __report_allowed,
                             sysctl_perf_event_sample_rate);
        }
}

static atomic64_t perf_event_id;

static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
                              enum event_type_t event_type);

static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
                             enum event_type_t event_type,
                             struct task_struct *task);

static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);

void __weak perf_event_print_debug(void)        { }

extern __weak const char *perf_pmu_name(void)
{
        return "pmu";
}

static inline u64 perf_clock(void)
{
        return local_clock();
}

static inline u64 perf_event_clock(struct perf_event *event)
{
        return event->clock();
}

/*
 * State based event timekeeping...
 *
 * The basic idea is to use event->state to determine which (if any) time
 * fields to increment with the current delta. This means we only need to
 * update timestamps when we change state or when they are explicitly requested
 * (read).
 *
 * Event groups make things a little more complicated, but not terribly so. The
 * rules for a group are that if the group leader is OFF the entire group is
 * OFF, irrespecive of what the group member states are. This results in
 * __perf_effective_state().
 *
 * A futher ramification is that when a group leader flips between OFF and
 * !OFF, we need to update all group member times.
 *
 *
 * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
 * need to make sure the relevant context time is updated before we try and
 * update our timestamps.
 */

static __always_inline enum perf_event_state
__perf_effective_state(struct perf_event *event)
{
        struct perf_event *leader = event->group_leader;

        if (leader->state <= PERF_EVENT_STATE_OFF)
                return leader->state;

        return event->state;
}

static __always_inline void
__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
{
        enum perf_event_state state = __perf_effective_state(event);
        u64 delta = now - event->tstamp;

        *enabled = event->total_time_enabled;
        if (state >= PERF_EVENT_STATE_INACTIVE)
                *enabled += delta;

        *running = event->total_time_running;
        if (state >= PERF_EVENT_STATE_ACTIVE)
                *running += delta;
}

static void perf_event_update_time(struct perf_event *event)
{
        u64 now = perf_event_time(event);

        __perf_update_times(event, now, &event->total_time_enabled,
                                        &event->total_time_running);
        event->tstamp = now;
}

static void perf_event_update_sibling_time(struct perf_event *leader)
{
        struct perf_event *sibling;

        for_each_sibling_event(sibling, leader)
                perf_event_update_time(sibling);
}

static void
perf_event_set_state(struct perf_event *event, enum perf_event_state state)
{
        if (event->state == state)
                return;

        perf_event_update_time(event);
        /*
         * If a group leader gets enabled/disabled all its siblings
         * are affected too.
         */
        if ((event->state < 0) ^ (state < 0))
                perf_event_update_sibling_time(event);

        WRITE_ONCE(event->state, state);
}

/*
 * UP store-release, load-acquire
 */

#define __store_release(ptr, val)                                        \
do {                                                                        \
        barrier();                                                        \
        WRITE_ONCE(*(ptr), (val));                                        \
} while (0)

#define __load_acquire(ptr)                                                \
({                                                                        \
        __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr));        \
        barrier();                                                        \
        ___p;                                                                \
})

#ifdef CONFIG_CGROUP_PERF

static inline bool
perf_cgroup_match(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);

        /* @event doesn't care about cgroup */
        if (!event->cgrp)
                return true;

        /* wants specific cgroup scope but @cpuctx isn't associated with any */
        if (!cpuctx->cgrp)
                return false;

        /*
         * Cgroup scoping is recursive.  An event enabled for a cgroup is
         * also enabled for all its descendant cgroups.  If @cpuctx's
         * cgroup is a descendant of @event's (the test covers identity
         * case), it's a match.
         */
        return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
                                    event->cgrp->css.cgroup);
}

static inline void perf_detach_cgroup(struct perf_event *event)
{
        css_put(&event->cgrp->css);
        event->cgrp = NULL;
}

static inline int is_cgroup_event(struct perf_event *event)
{
        return event->cgrp != NULL;
}

static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
        struct perf_cgroup_info *t;

        t = per_cpu_ptr(event->cgrp->info, event->cpu);
        return t->time;
}

static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
{
        struct perf_cgroup_info *t;

        t = per_cpu_ptr(event->cgrp->info, event->cpu);
        if (!__load_acquire(&t->active))
                return t->time;
        now += READ_ONCE(t->timeoffset);
        return now;
}

static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv)
{
        if (adv)
                info->time += now - info->timestamp;
        info->timestamp = now;
        /*
         * see update_context_time()
         */
        WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
}

static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
{
        struct perf_cgroup *cgrp = cpuctx->cgrp;
        struct cgroup_subsys_state *css;
        struct perf_cgroup_info *info;

        if (cgrp) {
                u64 now = perf_clock();

                for (css = &cgrp->css; css; css = css->parent) {
                        cgrp = container_of(css, struct perf_cgroup, css);
                        info = this_cpu_ptr(cgrp->info);

                        __update_cgrp_time(info, now, true);
                        if (final)
                                __store_release(&info->active, 0);
                }
        }
}

static inline void update_cgrp_time_from_event(struct perf_event *event)
{
        struct perf_cgroup_info *info;
        struct perf_cgroup *cgrp;

        /*
         * ensure we access cgroup data only when needed and
         * when we know the cgroup is pinned (css_get)
         */
        if (!is_cgroup_event(event))
                return;

        cgrp = perf_cgroup_from_task(current, event->ctx);
        /*
         * Do not update time when cgroup is not active
         */
        if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) {
                info = this_cpu_ptr(event->cgrp->info);
                __update_cgrp_time(info, perf_clock(), true);
        }
}

static inline void
perf_cgroup_set_timestamp(struct task_struct *task,
                          struct perf_event_context *ctx)
{
        struct perf_cgroup *cgrp;
        struct perf_cgroup_info *info;
        struct cgroup_subsys_state *css;

        /*
         * ctx->lock held by caller
         * ensure we do not access cgroup data
         * unless we have the cgroup pinned (css_get)
         */
        if (!task || !ctx->nr_cgroups)
                return;

        cgrp = perf_cgroup_from_task(task, ctx);

        for (css = &cgrp->css; css; css = css->parent) {
                cgrp = container_of(css, struct perf_cgroup, css);
                info = this_cpu_ptr(cgrp->info);
                __update_cgrp_time(info, ctx->timestamp, false);
                __store_release(&info->active, 1);
        }
}

static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);

#define PERF_CGROUP_SWOUT        0x1 /* cgroup switch out every event */
#define PERF_CGROUP_SWIN        0x2 /* cgroup switch in events based on task */

/*
 * reschedule events based on the cgroup constraint of task.
 *
 * mode SWOUT : schedule out everything
 * mode SWIN : schedule in based on cgroup for next
 */
static void perf_cgroup_switch(struct task_struct *task, int mode)
{
        struct perf_cpu_context *cpuctx, *tmp;
        struct list_head *list;
        unsigned long flags;

        /*
         * Disable interrupts and preemption to avoid this CPU's
         * cgrp_cpuctx_entry to change under us.
         */
        local_irq_save(flags);

        list = this_cpu_ptr(&cgrp_cpuctx_list);
        list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
                WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);

                perf_ctx_lock(cpuctx, cpuctx->task_ctx);
                perf_pmu_disable(cpuctx->ctx.pmu);

                if (mode & PERF_CGROUP_SWOUT) {
                        cpu_ctx_sched_out(cpuctx, EVENT_ALL);
                        /*
                         * must not be done before ctxswout due
                         * to event_filter_match() in event_sched_out()
                         */
                        cpuctx->cgrp = NULL;
                }

                if (mode & PERF_CGROUP_SWIN) {
                        WARN_ON_ONCE(cpuctx->cgrp);
                        /*
                         * set cgrp before ctxsw in to allow
                         * event_filter_match() to not have to pass
                         * task around
                         * we pass the cpuctx->ctx to perf_cgroup_from_task()
                         * because cgorup events are only per-cpu
                         */
                        cpuctx->cgrp = perf_cgroup_from_task(task,
                                                             &cpuctx->ctx);
                        cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
                }
                perf_pmu_enable(cpuctx->ctx.pmu);
                perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
        }

        local_irq_restore(flags);
}

static inline void perf_cgroup_sched_out(struct task_struct *task,
                                         struct task_struct *next)
{
        struct perf_cgroup *cgrp1;
        struct perf_cgroup *cgrp2 = NULL;

        rcu_read_lock();
        /*
         * we come here when we know perf_cgroup_events > 0
         * we do not need to pass the ctx here because we know
         * we are holding the rcu lock
         */
        cgrp1 = perf_cgroup_from_task(task, NULL);
        cgrp2 = perf_cgroup_from_task(next, NULL);

        /*
         * only schedule out current cgroup events if we know
         * that we are switching to a different cgroup. Otherwise,
         * do no touch the cgroup events.
         */
        if (cgrp1 != cgrp2)
                perf_cgroup_switch(task, PERF_CGROUP_SWOUT);

        rcu_read_unlock();
}

static inline void perf_cgroup_sched_in(struct task_struct *prev,
                                        struct task_struct *task)
{
        struct perf_cgroup *cgrp1;
        struct perf_cgroup *cgrp2 = NULL;

        rcu_read_lock();
        /*
         * we come here when we know perf_cgroup_events > 0
         * we do not need to pass the ctx here because we know
         * we are holding the rcu lock
         */
        cgrp1 = perf_cgroup_from_task(task, NULL);
        cgrp2 = perf_cgroup_from_task(prev, NULL);

        /*
         * only need to schedule in cgroup events if we are changing
         * cgroup during ctxsw. Cgroup events were not scheduled
         * out of ctxsw out if that was not the case.
         */
        if (cgrp1 != cgrp2)
                perf_cgroup_switch(task, PERF_CGROUP_SWIN);

        rcu_read_unlock();
}

static int perf_cgroup_ensure_storage(struct perf_event *event,
                                struct cgroup_subsys_state *css)
{
        struct perf_cpu_context *cpuctx;
        struct perf_event **storage;
        int cpu, heap_size, ret = 0;

        /*
         * Allow storage to have sufficent space for an iterator for each
         * possibly nested cgroup plus an iterator for events with no cgroup.
         */
        for (heap_size = 1; css; css = css->parent)
                heap_size++;

        for_each_possible_cpu(cpu) {
                cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
                if (heap_size <= cpuctx->heap_size)
                        continue;

                storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
                                       GFP_KERNEL, cpu_to_node(cpu));
                if (!storage) {
                        ret = -ENOMEM;
                        break;
                }

                raw_spin_lock_irq(&cpuctx->ctx.lock);
                if (cpuctx->heap_size < heap_size) {
                        swap(cpuctx->heap, storage);
                        if (storage == cpuctx->heap_default)
                                storage = NULL;
                        cpuctx->heap_size = heap_size;
                }
                raw_spin_unlock_irq(&cpuctx->ctx.lock);

                kfree(storage);
        }

        return ret;
}

static inline int perf_cgroup_connect(int fd, struct perf_event *event,
                                      struct perf_event_attr *attr,
                                      struct perf_event *group_leader)
{
        struct perf_cgroup *cgrp;
        struct cgroup_subsys_state *css;
        struct fd f = fdget(fd);
        int ret = 0;

        if (!f.file)
                return -EBADF;

        css = css_tryget_online_from_dir(f.file->f_path.dentry,
                                         &perf_event_cgrp_subsys);
        if (IS_ERR(css)) {
                ret = PTR_ERR(css);
                goto out;
        }

        ret = perf_cgroup_ensure_storage(event, css);
        if (ret)
                goto out;

        cgrp = container_of(css, struct perf_cgroup, css);
        event->cgrp = cgrp;

        /*
         * all events in a group must monitor
         * the same cgroup because a task belongs
         * to only one perf cgroup at a time
         */
        if (group_leader && group_leader->cgrp != cgrp) {
                perf_detach_cgroup(event);
                ret = -EINVAL;
        }
out:
        fdput(f);
        return ret;
}

static inline void
perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_cpu_context *cpuctx;

        if (!is_cgroup_event(event))
                return;

        /*
         * Because cgroup events are always per-cpu events,
         * @ctx == &cpuctx->ctx.
         */
        cpuctx = container_of(ctx, struct perf_cpu_context, ctx);

        /*
         * Since setting cpuctx->cgrp is conditional on the current @cgrp
         * matching the event's cgroup, we must do this for every new event,
         * because if the first would mismatch, the second would not try again
         * and we would leave cpuctx->cgrp unset.
         */
        if (ctx->is_active && !cpuctx->cgrp) {
                struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);

                if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
                        cpuctx->cgrp = cgrp;
        }

        if (ctx->nr_cgroups++)
                return;

        list_add(&cpuctx->cgrp_cpuctx_entry,
                        per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
}

static inline void
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_cpu_context *cpuctx;

        if (!is_cgroup_event(event))
                return;

        /*
         * Because cgroup events are always per-cpu events,
         * @ctx == &cpuctx->ctx.
         */
        cpuctx = container_of(ctx, struct perf_cpu_context, ctx);

        if (--ctx->nr_cgroups)
                return;

        if (ctx->is_active && cpuctx->cgrp)
                cpuctx->cgrp = NULL;

        list_del(&cpuctx->cgrp_cpuctx_entry);
}

#else /* !CONFIG_CGROUP_PERF */

static inline bool
perf_cgroup_match(struct perf_event *event)
{
        return true;
}

static inline void perf_detach_cgroup(struct perf_event *event)
{}

static inline int is_cgroup_event(struct perf_event *event)
{
        return 0;
}

static inline void update_cgrp_time_from_event(struct perf_event *event)
{
}

static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
                                                bool final)
{
}

static inline void perf_cgroup_sched_out(struct task_struct *task,
                                         struct task_struct *next)
{
}

static inline void perf_cgroup_sched_in(struct task_struct *prev,
                                        struct task_struct *task)
{
}

static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
                                      struct perf_event_attr *attr,
                                      struct perf_event *group_leader)
{
        return -EINVAL;
}

static inline void
perf_cgroup_set_timestamp(struct task_struct *task,
                          struct perf_event_context *ctx)
{
}

static inline void
perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
{
}

static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
        return 0;
}

static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
{
        return 0;
}

static inline void
perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
}

static inline void
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
{
}
#endif

/*
 * set default to be dependent on timer tick just
 * like original code
 */
#define PERF_CPU_HRTIMER (1000 / HZ)
/*
 * function must be called with interrupts disabled
 */
static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
{
        struct perf_cpu_context *cpuctx;
        bool rotations;

        lockdep_assert_irqs_disabled();

        cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
        rotations = perf_rotate_context(cpuctx);

        raw_spin_lock(&cpuctx->hrtimer_lock);
        if (rotations)
                hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
        else
                cpuctx->hrtimer_active = 0;
        raw_spin_unlock(&cpuctx->hrtimer_lock);

        return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
}

static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
{
        struct hrtimer *timer = &cpuctx->hrtimer;
        struct pmu *pmu = cpuctx->ctx.pmu;
        u64 interval;

        /* no multiplexing needed for SW PMU */
        if (pmu->task_ctx_nr == perf_sw_context)
                return;

        /*
         * check default is sane, if not set then force to
         * default interval (1/tick)
         */
        interval = pmu->hrtimer_interval_ms;
        if (interval < 1)
                interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;

        cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);

        raw_spin_lock_init(&cpuctx->hrtimer_lock);
        hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
        timer->function = perf_mux_hrtimer_handler;
}

static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
{
        struct hrtimer *timer = &cpuctx->hrtimer;
        struct pmu *pmu = cpuctx->ctx.pmu;
        unsigned long flags;

        /* not for SW PMU */
        if (pmu->task_ctx_nr == perf_sw_context)
                return 0;

        raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
        if (!cpuctx->hrtimer_active) {
                cpuctx->hrtimer_active = 1;
                hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
                hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
        }
        raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);

        return 0;
}

static int perf_mux_hrtimer_restart_ipi(void *arg)
{
        return perf_mux_hrtimer_restart(arg);
}

void perf_pmu_disable(struct pmu *pmu)
{
        int *count = this_cpu_ptr(pmu->pmu_disable_count);
        if (!(*count)++)
                pmu->pmu_disable(pmu);
}

void perf_pmu_enable(struct pmu *pmu)
{
        int *count = this_cpu_ptr(pmu->pmu_disable_count);
        if (!--(*count))
                pmu->pmu_enable(pmu);
}

static DEFINE_PER_CPU(struct list_head, active_ctx_list);

/*
 * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
 * perf_event_task_tick() are fully serialized because they're strictly cpu
 * affine and perf_event_ctx{activate,deactivate} are called with IRQs
 * disabled, while perf_event_task_tick is called from IRQ context.
 */
static void perf_event_ctx_activate(struct perf_event_context *ctx)
{
        struct list_head *head = this_cpu_ptr(&active_ctx_list);

        lockdep_assert_irqs_disabled();

        WARN_ON(!list_empty(&ctx->active_ctx_list));

        list_add(&ctx->active_ctx_list, head);
}

static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
{
        lockdep_assert_irqs_disabled();

        WARN_ON(list_empty(&ctx->active_ctx_list));

        list_del_init(&ctx->active_ctx_list);
}

static void get_ctx(struct perf_event_context *ctx)
{
        refcount_inc(&ctx->refcount);
}

static void *alloc_task_ctx_data(struct pmu *pmu)
{
        if (pmu->task_ctx_cache)
                return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);

        return NULL;
}

static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
{
        if (pmu->task_ctx_cache && task_ctx_data)
                kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
}

static void free_ctx(struct rcu_head *head)
{
        struct perf_event_context *ctx;

        ctx = container_of(head, struct perf_event_context, rcu_head);
        free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
        kfree(ctx);
}

static void put_ctx(struct perf_event_context *ctx)
{
        if (refcount_dec_and_test(&ctx->refcount)) {
                if (ctx->parent_ctx)
                        put_ctx(ctx->parent_ctx);
                if (ctx->task && ctx->task != TASK_TOMBSTONE)
                        put_task_struct(ctx->task);
                call_rcu(&ctx->rcu_head, free_ctx);
        }
}

/*
 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
 * perf_pmu_migrate_context() we need some magic.
 *
 * Those places that change perf_event::ctx will hold both
 * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
 *
 * Lock ordering is by mutex address. There are two other sites where
 * perf_event_context::mutex nests and those are:
 *
 *  - perf_event_exit_task_context()        [ child , 0 ]
 *      perf_event_exit_event()
 *        put_event()                        [ parent, 1 ]
 *
 *  - perf_event_init_context()                [ parent, 0 ]
 *      inherit_task_group()
 *        inherit_group()
 *          inherit_event()
 *            perf_event_alloc()
 *              perf_init_event()
 *                perf_try_init_event()        [ child , 1 ]
 *
 * While it appears there is an obvious deadlock here -- the parent and child
 * nesting levels are inverted between the two. This is in fact safe because
 * life-time rules separate them. That is an exiting task cannot fork, and a
 * spawning task cannot (yet) exit.
 *
 * But remember that these are parent<->child context relations, and
 * migration does not affect children, therefore these two orderings should not
 * interact.
 *
 * The change in perf_event::ctx does not affect children (as claimed above)
 * because the sys_perf_event_open() case will install a new event and break
 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
 * concerned with cpuctx and that doesn't have children.
 *
 * The places that change perf_event::ctx will issue:
 *
 *   perf_remove_from_context();
 *   synchronize_rcu();
 *   perf_install_in_context();
 *
 * to affect the change. The remove_from_context() + synchronize_rcu() should
 * quiesce the event, after which we can install it in the new location. This
 * means that only external vectors (perf_fops, prctl) can perturb the event
 * while in transit. Therefore all such accessors should also acquire
 * perf_event_context::mutex to serialize against this.
 *
 * However; because event->ctx can change while we're waiting to acquire
 * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
 * function.
 *
 * Lock order:
 *    exec_update_lock
 *        task_struct::perf_event_mutex
 *          perf_event_context::mutex
 *            perf_event::child_mutex;
 *              perf_event_context::lock
 *            perf_event::mmap_mutex
 *            mmap_lock
 *              perf_addr_filters_head::lock
 *
 *    cpu_hotplug_lock
 *      pmus_lock
 *          cpuctx->mutex / perf_event_context::mutex
 */
static struct perf_event_context *
perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
{
        struct perf_event_context *ctx;

again:
        rcu_read_lock();
        ctx = READ_ONCE(event->ctx);
        if (!refcount_inc_not_zero(&ctx->refcount)) {
                rcu_read_unlock();
                goto again;
        }
        rcu_read_unlock();

        mutex_lock_nested(&ctx->mutex, nesting);
        if (event->ctx != ctx) {
                mutex_unlock(&ctx->mutex);
                put_ctx(ctx);
                goto again;
        }

        return ctx;
}

static inline struct perf_event_context *
perf_event_ctx_lock(struct perf_event *event)
{
        return perf_event_ctx_lock_nested(event, 0);
}

static void perf_event_ctx_unlock(struct perf_event *event,
                                  struct perf_event_context *ctx)
{
        mutex_unlock(&ctx->mutex);
        put_ctx(ctx);
}

/*
 * This must be done under the ctx->lock, such as to serialize against
 * context_equiv(), therefore we cannot call put_ctx() since that might end up
 * calling scheduler related locks and ctx->lock nests inside those.
 */
static __must_check struct perf_event_context *
unclone_ctx(struct perf_event_context *ctx)
{
        struct perf_event_context *parent_ctx = ctx->parent_ctx;

        lockdep_assert_held(&ctx->lock);

        if (parent_ctx)
                ctx->parent_ctx = NULL;
        ctx->generation++;

        return parent_ctx;
}

static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
                                enum pid_type type)
{
        u32 nr;
        /*
         * only top level events have the pid namespace they were created in
         */
        if (event->parent)
                event = event->parent;

        nr = __task_pid_nr_ns(p, type, event->ns);
        /* avoid -1 if it is idle thread or runs in another ns */
        if (!nr && !pid_alive(p))
                nr = -1;
        return nr;
}

static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
{
        return perf_event_pid_type(event, p, PIDTYPE_TGID);
}

static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
{
        return perf_event_pid_type(event, p, PIDTYPE_PID);
}

/*
 * If we inherit events we want to return the parent event id
 * to userspace.
 */
static u64 primary_event_id(struct perf_event *event)
{
        u64 id = event->id;

        if (event->parent)
                id = event->parent->id;

        return id;
}

/*
 * Get the perf_event_context for a task and lock it.
 *
 * This has to cope with the fact that until it is locked,
 * the context could get moved to another task.
 */
static struct perf_event_context *
perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
{
        struct perf_event_context *ctx;

retry:
        /*
         * One of the few rules of preemptible RCU is that one cannot do
         * rcu_read_unlock() while holding a scheduler (or nested) lock when
         * part of the read side critical section was irqs-enabled -- see
         * rcu_read_unlock_special().
         *
         * Since ctx->lock nests under rq->lock we must ensure the entire read
         * side critical section has interrupts disabled.
         */
        local_irq_save(*flags);
        rcu_read_lock();
        ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
        if (ctx) {
                /*
                 * If this context is a clone of another, it might
                 * get swapped for another underneath us by
                 * perf_event_task_sched_out, though the
                 * rcu_read_lock() protects us from any context
                 * getting freed.  Lock the context and check if it
                 * got swapped before we could get the lock, and retry
                 * if so.  If we locked the right context, then it
                 * can't get swapped on us any more.
                 */
                raw_spin_lock(&ctx->lock);
                if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
                        raw_spin_unlock(&ctx->lock);
                        rcu_read_unlock();
                        local_irq_restore(*flags);
                        goto retry;
                }

                if (ctx->task == TASK_TOMBSTONE ||
                    !refcount_inc_not_zero(&ctx->refcount)) {
                        raw_spin_unlock(&ctx->lock);
                        ctx = NULL;
                } else {
                        WARN_ON_ONCE(ctx->task != task);
                }
        }
        rcu_read_unlock();
        if (!ctx)
                local_irq_restore(*flags);
        return ctx;
}

/*
 * Get the context for a task and increment its pin_count so it
 * can't get swapped to another task.  This also increments its
 * reference count so that the context can't get freed.
 */
static struct perf_event_context *
perf_pin_task_context(struct task_struct *task, int ctxn)
{
        struct perf_event_context *ctx;
        unsigned long flags;

        ctx = perf_lock_task_context(task, ctxn, &flags);
        if (ctx) {
                ++ctx->pin_count;
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
        return ctx;
}

static void perf_unpin_context(struct perf_event_context *ctx)
{
        unsigned long flags;

        raw_spin_lock_irqsave(&ctx->lock, flags);
        --ctx->pin_count;
        raw_spin_unlock_irqrestore(&ctx->lock, flags);
}

/*
 * Update the record of the current time in a context.
 */
static void __update_context_time(struct perf_event_context *ctx, bool adv)
{
        u64 now = perf_clock();

        if (adv)
                ctx->time += now - ctx->timestamp;
        ctx->timestamp = now;

        /*
         * The above: time' = time + (now - timestamp), can be re-arranged
         * into: time` = now + (time - timestamp), which gives a single value
         * offset to compute future time without locks on.
         *
         * See perf_event_time_now(), which can be used from NMI context where
         * it's (obviously) not possible to acquire ctx->lock in order to read
         * both the above values in a consistent manner.
         */
        WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
}

static void update_context_time(struct perf_event_context *ctx)
{
        __update_context_time(ctx, true);
}

static u64 perf_event_time(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;

        if (unlikely(!ctx))
                return 0;

        if (is_cgroup_event(event))
                return perf_cgroup_event_time(event);

        return ctx->time;
}

static u64 perf_event_time_now(struct perf_event *event, u64 now)
{
        struct perf_event_context *ctx = event->ctx;

        if (unlikely(!ctx))
                return 0;

        if (is_cgroup_event(event))
                return perf_cgroup_event_time_now(event, now);

        if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
                return ctx->time;

        now += READ_ONCE(ctx->timeoffset);
        return now;
}

static enum event_type_t get_event_type(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;
        enum event_type_t event_type;

        lockdep_assert_held(&ctx->lock);

        /*
         * It's 'group type', really, because if our group leader is
         * pinned, so are we.
         */
        if (event->group_leader != event)
                event = event->group_leader;

        event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
        if (!ctx->task)
                event_type |= EVENT_CPU;

        return event_type;
}

/*
 * Helper function to initialize event group nodes.
 */
static void init_event_group(struct perf_event *event)
{
        RB_CLEAR_NODE(&event->group_node);
        event->group_index = 0;
}

/*
 * Extract pinned or flexible groups from the context
 * based on event attrs bits.
 */
static struct perf_event_groups *
get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
{
        if (event->attr.pinned)
                return &ctx->pinned_groups;
        else
                return &ctx->flexible_groups;
}

/*
 * Helper function to initializes perf_event_group trees.
 */
static void perf_event_groups_init(struct perf_event_groups *groups)
{
        groups->tree = RB_ROOT;
        groups->index = 0;
}

/*
 * Compare function for event groups;
 *
 * Implements complex key that first sorts by CPU and then by virtual index
 * which provides ordering when rotating groups for the same CPU.
 */
static bool
perf_event_groups_less(struct perf_event *left, struct perf_event *right)
{
        if (left->cpu < right->cpu)
                return true;
        if (left->cpu > right->cpu)
                return false;

#ifdef CONFIG_CGROUP_PERF
        if (left->cgrp != right->cgrp) {
                if (!left->cgrp || !left->cgrp->css.cgroup) {
                        /*
                         * Left has no cgroup but right does, no cgroups come
                         * first.
                         */
                        return true;
                }
                if (!right->cgrp || !right->cgrp->css.cgroup) {
                        /*
                         * Right has no cgroup but left does, no cgroups come
                         * first.
                         */
                        return false;
                }
                /* Two dissimilar cgroups, order by id. */
                if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id)
                        return true;

                return false;
        }
#endif

        if (left->group_index < right->group_index)
                return true;
        if (left->group_index > right->group_index)
                return false;

        return false;
}

/*
 * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
 * key (see perf_event_groups_less). This places it last inside the CPU
 * subtree.
 */
static void
perf_event_groups_insert(struct perf_event_groups *groups,
                         struct perf_event *event)
{
        struct perf_event *node_event;
        struct rb_node *parent;
        struct rb_node **node;

        event->group_index = ++groups->index;

        node = &groups->tree.rb_node;
        parent = *node;

        while (*node) {
                parent = *node;
                node_event = container_of(*node, struct perf_event, group_node);

                if (perf_event_groups_less(event, node_event))
                        node = &parent->rb_left;
                else
                        node = &parent->rb_right;
        }

        rb_link_node(&event->group_node, parent, node);
        rb_insert_color(&event->group_node, &groups->tree);
}

/*
 * Helper function to insert event into the pinned or flexible groups.
 */
static void
add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_event_groups *groups;

        groups = get_event_groups(event, ctx);
        perf_event_groups_insert(groups, event);
}

/*
 * Delete a group from a tree.
 */
static void
perf_event_groups_delete(struct perf_event_groups *groups,
                         struct perf_event *event)
{
        WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
                     RB_EMPTY_ROOT(&groups->tree));

        rb_erase(&event->group_node, &groups->tree);
        init_event_group(event);
}

/*
 * Helper function to delete event from its groups.
 */
static void
del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_event_groups *groups;

        groups = get_event_groups(event, ctx);
        perf_event_groups_delete(groups, event);
}

/*
 * Get the leftmost event in the cpu/cgroup subtree.
 */
static struct perf_event *
perf_event_groups_first(struct perf_event_groups *groups, int cpu,
                        struct cgroup *cgrp)
{
        struct perf_event *node_event = NULL, *match = NULL;
        struct rb_node *node = groups->tree.rb_node;
#ifdef CONFIG_CGROUP_PERF
        u64 node_cgrp_id, cgrp_id = 0;

        if (cgrp)
                cgrp_id = cgrp->kn->id;
#endif

        while (node) {
                node_event = container_of(node, struct perf_event, group_node);

                if (cpu < node_event->cpu) {
                        node = node->rb_left;
                        continue;
                }
                if (cpu > node_event->cpu) {
                        node = node->rb_right;
                        continue;
                }
#ifdef CONFIG_CGROUP_PERF
                node_cgrp_id = 0;
                if (node_event->cgrp && node_event->cgrp->css.cgroup)
                        node_cgrp_id = node_event->cgrp->css.cgroup->kn->id;

                if (cgrp_id < node_cgrp_id) {
                        node = node->rb_left;
                        continue;
                }
                if (cgrp_id > node_cgrp_id) {
                        node = node->rb_right;
                        continue;
                }
#endif
                match = node_event;
                node = node->rb_left;
        }

        return match;
}

/*
 * Like rb_entry_next_safe() for the @cpu subtree.
 */
static struct perf_event *
perf_event_groups_next(struct perf_event *event)
{
        struct perf_event *next;
#ifdef CONFIG_CGROUP_PERF
        u64 curr_cgrp_id = 0;
        u64 next_cgrp_id = 0;
#endif

        next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
        if (next == NULL || next->cpu != event->cpu)
                return NULL;

#ifdef CONFIG_CGROUP_PERF
        if (event->cgrp && event->cgrp->css.cgroup)
                curr_cgrp_id = event->cgrp->css.cgroup->kn->id;

        if (next->cgrp && next->cgrp->css.cgroup)
                next_cgrp_id = next->cgrp->css.cgroup->kn->id;

        if (curr_cgrp_id != next_cgrp_id)
                return NULL;
#endif
        return next;
}

/*
 * Iterate through the whole groups tree.
 */
#define perf_event_groups_for_each(event, groups)                        \
        for (event = rb_entry_safe(rb_first(&((groups)->tree)),                \
                                typeof(*event), group_node); event;        \
                event = rb_entry_safe(rb_next(&event->group_node),        \
                                typeof(*event), group_node))

/*
 * Add an event from the lists for its context.
 * Must be called with ctx->mutex and ctx->lock held.
 */
static void
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{
        lockdep_assert_held(&ctx->lock);

        WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
        event->attach_state |= PERF_ATTACH_CONTEXT;

        event->tstamp = perf_event_time(event);

        /*
         * If we're a stand alone event or group leader, we go to the context
         * list, group events are kept attached to the group so that
         * perf_group_detach can, at all times, locate all siblings.
         */
        if (event->group_leader == event) {
                event->group_caps = event->event_caps;
                add_event_to_groups(event, ctx);
        }

        list_add_rcu(&event->event_entry, &ctx->event_list);
        ctx->nr_events++;
        if (event->attr.inherit_stat)
                ctx->nr_stat++;

        if (event->state > PERF_EVENT_STATE_OFF)
                perf_cgroup_event_enable(event, ctx);

        ctx->generation++;
}

/*
 * Initialize event state based on the perf_event_attr::disabled.
 */
static inline void perf_event__state_init(struct perf_event *event)
{
        event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
                                              PERF_EVENT_STATE_INACTIVE;
}

static int __perf_event_read_size(u64 read_format, int nr_siblings)
{
        int entry = sizeof(u64); /* value */
        int size = 0;
        int nr = 1;

        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
                size += sizeof(u64);

        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
                size += sizeof(u64);

        if (read_format & PERF_FORMAT_ID)
                entry += sizeof(u64);

        if (read_format & PERF_FORMAT_LOST)
                entry += sizeof(u64);

        if (read_format & PERF_FORMAT_GROUP) {
                nr += nr_siblings;
                size += sizeof(u64);
        }

        /*
         * Since perf_event_validate_size() limits this to 16k and inhibits
         * adding more siblings, this will never overflow.
         */
        return size + nr * entry;
}

static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
{
        struct perf_sample_data *data;
        u16 size = 0;

        if (sample_type & PERF_SAMPLE_IP)
                size += sizeof(data->ip);

        if (sample_type & PERF_SAMPLE_ADDR)
                size += sizeof(data->addr);

        if (sample_type & PERF_SAMPLE_PERIOD)
                size += sizeof(data->period);

        if (sample_type & PERF_SAMPLE_WEIGHT)
                size += sizeof(data->weight);

        if (sample_type & PERF_SAMPLE_READ)
                size += event->read_size;

        if (sample_type & PERF_SAMPLE_DATA_SRC)
                size += sizeof(data->data_src.val);

        if (sample_type & PERF_SAMPLE_TRANSACTION)
                size += sizeof(data->txn);

        if (sample_type & PERF_SAMPLE_PHYS_ADDR)
                size += sizeof(data->phys_addr);

        if (sample_type & PERF_SAMPLE_CGROUP)
                size += sizeof(data->cgroup);

        event->header_size = size;
}

/*
 * Called at perf_event creation and when events are attached/detached from a
 * group.
 */
static void perf_event__header_size(struct perf_event *event)
{
        event->read_size =
                __perf_event_read_size(event->attr.read_format,
                                       event->group_leader->nr_siblings);
        __perf_event_header_size(event, event->attr.sample_type);
}

static void perf_event__id_header_size(struct perf_event *event)
{
        struct perf_sample_data *data;
        u64 sample_type = event->attr.sample_type;
        u16 size = 0;

        if (sample_type & PERF_SAMPLE_TID)
                size += sizeof(data->tid_entry);

        if (sample_type & PERF_SAMPLE_TIME)
                size += sizeof(data->time);

        if (sample_type & PERF_SAMPLE_IDENTIFIER)
                size += sizeof(data->id);

        if (sample_type & PERF_SAMPLE_ID)
                size += sizeof(data->id);

        if (sample_type & PERF_SAMPLE_STREAM_ID)
                size += sizeof(data->stream_id);

        if (sample_type & PERF_SAMPLE_CPU)
                size += sizeof(data->cpu_entry);

        event->id_header_size = size;
}

/*
 * Check that adding an event to the group does not result in anybody
 * overflowing the 64k event limit imposed by the output buffer.
 *
 * Specifically, check that the read_size for the event does not exceed 16k,
 * read_size being the one term that grows with groups size. Since read_size
 * depends on per-event read_format, also (re)check the existing events.
 *
 * This leaves 48k for the constant size fields and things like callchains,
 * branch stacks and register sets.
 */
static bool perf_event_validate_size(struct perf_event *event)
{
        struct perf_event *sibling, *group_leader = event->group_leader;

        if (__perf_event_read_size(event->attr.read_format,
                                   group_leader->nr_siblings + 1) > 16*1024)
                return false;

        if (__perf_event_read_size(group_leader->attr.read_format,
                                   group_leader->nr_siblings + 1) > 16*1024)
                return false;

        /*
         * When creating a new group leader, group_leader->ctx is initialized
         * after the size has been validated, but we cannot safely use
         * for_each_sibling_event() until group_leader->ctx is set. A new group
         * leader cannot have any siblings yet, so we can safely skip checking
         * the non-existent siblings.
         */
        if (event == group_leader)
                return true;

        for_each_sibling_event(sibling, group_leader) {
                if (__perf_event_read_size(sibling->attr.read_format,
                                           group_leader->nr_siblings + 1) > 16*1024)
                        return false;
        }

        return true;
}

static void perf_group_attach(struct perf_event *event)
{
        struct perf_event *group_leader = event->group_leader, *pos;

        lockdep_assert_held(&event->ctx->lock);

        /*
         * We can have double attach due to group movement in perf_event_open.
         */
        if (event->attach_state & PERF_ATTACH_GROUP)
                return;

        event->attach_state |= PERF_ATTACH_GROUP;

        if (group_leader == event)
                return;

        WARN_ON_ONCE(group_leader->ctx != event->ctx);

        group_leader->group_caps &= event->event_caps;

        list_add_tail(&event->sibling_list, &group_leader->sibling_list);
        group_leader->nr_siblings++;
        group_leader->group_generation++;

        perf_event__header_size(group_leader);

        for_each_sibling_event(pos, group_leader)
                perf_event__header_size(pos);
}

/*
 * Remove an event from the lists for its context.
 * Must be called with ctx->mutex and ctx->lock held.
 */
static void
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
{
        WARN_ON_ONCE(event->ctx != ctx);
        lockdep_assert_held(&ctx->lock);

        /*
         * We can have double detach due to exit/hot-unplug + close.
         */
        if (!(event->attach_state & PERF_ATTACH_CONTEXT))
                return;

        event->attach_state &= ~PERF_ATTACH_CONTEXT;

        ctx->nr_events--;
        if (event->attr.inherit_stat)
                ctx->nr_stat--;

        list_del_rcu(&event->event_entry);

        if (event->group_leader == event)
                del_event_from_groups(event, ctx);

        /*
         * If event was in error state, then keep it
         * that way, otherwise bogus counts will be
         * returned on read(). The only way to get out
         * of error state is by explicit re-enabling
         * of the event
         */
        if (event->state > PERF_EVENT_STATE_OFF) {
                perf_cgroup_event_disable(event, ctx);
                perf_event_set_state(event, PERF_EVENT_STATE_OFF);
        }

        ctx->generation++;
}

static int
perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
{
        if (!has_aux(aux_event))
                return 0;

        if (!event->pmu->aux_output_match)
                return 0;

        return event->pmu->aux_output_match(aux_event);
}

static void put_event(struct perf_event *event);
static void event_sched_out(struct perf_event *event,
                            struct perf_cpu_context *cpuctx,
                            struct perf_event_context *ctx);

static void perf_put_aux_event(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        struct perf_event *iter;

        /*
         * If event uses aux_event tear down the link
         */
        if (event->aux_event) {
                iter = event->aux_event;
                event->aux_event = NULL;
                put_event(iter);
                return;
        }

        /*
         * If the event is an aux_event, tear down all links to
         * it from other events.
         */
        for_each_sibling_event(iter, event->group_leader) {
                if (iter->aux_event != event)
                        continue;

                iter->aux_event = NULL;
                put_event(event);

                /*
                 * If it's ACTIVE, schedule it out and put it into ERROR
                 * state so that we don't try to schedule it again. Note
                 * that perf_event_enable() will clear the ERROR status.
                 */
                event_sched_out(iter, cpuctx, ctx);
                perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
        }
}

static bool perf_need_aux_event(struct perf_event *event)
{
        return !!event->attr.aux_output || !!event->attr.aux_sample_size;
}

static int perf_get_aux_event(struct perf_event *event,
                              struct perf_event *group_leader)
{
        /*
         * Our group leader must be an aux event if we want to be
         * an aux_output. This way, the aux event will precede its
         * aux_output events in the group, and therefore will always
         * schedule first.
         */
        if (!group_leader)
                return 0;

        /*
         * aux_output and aux_sample_size are mutually exclusive.
         */
        if (event->attr.aux_output && event->attr.aux_sample_size)
                return 0;

        if (event->attr.aux_output &&
            !perf_aux_output_match(event, group_leader))
                return 0;

        if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
                return 0;

        if (!atomic_long_inc_not_zero(&group_leader->refcount))
                return 0;

        /*
         * Link aux_outputs to their aux event; this is undone in
         * perf_group_detach() by perf_put_aux_event(). When the
         * group in torn down, the aux_output events loose their
         * link to the aux_event and can't schedule any more.
         */
        event->aux_event = group_leader;

        return 1;
}

static inline struct list_head *get_event_list(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;
        return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
}

/*
 * Events that have PERF_EV_CAP_SIBLING require being part of a group and
 * cannot exist on their own, schedule them out and move them into the ERROR
 * state. Also see _perf_event_enable(), it will not be able to recover
 * this ERROR state.
 */
static inline void perf_remove_sibling_event(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);

        event_sched_out(event, cpuctx, ctx);
        perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
}

static void perf_group_detach(struct perf_event *event)
{
        struct perf_event *leader = event->group_leader;
        struct perf_event *sibling, *tmp;
        struct perf_event_context *ctx = event->ctx;

        lockdep_assert_held(&ctx->lock);

        /*
         * We can have double detach due to exit/hot-unplug + close.
         */
        if (!(event->attach_state & PERF_ATTACH_GROUP))
                return;

        event->attach_state &= ~PERF_ATTACH_GROUP;

        perf_put_aux_event(event);

        /*
         * If this is a sibling, remove it from its group.
         */
        if (leader != event) {
                list_del_init(&event->sibling_list);
                event->group_leader->nr_siblings--;
                event->group_leader->group_generation++;
                goto out;
        }

        /*
         * If this was a group event with sibling events then
         * upgrade the siblings to singleton events by adding them
         * to whatever list we are on.
         */
        list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {

                if (sibling->event_caps & PERF_EV_CAP_SIBLING)
                        perf_remove_sibling_event(sibling);

                sibling->group_leader = sibling;
                list_del_init(&sibling->sibling_list);

                /* Inherit group flags from the previous leader */
                sibling->group_caps = event->group_caps;

                if (!RB_EMPTY_NODE(&event->group_node)) {
                        add_event_to_groups(sibling, event->ctx);

                        if (sibling->state == PERF_EVENT_STATE_ACTIVE)
                                list_add_tail(&sibling->active_list, get_event_list(sibling));
                }

                WARN_ON_ONCE(sibling->ctx != event->ctx);
        }

out:
        for_each_sibling_event(tmp, leader)
                perf_event__header_size(tmp);

        perf_event__header_size(leader);
}

static bool is_orphaned_event(struct perf_event *event)
{
        return event->state == PERF_EVENT_STATE_DEAD;
}

static inline int __pmu_filter_match(struct perf_event *event)
{
        struct pmu *pmu = event->pmu;
        return pmu->filter_match ? pmu->filter_match(event) : 1;
}

/*
 * Check whether we should attempt to schedule an event group based on
 * PMU-specific filtering. An event group can consist of HW and SW events,
 * potentially with a SW leader, so we must check all the filters, to
 * determine whether a group is schedulable:
 */
static inline int pmu_filter_match(struct perf_event *event)
{
        struct perf_event *sibling;

        if (!__pmu_filter_match(event))
                return 0;

        for_each_sibling_event(sibling, event) {
                if (!__pmu_filter_match(sibling))
                        return 0;
        }

        return 1;
}

static inline int
event_filter_match(struct perf_event *event)
{
        return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
               perf_cgroup_match(event) && pmu_filter_match(event);
}

static void
event_sched_out(struct perf_event *event,
                  struct perf_cpu_context *cpuctx,
                  struct perf_event_context *ctx)
{
        enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;

        WARN_ON_ONCE(event->ctx != ctx);
        lockdep_assert_held(&ctx->lock);

        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return;

        /*
         * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
         * we can schedule events _OUT_ individually through things like
         * __perf_remove_from_context().
         */
        list_del_init(&event->active_list);

        perf_pmu_disable(event->pmu);

        event->pmu->del(event, 0);
        event->oncpu = -1;

        if (READ_ONCE(event->pending_disable) >= 0) {
                WRITE_ONCE(event->pending_disable, -1);
                perf_cgroup_event_disable(event, ctx);
                state = PERF_EVENT_STATE_OFF;
        }
        perf_event_set_state(event, state);

        if (!is_software_event(event))
                cpuctx->active_oncpu--;
        if (!--ctx->nr_active)
                perf_event_ctx_deactivate(ctx);
        if (event->attr.freq && event->attr.sample_freq)
                ctx->nr_freq--;
        if (event->attr.exclusive || !cpuctx->active_oncpu)
                cpuctx->exclusive = 0;

        perf_pmu_enable(event->pmu);
}

static void
group_sched_out(struct perf_event *group_event,
                struct perf_cpu_context *cpuctx,
                struct perf_event_context *ctx)
{
        struct perf_event *event;

        if (group_event->state != PERF_EVENT_STATE_ACTIVE)
                return;

        perf_pmu_disable(ctx->pmu);

        event_sched_out(group_event, cpuctx, ctx);

        /*
         * Schedule out siblings (if any):
         */
        for_each_sibling_event(event, group_event)
                event_sched_out(event, cpuctx, ctx);

        perf_pmu_enable(ctx->pmu);
}

#define DETACH_GROUP        0x01UL

/*
 * Cross CPU call to remove a performance event
 *
 * We disable the event on the hardware level first. After that we
 * remove it from the context list.
 */
static void
__perf_remove_from_context(struct perf_event *event,
                           struct perf_cpu_context *cpuctx,
                           struct perf_event_context *ctx,
                           void *info)
{
        unsigned long flags = (unsigned long)info;

        if (ctx->is_active & EVENT_TIME) {
                update_context_time(ctx);
                update_cgrp_time_from_cpuctx(cpuctx, false);
        }

        event_sched_out(event, cpuctx, ctx);
        if (flags & DETACH_GROUP)
                perf_group_detach(event);
        list_del_event(event, ctx);

        if (!ctx->nr_events && ctx->is_active) {
                if (ctx == &cpuctx->ctx)
                        update_cgrp_time_from_cpuctx(cpuctx, true);

                ctx->is_active = 0;
                ctx->rotate_necessary = 0;
                if (ctx->task) {
                        WARN_ON_ONCE(cpuctx->task_ctx != ctx);
                        cpuctx->task_ctx = NULL;
                }
        }
}

/*
 * Remove the event from a task's (or a CPU's) list of events.
 *
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
 * remains valid.  This is OK when called from perf_release since
 * that only calls us on the top-level context, which can't be a clone.
 * When called from perf_event_exit_task, it's OK because the
 * context has been detached from its task.
 */
static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
{
        struct perf_event_context *ctx = event->ctx;

        lockdep_assert_held(&ctx->mutex);

        event_function_call(event, __perf_remove_from_context, (void *)flags);

        /*
         * The above event_function_call() can NO-OP when it hits
         * TASK_TOMBSTONE. In that case we must already have been detached
         * from the context (by perf_event_exit_event()) but the grouping
         * might still be in-tact.
         */
        WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
        if ((flags & DETACH_GROUP) &&
            (event->attach_state & PERF_ATTACH_GROUP)) {
                /*
                 * Since in that case we cannot possibly be scheduled, simply
                 * detach now.
                 */
                raw_spin_lock_irq(&ctx->lock);
                perf_group_detach(event);
                raw_spin_unlock_irq(&ctx->lock);
        }
}

/*
 * Cross CPU call to disable a performance event
 */
static void __perf_event_disable(struct perf_event *event,
                                 struct perf_cpu_context *cpuctx,
                                 struct perf_event_context *ctx,
                                 void *info)
{
        if (event->state < PERF_EVENT_STATE_INACTIVE)
                return;

        if (ctx->is_active & EVENT_TIME) {
                update_context_time(ctx);
                update_cgrp_time_from_event(event);
        }

        if (event == event->group_leader)
                group_sched_out(event, cpuctx, ctx);
        else
                event_sched_out(event, cpuctx, ctx);

        perf_event_set_state(event, PERF_EVENT_STATE_OFF);
        perf_cgroup_event_disable(event, ctx);
}

/*
 * Disable an event.
 *
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
 * remains valid.  This condition is satisfied when called through
 * perf_event_for_each_child or perf_event_for_each because they
 * hold the top-level event's child_mutex, so any descendant that
 * goes to exit will block in perf_event_exit_event().
 *
 * When called from perf_pending_event it's OK because event->ctx
 * is the current context on this CPU and preemption is disabled,
 * hence we can't get into perf_event_task_sched_out for this context.
 */
static void _perf_event_disable(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;

        raw_spin_lock_irq(&ctx->lock);
        if (event->state <= PERF_EVENT_STATE_OFF) {
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }
        raw_spin_unlock_irq(&ctx->lock);

        event_function_call(event, __perf_event_disable, NULL);
}

void perf_event_disable_local(struct perf_event *event)
{
        event_function_local(event, __perf_event_disable, NULL);
}

/*
 * Strictly speaking kernel users cannot create groups and therefore this
 * interface does not need the perf_event_ctx_lock() magic.
 */
void perf_event_disable(struct perf_event *event)
{
        struct perf_event_context *ctx;

        ctx = perf_event_ctx_lock(event);
        _perf_event_disable(event);
        perf_event_ctx_unlock(event, ctx);
}
EXPORT_SYMBOL_GPL(perf_event_disable);

void perf_event_disable_inatomic(struct perf_event *event)
{
        WRITE_ONCE(event->pending_disable, smp_processor_id());
        /* can fail, see perf_pending_event_disable() */
        irq_work_queue(&event->pending);
}

#define MAX_INTERRUPTS (~0ULL)

static void perf_log_throttle(struct perf_event *event, int enable);
static void perf_log_itrace_start(struct perf_event *event);

static int
event_sched_in(struct perf_event *event,
                 struct perf_cpu_context *cpuctx,
                 struct perf_event_context *ctx)
{
        int ret = 0;

        WARN_ON_ONCE(event->ctx != ctx);

        lockdep_assert_held(&ctx->lock);

        if (event->state <= PERF_EVENT_STATE_OFF)
                return 0;

        WRITE_ONCE(event->oncpu, smp_processor_id());
        /*
         * Order event::oncpu write to happen before the ACTIVE state is
         * visible. This allows perf_event_{stop,read}() to observe the correct
         * ->oncpu if it sees ACTIVE.
         */
        smp_wmb();
        perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);

        /*
         * Unthrottle events, since we scheduled we might have missed several
         * ticks already, also for a heavily scheduling task there is little
         * guarantee it'll get a tick in a timely manner.
         */
        if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
                perf_log_throttle(event, 1);
                event->hw.interrupts = 0;
        }

        perf_pmu_disable(event->pmu);

        perf_log_itrace_start(event);

        if (event->pmu->add(event, PERF_EF_START)) {
                perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
                event->oncpu = -1;
                ret = -EAGAIN;
                goto out;
        }

        if (!is_software_event(event))
                cpuctx->active_oncpu++;
        if (!ctx->nr_active++)
                perf_event_ctx_activate(ctx);
        if (event->attr.freq && event->attr.sample_freq)
                ctx->nr_freq++;

        if (event->attr.exclusive)
                cpuctx->exclusive = 1;

out:
        perf_pmu_enable(event->pmu);

        return ret;
}

static int
group_sched_in(struct perf_event *group_event,
               struct perf_cpu_context *cpuctx,
               struct perf_event_context *ctx)
{
        struct perf_event *event, *partial_group = NULL;
        struct pmu *pmu = ctx->pmu;

        if (group_event->state == PERF_EVENT_STATE_OFF)
                return 0;

        pmu->start_txn(pmu, PERF_PMU_TXN_ADD);

        if (event_sched_in(group_event, cpuctx, ctx))
                goto error;

        /*
         * Schedule in siblings as one group (if any):
         */
        for_each_sibling_event(event, group_event) {
                if (event_sched_in(event, cpuctx, ctx)) {
                        partial_group = event;
                        goto group_error;
                }
        }

        if (!pmu->commit_txn(pmu))
                return 0;

group_error:
        /*
         * Groups can be scheduled in as one unit only, so undo any
         * partial group before returning:
         * The events up to the failed event are scheduled out normally.
         */
        for_each_sibling_event(event, group_event) {
                if (event == partial_group)
                        break;

                event_sched_out(event, cpuctx, ctx);
        }
        event_sched_out(group_event, cpuctx, ctx);

error:
        pmu->cancel_txn(pmu);
        return -EAGAIN;
}

/*
 * Work out whether we can put this event group on the CPU now.
 */
static int group_can_go_on(struct perf_event *event,
                           struct perf_cpu_context *cpuctx,
                           int can_add_hw)
{
        /*
         * Groups consisting entirely of software events can always go on.
         */
        if (event->group_caps & PERF_EV_CAP_SOFTWARE)
                return 1;
        /*
         * If an exclusive group is already on, no other hardware
         * events can go on.
         */
        if (cpuctx->exclusive)
                return 0;
        /*
         * If this group is exclusive and there are already
         * events on the CPU, it can't go on.
         */
        if (event->attr.exclusive && !list_empty(get_event_list(event)))
                return 0;
        /*
         * Otherwise, try to add it if all previous groups were able
         * to go on.
         */
        return can_add_hw;
}

static void add_event_to_ctx(struct perf_event *event,
                               struct perf_event_context *ctx)
{
        list_add_event(event, ctx);
        perf_group_attach(event);
}

static void ctx_sched_out(struct perf_event_context *ctx,
                          struct perf_cpu_context *cpuctx,
                          enum event_type_t event_type);
static void
ctx_sched_in(struct perf_event_context *ctx,
             struct perf_cpu_context *cpuctx,
             enum event_type_t event_type,
             struct task_struct *task);

static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
                               struct perf_event_context *ctx,
                               enum event_type_t event_type)
{
        if (!cpuctx->task_ctx)
                return;

        if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
                return;

        ctx_sched_out(ctx, cpuctx, event_type);
}

static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
                                struct perf_event_context *ctx,
                                struct task_struct *task)
{
        cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
        if (ctx)
                ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
        if (ctx)
                ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
}

/*
 * We want to maintain the following priority of scheduling:
 *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
 *  - task pinned (EVENT_PINNED)
 *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
 *  - task flexible (EVENT_FLEXIBLE).
 *
 * In order to avoid unscheduling and scheduling back in everything every
 * time an event is added, only do it for the groups of equal priority and
 * below.
 *
 * This can be called after a batch operation on task events, in which case
 * event_type is a bit mask of the types of events involved. For CPU events,
 * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
 */
static void ctx_resched(struct perf_cpu_context *cpuctx,
                        struct perf_event_context *task_ctx,
                        enum event_type_t event_type)
{
        enum event_type_t ctx_event_type;
        bool cpu_event = !!(event_type & EVENT_CPU);

        /*
         * If pinned groups are involved, flexible groups also need to be
         * scheduled out.
         */
        if (event_type & EVENT_PINNED)
                event_type |= EVENT_FLEXIBLE;

        ctx_event_type = event_type & EVENT_ALL;

        perf_pmu_disable(cpuctx->ctx.pmu);
        if (task_ctx)
                task_ctx_sched_out(cpuctx, task_ctx, event_type);

        /*
         * Decide which cpu ctx groups to schedule out based on the types
         * of events that caused rescheduling:
         *  - EVENT_CPU: schedule out corresponding groups;
         *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
         *  - otherwise, do nothing more.
         */
        if (cpu_event)
                cpu_ctx_sched_out(cpuctx, ctx_event_type);
        else if (ctx_event_type & EVENT_PINNED)
                cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);

        perf_event_sched_in(cpuctx, task_ctx, current);
        perf_pmu_enable(cpuctx->ctx.pmu);
}

void perf_pmu_resched(struct pmu *pmu)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
        struct perf_event_context *task_ctx = cpuctx->task_ctx;

        perf_ctx_lock(cpuctx, task_ctx);
        ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
        perf_ctx_unlock(cpuctx, task_ctx);
}

/*
 * Cross CPU call to install and enable a performance event
 *
 * Very similar to remote_function() + event_function() but cannot assume that
 * things like ctx->is_active and cpuctx->task_ctx are set.
 */
static int  __perf_install_in_context(void *info)
{
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        struct perf_event_context *task_ctx = cpuctx->task_ctx;
        bool reprogram = true;
        int ret = 0;

        raw_spin_lock(&cpuctx->ctx.lock);
        if (ctx->task) {
                raw_spin_lock(&ctx->lock);
                task_ctx = ctx;

                reprogram = (ctx->task == current);

                /*
                 * If the task is running, it must be running on this CPU,
                 * otherwise we cannot reprogram things.
                 *
                 * If its not running, we don't care, ctx->lock will
                 * serialize against it becoming runnable.
                 */
                if (task_curr(ctx->task) && !reprogram) {
                        ret = -ESRCH;
                        goto unlock;
                }

                WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
        } else if (task_ctx) {
                raw_spin_lock(&task_ctx->lock);
        }

#ifdef CONFIG_CGROUP_PERF
        if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
                /*
                 * If the current cgroup doesn't match the event's
                 * cgroup, we should not try to schedule it.
                 */
                struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
                reprogram = cgroup_is_descendant(cgrp->css.cgroup,
                                        event->cgrp->css.cgroup);
        }
#endif

        if (reprogram) {
                ctx_sched_out(ctx, cpuctx, EVENT_TIME);
                add_event_to_ctx(event, ctx);
                ctx_resched(cpuctx, task_ctx, get_event_type(event));
        } else {
                add_event_to_ctx(event, ctx);
        }

unlock:
        perf_ctx_unlock(cpuctx, task_ctx);

        return ret;
}

static bool exclusive_event_installable(struct perf_event *event,
                                        struct perf_event_context *ctx);

/*
 * Attach a performance event to a context.
 *
 * Very similar to event_function_call, see comment there.
 */
static void
perf_install_in_context(struct perf_event_context *ctx,
                        struct perf_event *event,
                        int cpu)
{
        struct task_struct *task = READ_ONCE(ctx->task);

        lockdep_assert_held(&ctx->mutex);

        WARN_ON_ONCE(!exclusive_event_installable(event, ctx));

        if (event->cpu != -1)
                event->cpu = cpu;

        /*
         * Ensures that if we can observe event->ctx, both the event and ctx
         * will be 'complete'. See perf_iterate_sb_cpu().
         */
        smp_store_release(&event->ctx, ctx);

        /*
         * perf_event_attr::disabled events will not run and can be initialized
         * without IPI. Except when this is the first event for the context, in
         * that case we need the magic of the IPI to set ctx->is_active.
         *
         * The IOC_ENABLE that is sure to follow the creation of a disabled
         * event will issue the IPI and reprogram the hardware.
         */
        if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {
                raw_spin_lock_irq(&ctx->lock);
                if (ctx->task == TASK_TOMBSTONE) {
                        raw_spin_unlock_irq(&ctx->lock);
                        return;
                }
                add_event_to_ctx(event, ctx);
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }

        if (!task) {
                cpu_function_call(cpu, __perf_install_in_context, event);
                return;
        }

        /*
         * Should not happen, we validate the ctx is still alive before calling.
         */
        if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
                return;

        /*
         * Installing events is tricky because we cannot rely on ctx->is_active
         * to be set in case this is the nr_events 0 -> 1 transition.
         *
         * Instead we use task_curr(), which tells us if the task is running.
         * However, since we use task_curr() outside of rq::lock, we can race
         * against the actual state. This means the result can be wrong.
         *
         * If we get a false positive, we retry, this is harmless.
         *
         * If we get a false negative, things are complicated. If we are after
         * perf_event_context_sched_in() ctx::lock will serialize us, and the
         * value must be correct. If we're before, it doesn't matter since
         * perf_event_context_sched_in() will program the counter.
         *
         * However, this hinges on the remote context switch having observed
         * our task->perf_event_ctxp[] store, such that it will in fact take
         * ctx::lock in perf_event_context_sched_in().
         *
         * We do this by task_function_call(), if the IPI fails to hit the task
         * we know any future context switch of task must see the
         * perf_event_ctpx[] store.
         */

        /*
         * This smp_mb() orders the task->perf_event_ctxp[] store with the
         * task_cpu() load, such that if the IPI then does not find the task
         * running, a future context switch of that task must observe the
         * store.
         */
        smp_mb();
again:
        if (!task_function_call(task, __perf_install_in_context, event))
                return;

        raw_spin_lock_irq(&ctx->lock);
        task = ctx->task;
        if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
                /*
                 * Cannot happen because we already checked above (which also
                 * cannot happen), and we hold ctx->mutex, which serializes us
                 * against perf_event_exit_task_context().
                 */
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }
        /*
         * If the task is not running, ctx->lock will avoid it becoming so,
         * thus we can safely install the event.
         */
        if (task_curr(task)) {
                raw_spin_unlock_irq(&ctx->lock);
                goto again;
        }
        add_event_to_ctx(event, ctx);
        raw_spin_unlock_irq(&ctx->lock);
}

/*
 * Cross CPU call to enable a performance event
 */
static void __perf_event_enable(struct perf_event *event,
                                struct perf_cpu_context *cpuctx,
                                struct perf_event_context *ctx,
                                void *info)
{
        struct perf_event *leader = event->group_leader;
        struct perf_event_context *task_ctx;

        if (event->state >= PERF_EVENT_STATE_INACTIVE ||
            event->state <= PERF_EVENT_STATE_ERROR)
                return;

        if (ctx->is_active)
                ctx_sched_out(ctx, cpuctx, EVENT_TIME);

        perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
        perf_cgroup_event_enable(event, ctx);

        if (!ctx->is_active)
                return;

        if (!event_filter_match(event)) {
                ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
                return;
        }

        /*
         * If the event is in a group and isn't the group leader,
         * then don't put it on unless the group is on.
         */
        if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
                ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
                return;
        }

        task_ctx = cpuctx->task_ctx;
        if (ctx->task)
                WARN_ON_ONCE(task_ctx != ctx);

        ctx_resched(cpuctx, task_ctx, get_event_type(event));
}

/*
 * Enable an event.
 *
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
 * remains valid.  This condition is satisfied when called through
 * perf_event_for_each_child or perf_event_for_each as described
 * for perf_event_disable.
 */
static void _perf_event_enable(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;

        raw_spin_lock_irq(&ctx->lock);
        if (event->state >= PERF_EVENT_STATE_INACTIVE ||
            event->state <  PERF_EVENT_STATE_ERROR) {
out:
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }

        /*
         * If the event is in error state, clear that first.
         *
         * That way, if we see the event in error state below, we know that it
         * has gone back into error state, as distinct from the task having
         * been scheduled away before the cross-call arrived.
         */
        if (event->state == PERF_EVENT_STATE_ERROR) {
                /*
                 * Detached SIBLING events cannot leave ERROR state.
                 */
                if (event->event_caps & PERF_EV_CAP_SIBLING &&
                    event->group_leader == event)
                        goto out;

                event->state = PERF_EVENT_STATE_OFF;
        }
        raw_spin_unlock_irq(&ctx->lock);

        event_function_call(event, __perf_event_enable, NULL);
}

/*
 * See perf_event_disable();
 */
void perf_event_enable(struct perf_event *event)
{
        struct perf_event_context *ctx;

        ctx = perf_event_ctx_lock(event);
        _perf_event_enable(event);
        perf_event_ctx_unlock(event, ctx);
}
EXPORT_SYMBOL_GPL(perf_event_enable);

struct stop_event_data {
        struct perf_event        *event;
        unsigned int                restart;
};

static int __perf_event_stop(void *info)
{
        struct stop_event_data *sd = info;
        struct perf_event *event = sd->event;

        /* if it's already INACTIVE, do nothing */
        if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
                return 0;

        /* matches smp_wmb() in event_sched_in() */
        smp_rmb();

        /*
         * There is a window with interrupts enabled before we get here,
         * so we need to check again lest we try to stop another CPU's event.
         */
        if (READ_ONCE(event->oncpu) != smp_processor_id())
                return -EAGAIN;

        event->pmu->stop(event, PERF_EF_UPDATE);

        /*
         * May race with the actual stop (through perf_pmu_output_stop()),
         * but it is only used for events with AUX ring buffer, and such
         * events will refuse to restart because of rb::aux_mmap_count==0,
         * see comments in perf_aux_output_begin().
         *
         * Since this is happening on an event-local CPU, no trace is lost
         * while restarting.
         */
        if (sd->restart)
                event->pmu->start(event, 0);

        return 0;
}

static int perf_event_stop(struct perf_event *event, int restart)
{
        struct stop_event_data sd = {
                .event                = event,
                .restart        = restart,
        };
        int ret = 0;

        do {
                if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
                        return 0;

                /* matches smp_wmb() in event_sched_in() */
                smp_rmb();

                /*
                 * We only want to restart ACTIVE events, so if the event goes
                 * inactive here (event->oncpu==-1), there's nothing more to do;
                 * fall through with ret==-ENXIO.
                 */
                ret = cpu_function_call(READ_ONCE(event->oncpu),
                                        __perf_event_stop, &sd);
        } while (ret == -EAGAIN);

        return ret;
}

/*
 * In order to contain the amount of racy and tricky in the address filter
 * configuration management, it is a two part process:
 *
 * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
 *      we update the addresses of corresponding vmas in
 *        event::addr_filter_ranges array and bump the event::addr_filters_gen;
 * (p2) when an event is scheduled in (pmu::add), it calls
 *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
 *      if the generation has changed since the previous call.
 *
 * If (p1) happens while the event is active, we restart it to force (p2).
 *
 * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
 *     pre-existing mappings, called once when new filters arrive via SET_FILTER
 *     ioctl;
 * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
 *     registered mapping, called for every new mmap(), with mm::mmap_lock down
 *     for reading;
 * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
 *     of exec.
 */
void perf_event_addr_filters_sync(struct perf_event *event)
{
        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);

        if (!has_addr_filter(event))
                return;

        raw_spin_lock(&ifh->lock);
        if (event->addr_filters_gen != event->hw.addr_filters_gen) {
                event->pmu->addr_filters_sync(event);
                event->hw.addr_filters_gen = event->addr_filters_gen;
        }
        raw_spin_unlock(&ifh->lock);
}
EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);

static int _perf_event_refresh(struct perf_event *event, int refresh)
{
        /*
         * not supported on inherited events
         */
        if (event->attr.inherit || !is_sampling_event(event))
                return -EINVAL;

        atomic_add(refresh, &event->event_limit);
        _perf_event_enable(event);

        return 0;
}

/*
 * See perf_event_disable()
 */
int perf_event_refresh(struct perf_event *event, int refresh)
{
        struct perf_event_context *ctx;
        int ret;

        ctx = perf_event_ctx_lock(event);
        ret = _perf_event_refresh(event, refresh);
        perf_event_ctx_unlock(event, ctx);

        return ret;
}
EXPORT_SYMBOL_GPL(perf_event_refresh);

static int perf_event_modify_breakpoint(struct perf_event *bp,
                                         struct perf_event_attr *attr)
{
        int err;

        _perf_event_disable(bp);

        err = modify_user_hw_breakpoint_check(bp, attr, true);

        if (!bp->attr.disabled)
                _perf_event_enable(bp);

        return err;
}

static int perf_event_modify_attr(struct perf_event *event,
                                  struct perf_event_attr *attr)
{
        if (event->attr.type != attr->type)
                return -EINVAL;

        switch (event->attr.type) {
        case PERF_TYPE_BREAKPOINT:
                return perf_event_modify_breakpoint(event, attr);
        default:
                /* Place holder for future additions. */
                return -EOPNOTSUPP;
        }
}

static void ctx_sched_out(struct perf_event_context *ctx,
                          struct perf_cpu_context *cpuctx,
                          enum event_type_t event_type)
{
        struct perf_event *event, *tmp;
        int is_active = ctx->is_active;

        lockdep_assert_held(&ctx->lock);

        if (likely(!ctx->nr_events)) {
                /*
                 * See __perf_remove_from_context().
                 */
                WARN_ON_ONCE(ctx->is_active);
                if (ctx->task)
                        WARN_ON_ONCE(cpuctx->task_ctx);
                return;
        }

        /*
         * Always update time if it was set; not only when it changes.
         * Otherwise we can 'forget' to update time for any but the last
         * context we sched out. For example:
         *
         *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
         *   ctx_sched_out(.event_type = EVENT_PINNED)
         *
         * would only update time for the pinned events.
         */
        if (is_active & EVENT_TIME) {
                /* update (and stop) ctx time */
                update_context_time(ctx);
                update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
                /*
                 * CPU-release for the below ->is_active store,
                 * see __load_acquire() in perf_event_time_now()
                 */
                barrier();
        }

        ctx->is_active &= ~event_type;
        if (!(ctx->is_active & EVENT_ALL))
                ctx->is_active = 0;

        if (ctx->task) {
                WARN_ON_ONCE(cpuctx->task_ctx != ctx);
                if (!ctx->is_active)
                        cpuctx->task_ctx = NULL;
        }

        is_active ^= ctx->is_active; /* changed bits */

        if (!ctx->nr_active || !(is_active & EVENT_ALL))
                return;

        perf_pmu_disable(ctx->pmu);
        if (is_active & EVENT_PINNED) {
                list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
                        group_sched_out(event, cpuctx, ctx);
        }

        if (is_active & EVENT_FLEXIBLE) {
                list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
                        group_sched_out(event, cpuctx, ctx);

                /*
                 * Since we cleared EVENT_FLEXIBLE, also clear
                 * rotate_necessary, is will be reset by
                 * ctx_flexible_sched_in() when needed.
                 */
                ctx->rotate_necessary = 0;
        }
        perf_pmu_enable(ctx->pmu);
}

/*
 * Test whether two contexts are equivalent, i.e. whether they have both been
 * cloned from the same version of the same context.
 *
 * Equivalence is measured using a generation number in the context that is
 * incremented on each modification to it; see unclone_ctx(), list_add_event()
 * and list_del_event().
 */
static int context_equiv(struct perf_event_context *ctx1,
                         struct perf_event_context *ctx2)
{
        lockdep_assert_held(&ctx1->lock);
        lockdep_assert_held(&ctx2->lock);

        /* Pinning disables the swap optimization */
        if (ctx1->pin_count || ctx2->pin_count)
                return 0;

        /* If ctx1 is the parent of ctx2 */
        if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
                return 1;

        /* If ctx2 is the parent of ctx1 */
        if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
                return 1;

        /*
         * If ctx1 and ctx2 have the same parent; we flatten the parent
         * hierarchy, see perf_event_init_context().
         */
        if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
                        ctx1->parent_gen == ctx2->parent_gen)
                return 1;

        /* Unmatched */
        return 0;
}

static void __perf_event_sync_stat(struct perf_event *event,
                                     struct perf_event *next_event)
{
        u64 value;

        if (!event->attr.inherit_stat)
                return;

        /*
         * Update the event value, we cannot use perf_event_read()
         * because we're in the middle of a context switch and have IRQs
         * disabled, which upsets smp_call_function_single(), however
         * we know the event must be on the current CPU, therefore we
         * don't need to use it.
         */
        if (event->state == PERF_EVENT_STATE_ACTIVE)
                event->pmu->read(event);

        perf_event_update_time(event);

        /*
         * In order to keep per-task stats reliable we need to flip the event
         * values when we flip the contexts.
         */
        value = local64_read(&next_event->count);
        value = local64_xchg(&event->count, value);
        local64_set(&next_event->count, value);

        swap(event->total_time_enabled, next_event->total_time_enabled);
        swap(event->total_time_running, next_event->total_time_running);

        /*
         * Since we swizzled the values, update the user visible data too.
         */
        perf_event_update_userpage(event);
        perf_event_update_userpage(next_event);
}

static void perf_event_sync_stat(struct perf_event_context *ctx,
                                   struct perf_event_context *next_ctx)
{
        struct perf_event *event, *next_event;

        if (!ctx->nr_stat)
                return;

        update_context_time(ctx);

        event = list_first_entry(&ctx->event_list,
                                   struct perf_event, event_entry);

        next_event = list_first_entry(&next_ctx->event_list,
                                        struct perf_event, event_entry);

        while (&event->event_entry != &ctx->event_list &&
               &next_event->event_entry != &next_ctx->event_list) {

                __perf_event_sync_stat(event, next_event);

                event = list_next_entry(event, event_entry);
                next_event = list_next_entry(next_event, event_entry);
        }
}

static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
                                         struct task_struct *next)
{
        struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
        struct perf_event_context *next_ctx;
        struct perf_event_context *parent, *next_parent;
        struct perf_cpu_context *cpuctx;
        int do_switch = 1;
        struct pmu *pmu;

        if (likely(!ctx))
                return;

        pmu = ctx->pmu;
        cpuctx = __get_cpu_context(ctx);
        if (!cpuctx->task_ctx)
                return;

        rcu_read_lock();
        next_ctx = next->perf_event_ctxp[ctxn];
        if (!next_ctx)
                goto unlock;

        parent = rcu_dereference(ctx->parent_ctx);
        next_parent = rcu_dereference(next_ctx->parent_ctx);

        /* If neither context have a parent context; they cannot be clones. */
        if (!parent && !next_parent)
                goto unlock;

        if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
                /*
                 * Looks like the two contexts are clones, so we might be
                 * able to optimize the context switch.  We lock both
                 * contexts and check that they are clones under the
                 * lock (including re-checking that neither has been
                 * uncloned in the meantime).  It doesn't matter which
                 * order we take the locks because no other cpu could
                 * be trying to lock both of these tasks.
                 */
                raw_spin_lock(&ctx->lock);
                raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
                if (context_equiv(ctx, next_ctx)) {

                        WRITE_ONCE(ctx->task, next);
                        WRITE_ONCE(next_ctx->task, task);

                        perf_pmu_disable(pmu);

                        if (cpuctx->sched_cb_usage && pmu->sched_task)
                                pmu->sched_task(ctx, false);

                        /*
                         * PMU specific parts of task perf context can require
                         * additional synchronization. As an example of such
                         * synchronization see implementation details of Intel
                         * LBR call stack data profiling;
                         */
                        if (pmu->swap_task_ctx)
                                pmu->swap_task_ctx(ctx, next_ctx);
                        else
                                swap(ctx->task_ctx_data, next_ctx->task_ctx_data);

                        perf_pmu_enable(pmu);

                        /*
                         * RCU_INIT_POINTER here is safe because we've not
                         * modified the ctx and the above modification of
                         * ctx->task and ctx->task_ctx_data are immaterial
                         * since those values are always verified under
                         * ctx->lock which we're now holding.
                         */
                        RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
                        RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);

                        do_switch = 0;

                        perf_event_sync_stat(ctx, next_ctx);
                }
                raw_spin_unlock(&next_ctx->lock);
                raw_spin_unlock(&ctx->lock);
        }
unlock:
        rcu_read_unlock();

        if (do_switch) {
                raw_spin_lock(&ctx->lock);
                perf_pmu_disable(pmu);

                if (cpuctx->sched_cb_usage && pmu->sched_task)
                        pmu->sched_task(ctx, false);
                task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);

                perf_pmu_enable(pmu);
                raw_spin_unlock(&ctx->lock);
        }
}

static DEFINE_PER_CPU(struct list_head, sched_cb_list);

void perf_sched_cb_dec(struct pmu *pmu)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);

        this_cpu_dec(perf_sched_cb_usages);

        if (!--cpuctx->sched_cb_usage)
                list_del(&cpuctx->sched_cb_entry);
}


void perf_sched_cb_inc(struct pmu *pmu)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);

        if (!cpuctx->sched_cb_usage++)
                list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));

        this_cpu_inc(perf_sched_cb_usages);
}

/*
 * This function provides the context switch callback to the lower code
 * layer. It is invoked ONLY when the context switch callback is enabled.
 *
 * This callback is relevant even to per-cpu events; for example multi event
 * PEBS requires this to provide PID/TID information. This requires we flush
 * all queued PEBS records before we context switch to a new task.
 */
static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
{
        struct pmu *pmu;

        pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */

        if (WARN_ON_ONCE(!pmu->sched_task))
                return;

        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
        perf_pmu_disable(pmu);

        pmu->sched_task(cpuctx->task_ctx, sched_in);

        perf_pmu_enable(pmu);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}

static void perf_pmu_sched_task(struct task_struct *prev,
                                struct task_struct *next,
                                bool sched_in)
{
        struct perf_cpu_context *cpuctx;

        if (prev == next)
                return;

        list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
                /* will be handled in perf_event_context_sched_in/out */
                if (cpuctx->task_ctx)
                        continue;

                __perf_pmu_sched_task(cpuctx, sched_in);
        }
}

static void perf_event_switch(struct task_struct *task,
                              struct task_struct *next_prev, bool sched_in);

#define for_each_task_context_nr(ctxn)                                        \
        for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)

/*
 * Called from scheduler to remove the events of the current task,
 * with interrupts disabled.
 *
 * We stop each event and update the event value in event->count.
 *
 * This does not protect us against NMI, but disable()
 * sets the disabled bit in the control field of event _before_
 * accessing the event control register. If a NMI hits, then it will
 * not restart the event.
 */
void __perf_event_task_sched_out(struct task_struct *task,
                                 struct task_struct *next)
{
        int ctxn;

        if (__this_cpu_read(perf_sched_cb_usages))
                perf_pmu_sched_task(task, next, false);

        if (atomic_read(&nr_switch_events))
                perf_event_switch(task, next, false);

        for_each_task_context_nr(ctxn)
                perf_event_context_sched_out(task, ctxn, next);

        /*
         * if cgroup events exist on this CPU, then we need
         * to check if we have to switch out PMU state.
         * cgroup event are system-wide mode only
         */
        if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
                perf_cgroup_sched_out(task, next);
}

/*
 * Called with IRQs disabled
 */
static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
                              enum event_type_t event_type)
{
        ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
}

static bool perf_less_group_idx(const void *l, const void *r)
{
        const struct perf_event *le = *(const struct perf_event **)l;
        const struct perf_event *re = *(const struct perf_event **)r;

        return le->group_index < re->group_index;
}

static void swap_ptr(void *l, void *r)
{
        void **lp = l, **rp = r;

        swap(*lp, *rp);
}

static const struct min_heap_callbacks perf_min_heap = {
        .elem_size = sizeof(struct perf_event *),
        .less = perf_less_group_idx,
        .swp = swap_ptr,
};

static void __heap_add(struct min_heap *heap, struct perf_event *event)
{
        struct perf_event **itrs = heap->data;

        if (event) {
                itrs[heap->nr] = event;
                heap->nr++;
        }
}

static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
                                struct perf_event_groups *groups, int cpu,
                                int (*func)(struct perf_event *, void *),
                                void *data)
{
#ifdef CONFIG_CGROUP_PERF
        struct cgroup_subsys_state *css = NULL;
#endif
        /* Space for per CPU and/or any CPU event iterators. */
        struct perf_event *itrs[2];
        struct min_heap event_heap;
        struct perf_event **evt;
        int ret;

        if (cpuctx) {
                event_heap = (struct min_heap){
                        .data = cpuctx->heap,
                        .nr = 0,
                        .size = cpuctx->heap_size,
                };

                lockdep_assert_held(&cpuctx->ctx.lock);

#ifdef CONFIG_CGROUP_PERF
                if (cpuctx->cgrp)
                        css = &cpuctx->cgrp->css;
#endif
        } else {
                event_heap = (struct min_heap){
                        .data = itrs,
                        .nr = 0,
                        .size = ARRAY_SIZE(itrs),
                };
                /* Events not within a CPU context may be on any CPU. */
                __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
        }
        evt = event_heap.data;

        __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));

#ifdef CONFIG_CGROUP_PERF
        for (; css; css = css->parent)
                __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
#endif

        min_heapify_all(&event_heap, &perf_min_heap);

        while (event_heap.nr) {
                ret = func(*evt, data);
                if (ret)
                        return ret;

                *evt = perf_event_groups_next(*evt);
                if (*evt)
                        min_heapify(&event_heap, 0, &perf_min_heap);
                else
                        min_heap_pop(&event_heap, &perf_min_heap);
        }

        return 0;
}

/*
 * Because the userpage is strictly per-event (there is no concept of context,
 * so there cannot be a context indirection), every userpage must be updated
 * when context time starts :-(
 *
 * IOW, we must not miss EVENT_TIME edges.
 */
static inline bool event_update_userpage(struct perf_event *event)
{
        if (likely(!atomic_read(&event->mmap_count)))
                return false;

        perf_event_update_time(event);
        perf_event_update_userpage(event);

        return true;
}

static inline void group_update_userpage(struct perf_event *group_event)
{
        struct perf_event *event;

        if (!event_update_userpage(group_event))
                return;

        for_each_sibling_event(event, group_event)
                event_update_userpage(event);
}

static int merge_sched_in(struct perf_event *event, void *data)
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        int *can_add_hw = data;

        if (event->state <= PERF_EVENT_STATE_OFF)
                return 0;

        if (!event_filter_match(event))
                return 0;

        if (group_can_go_on(event, cpuctx, *can_add_hw)) {
                if (!group_sched_in(event, cpuctx, ctx))
                        list_add_tail(&event->active_list, get_event_list(event));
        }

        if (event->state == PERF_EVENT_STATE_INACTIVE) {
                *can_add_hw = 0;
                if (event->attr.pinned) {
                        perf_cgroup_event_disable(event, ctx);
                        perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
                } else {
                        ctx->rotate_necessary = 1;
                        perf_mux_hrtimer_restart(cpuctx);
                        group_update_userpage(event);
                }
        }

        return 0;
}

static void
ctx_pinned_sched_in(struct perf_event_context *ctx,
                    struct perf_cpu_context *cpuctx)
{
        int can_add_hw = 1;

        if (ctx != &cpuctx->ctx)
                cpuctx = NULL;

        visit_groups_merge(cpuctx, &ctx->pinned_groups,
                           smp_processor_id(),
                           merge_sched_in, &can_add_hw);
}

static void
ctx_flexible_sched_in(struct perf_event_context *ctx,
                      struct perf_cpu_context *cpuctx)
{
        int can_add_hw = 1;

        if (ctx != &cpuctx->ctx)
                cpuctx = NULL;

        visit_groups_merge(cpuctx, &ctx->flexible_groups,
                           smp_processor_id(),
                           merge_sched_in, &can_add_hw);
}

static void
ctx_sched_in(struct perf_event_context *ctx,
             struct perf_cpu_context *cpuctx,
             enum event_type_t event_type,
             struct task_struct *task)
{
        int is_active = ctx->is_active;

        lockdep_assert_held(&ctx->lock);

        if (likely(!ctx->nr_events))
                return;

        if (!(is_active & EVENT_TIME)) {
                /* start ctx time */
                __update_context_time(ctx, false);
                perf_cgroup_set_timestamp(task, ctx);
                /*
                 * CPU-release for the below ->is_active store,
                 * see __load_acquire() in perf_event_time_now()
                 */
                barrier();
        }

        ctx->is_active |= (event_type | EVENT_TIME);
        if (ctx->task) {
                if (!is_active)
                        cpuctx->task_ctx = ctx;
                else
                        WARN_ON_ONCE(cpuctx->task_ctx != ctx);
        }

        is_active ^= ctx->is_active; /* changed bits */

        /*
         * First go through the list and put on any pinned groups
         * in order to give them the best chance of going on.
         */
        if (is_active & EVENT_PINNED)
                ctx_pinned_sched_in(ctx, cpuctx);

        /* Then walk through the lower prio flexible groups */
        if (is_active & EVENT_FLEXIBLE)
                ctx_flexible_sched_in(ctx, cpuctx);
}

static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
                             enum event_type_t event_type,
                             struct task_struct *task)
{
        struct perf_event_context *ctx = &cpuctx->ctx;

        ctx_sched_in(ctx, cpuctx, event_type, task);
}

static void perf_event_context_sched_in(struct perf_event_context *ctx,
                                        struct task_struct *task)
{
        struct perf_cpu_context *cpuctx;
        struct pmu *pmu = ctx->pmu;

        cpuctx = __get_cpu_context(ctx);
        if (cpuctx->task_ctx == ctx) {
                if (cpuctx->sched_cb_usage)
                        __perf_pmu_sched_task(cpuctx, true);
                return;
        }

        perf_ctx_lock(cpuctx, ctx);
        /*
         * We must check ctx->nr_events while holding ctx->lock, such
         * that we serialize against perf_install_in_context().
         */
        if (!ctx->nr_events)
                goto unlock;

        perf_pmu_disable(pmu);
        /*
         * We want to keep the following priority order:
         * cpu pinned (that don't need to move), task pinned,
         * cpu flexible, task flexible.
         *
         * However, if task's ctx is not carrying any pinned
         * events, no need to flip the cpuctx's events around.
         */
        if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
                cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
        perf_event_sched_in(cpuctx, ctx, task);

        if (cpuctx->sched_cb_usage && pmu->sched_task)
                pmu->sched_task(cpuctx->task_ctx, true);

        perf_pmu_enable(pmu);

unlock:
        perf_ctx_unlock(cpuctx, ctx);
}

/*
 * Called from scheduler to add the events of the current task
 * with interrupts disabled.
 *
 * We restore the event value and then enable it.
 *
 * This does not protect us against NMI, but enable()
 * sets the enabled bit in the control field of event _before_
 * accessing the event control register. If a NMI hits, then it will
 * keep the event running.
 */
void __perf_event_task_sched_in(struct task_struct *prev,
                                struct task_struct *task)
{
        struct perf_event_context *ctx;
        int ctxn;

        /*
         * If cgroup events exist on this CPU, then we need to check if we have
         * to switch in PMU state; cgroup event are system-wide mode only.
         *
         * Since cgroup events are CPU events, we must schedule these in before
         * we schedule in the task events.
         */
        if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
                perf_cgroup_sched_in(prev, task);

        for_each_task_context_nr(ctxn) {
                ctx = task->perf_event_ctxp[ctxn];
                if (likely(!ctx))
                        continue;

                perf_event_context_sched_in(ctx, task);
        }

        if (atomic_read(&nr_switch_events))
                perf_event_switch(task, prev, true);

        if (__this_cpu_read(perf_sched_cb_usages))
                perf_pmu_sched_task(prev, task, true);
}

static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
{
        u64 frequency = event->attr.sample_freq;
        u64 sec = NSEC_PER_SEC;
        u64 divisor, dividend;

        int count_fls, nsec_fls, frequency_fls, sec_fls;

        count_fls = fls64(count);
        nsec_fls = fls64(nsec);
        frequency_fls = fls64(frequency);
        sec_fls = 30;

        /*
         * We got @count in @nsec, with a target of sample_freq HZ
         * the target period becomes:
         *
         *             @count * 10^9
         * period = -------------------
         *          @nsec * sample_freq
         *
         */

        /*
         * Reduce accuracy by one bit such that @a and @b converge
         * to a similar magnitude.
         */
#define REDUCE_FLS(a, b)                \
do {                                        \
        if (a##_fls > b##_fls) {        \
                a >>= 1;                \
                a##_fls--;                \
        } else {                        \
                b >>= 1;                \
                b##_fls--;                \
        }                                \
} while (0)

        /*
         * Reduce accuracy until either term fits in a u64, then proceed with
         * the other, so that finally we can do a u64/u64 division.
         */
        while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
                REDUCE_FLS(nsec, frequency);
                REDUCE_FLS(sec, count);
        }

        if (count_fls + sec_fls > 64) {
                divisor = nsec * frequency;

                while (count_fls + sec_fls > 64) {
                        REDUCE_FLS(count, sec);
                        divisor >>= 1;
                }

                dividend = count * sec;
        } else {
                dividend = count * sec;

                while (nsec_fls + frequency_fls > 64) {
                        REDUCE_FLS(nsec, frequency);
                        dividend >>= 1;
                }

                divisor = nsec * frequency;
        }

        if (!divisor)
                return dividend;

        return div64_u64(dividend, divisor);
}

static DEFINE_PER_CPU(int, perf_throttled_count);
static DEFINE_PER_CPU(u64, perf_throttled_seq);

static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
{
        struct hw_perf_event *hwc = &event->hw;
        s64 period, sample_period;
        s64 delta;

        period = perf_calculate_period(event, nsec, count);

        delta = (s64)(period - hwc->sample_period);
        delta = (delta + 7) / 8; /* low pass filter */

        sample_period = hwc->sample_period + delta;

        if (!sample_period)
                sample_period = 1;

        hwc->sample_period = sample_period;

        if (local64_read(&hwc->period_left) > 8*sample_period) {
                if (disable)
                        event->pmu->stop(event, PERF_EF_UPDATE);

                local64_set(&hwc->period_left, 0);

                if (disable)
                        event->pmu->start(event, PERF_EF_RELOAD);
        }
}

/*
 * combine freq adjustment with unthrottling to avoid two passes over the
 * events. At the same time, make sure, having freq events does not change
 * the rate of unthrottling as that would introduce bias.
 */
static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
                                           int needs_unthr)
{
        struct perf_event *event;
        struct hw_perf_event *hwc;
        u64 now, period = TICK_NSEC;
        s64 delta;

        /*
         * only need to iterate over all events iff:
         * - context have events in frequency mode (needs freq adjust)
         * - there are events to unthrottle on this cpu
         */
        if (!(ctx->nr_freq || needs_unthr))
                return;

        raw_spin_lock(&ctx->lock);
        perf_pmu_disable(ctx->pmu);

        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (event->state != PERF_EVENT_STATE_ACTIVE)
                        continue;

                if (!event_filter_match(event))
                        continue;

                perf_pmu_disable(event->pmu);

                hwc = &event->hw;

                if (hwc->interrupts == MAX_INTERRUPTS) {
                        hwc->interrupts = 0;
                        perf_log_throttle(event, 1);
                        event->pmu->start(event, 0);
                }

                if (!event->attr.freq || !event->attr.sample_freq)
                        goto next;

                /*
                 * stop the event and update event->count
                 */
                event->pmu->stop(event, PERF_EF_UPDATE);

                now = local64_read(&event->count);
                delta = now - hwc->freq_count_stamp;
                hwc->freq_count_stamp = now;

                /*
                 * restart the event
                 * reload only if value has changed
                 * we have stopped the event so tell that
                 * to perf_adjust_period() to avoid stopping it
                 * twice.
                 */
                if (delta > 0)
                        perf_adjust_period(event, period, delta, false);

                event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
        next:
                perf_pmu_enable(event->pmu);
        }

        perf_pmu_enable(ctx->pmu);
        raw_spin_unlock(&ctx->lock);
}

/*
 * Move @event to the tail of the @ctx's elegible events.
 */
static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
{
        /*
         * Rotate the first entry last of non-pinned groups. Rotation might be
         * disabled by the inheritance code.
         */
        if (ctx->rotate_disable)
                return;

        perf_event_groups_delete(&ctx->flexible_groups, event);
        perf_event_groups_insert(&ctx->flexible_groups, event);
}

/* pick an event from the flexible_groups to rotate */
static inline struct perf_event *
ctx_event_to_rotate(struct perf_event_context *ctx)
{
        struct perf_event *event;

        /* pick the first active flexible event */
        event = list_first_entry_or_null(&ctx->flexible_active,
                                         struct perf_event, active_list);

        /* if no active flexible event, pick the first event */
        if (!event) {
                event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
                                      typeof(*event), group_node);
        }

        /*
         * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
         * finds there are unschedulable events, it will set it again.
         */
        ctx->rotate_necessary = 0;

        return event;
}

static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
{
        struct perf_event *cpu_event = NULL, *task_event = NULL;
        struct perf_event_context *task_ctx = NULL;
        int cpu_rotate, task_rotate;

        /*
         * Since we run this from IRQ context, nobody can install new
         * events, thus the event count values are stable.
         */

        cpu_rotate = cpuctx->ctx.rotate_necessary;
        task_ctx = cpuctx->task_ctx;
        task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;

        if (!(cpu_rotate || task_rotate))
                return false;

        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
        perf_pmu_disable(cpuctx->ctx.pmu);

        if (task_rotate)
                task_event = ctx_event_to_rotate(task_ctx);
        if (cpu_rotate)
                cpu_event = ctx_event_to_rotate(&cpuctx->ctx);

        /*
         * As per the order given at ctx_resched() first 'pop' task flexible
         * and then, if needed CPU flexible.
         */
        if (task_event || (task_ctx && cpu_event))
                ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
        if (cpu_event)
                cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);

        if (task_event)
                rotate_ctx(task_ctx, task_event);
        if (cpu_event)
                rotate_ctx(&cpuctx->ctx, cpu_event);

        perf_event_sched_in(cpuctx, task_ctx, current);

        perf_pmu_enable(cpuctx->ctx.pmu);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);

        return true;
}

void perf_event_task_tick(void)
{
        struct list_head *head = this_cpu_ptr(&active_ctx_list);
        struct perf_event_context *ctx, *tmp;
        int throttled;

        lockdep_assert_irqs_disabled();

        __this_cpu_inc(perf_throttled_seq);
        throttled = __this_cpu_xchg(perf_throttled_count, 0);
        tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);

        list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
                perf_adjust_freq_unthr_context(ctx, throttled);
}

static int event_enable_on_exec(struct perf_event *event,
                                struct perf_event_context *ctx)
{
        if (!event->attr.enable_on_exec)
                return 0;

        event->attr.enable_on_exec = 0;
        if (event->state >= PERF_EVENT_STATE_INACTIVE)
                return 0;

        perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);

        return 1;
}

/*
 * Enable all of a task's events that have been marked enable-on-exec.
 * This expects task == current.
 */
static void perf_event_enable_on_exec(int ctxn)
{
        struct perf_event_context *ctx, *clone_ctx = NULL;
        enum event_type_t event_type = 0;
        struct perf_cpu_context *cpuctx;
        struct perf_event *event;
        unsigned long flags;
        int enabled = 0;

        local_irq_save(flags);
        ctx = current->perf_event_ctxp[ctxn];
        if (!ctx || !ctx->nr_events)
                goto out;

        cpuctx = __get_cpu_context(ctx);
        perf_ctx_lock(cpuctx, ctx);
        ctx_sched_out(ctx, cpuctx, EVENT_TIME);
        list_for_each_entry(event, &ctx->event_list, event_entry) {
                enabled |= event_enable_on_exec(event, ctx);
                event_type |= get_event_type(event);
        }

        /*
         * Unclone and reschedule this context if we enabled any event.
         */
        if (enabled) {
                clone_ctx = unclone_ctx(ctx);
                ctx_resched(cpuctx, ctx, event_type);
        } else {
                ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
        }
        perf_ctx_unlock(cpuctx, ctx);

out:
        local_irq_restore(flags);

        if (clone_ctx)
                put_ctx(clone_ctx);
}

struct perf_read_data {
        struct perf_event *event;
        bool group;
        int ret;
};

static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
{
        u16 local_pkg, event_pkg;

        if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
                int local_cpu = smp_processor_id();

                event_pkg = topology_physical_package_id(event_cpu);
                local_pkg = topology_physical_package_id(local_cpu);

                if (event_pkg == local_pkg)
                        return local_cpu;
        }

        return event_cpu;
}

/*
 * Cross CPU call to read the hardware event
 */
static void __perf_event_read(void *info)
{
        struct perf_read_data *data = info;
        struct perf_event *sub, *event = data->event;
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        struct pmu *pmu = event->pmu;

        /*
         * If this is a task context, we need to check whether it is
         * the current task context of this cpu.  If not it has been
         * scheduled out before the smp call arrived.  In that case
         * event->count would have been updated to a recent sample
         * when the event was scheduled out.
         */
        if (ctx->task && cpuctx->task_ctx != ctx)
                return;

        raw_spin_lock(&ctx->lock);
        if (ctx->is_active & EVENT_TIME) {
                update_context_time(ctx);
                update_cgrp_time_from_event(event);
        }

        perf_event_update_time(event);
        if (data->group)
                perf_event_update_sibling_time(event);

        if (event->state != PERF_EVENT_STATE_ACTIVE)
                goto unlock;

        if (!data->group) {
                pmu->read(event);
                data->ret = 0;
                goto unlock;
        }

        pmu->start_txn(pmu, PERF_PMU_TXN_READ);

        pmu->read(event);

        for_each_sibling_event(sub, event) {
                if (sub->state == PERF_EVENT_STATE_ACTIVE) {
                        /*
                         * Use sibling's PMU rather than @event's since
                         * sibling could be on different (eg: software) PMU.
                         */
                        sub->pmu->read(sub);
                }
        }

        data->ret = pmu->commit_txn(pmu);

unlock:
        raw_spin_unlock(&ctx->lock);
}

static inline u64 perf_event_count(struct perf_event *event)
{
        return local64_read(&event->count) + atomic64_read(&event->child_count);
}

static void calc_timer_values(struct perf_event *event,
                                u64 *now,
                                u64 *enabled,
                                u64 *running)
{
        u64 ctx_time;

        *now = perf_clock();
        ctx_time = perf_event_time_now(event, *now);
        __perf_update_times(event, ctx_time, enabled, running);
}

/*
 * NMI-safe method to read a local event, that is an event that
 * is:
 *   - either for the current task, or for this CPU
 *   - does not have inherit set, for inherited task events
 *     will not be local and we cannot read them atomically
 *   - must not have a pmu::count method
 */
int perf_event_read_local(struct perf_event *event, u64 *value,
                          u64 *enabled, u64 *running)
{
        unsigned long flags;
        int ret = 0;

        /*
         * Disabling interrupts avoids all counter scheduling (context
         * switches, timer based rotation and IPIs).
         */
        local_irq_save(flags);

        /*
         * It must not be an event with inherit set, we cannot read
         * all child counters from atomic context.
         */
        if (event->attr.inherit) {
                ret = -EOPNOTSUPP;
                goto out;
        }

        /* If this is a per-task event, it must be for current */
        if ((event->attach_state & PERF_ATTACH_TASK) &&
            event->hw.target != current) {
                ret = -EINVAL;
                goto out;
        }

        /* If this is a per-CPU event, it must be for this CPU */
        if (!(event->attach_state & PERF_ATTACH_TASK) &&
            event->cpu != smp_processor_id()) {
                ret = -EINVAL;
                goto out;
        }

        /* If this is a pinned event it must be running on this CPU */
        if (event->attr.pinned && event->oncpu != smp_processor_id()) {
                ret = -EBUSY;
                goto out;
        }

        /*
         * If the event is currently on this CPU, its either a per-task event,
         * or local to this CPU. Furthermore it means its ACTIVE (otherwise
         * oncpu == -1).
         */
        if (event->oncpu == smp_processor_id())
                event->pmu->read(event);

        *value = local64_read(&event->count);
        if (enabled || running) {
                u64 __enabled, __running, __now;;

                calc_timer_values(event, &__now, &__enabled, &__running);
                if (enabled)
                        *enabled = __enabled;
                if (running)
                        *running = __running;
        }
out:
        local_irq_restore(flags);

        return ret;
}

static int perf_event_read(struct perf_event *event, bool group)
{
        enum perf_event_state state = READ_ONCE(event->state);
        int event_cpu, ret = 0;

        /*
         * If event is enabled and currently active on a CPU, update the
         * value in the event structure:
         */
again:
        if (state == PERF_EVENT_STATE_ACTIVE) {
                struct perf_read_data data;

                /*
                 * Orders the ->state and ->oncpu loads such that if we see
                 * ACTIVE we must also see the right ->oncpu.
                 *
                 * Matches the smp_wmb() from event_sched_in().
                 */
                smp_rmb();

                event_cpu = READ_ONCE(event->oncpu);
                if ((unsigned)event_cpu >= nr_cpu_ids)
                        return 0;

                data = (struct perf_read_data){
                        .event = event,
                        .group = group,
                        .ret = 0,
                };

                preempt_disable();
                event_cpu = __perf_event_read_cpu(event, event_cpu);

                /*
                 * Purposely ignore the smp_call_function_single() return
                 * value.
                 *
                 * If event_cpu isn't a valid CPU it means the event got
                 * scheduled out and that will have updated the event count.
                 *
                 * Therefore, either way, we'll have an up-to-date event count
                 * after this.
                 */
                (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
                preempt_enable();
                ret = data.ret;

        } else if (state == PERF_EVENT_STATE_INACTIVE) {
                struct perf_event_context *ctx = event->ctx;
                unsigned long flags;

                raw_spin_lock_irqsave(&ctx->lock, flags);
                state = event->state;
                if (state != PERF_EVENT_STATE_INACTIVE) {
                        raw_spin_unlock_irqrestore(&ctx->lock, flags);
                        goto again;
                }

                /*
                 * May read while context is not active (e.g., thread is
                 * blocked), in that case we cannot update context time
                 */
                if (ctx->is_active & EVENT_TIME) {
                        update_context_time(ctx);
                        update_cgrp_time_from_event(event);
                }

                perf_event_update_time(event);
                if (group)
                        perf_event_update_sibling_time(event);
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }

        return ret;
}

/*
 * Initialize the perf_event context in a task_struct:
 */
static void __perf_event_init_context(struct perf_event_context *ctx)
{
        raw_spin_lock_init(&ctx->lock);
        mutex_init(&ctx->mutex);
        INIT_LIST_HEAD(&ctx->active_ctx_list);
        perf_event_groups_init(&ctx->pinned_groups);
        perf_event_groups_init(&ctx->flexible_groups);
        INIT_LIST_HEAD(&ctx->event_list);
        INIT_LIST_HEAD(&ctx->pinned_active);
        INIT_LIST_HEAD(&ctx->flexible_active);
        refcount_set(&ctx->refcount, 1);
}

static struct perf_event_context *
alloc_perf_context(struct pmu *pmu, struct task_struct *task)
{
        struct perf_event_context *ctx;

        ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
        if (!ctx)
                return NULL;

        __perf_event_init_context(ctx);
        if (task)
                ctx->task = get_task_struct(task);
        ctx->pmu = pmu;

        return ctx;
}

static struct task_struct *
find_lively_task_by_vpid(pid_t vpid)
{
        struct task_struct *task;

        rcu_read_lock();
        if (!vpid)
                task = current;
        else
                task = find_task_by_vpid(vpid);
        if (task)
                get_task_struct(task);
        rcu_read_unlock();

        if (!task)
                return ERR_PTR(-ESRCH);

        return task;
}

/*
 * Returns a matching context with refcount and pincount.
 */
static struct perf_event_context *
find_get_context(struct pmu *pmu, struct task_struct *task,
                struct perf_event *event)
{
        struct perf_event_context *ctx, *clone_ctx = NULL;
        struct perf_cpu_context *cpuctx;
        void *task_ctx_data = NULL;
        unsigned long flags;
        int ctxn, err;
        int cpu = event->cpu;

        if (!task) {
                /* Must be root to operate on a CPU event: */
                err = perf_allow_cpu(&event->attr);
                if (err)
                        return ERR_PTR(err);

                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
                ctx = &cpuctx->ctx;
                get_ctx(ctx);
                raw_spin_lock_irqsave(&ctx->lock, flags);
                ++ctx->pin_count;
                raw_spin_unlock_irqrestore(&ctx->lock, flags);

                return ctx;
        }

        err = -EINVAL;
        ctxn = pmu->task_ctx_nr;
        if (ctxn < 0)
                goto errout;

        if (event->attach_state & PERF_ATTACH_TASK_DATA) {
                task_ctx_data = alloc_task_ctx_data(pmu);
                if (!task_ctx_data) {
                        err = -ENOMEM;
                        goto errout;
                }
        }

retry:
        ctx = perf_lock_task_context(task, ctxn, &flags);
        if (ctx) {
                clone_ctx = unclone_ctx(ctx);
                ++ctx->pin_count;

                if (task_ctx_data && !ctx->task_ctx_data) {
                        ctx->task_ctx_data = task_ctx_data;
                        task_ctx_data = NULL;
                }
                raw_spin_unlock_irqrestore(&ctx->lock, flags);

                if (clone_ctx)
                        put_ctx(clone_ctx);
        } else {
                ctx = alloc_perf_context(pmu, task);
                err = -ENOMEM;
                if (!ctx)
                        goto errout;

                if (task_ctx_data) {
                        ctx->task_ctx_data = task_ctx_data;
                        task_ctx_data = NULL;
                }

                err = 0;
                mutex_lock(&task->perf_event_mutex);
                /*
                 * If it has already passed perf_event_exit_task().
                 * we must see PF_EXITING, it takes this mutex too.
                 */
                if (task->flags & PF_EXITING)
                        err = -ESRCH;
                else if (task->perf_event_ctxp[ctxn])
                        err = -EAGAIN;
                else {
                        get_ctx(ctx);
                        ++ctx->pin_count;
                        rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
                }
                mutex_unlock(&task->perf_event_mutex);

                if (unlikely(err)) {
                        put_ctx(ctx);

                        if (err == -EAGAIN)
                                goto retry;
                        goto errout;
                }
        }

        free_task_ctx_data(pmu, task_ctx_data);
        return ctx;

errout:
        free_task_ctx_data(pmu, task_ctx_data);
        return ERR_PTR(err);
}

static void perf_event_free_filter(struct perf_event *event);
static void perf_event_free_bpf_prog(struct perf_event *event);

static void free_event_rcu(struct rcu_head *head)
{
        struct perf_event *event;

        event = container_of(head, struct perf_event, rcu_head);
        if (event->ns)
                put_pid_ns(event->ns);
        perf_event_free_filter(event);
        kfree(event);
}

static void ring_buffer_attach(struct perf_event *event,
                               struct perf_buffer *rb);

static void detach_sb_event(struct perf_event *event)
{
        struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);

        raw_spin_lock(&pel->lock);
        list_del_rcu(&event->sb_list);
        raw_spin_unlock(&pel->lock);
}

static bool is_sb_event(struct perf_event *event)
{
        struct perf_event_attr *attr = &event->attr;

        if (event->parent)
                return false;

        if (event->attach_state & PERF_ATTACH_TASK)
                return false;

        if (attr->mmap || attr->mmap_data || attr->mmap2 ||
            attr->comm || attr->comm_exec ||
            attr->task || attr->ksymbol ||
            attr->context_switch || attr->text_poke ||
            attr->bpf_event)
                return true;
        return false;
}

static void unaccount_pmu_sb_event(struct perf_event *event)
{
        if (is_sb_event(event))
                detach_sb_event(event);
}

static void unaccount_event_cpu(struct perf_event *event, int cpu)
{
        if (event->parent)
                return;

        if (is_cgroup_event(event))
                atomic_dec(&per_cpu(perf_cgroup_events, cpu));
}

#ifdef CONFIG_NO_HZ_FULL
static DEFINE_SPINLOCK(nr_freq_lock);
#endif

static void unaccount_freq_event_nohz(void)
{
#ifdef CONFIG_NO_HZ_FULL
        spin_lock(&nr_freq_lock);
        if (atomic_dec_and_test(&nr_freq_events))
                tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
        spin_unlock(&nr_freq_lock);
#endif
}

static void unaccount_freq_event(void)
{
        if (tick_nohz_full_enabled())
                unaccount_freq_event_nohz();
        else
                atomic_dec(&nr_freq_events);
}

static void unaccount_event(struct perf_event *event)
{
        bool dec = false;

        if (event->parent)
                return;

        if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
                dec = true;
        if (event->attr.mmap || event->attr.mmap_data)
                atomic_dec(&nr_mmap_events);
        if (event->attr.comm)
                atomic_dec(&nr_comm_events);
        if (event->attr.namespaces)
                atomic_dec(&nr_namespaces_events);
        if (event->attr.cgroup)
                atomic_dec(&nr_cgroup_events);
        if (event->attr.task)
                atomic_dec(&nr_task_events);
        if (event->attr.freq)
                unaccount_freq_event();
        if (event->attr.context_switch) {
                dec = true;
                atomic_dec(&nr_switch_events);
        }
        if (is_cgroup_event(event))
                dec = true;
        if (has_branch_stack(event))
                dec = true;
        if (event->attr.ksymbol)
                atomic_dec(&nr_ksymbol_events);
        if (event->attr.bpf_event)
                atomic_dec(&nr_bpf_events);
        if (event->attr.text_poke)
                atomic_dec(&nr_text_poke_events);

        if (dec) {
                if (!atomic_add_unless(&perf_sched_count, -1, 1))
                        schedule_delayed_work(&perf_sched_work, HZ);
        }

        unaccount_event_cpu(event, event->cpu);

        unaccount_pmu_sb_event(event);
}

static void perf_sched_delayed(struct work_struct *work)
{
        mutex_lock(&perf_sched_mutex);
        if (atomic_dec_and_test(&perf_sched_count))
                static_branch_disable(&perf_sched_events);
        mutex_unlock(&perf_sched_mutex);
}

/*
 * The following implement mutual exclusion of events on "exclusive" pmus
 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
 * at a time, so we disallow creating events that might conflict, namely:
 *
 *  1) cpu-wide events in the presence of per-task events,
 *  2) per-task events in the presence of cpu-wide events,
 *  3) two matching events on the same context.
 *
 * The former two cases are handled in the allocation path (perf_event_alloc(),
 * _free_event()), the latter -- before the first perf_install_in_context().
 */
static int exclusive_event_init(struct perf_event *event)
{
        struct pmu *pmu = event->pmu;

        if (!is_exclusive_pmu(pmu))
                return 0;

        /*
         * Prevent co-existence of per-task and cpu-wide events on the
         * same exclusive pmu.
         *
         * Negative pmu::exclusive_cnt means there are cpu-wide
         * events on this "exclusive" pmu, positive means there are
         * per-task events.
         *
         * Since this is called in perf_event_alloc() path, event::ctx
         * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
         * to mean "per-task event", because unlike other attach states it
         * never gets cleared.
         */
        if (event->attach_state & PERF_ATTACH_TASK) {
                if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
                        return -EBUSY;
        } else {
                if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
                        return -EBUSY;
        }

        return 0;
}

static void exclusive_event_destroy(struct perf_event *event)
{
        struct pmu *pmu = event->pmu;

        if (!is_exclusive_pmu(pmu))
                return;

        /* see comment in exclusive_event_init() */
        if (event->attach_state & PERF_ATTACH_TASK)
                atomic_dec(&pmu->exclusive_cnt);
        else
                atomic_inc(&pmu->exclusive_cnt);
}

static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
{
        if ((e1->pmu == e2->pmu) &&
            (e1->cpu == e2->cpu ||
             e1->cpu == -1 ||
             e2->cpu == -1))
                return true;
        return false;
}

static bool exclusive_event_installable(struct perf_event *event,
                                        struct perf_event_context *ctx)
{
        struct perf_event *iter_event;
        struct pmu *pmu = event->pmu;

        lockdep_assert_held(&ctx->mutex);

        if (!is_exclusive_pmu(pmu))
                return true;

        list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
                if (exclusive_event_match(iter_event, event))
                        return false;
        }

        return true;
}

static void perf_addr_filters_splice(struct perf_event *event,
                                       struct list_head *head);

static void _free_event(struct perf_event *event)
{
        irq_work_sync(&event->pending);

        unaccount_event(event);

        security_perf_event_free(event);

        if (event->rb) {
                /*
                 * Can happen when we close an event with re-directed output.
                 *
                 * Since we have a 0 refcount, perf_mmap_close() will skip
                 * over us; possibly making our ring_buffer_put() the last.
                 */
                mutex_lock(&event->mmap_mutex);
                ring_buffer_attach(event, NULL);
                mutex_unlock(&event->mmap_mutex);
        }

        if (is_cgroup_event(event))
                perf_detach_cgroup(event);

        if (!event->parent) {
                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
                        put_callchain_buffers();
        }

        perf_event_free_bpf_prog(event);
        perf_addr_filters_splice(event, NULL);
        kfree(event->addr_filter_ranges);

        if (event->destroy)
                event->destroy(event);

        /*
         * Must be after ->destroy(), due to uprobe_perf_close() using
         * hw.target.
         */
        if (event->hw.target)
                put_task_struct(event->hw.target);

        /*
         * perf_event_free_task() relies on put_ctx() being 'last', in particular
         * all task references must be cleaned up.
         */
        if (event->ctx)
                put_ctx(event->ctx);

        exclusive_event_destroy(event);
        module_put(event->pmu->module);

        call_rcu(&event->rcu_head, free_event_rcu);
}

/*
 * Used to free events which have a known refcount of 1, such as in error paths
 * where the event isn't exposed yet and inherited events.
 */
static void free_event(struct perf_event *event)
{
        if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
                                "unexpected event refcount: %ld; ptr=%p\n",
                                atomic_long_read(&event->refcount), event)) {
                /* leak to avoid use-after-free */
                return;
        }

        _free_event(event);
}

/*
 * Remove user event from the owner task.
 */
static void perf_remove_from_owner(struct perf_event *event)
{
        struct task_struct *owner;

        rcu_read_lock();
        /*
         * Matches the smp_store_release() in perf_event_exit_task(). If we
         * observe !owner it means the list deletion is complete and we can
         * indeed free this event, otherwise we need to serialize on
         * owner->perf_event_mutex.
         */
        owner = READ_ONCE(event->owner);
        if (owner) {
                /*
                 * Since delayed_put_task_struct() also drops the last
                 * task reference we can safely take a new reference
                 * while holding the rcu_read_lock().
                 */
                get_task_struct(owner);
        }
        rcu_read_unlock();

        if (owner) {
                /*
                 * If we're here through perf_event_exit_task() we're already
                 * holding ctx->mutex which would be an inversion wrt. the
                 * normal lock order.
                 *
                 * However we can safely take this lock because its the child
                 * ctx->mutex.
                 */
                mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);

                /*
                 * We have to re-check the event->owner field, if it is cleared
                 * we raced with perf_event_exit_task(), acquiring the mutex
                 * ensured they're done, and we can proceed with freeing the
                 * event.
                 */
                if (event->owner) {
                        list_del_init(&event->owner_entry);
                        smp_store_release(&event->owner, NULL);
                }
                mutex_unlock(&owner->perf_event_mutex);
                put_task_struct(owner);
        }
}

static void put_event(struct perf_event *event)
{
        if (!atomic_long_dec_and_test(&event->refcount))
                return;

        _free_event(event);
}

/*
 * Kill an event dead; while event:refcount will preserve the event
 * object, it will not preserve its functionality. Once the last 'user'
 * gives up the object, we'll destroy the thing.
 */
int perf_event_release_kernel(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *child, *tmp;
        LIST_HEAD(free_list);

        /*
         * If we got here through err_file: fput(event_file); we will not have
         * attached to a context yet.
         */
        if (!ctx) {
                WARN_ON_ONCE(event->attach_state &
                                (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
                goto no_ctx;
        }

        if (!is_kernel_event(event))
                perf_remove_from_owner(event);

        ctx = perf_event_ctx_lock(event);
        WARN_ON_ONCE(ctx->parent_ctx);
        perf_remove_from_context(event, DETACH_GROUP);

        raw_spin_lock_irq(&ctx->lock);
        /*
         * Mark this event as STATE_DEAD, there is no external reference to it
         * anymore.
         *
         * Anybody acquiring event->child_mutex after the below loop _must_
         * also see this, most importantly inherit_event() which will avoid
         * placing more children on the list.
         *
         * Thus this guarantees that we will in fact observe and kill _ALL_
         * child events.
         */
        event->state = PERF_EVENT_STATE_DEAD;
        raw_spin_unlock_irq(&ctx->lock);

        perf_event_ctx_unlock(event, ctx);

again:
        mutex_lock(&event->child_mutex);
        list_for_each_entry(child, &event->child_list, child_list) {

                /*
                 * Cannot change, child events are not migrated, see the
                 * comment with perf_event_ctx_lock_nested().
                 */
                ctx = READ_ONCE(child->ctx);
                /*
                 * Since child_mutex nests inside ctx::mutex, we must jump
                 * through hoops. We start by grabbing a reference on the ctx.
                 *
                 * Since the event cannot get freed while we hold the
                 * child_mutex, the context must also exist and have a !0
                 * reference count.
                 */
                get_ctx(ctx);

                /*
                 * Now that we have a ctx ref, we can drop child_mutex, and
                 * acquire ctx::mutex without fear of it going away. Then we
                 * can re-acquire child_mutex.
                 */
                mutex_unlock(&event->child_mutex);
                mutex_lock(&ctx->mutex);
                mutex_lock(&event->child_mutex);

                /*
                 * Now that we hold ctx::mutex and child_mutex, revalidate our
                 * state, if child is still the first entry, it didn't get freed
                 * and we can continue doing so.
                 */
                tmp = list_first_entry_or_null(&event->child_list,
                                               struct perf_event, child_list);
                if (tmp == child) {
                        perf_remove_from_context(child, DETACH_GROUP);
                        list_move(&child->child_list, &free_list);
                        /*
                         * This matches the refcount bump in inherit_event();
                         * this can't be the last reference.
                         */
                        put_event(event);
                }

                mutex_unlock(&event->child_mutex);
                mutex_unlock(&ctx->mutex);
                put_ctx(ctx);
                goto again;
        }
        mutex_unlock(&event->child_mutex);

        list_for_each_entry_safe(child, tmp, &free_list, child_list) {
                void *var = &child->ctx->refcount;

                list_del(&child->child_list);
                free_event(child);

                /*
                 * Wake any perf_event_free_task() waiting for this event to be
                 * freed.
                 */
                smp_mb(); /* pairs with wait_var_event() */
                wake_up_var(var);
        }

no_ctx:
        put_event(event); /* Must be the 'last' reference */
        return 0;
}
EXPORT_SYMBOL_GPL(perf_event_release_kernel);

/*
 * Called when the last reference to the file is gone.
 */
static int perf_release(struct inode *inode, struct file *file)
{
        perf_event_release_kernel(file->private_data);
        return 0;
}

static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
        struct perf_event *child;
        u64 total = 0;

        *enabled = 0;
        *running = 0;

        mutex_lock(&event->child_mutex);

        (void)perf_event_read(event, false);
        total += perf_event_count(event);

        *enabled += event->total_time_enabled +
                        atomic64_read(&event->child_total_time_enabled);
        *running += event->total_time_running +
                        atomic64_read(&event->child_total_time_running);

        list_for_each_entry(child, &event->child_list, child_list) {
                (void)perf_event_read(child, false);
                total += perf_event_count(child);
                *enabled += child->total_time_enabled;
                *running += child->total_time_running;
        }
        mutex_unlock(&event->child_mutex);

        return total;
}

u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
        struct perf_event_context *ctx;
        u64 count;

        ctx = perf_event_ctx_lock(event);
        count = __perf_event_read_value(event, enabled, running);
        perf_event_ctx_unlock(event, ctx);

        return count;
}
EXPORT_SYMBOL_GPL(perf_event_read_value);

static int __perf_read_group_add(struct perf_event *leader,
                                        u64 read_format, u64 *values)
{
        struct perf_event_context *ctx = leader->ctx;
        struct perf_event *sub, *parent;
        unsigned long flags;
        int n = 1; /* skip @nr */
        int ret;

        ret = perf_event_read(leader, true);
        if (ret)
                return ret;

        raw_spin_lock_irqsave(&ctx->lock, flags);
        /*
         * Verify the grouping between the parent and child (inherited)
         * events is still in tact.
         *
         * Specifically:
         *  - leader->ctx->lock pins leader->sibling_list
         *  - parent->child_mutex pins parent->child_list
         *  - parent->ctx->mutex pins parent->sibling_list
         *
         * Because parent->ctx != leader->ctx (and child_list nests inside
         * ctx->mutex), group destruction is not atomic between children, also
         * see perf_event_release_kernel(). Additionally, parent can grow the
         * group.
         *
         * Therefore it is possible to have parent and child groups in a
         * different configuration and summing over such a beast makes no sense
         * what so ever.
         *
         * Reject this.
         */
        parent = leader->parent;
        if (parent &&
            (parent->group_generation != leader->group_generation ||
             parent->nr_siblings != leader->nr_siblings)) {
                ret = -ECHILD;
                goto unlock;
        }

        /*
         * Since we co-schedule groups, {enabled,running} times of siblings
         * will be identical to those of the leader, so we only publish one
         * set.
         */
        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
                values[n++] += leader->total_time_enabled +
                        atomic64_read(&leader->child_total_time_enabled);
        }

        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
                values[n++] += leader->total_time_running +
                        atomic64_read(&leader->child_total_time_running);
        }

        /*
         * Write {count,id} tuples for every sibling.
         */
        values[n++] += perf_event_count(leader);
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(leader);
        if (read_format & PERF_FORMAT_LOST)
                values[n++] = atomic64_read(&leader->lost_samples);

        for_each_sibling_event(sub, leader) {
                values[n++] += perf_event_count(sub);
                if (read_format & PERF_FORMAT_ID)
                        values[n++] = primary_event_id(sub);
                if (read_format & PERF_FORMAT_LOST)
                        values[n++] = atomic64_read(&sub->lost_samples);
        }

unlock:
        raw_spin_unlock_irqrestore(&ctx->lock, flags);
        return ret;
}

static int perf_read_group(struct perf_event *event,
                                   u64 read_format, char __user *buf)
{
        struct perf_event *leader = event->group_leader, *child;
        struct perf_event_context *ctx = leader->ctx;
        int ret;
        u64 *values;

        lockdep_assert_held(&ctx->mutex);

        values = kzalloc(event->read_size, GFP_KERNEL);
        if (!values)
                return -ENOMEM;

        values[0] = 1 + leader->nr_siblings;

        mutex_lock(&leader->child_mutex);

        ret = __perf_read_group_add(leader, read_format, values);
        if (ret)
                goto unlock;

        list_for_each_entry(child, &leader->child_list, child_list) {
                ret = __perf_read_group_add(child, read_format, values);
                if (ret)
                        goto unlock;
        }

        mutex_unlock(&leader->child_mutex);

        ret = event->read_size;
        if (copy_to_user(buf, values, event->read_size))
                ret = -EFAULT;
        goto out;

unlock:
        mutex_unlock(&leader->child_mutex);
out:
        kfree(values);
        return ret;
}

static int perf_read_one(struct perf_event *event,
                                 u64 read_format, char __user *buf)
{
        u64 enabled, running;
        u64 values[5];
        int n = 0;

        values[n++] = __perf_event_read_value(event, &enabled, &running);
        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
                values[n++] = enabled;
        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
                values[n++] = running;
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(event);
        if (read_format & PERF_FORMAT_LOST)
                values[n++] = atomic64_read(&event->lost_samples);

        if (copy_to_user(buf, values, n * sizeof(u64)))
                return -EFAULT;

        return n * sizeof(u64);
}

static bool is_event_hup(struct perf_event *event)
{
        bool no_children;

        if (event->state > PERF_EVENT_STATE_EXIT)
                return false;

        mutex_lock(&event->child_mutex);
        no_children = list_empty(&event->child_list);
        mutex_unlock(&event->child_mutex);
        return no_children;
}

/*
 * Read the performance event - simple non blocking version for now
 */
static ssize_t
__perf_read(struct perf_event *event, char __user *buf, size_t count)
{
        u64 read_format = event->attr.read_format;
        int ret;

        /*
         * Return end-of-file for a read on an event that is in
         * error state (i.e. because it was pinned but it couldn't be
         * scheduled on to the CPU at some point).
         */
        if (event->state == PERF_EVENT_STATE_ERROR)
                return 0;

        if (count < event->read_size)
                return -ENOSPC;

        WARN_ON_ONCE(event->ctx->parent_ctx);
        if (read_format & PERF_FORMAT_GROUP)
                ret = perf_read_group(event, read_format, buf);
        else
                ret = perf_read_one(event, read_format, buf);

        return ret;
}

static ssize_t
perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
        struct perf_event *event = file->private_data;
        struct perf_event_context *ctx;
        int ret;

        ret = security_perf_event_read(event);
        if (ret)
                return ret;

        ctx = perf_event_ctx_lock(event);
        ret = __perf_read(event, buf, count);
        perf_event_ctx_unlock(event, ctx);

        return ret;
}

static __poll_t perf_poll(struct file *file, poll_table *wait)
{
        struct perf_event *event = file->private_data;
        struct perf_buffer *rb;
        __poll_t events = EPOLLHUP;

        poll_wait(file, &event->waitq, wait);

        if (is_event_hup(event))
                return events;

        /*
         * Pin the event->rb by taking event->mmap_mutex; otherwise
         * perf_event_set_output() can swizzle our rb and make us miss wakeups.
         */
        mutex_lock(&event->mmap_mutex);
        rb = event->rb;
        if (rb)
                events = atomic_xchg(&rb->poll, 0);
        mutex_unlock(&event->mmap_mutex);
        return events;
}

static void _perf_event_reset(struct perf_event *event)
{
        (void)perf_event_read(event, false);
        local64_set(&event->count, 0);
        perf_event_update_userpage(event);
}

/* Assume it's not an event with inherit set. */
u64 perf_event_pause(struct perf_event *event, bool reset)
{
        struct perf_event_context *ctx;
        u64 count;

        ctx = perf_event_ctx_lock(event);
        WARN_ON_ONCE(event->attr.inherit);
        _perf_event_disable(event);
        count = local64_read(&event->count);
        if (reset)
                local64_set(&event->count, 0);
        perf_event_ctx_unlock(event, ctx);

        return count;
}
EXPORT_SYMBOL_GPL(perf_event_pause);

/*
 * Holding the top-level event's child_mutex means that any
 * descendant process that has inherited this event will block
 * in perf_event_exit_event() if it goes to exit, thus satisfying the
 * task existence requirements of perf_event_enable/disable.
 */
static void perf_event_for_each_child(struct perf_event *event,
                                        void (*func)(struct perf_event *))
{
        struct perf_event *child;

        WARN_ON_ONCE(event->ctx->parent_ctx);

        mutex_lock(&event->child_mutex);
        func(event);
        list_for_each_entry(child, &event->child_list, child_list)
                func(child);
        mutex_unlock(&event->child_mutex);
}

static void perf_event_for_each(struct perf_event *event,
                                  void (*func)(struct perf_event *))
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *sibling;

        lockdep_assert_held(&ctx->mutex);

        event = event->group_leader;

        perf_event_for_each_child(event, func);
        for_each_sibling_event(sibling, event)
                perf_event_for_each_child(sibling, func);
}

static void __perf_event_period(struct perf_event *event,
                                struct perf_cpu_context *cpuctx,
                                struct perf_event_context *ctx,
                                void *info)
{
        u64 value = *((u64 *)info);
        bool active;

        if (event->attr.freq) {
                event->attr.sample_freq = value;
        } else {
                event->attr.sample_period = value;
                event->hw.sample_period = value;
        }

        active = (event->state == PERF_EVENT_STATE_ACTIVE);
        if (active) {
                perf_pmu_disable(ctx->pmu);
                /*
                 * We could be throttled; unthrottle now to avoid the tick
                 * trying to unthrottle while we already re-started the event.
                 */
                if (event->hw.interrupts == MAX_INTERRUPTS) {
                        event->hw.interrupts = 0;
                        perf_log_throttle(event, 1);
                }
                event->pmu->stop(event, PERF_EF_UPDATE);
        }

        local64_set(&event->hw.period_left, 0);

        if (active) {
                event->pmu->start(event, PERF_EF_RELOAD);
                perf_pmu_enable(ctx->pmu);
        }
}

static int perf_event_check_period(struct perf_event *event, u64 value)
{
        return event->pmu->check_period(event, value);
}

static int _perf_event_period(struct perf_event *event, u64 value)
{
        if (!is_sampling_event(event))
                return -EINVAL;

        if (!value)
                return -EINVAL;

        if (event->attr.freq && value > sysctl_perf_event_sample_rate)
                return -EINVAL;

        if (perf_event_check_period(event, value))
                return -EINVAL;

        if (!event->attr.freq && (value & (1ULL << 63)))
                return -EINVAL;

        event_function_call(event, __perf_event_period, &value);

        return 0;
}

int perf_event_period(struct perf_event *event, u64 value)
{
        struct perf_event_context *ctx;
        int ret;

        ctx = perf_event_ctx_lock(event);
        ret = _perf_event_period(event, value);
        perf_event_ctx_unlock(event, ctx);

        return ret;
}
EXPORT_SYMBOL_GPL(perf_event_period);

static const struct file_operations perf_fops;

static inline int perf_fget_light(int fd, struct fd *p)
{
        struct fd f = fdget(fd);
        if (!f.file)
                return -EBADF;

        if (f.file->f_op != &perf_fops) {
                fdput(f);
                return -EBADF;
        }
        *p = f;
        return 0;
}

static int perf_event_set_output(struct perf_event *event,
                                 struct perf_event *output_event);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
static int perf_copy_attr(struct perf_event_attr __user *uattr,
                          struct perf_event_attr *attr);

static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
{
        void (*func)(struct perf_event *);
        u32 flags = arg;

        switch (cmd) {
        case PERF_EVENT_IOC_ENABLE:
                func = _perf_event_enable;
                break;
        case PERF_EVENT_IOC_DISABLE:
                func = _perf_event_disable;
                break;
        case PERF_EVENT_IOC_RESET:
                func = _perf_event_reset;
                break;

        case PERF_EVENT_IOC_REFRESH:
                return _perf_event_refresh(event, arg);

        case PERF_EVENT_IOC_PERIOD:
        {
                u64 value;

                if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
                        return -EFAULT;

                return _perf_event_period(event, value);
        }
        case PERF_EVENT_IOC_ID:
        {
                u64 id = primary_event_id(event);

                if (copy_to_user((void __user *)arg, &id, sizeof(id)))
                        return -EFAULT;
                return 0;
        }

        case PERF_EVENT_IOC_SET_OUTPUT:
        {
                int ret;
                if (arg != -1) {
                        struct perf_event *output_event;
                        struct fd output;
                        ret = perf_fget_light(arg, &output);
                        if (ret)
                                return ret;
                        output_event = output.file->private_data;
                        ret = perf_event_set_output(event, output_event);
                        fdput(output);
                } else {
                        ret = perf_event_set_output(event, NULL);
                }
                return ret;
        }

        case PERF_EVENT_IOC_SET_FILTER:
                return perf_event_set_filter(event, (void __user *)arg);

        case PERF_EVENT_IOC_SET_BPF:
                return perf_event_set_bpf_prog(event, arg);

        case PERF_EVENT_IOC_PAUSE_OUTPUT: {
                struct perf_buffer *rb;

                rcu_read_lock();
                rb = rcu_dereference(event->rb);
                if (!rb || !rb->nr_pages) {
                        rcu_read_unlock();
                        return -EINVAL;
                }
                rb_toggle_paused(rb, !!arg);
                rcu_read_unlock();
                return 0;
        }

        case PERF_EVENT_IOC_QUERY_BPF:
                return perf_event_query_prog_array(event, (void __user *)arg);

        case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
                struct perf_event_attr new_attr;
                int err = perf_copy_attr((struct perf_event_attr __user *)arg,
                                         &new_attr);

                if (err)
                        return err;

                return perf_event_modify_attr(event,  &new_attr);
        }
        default:
                return -ENOTTY;
        }

        if (flags & PERF_IOC_FLAG_GROUP)
                perf_event_for_each(event, func);
        else
                perf_event_for_each_child(event, func);

        return 0;
}

static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        struct perf_event *event = file->private_data;
        struct perf_event_context *ctx;
        long ret;

        /* Treat ioctl like writes as it is likely a mutating operation. */
        ret = security_perf_event_write(event);
        if (ret)
                return ret;

        ctx = perf_event_ctx_lock(event);
        ret = _perf_ioctl(event, cmd, arg);
        perf_event_ctx_unlock(event, ctx);

        return ret;
}

#ifdef CONFIG_COMPAT
static long perf_compat_ioctl(struct file *file, unsigned int cmd,
                                unsigned long arg)
{
        switch (_IOC_NR(cmd)) {
        case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
        case _IOC_NR(PERF_EVENT_IOC_ID):
        case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
        case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
                /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
                if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
                        cmd &= ~IOCSIZE_MASK;
                        cmd |= sizeof(void *) << IOCSIZE_SHIFT;
                }
                break;
        }
        return perf_ioctl(file, cmd, arg);
}
#else
# define perf_compat_ioctl NULL
#endif

int perf_event_task_enable(void)
{
        struct perf_event_context *ctx;
        struct perf_event *event;

        mutex_lock(&current->perf_event_mutex);
        list_for_each_entry(event, &current->perf_event_list, owner_entry) {
                ctx = perf_event_ctx_lock(event);
                perf_event_for_each_child(event, _perf_event_enable);
                perf_event_ctx_unlock(event, ctx);
        }
        mutex_unlock(&current->perf_event_mutex);

        return 0;
}

int perf_event_task_disable(void)
{
        struct perf_event_context *ctx;
        struct perf_event *event;

        mutex_lock(&current->perf_event_mutex);
        list_for_each_entry(event, &current->perf_event_list, owner_entry) {
                ctx = perf_event_ctx_lock(event);
                perf_event_for_each_child(event, _perf_event_disable);
                perf_event_ctx_unlock(event, ctx);
        }
        mutex_unlock(&current->perf_event_mutex);

        return 0;
}

static int perf_event_index(struct perf_event *event)
{
        if (event->hw.state & PERF_HES_STOPPED)
                return 0;

        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return 0;

        return event->pmu->event_idx(event);
}

static void perf_event_init_userpage(struct perf_event *event)
{
        struct perf_event_mmap_page *userpg;
        struct perf_buffer *rb;

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (!rb)
                goto unlock;

        userpg = rb->user_page;

        /* Allow new userspace to detect that bit 0 is deprecated */
        userpg->cap_bit0_is_deprecated = 1;
        userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
        userpg->data_offset = PAGE_SIZE;
        userpg->data_size = perf_data_size(rb);

unlock:
        rcu_read_unlock();
}

void __weak arch_perf_update_userpage(
        struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
{
}

/*
 * Callers need to ensure there can be no nesting of this function, otherwise
 * the seqlock logic goes bad. We can not serialize this because the arch
 * code calls this from NMI context.
 */
void perf_event_update_userpage(struct perf_event *event)
{
        struct perf_event_mmap_page *userpg;
        struct perf_buffer *rb;
        u64 enabled, running, now;

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (!rb)
                goto unlock;

        /*
         * compute total_time_enabled, total_time_running
         * based on snapshot values taken when the event
         * was last scheduled in.
         *
         * we cannot simply called update_context_time()
         * because of locking issue as we can be called in
         * NMI context
         */
        calc_timer_values(event, &now, &enabled, &running);

        userpg = rb->user_page;
        /*
         * Disable preemption to guarantee consistent time stamps are stored to
         * the user page.
         */
        preempt_disable();
        ++userpg->lock;
        barrier();
        userpg->index = perf_event_index(event);
        userpg->offset = perf_event_count(event);
        if (userpg->index)
                userpg->offset -= local64_read(&event->hw.prev_count);

        userpg->time_enabled = enabled +
                        atomic64_read(&event->child_total_time_enabled);

        userpg->time_running = running +
                        atomic64_read(&event->child_total_time_running);

        arch_perf_update_userpage(event, userpg, now);

        barrier();
        ++userpg->lock;
        preempt_enable();
unlock:
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(perf_event_update_userpage);

static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
{
        struct perf_event *event = vmf->vma->vm_file->private_data;
        struct perf_buffer *rb;
        vm_fault_t ret = VM_FAULT_SIGBUS;

        if (vmf->flags & FAULT_FLAG_MKWRITE) {
                if (vmf->pgoff == 0)
                        ret = 0;
                return ret;
        }

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (!rb)
                goto unlock;

        if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
                goto unlock;

        vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
        if (!vmf->page)
                goto unlock;

        get_page(vmf->page);
        vmf->page->mapping = vmf->vma->vm_file->f_mapping;
        vmf->page->index   = vmf->pgoff;

        ret = 0;
unlock:
        rcu_read_unlock();

        return ret;
}

static void ring_buffer_attach(struct perf_event *event,
                               struct perf_buffer *rb)
{
        struct perf_buffer *old_rb = NULL;
        unsigned long flags;

        WARN_ON_ONCE(event->parent);

        if (event->rb) {
                /*
                 * Should be impossible, we set this when removing
                 * event->rb_entry and wait/clear when adding event->rb_entry.
                 */
                WARN_ON_ONCE(event->rcu_pending);

                old_rb = event->rb;
                spin_lock_irqsave(&old_rb->event_lock, flags);
                list_del_rcu(&event->rb_entry);
                spin_unlock_irqrestore(&old_rb->event_lock, flags);

                event->rcu_batches = get_state_synchronize_rcu();
                event->rcu_pending = 1;
        }

        if (rb) {
                if (event->rcu_pending) {
                        cond_synchronize_rcu(event->rcu_batches);
                        event->rcu_pending = 0;
                }

                spin_lock_irqsave(&rb->event_lock, flags);
                list_add_rcu(&event->rb_entry, &rb->event_list);
                spin_unlock_irqrestore(&rb->event_lock, flags);
        }

        /*
         * Avoid racing with perf_mmap_close(AUX): stop the event
         * before swizzling the event::rb pointer; if it's getting
         * unmapped, its aux_mmap_count will be 0 and it won't
         * restart. See the comment in __perf_pmu_output_stop().
         *
         * Data will inevitably be lost when set_output is done in
         * mid-air, but then again, whoever does it like this is
         * not in for the data anyway.
         */
        if (has_aux(event))
                perf_event_stop(event, 0);

        rcu_assign_pointer(event->rb, rb);

        if (old_rb) {
                ring_buffer_put(old_rb);
                /*
                 * Since we detached before setting the new rb, so that we
                 * could attach the new rb, we could have missed a wakeup.
                 * Provide it now.
                 */
                wake_up_all(&event->waitq);
        }
}

static void ring_buffer_wakeup(struct perf_event *event)
{
        struct perf_buffer *rb;

        if (event->parent)
                event = event->parent;

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (rb) {
                list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
                        wake_up_all(&event->waitq);
        }
        rcu_read_unlock();
}

struct perf_buffer *ring_buffer_get(struct perf_event *event)
{
        struct perf_buffer *rb;

        if (event->parent)
                event = event->parent;

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (rb) {
                if (!refcount_inc_not_zero(&rb->refcount))
                        rb = NULL;
        }
        rcu_read_unlock();

        return rb;
}

void ring_buffer_put(struct perf_buffer *rb)
{
        if (!refcount_dec_and_test(&rb->refcount))
                return;

        WARN_ON_ONCE(!list_empty(&rb->event_list));

        call_rcu(&rb->rcu_head, rb_free_rcu);
}

static void perf_mmap_open(struct vm_area_struct *vma)
{
        struct perf_event *event = vma->vm_file->private_data;

        atomic_inc(&event->mmap_count);
        atomic_inc(&event->rb->mmap_count);

        if (vma->vm_pgoff)
                atomic_inc(&event->rb->aux_mmap_count);

        if (event->pmu->event_mapped)
                event->pmu->event_mapped(event, vma->vm_mm);
}

static void perf_pmu_output_stop(struct perf_event *event);

/*
 * A buffer can be mmap()ed multiple times; either directly through the same
 * event, or through other events by use of perf_event_set_output().
 *
 * In order to undo the VM accounting done by perf_mmap() we need to destroy
 * the buffer here, where we still have a VM context. This means we need
 * to detach all events redirecting to us.
 */
static void perf_mmap_close(struct vm_area_struct *vma)
{
        struct perf_event *event = vma->vm_file->private_data;
        struct perf_buffer *rb = ring_buffer_get(event);
        struct user_struct *mmap_user = rb->mmap_user;
        int mmap_locked = rb->mmap_locked;
        unsigned long size = perf_data_size(rb);
        bool detach_rest = false;

        if (event->pmu->event_unmapped)
                event->pmu->event_unmapped(event, vma->vm_mm);

        /*
         * rb->aux_mmap_count will always drop before rb->mmap_count and
         * event->mmap_count, so it is ok to use event->mmap_mutex to
         * serialize with perf_mmap here.
         */
        if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
            atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
                /*
                 * Stop all AUX events that are writing to this buffer,
                 * so that we can free its AUX pages and corresponding PMU
                 * data. Note that after rb::aux_mmap_count dropped to zero,
                 * they won't start any more (see perf_aux_output_begin()).
                 */
                perf_pmu_output_stop(event);

                /* now it's safe to free the pages */
                atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
                atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);

                /* this has to be the last one */
                rb_free_aux(rb);
                WARN_ON_ONCE(refcount_read(&rb->aux_refcount));

                mutex_unlock(&event->mmap_mutex);
        }

        if (atomic_dec_and_test(&rb->mmap_count))
                detach_rest = true;

        if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
                goto out_put;

        ring_buffer_attach(event, NULL);
        mutex_unlock(&event->mmap_mutex);

        /* If there's still other mmap()s of this buffer, we're done. */
        if (!detach_rest)
                goto out_put;

        /*
         * No other mmap()s, detach from all other events that might redirect
         * into the now unreachable buffer. Somewhat complicated by the
         * fact that rb::event_lock otherwise nests inside mmap_mutex.
         */
again:
        rcu_read_lock();
        list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
                if (!atomic_long_inc_not_zero(&event->refcount)) {
                        /*
                         * This event is en-route to free_event() which will
                         * detach it and remove it from the list.
                         */
                        continue;
                }
                rcu_read_unlock();

                mutex_lock(&event->mmap_mutex);
                /*
                 * Check we didn't race with perf_event_set_output() which can
                 * swizzle the rb from under us while we were waiting to
                 * acquire mmap_mutex.
                 *
                 * If we find a different rb; ignore this event, a next
                 * iteration will no longer find it on the list. We have to
                 * still restart the iteration to make sure we're not now
                 * iterating the wrong list.
                 */
                if (event->rb == rb)
                        ring_buffer_attach(event, NULL);

                mutex_unlock(&event->mmap_mutex);
                put_event(event);

                /*
                 * Restart the iteration; either we're on the wrong list or
                 * destroyed its integrity by doing a deletion.
                 */
                goto again;
        }
        rcu_read_unlock();

        /*
         * It could be there's still a few 0-ref events on the list; they'll
         * get cleaned up by free_event() -- they'll also still have their
         * ref on the rb and will free it whenever they are done with it.
         *
         * Aside from that, this buffer is 'fully' detached and unmapped,
         * undo the VM accounting.
         */

        atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
                        &mmap_user->locked_vm);
        atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
        free_uid(mmap_user);

out_put:
        ring_buffer_put(rb); /* could be last */
}

static const struct vm_operations_struct perf_mmap_vmops = {
        .open                = perf_mmap_open,
        .close                = perf_mmap_close, /* non mergeable */
        .fault                = perf_mmap_fault,
        .page_mkwrite        = perf_mmap_fault,
};

static int perf_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct perf_event *event = file->private_data;
        unsigned long user_locked, user_lock_limit;
        struct user_struct *user = current_user();
        struct perf_buffer *rb = NULL;
        unsigned long locked, lock_limit;
        unsigned long vma_size;
        unsigned long nr_pages;
        long user_extra = 0, extra = 0;
        int ret = 0, flags = 0;

        /*
         * Don't allow mmap() of inherited per-task counters. This would
         * create a performance issue due to all children writing to the
         * same rb.
         */
        if (event->cpu == -1 && event->attr.inherit)
                return -EINVAL;

        if (!(vma->vm_flags & VM_SHARED))
                return -EINVAL;

        ret = security_perf_event_read(event);
        if (ret)
                return ret;

        vma_size = vma->vm_end - vma->vm_start;

        if (vma->vm_pgoff == 0) {
                nr_pages = (vma_size / PAGE_SIZE) - 1;
        } else {
                /*
                 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
                 * mapped, all subsequent mappings should have the same size
                 * and offset. Must be above the normal perf buffer.
                 */
                u64 aux_offset, aux_size;

                if (!event->rb)
                        return -EINVAL;

                nr_pages = vma_size / PAGE_SIZE;

                mutex_lock(&event->mmap_mutex);
                ret = -EINVAL;

                rb = event->rb;
                if (!rb)
                        goto aux_unlock;

                aux_offset = READ_ONCE(rb->user_page->aux_offset);
                aux_size = READ_ONCE(rb->user_page->aux_size);

                if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
                        goto aux_unlock;

                if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
                        goto aux_unlock;

                /* already mapped with a different offset */
                if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
                        goto aux_unlock;

                if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
                        goto aux_unlock;

                /* already mapped with a different size */
                if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
                        goto aux_unlock;

                if (!is_power_of_2(nr_pages))
                        goto aux_unlock;

                if (!atomic_inc_not_zero(&rb->mmap_count))
                        goto aux_unlock;

                if (rb_has_aux(rb)) {
                        atomic_inc(&rb->aux_mmap_count);
                        ret = 0;
                        goto unlock;
                }

                atomic_set(&rb->aux_mmap_count, 1);
                user_extra = nr_pages;

                goto accounting;
        }

        /*
         * If we have rb pages ensure they're a power-of-two number, so we
         * can do bitmasks instead of modulo.
         */
        if (nr_pages != 0 && !is_power_of_2(nr_pages))
                return -EINVAL;

        if (vma_size != PAGE_SIZE * (1 + nr_pages))
                return -EINVAL;

        WARN_ON_ONCE(event->ctx->parent_ctx);
again:
        mutex_lock(&event->mmap_mutex);
        if (event->rb) {
                if (data_page_nr(event->rb) != nr_pages) {
                        ret = -EINVAL;
                        goto unlock;
                }

                if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
                        /*
                         * Raced against perf_mmap_close(); remove the
                         * event and try again.
                         */
                        ring_buffer_attach(event, NULL);
                        mutex_unlock(&event->mmap_mutex);
                        goto again;
                }

                goto unlock;
        }

        user_extra = nr_pages + 1;

accounting:
        user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);

        /*
         * Increase the limit linearly with more CPUs:
         */
        user_lock_limit *= num_online_cpus();

        user_locked = atomic_long_read(&user->locked_vm);

        /*
         * sysctl_perf_event_mlock may have changed, so that
         *     user->locked_vm > user_lock_limit
         */
        if (user_locked > user_lock_limit)
                user_locked = user_lock_limit;
        user_locked += user_extra;

        if (user_locked > user_lock_limit) {
                /*
                 * charge locked_vm until it hits user_lock_limit;
                 * charge the rest from pinned_vm
                 */
                extra = user_locked - user_lock_limit;
                user_extra -= extra;
        }

        lock_limit = rlimit(RLIMIT_MEMLOCK);
        lock_limit >>= PAGE_SHIFT;
        locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;

        if ((locked > lock_limit) && perf_is_paranoid() &&
                !capable(CAP_IPC_LOCK)) {
                ret = -EPERM;
                goto unlock;
        }

        WARN_ON(!rb && event->rb);

        if (vma->vm_flags & VM_WRITE)
                flags |= RING_BUFFER_WRITABLE;

        if (!rb) {
                rb = rb_alloc(nr_pages,
                              event->attr.watermark ? event->attr.wakeup_watermark : 0,
                              event->cpu, flags);

                if (!rb) {
                        ret = -ENOMEM;
                        goto unlock;
                }

                atomic_set(&rb->mmap_count, 1);
                rb->mmap_user = get_current_user();
                rb->mmap_locked = extra;

                ring_buffer_attach(event, rb);

                perf_event_update_time(event);
                perf_event_init_userpage(event);
                perf_event_update_userpage(event);
        } else {
                ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
                                   event->attr.aux_watermark, flags);
                if (!ret)
                        rb->aux_mmap_locked = extra;
        }

unlock:
        if (!ret) {
                atomic_long_add(user_extra, &user->locked_vm);
                atomic64_add(extra, &vma->vm_mm->pinned_vm);

                atomic_inc(&event->mmap_count);
        } else if (rb) {
                atomic_dec(&rb->mmap_count);
        }
aux_unlock:
        mutex_unlock(&event->mmap_mutex);

        /*
         * Since pinned accounting is per vm we cannot allow fork() to copy our
         * vma.
         */
        vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_ops = &perf_mmap_vmops;

        if (event->pmu->event_mapped)
                event->pmu->event_mapped(event, vma->vm_mm);

        return ret;
}

static int perf_fasync(int fd, struct file *filp, int on)
{
        struct inode *inode = file_inode(filp);
        struct perf_event *event = filp->private_data;
        int retval;

        inode_lock(inode);
        retval = fasync_helper(fd, filp, on, &event->fasync);
        inode_unlock(inode);

        if (retval < 0)
                return retval;

        return 0;
}

static const struct file_operations perf_fops = {
        .llseek                        = no_llseek,
        .release                = perf_release,
        .read                        = perf_read,
        .poll                        = perf_poll,
        .unlocked_ioctl                = perf_ioctl,
        .compat_ioctl                = perf_compat_ioctl,
        .mmap                        = perf_mmap,
        .fasync                        = perf_fasync,
};

/*
 * Perf event wakeup
 *
 * If there's data, ensure we set the poll() state and publish everything
 * to user-space before waking everybody up.
 */

static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
{
        /* only the parent has fasync state */
        if (event->parent)
                event = event->parent;
        return &event->fasync;
}

void perf_event_wakeup(struct perf_event *event)
{
        ring_buffer_wakeup(event);

        if (event->pending_kill) {
                kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
                event->pending_kill = 0;
        }
}

static void perf_pending_event_disable(struct perf_event *event)
{
        int cpu = READ_ONCE(event->pending_disable);

        if (cpu < 0)
                return;

        if (cpu == smp_processor_id()) {
                WRITE_ONCE(event->pending_disable, -1);
                perf_event_disable_local(event);
                return;
        }

        /*
         *  CPU-A                        CPU-B
         *
         *  perf_event_disable_inatomic()
         *    @pending_disable = CPU-A;
         *    irq_work_queue();
         *
         *  sched-out
         *    @pending_disable = -1;
         *
         *                                sched-in
         *                                perf_event_disable_inatomic()
         *                                  @pending_disable = CPU-B;
         *                                  irq_work_queue(); // FAILS
         *
         *  irq_work_run()
         *    perf_pending_event()
         *
         * But the event runs on CPU-B and wants disabling there.
         */
        irq_work_queue_on(&event->pending, cpu);
}

static void perf_pending_event(struct irq_work *entry)
{
        struct perf_event *event = container_of(entry, struct perf_event, pending);
        int rctx;

        rctx = perf_swevent_get_recursion_context();
        /*
         * If we 'fail' here, that's OK, it means recursion is already disabled
         * and we won't recurse 'further'.
         */

        perf_pending_event_disable(event);

        if (event->pending_wakeup) {
                event->pending_wakeup = 0;
                perf_event_wakeup(event);
        }

        if (rctx >= 0)
                perf_swevent_put_recursion_context(rctx);
}

/*
 * We assume there is only KVM supporting the callbacks.
 * Later on, we might change it to a list if there is
 * another virtualization implementation supporting the callbacks.
 */
struct perf_guest_info_callbacks __rcu *perf_guest_cbs;

int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
        if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))
                return -EBUSY;

        rcu_assign_pointer(perf_guest_cbs, cbs);
        return 0;
}
EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);

int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
        if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))
                return -EINVAL;

        rcu_assign_pointer(perf_guest_cbs, NULL);
        synchronize_rcu();
        return 0;
}
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);

static void
perf_output_sample_regs(struct perf_output_handle *handle,
                        struct pt_regs *regs, u64 mask)
{
        int bit;
        DECLARE_BITMAP(_mask, 64);

        bitmap_from_u64(_mask, mask);
        for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
                u64 val;

                val = perf_reg_value(regs, bit);
                perf_output_put(handle, val);
        }
}

static void perf_sample_regs_user(struct perf_regs *regs_user,
                                  struct pt_regs *regs)
{
        if (user_mode(regs)) {
                regs_user->abi = perf_reg_abi(current);
                regs_user->regs = regs;
        } else if (!(current->flags & PF_KTHREAD)) {
                perf_get_regs_user(regs_user, regs);
        } else {
                regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
                regs_user->regs = NULL;
        }
}

static void perf_sample_regs_intr(struct perf_regs *regs_intr,
                                  struct pt_regs *regs)
{
        regs_intr->regs = regs;
        regs_intr->abi  = perf_reg_abi(current);
}


/*
 * Get remaining task size from user stack pointer.
 *
 * It'd be better to take stack vma map and limit this more
 * precisely, but there's no way to get it safely under interrupt,
 * so using TASK_SIZE as limit.
 */
static u64 perf_ustack_task_size(struct pt_regs *regs)
{
        unsigned long addr = perf_user_stack_pointer(regs);

        if (!addr || addr >= TASK_SIZE)
                return 0;

        return TASK_SIZE - addr;
}

static u16
perf_sample_ustack_size(u16 stack_size, u16 header_size,
                        struct pt_regs *regs)
{
        u64 task_size;

        /* No regs, no stack pointer, no dump. */
        if (!regs)
                return 0;

        /*
         * Check if we fit in with the requested stack size into the:
         * - TASK_SIZE
         *   If we don't, we limit the size to the TASK_SIZE.
         *
         * - remaining sample size
         *   If we don't, we customize the stack size to
         *   fit in to the remaining sample size.
         */

        task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
        stack_size = min(stack_size, (u16) task_size);

        /* Current header size plus static size and dynamic size. */
        header_size += 2 * sizeof(u64);

        /* Do we fit in with the current stack dump size? */
        if ((u16) (header_size + stack_size) < header_size) {
                /*
                 * If we overflow the maximum size for the sample,
                 * we customize the stack dump size to fit in.
                 */
                stack_size = USHRT_MAX - header_size - sizeof(u64);
                stack_size = round_up(stack_size, sizeof(u64));
        }

        return stack_size;
}

static void
perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
                          struct pt_regs *regs)
{
        /* Case of a kernel thread, nothing to dump */
        if (!regs) {
                u64 size = 0;
                perf_output_put(handle, size);
        } else {
                unsigned long sp;
                unsigned int rem;
                u64 dyn_size;
                mm_segment_t fs;

                /*
                 * We dump:
                 * static size
                 *   - the size requested by user or the best one we can fit
                 *     in to the sample max size
                 * data
                 *   - user stack dump data
                 * dynamic size
                 *   - the actual dumped size
                 */

                /* Static size. */
                perf_output_put(handle, dump_size);

                /* Data. */
                sp = perf_user_stack_pointer(regs);
                fs = force_uaccess_begin();
                rem = __output_copy_user(handle, (void *) sp, dump_size);
                force_uaccess_end(fs);
                dyn_size = dump_size - rem;

                perf_output_skip(handle, rem);

                /* Dynamic size. */
                perf_output_put(handle, dyn_size);
        }
}

static unsigned long perf_prepare_sample_aux(struct perf_event *event,
                                          struct perf_sample_data *data,
                                          size_t size)
{
        struct perf_event *sampler = event->aux_event;
        struct perf_buffer *rb;

        data->aux_size = 0;

        if (!sampler)
                goto out;

        if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
                goto out;

        if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
                goto out;

        rb = ring_buffer_get(sampler);
        if (!rb)
                goto out;

        /*
         * If this is an NMI hit inside sampling code, don't take
         * the sample. See also perf_aux_sample_output().
         */
        if (READ_ONCE(rb->aux_in_sampling)) {
                data->aux_size = 0;
        } else {
                size = min_t(size_t, size, perf_aux_size(rb));
                data->aux_size = ALIGN(size, sizeof(u64));
        }
        ring_buffer_put(rb);

out:
        return data->aux_size;
}

long perf_pmu_snapshot_aux(struct perf_buffer *rb,
                           struct perf_event *event,
                           struct perf_output_handle *handle,
                           unsigned long size)
{
        unsigned long flags;
        long ret;

        /*
         * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
         * paths. If we start calling them in NMI context, they may race with
         * the IRQ ones, that is, for example, re-starting an event that's just
         * been stopped, which is why we're using a separate callback that
         * doesn't change the event state.
         *
         * IRQs need to be disabled to prevent IPIs from racing with us.
         */
        local_irq_save(flags);
        /*
         * Guard against NMI hits inside the critical section;
         * see also perf_prepare_sample_aux().
         */
        WRITE_ONCE(rb->aux_in_sampling, 1);
        barrier();

        ret = event->pmu->snapshot_aux(event, handle, size);

        barrier();
        WRITE_ONCE(rb->aux_in_sampling, 0);
        local_irq_restore(flags);

        return ret;
}

static void perf_aux_sample_output(struct perf_event *event,
                                   struct perf_output_handle *handle,
                                   struct perf_sample_data *data)
{
        struct perf_event *sampler = event->aux_event;
        struct perf_buffer *rb;
        unsigned long pad;
        long size;

        if (WARN_ON_ONCE(!sampler || !data->aux_size))
                return;

        rb = ring_buffer_get(sampler);
        if (!rb)
                return;

        size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);

        /*
         * An error here means that perf_output_copy() failed (returned a
         * non-zero surplus that it didn't copy), which in its current
         * enlightened implementation is not possible. If that changes, we'd
         * like to know.
         */
        if (WARN_ON_ONCE(size < 0))
                goto out_put;

        /*
         * The pad comes from ALIGN()ing data->aux_size up to u64 in
         * perf_prepare_sample_aux(), so should not be more than that.
         */
        pad = data->aux_size - size;
        if (WARN_ON_ONCE(pad >= sizeof(u64)))
                pad = 8;

        if (pad) {
                u64 zero = 0;
                perf_output_copy(handle, &zero, pad);
        }

out_put:
        ring_buffer_put(rb);
}

static void __perf_event_header__init_id(struct perf_event_header *header,
                                         struct perf_sample_data *data,
                                         struct perf_event *event)
{
        u64 sample_type = event->attr.sample_type;

        data->type = sample_type;
        header->size += event->id_header_size;

        if (sample_type & PERF_SAMPLE_TID) {
                /* namespace issues */
                data->tid_entry.pid = perf_event_pid(event, current);
                data->tid_entry.tid = perf_event_tid(event, current);
        }

        if (sample_type & PERF_SAMPLE_TIME)
                data->time = perf_event_clock(event);

        if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
                data->id = primary_event_id(event);

        if (sample_type & PERF_SAMPLE_STREAM_ID)
                data->stream_id = event->id;

        if (sample_type & PERF_SAMPLE_CPU) {
                data->cpu_entry.cpu         = raw_smp_processor_id();
                data->cpu_entry.reserved = 0;
        }
}

void perf_event_header__init_id(struct perf_event_header *header,
                                struct perf_sample_data *data,
                                struct perf_event *event)
{
        if (event->attr.sample_id_all)
                __perf_event_header__init_id(header, data, event);
}

static void __perf_event__output_id_sample(struct perf_output_handle *handle,
                                           struct perf_sample_data *data)
{
        u64 sample_type = data->type;

        if (sample_type & PERF_SAMPLE_TID)
                perf_output_put(handle, data->tid_entry);

        if (sample_type & PERF_SAMPLE_TIME)
                perf_output_put(handle, data->time);

        if (sample_type & PERF_SAMPLE_ID)
                perf_output_put(handle, data->id);

        if (sample_type & PERF_SAMPLE_STREAM_ID)
                perf_output_put(handle, data->stream_id);

        if (sample_type & PERF_SAMPLE_CPU)
                perf_output_put(handle, data->cpu_entry);

        if (sample_type & PERF_SAMPLE_IDENTIFIER)
                perf_output_put(handle, data->id);
}

void perf_event__output_id_sample(struct perf_event *event,
                                  struct perf_output_handle *handle,
                                  struct perf_sample_data *sample)
{
        if (event->attr.sample_id_all)
                __perf_event__output_id_sample(handle, sample);
}

static void perf_output_read_one(struct perf_output_handle *handle,
                                 struct perf_event *event,
                                 u64 enabled, u64 running)
{
        u64 read_format = event->attr.read_format;
        u64 values[5];
        int n = 0;

        values[n++] = perf_event_count(event);
        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
                values[n++] = enabled +
                        atomic64_read(&event->child_total_time_enabled);
        }
        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
                values[n++] = running +
                        atomic64_read(&event->child_total_time_running);
        }
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(event);
        if (read_format & PERF_FORMAT_LOST)
                values[n++] = atomic64_read(&event->lost_samples);

        __output_copy(handle, values, n * sizeof(u64));
}

static void perf_output_read_group(struct perf_output_handle *handle,
                            struct perf_event *event,
                            u64 enabled, u64 running)
{
        struct perf_event *leader = event->group_leader, *sub;
        u64 read_format = event->attr.read_format;
        unsigned long flags;
        u64 values[6];
        int n = 0;

        /*
         * Disabling interrupts avoids all counter scheduling
         * (context switches, timer based rotation and IPIs).
         */
        local_irq_save(flags);

        values[n++] = 1 + leader->nr_siblings;

        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
                values[n++] = enabled;

        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
                values[n++] = running;

        if ((leader != event) &&
            (leader->state == PERF_EVENT_STATE_ACTIVE))
                leader->pmu->read(leader);

        values[n++] = perf_event_count(leader);
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(leader);
        if (read_format & PERF_FORMAT_LOST)
                values[n++] = atomic64_read(&leader->lost_samples);

        __output_copy(handle, values, n * sizeof(u64));

        for_each_sibling_event(sub, leader) {
                n = 0;

                if ((sub != event) &&
                    (sub->state == PERF_EVENT_STATE_ACTIVE))
                        sub->pmu->read(sub);

                values[n++] = perf_event_count(sub);
                if (read_format & PERF_FORMAT_ID)
                        values[n++] = primary_event_id(sub);
                if (read_format & PERF_FORMAT_LOST)
                        values[n++] = atomic64_read(&sub->lost_samples);

                __output_copy(handle, values, n * sizeof(u64));
        }

        local_irq_restore(flags);
}

#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
                                 PERF_FORMAT_TOTAL_TIME_RUNNING)

/*
 * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
 *
 * The problem is that its both hard and excessively expensive to iterate the
 * child list, not to mention that its impossible to IPI the children running
 * on another CPU, from interrupt/NMI context.
 */
static void perf_output_read(struct perf_output_handle *handle,
                             struct perf_event *event)
{
        u64 enabled = 0, running = 0, now;
        u64 read_format = event->attr.read_format;

        /*
         * compute total_time_enabled, total_time_running
         * based on snapshot values taken when the event
         * was last scheduled in.
         *
         * we cannot simply called update_context_time()
         * because of locking issue as we are called in
         * NMI context
         */
        if (read_format & PERF_FORMAT_TOTAL_TIMES)
                calc_timer_values(event, &now, &enabled, &running);

        if (event->attr.read_format & PERF_FORMAT_GROUP)
                perf_output_read_group(handle, event, enabled, running);
        else
                perf_output_read_one(handle, event, enabled, running);
}

static inline bool perf_sample_save_hw_index(struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
}

void perf_output_sample(struct perf_output_handle *handle,
                        struct perf_event_header *header,
                        struct perf_sample_data *data,
                        struct perf_event *event)
{
        u64 sample_type = data->type;

        perf_output_put(handle, *header);

        if (sample_type & PERF_SAMPLE_IDENTIFIER)
                perf_output_put(handle, data->id);

        if (sample_type & PERF_SAMPLE_IP)
                perf_output_put(handle, data->ip);

        if (sample_type & PERF_SAMPLE_TID)
                perf_output_put(handle, data->tid_entry);

        if (sample_type & PERF_SAMPLE_TIME)
                perf_output_put(handle, data->time);

        if (sample_type & PERF_SAMPLE_ADDR)
                perf_output_put(handle, data->addr);

        if (sample_type & PERF_SAMPLE_ID)
                perf_output_put(handle, data->id);

        if (sample_type & PERF_SAMPLE_STREAM_ID)
                perf_output_put(handle, data->stream_id);

        if (sample_type & PERF_SAMPLE_CPU)
                perf_output_put(handle, data->cpu_entry);

        if (sample_type & PERF_SAMPLE_PERIOD)
                perf_output_put(handle, data->period);

        if (sample_type & PERF_SAMPLE_READ)
                perf_output_read(handle, event);

        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
                int size = 1;

                size += data->callchain->nr;
                size *= sizeof(u64);
                __output_copy(handle, data->callchain, size);
        }

        if (sample_type & PERF_SAMPLE_RAW) {
                struct perf_raw_record *raw = data->raw;

                if (raw) {
                        struct perf_raw_frag *frag = &raw->frag;

                        perf_output_put(handle, raw->size);
                        do {
                                if (frag->copy) {
                                        __output_custom(handle, frag->copy,
                                                        frag->data, frag->size);
                                } else {
                                        __output_copy(handle, frag->data,
                                                      frag->size);
                                }
                                if (perf_raw_frag_last(frag))
                                        break;
                                frag = frag->next;
                        } while (1);
                        if (frag->pad)
                                __output_skip(handle, NULL, frag->pad);
                } else {
                        struct {
                                u32        size;
                                u32        data;
                        } raw = {
                                .size = sizeof(u32),
                                .data = 0,
                        };
                        perf_output_put(handle, raw);
                }
        }

        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
                if (data->br_stack) {
                        size_t size;

                        size = data->br_stack->nr
                             * sizeof(struct perf_branch_entry);

                        perf_output_put(handle, data->br_stack->nr);
                        if (perf_sample_save_hw_index(event))
                                perf_output_put(handle, data->br_stack->hw_idx);
                        perf_output_copy(handle, data->br_stack->entries, size);
                } else {
                        /*
                         * we always store at least the value of nr
                         */
                        u64 nr = 0;
                        perf_output_put(handle, nr);
                }
        }

        if (sample_type & PERF_SAMPLE_REGS_USER) {
                u64 abi = data->regs_user.abi;

                /*
                 * If there are no regs to dump, notice it through
                 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
                 */
                perf_output_put(handle, abi);

                if (abi) {
                        u64 mask = event->attr.sample_regs_user;
                        perf_output_sample_regs(handle,
                                                data->regs_user.regs,
                                                mask);
                }
        }

        if (sample_type & PERF_SAMPLE_STACK_USER) {
                perf_output_sample_ustack(handle,
                                          data->stack_user_size,
                                          data->regs_user.regs);
        }

        if (sample_type & PERF_SAMPLE_WEIGHT)
                perf_output_put(handle, data->weight);

        if (sample_type & PERF_SAMPLE_DATA_SRC)
                perf_output_put(handle, data->data_src.val);

        if (sample_type & PERF_SAMPLE_TRANSACTION)
                perf_output_put(handle, data->txn);

        if (sample_type & PERF_SAMPLE_REGS_INTR) {
                u64 abi = data->regs_intr.abi;
                /*
                 * If there are no regs to dump, notice it through
                 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
                 */
                perf_output_put(handle, abi);

                if (abi) {
                        u64 mask = event->attr.sample_regs_intr;

                        perf_output_sample_regs(handle,
                                                data->regs_intr.regs,
                                                mask);
                }
        }

        if (sample_type & PERF_SAMPLE_PHYS_ADDR)
                perf_output_put(handle, data->phys_addr);

        if (sample_type & PERF_SAMPLE_CGROUP)
                perf_output_put(handle, data->cgroup);

        if (sample_type & PERF_SAMPLE_AUX) {
                perf_output_put(handle, data->aux_size);

                if (data->aux_size)
                        perf_aux_sample_output(event, handle, data);
        }

        if (!event->attr.watermark) {
                int wakeup_events = event->attr.wakeup_events;

                if (wakeup_events) {
                        struct perf_buffer *rb = handle->rb;
                        int events = local_inc_return(&rb->events);

                        if (events >= wakeup_events) {
                                local_sub(wakeup_events, &rb->events);
                                local_inc(&rb->wakeup);
                        }
                }
        }
}

static u64 perf_virt_to_phys(u64 virt)
{
        u64 phys_addr = 0;

        if (!virt)
                return 0;

        if (virt >= TASK_SIZE) {
                /* If it's vmalloc()d memory, leave phys_addr as 0 */
                if (virt_addr_valid((void *)(uintptr_t)virt) &&
                    !(virt >= VMALLOC_START && virt < VMALLOC_END))
                        phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
        } else {
                /*
                 * Walking the pages tables for user address.
                 * Interrupts are disabled, so it prevents any tear down
                 * of the page tables.
                 * Try IRQ-safe get_user_page_fast_only first.
                 * If failed, leave phys_addr as 0.
                 */
                if (current->mm != NULL) {
                        struct page *p;

                        pagefault_disable();
                        if (get_user_page_fast_only(virt, 0, &p)) {
                                phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
                                put_page(p);
                        }
                        pagefault_enable();
                }
        }

        return phys_addr;
}

static struct perf_callchain_entry __empty_callchain = { .nr = 0, };

struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
        bool kernel = !event->attr.exclude_callchain_kernel;
        bool user   = !event->attr.exclude_callchain_user;
        /* Disallow cross-task user callchains. */
        bool crosstask = event->ctx->task && event->ctx->task != current;
        const u32 max_stack = event->attr.sample_max_stack;
        struct perf_callchain_entry *callchain;

        if (!kernel && !user)
                return &__empty_callchain;

        callchain = get_perf_callchain(regs, 0, kernel, user,
                                       max_stack, crosstask, true);
        return callchain ?: &__empty_callchain;
}

void perf_prepare_sample(struct perf_event_header *header,
                         struct perf_sample_data *data,
                         struct perf_event *event,
                         struct pt_regs *regs)
{
        u64 sample_type = event->attr.sample_type;

        header->type = PERF_RECORD_SAMPLE;
        header->size = sizeof(*header) + event->header_size;

        header->misc = 0;
        header->misc |= perf_misc_flags(regs);

        __perf_event_header__init_id(header, data, event);

        if (sample_type & PERF_SAMPLE_IP)
                data->ip = perf_instruction_pointer(regs);

        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
                int size = 1;

                if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
                        data->callchain = perf_callchain(event, regs);

                size += data->callchain->nr;

                header->size += size * sizeof(u64);
        }

        if (sample_type & PERF_SAMPLE_RAW) {
                struct perf_raw_record *raw = data->raw;
                int size;

                if (raw) {
                        struct perf_raw_frag *frag = &raw->frag;
                        u32 sum = 0;

                        do {
                                sum += frag->size;
                                if (perf_raw_frag_last(frag))
                                        break;
                                frag = frag->next;
                        } while (1);

                        size = round_up(sum + sizeof(u32), sizeof(u64));
                        raw->size = size - sizeof(u32);
                        frag->pad = raw->size - sum;
                } else {
                        size = sizeof(u64);
                }

                header->size += size;
        }

        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
                int size = sizeof(u64); /* nr */
                if (data->br_stack) {
                        if (perf_sample_save_hw_index(event))
                                size += sizeof(u64);

                        size += data->br_stack->nr
                              * sizeof(struct perf_branch_entry);
                }
                header->size += size;
        }

        if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
                perf_sample_regs_user(&data->regs_user, regs);

        if (sample_type & PERF_SAMPLE_REGS_USER) {
                /* regs dump ABI info */
                int size = sizeof(u64);

                if (data->regs_user.regs) {
                        u64 mask = event->attr.sample_regs_user;
                        size += hweight64(mask) * sizeof(u64);
                }

                header->size += size;
        }

        if (sample_type & PERF_SAMPLE_STACK_USER) {
                /*
                 * Either we need PERF_SAMPLE_STACK_USER bit to be always
                 * processed as the last one or have additional check added
                 * in case new sample type is added, because we could eat
                 * up the rest of the sample size.
                 */
                u16 stack_size = event->attr.sample_stack_user;
                u16 size = sizeof(u64);

                stack_size = perf_sample_ustack_size(stack_size, header->size,
                                                     data->regs_user.regs);

                /*
                 * If there is something to dump, add space for the dump
                 * itself and for the field that tells the dynamic size,
                 * which is how many have been actually dumped.
                 */
                if (stack_size)
                        size += sizeof(u64) + stack_size;

                data->stack_user_size = stack_size;
                header->size += size;
        }

        if (sample_type & PERF_SAMPLE_REGS_INTR) {
                /* regs dump ABI info */
                int size = sizeof(u64);

                perf_sample_regs_intr(&data->regs_intr, regs);

                if (data->regs_intr.regs) {
                        u64 mask = event->attr.sample_regs_intr;

                        size += hweight64(mask) * sizeof(u64);
                }

                header->size += size;
        }

        if (sample_type & PERF_SAMPLE_PHYS_ADDR)
                data->phys_addr = perf_virt_to_phys(data->addr);

#ifdef CONFIG_CGROUP_PERF
        if (sample_type & PERF_SAMPLE_CGROUP) {
                struct cgroup *cgrp;

                /* protected by RCU */
                cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
                data->cgroup = cgroup_id(cgrp);
        }
#endif

        if (sample_type & PERF_SAMPLE_AUX) {
                u64 size;

                header->size += sizeof(u64); /* size */

                /*
                 * Given the 16bit nature of header::size, an AUX sample can
                 * easily overflow it, what with all the preceding sample bits.
                 * Make sure this doesn't happen by using up to U16_MAX bytes
                 * per sample in total (rounded down to 8 byte boundary).
                 */
                size = min_t(size_t, U16_MAX - header->size,
                             event->attr.aux_sample_size);
                size = rounddown(size, 8);
                size = perf_prepare_sample_aux(event, data, size);

                WARN_ON_ONCE(size + header->size > U16_MAX);
                header->size += size;
        }
        /*
         * If you're adding more sample types here, you likely need to do
         * something about the overflowing header::size, like repurpose the
         * lowest 3 bits of size, which should be always zero at the moment.
         * This raises a more important question, do we really need 512k sized
         * samples and why, so good argumentation is in order for whatever you
         * do here next.
         */
        WARN_ON_ONCE(header->size & 7);
}

static __always_inline int
__perf_event_output(struct perf_event *event,
                    struct perf_sample_data *data,
                    struct pt_regs *regs,
                    int (*output_begin)(struct perf_output_handle *,
                                        struct perf_sample_data *,
                                        struct perf_event *,
                                        unsigned int))
{
        struct perf_output_handle handle;
        struct perf_event_header header;
        int err;

        /* protect the callchain buffers */
        rcu_read_lock();

        perf_prepare_sample(&header, data, event, regs);

        err = output_begin(&handle, data, event, header.size);
        if (err)
                goto exit;

        perf_output_sample(&handle, &header, data, event);

        perf_output_end(&handle);

exit:
        rcu_read_unlock();
        return err;
}

void
perf_event_output_forward(struct perf_event *event,
                         struct perf_sample_data *data,
                         struct pt_regs *regs)
{
        __perf_event_output(event, data, regs, perf_output_begin_forward);
}

void
perf_event_output_backward(struct perf_event *event,
                           struct perf_sample_data *data,
                           struct pt_regs *regs)
{
        __perf_event_output(event, data, regs, perf_output_begin_backward);
}

int
perf_event_output(struct perf_event *event,
                  struct perf_sample_data *data,
                  struct pt_regs *regs)
{
        return __perf_event_output(event, data, regs, perf_output_begin);
}

/*
 * read event_id
 */

struct perf_read_event {
        struct perf_event_header        header;

        u32                                pid;
        u32                                tid;
};

static void
perf_event_read_event(struct perf_event *event,
                        struct task_struct *task)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        struct perf_read_event read_event = {
                .header = {
                        .type = PERF_RECORD_READ,
                        .misc = 0,
                        .size = sizeof(read_event) + event->read_size,
                },
                .pid = perf_event_pid(event, task),
                .tid = perf_event_tid(event, task),
        };
        int ret;

        perf_event_header__init_id(&read_event.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
        if (ret)
                return;

        perf_output_put(&handle, read_event);
        perf_output_read(&handle, event);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

typedef void (perf_iterate_f)(struct perf_event *event, void *data);

static void
perf_iterate_ctx(struct perf_event_context *ctx,
                   perf_iterate_f output,
                   void *data, bool all)
{
        struct perf_event *event;

        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (!all) {
                        if (event->state < PERF_EVENT_STATE_INACTIVE)
                                continue;
                        if (!event_filter_match(event))
                                continue;
                }

                output(event, data);
        }
}

static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
{
        struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
        struct perf_event *event;

        list_for_each_entry_rcu(event, &pel->list, sb_list) {
                /*
                 * Skip events that are not fully formed yet; ensure that
                 * if we observe event->ctx, both event and ctx will be
                 * complete enough. See perf_install_in_context().
                 */
                if (!smp_load_acquire(&event->ctx))
                        continue;

                if (event->state < PERF_EVENT_STATE_INACTIVE)
                        continue;
                if (!event_filter_match(event))
                        continue;
                output(event, data);
        }
}

/*
 * Iterate all events that need to receive side-band events.
 *
 * For new callers; ensure that account_pmu_sb_event() includes
 * your event, otherwise it might not get delivered.
 */
static void
perf_iterate_sb(perf_iterate_f output, void *data,
               struct perf_event_context *task_ctx)
{
        struct perf_event_context *ctx;
        int ctxn;

        rcu_read_lock();
        preempt_disable();

        /*
         * If we have task_ctx != NULL we only notify the task context itself.
         * The task_ctx is set only for EXIT events before releasing task
         * context.
         */
        if (task_ctx) {
                perf_iterate_ctx(task_ctx, output, data, false);
                goto done;
        }

        perf_iterate_sb_cpu(output, data);

        for_each_task_context_nr(ctxn) {
                ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
                if (ctx)
                        perf_iterate_ctx(ctx, output, data, false);
        }
done:
        preempt_enable();
        rcu_read_unlock();
}

/*
 * Clear all file-based filters at exec, they'll have to be
 * re-instated when/if these objects are mmapped again.
 */
static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
{
        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
        struct perf_addr_filter *filter;
        unsigned int restart = 0, count = 0;
        unsigned long flags;

        if (!has_addr_filter(event))
                return;

        raw_spin_lock_irqsave(&ifh->lock, flags);
        list_for_each_entry(filter, &ifh->list, entry) {
                if (filter->path.dentry) {
                        event->addr_filter_ranges[count].start = 0;
                        event->addr_filter_ranges[count].size = 0;
                        restart++;
                }

                count++;
        }

        if (restart)
                event->addr_filters_gen++;
        raw_spin_unlock_irqrestore(&ifh->lock, flags);

        if (restart)
                perf_event_stop(event, 1);
}

void perf_event_exec(void)
{
        struct perf_event_context *ctx;
        int ctxn;

        rcu_read_lock();
        for_each_task_context_nr(ctxn) {
                ctx = current->perf_event_ctxp[ctxn];
                if (!ctx)
                        continue;

                perf_event_enable_on_exec(ctxn);

                perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
                                   true);
        }
        rcu_read_unlock();
}

struct remote_output {
        struct perf_buffer        *rb;
        int                        err;
};

static void __perf_event_output_stop(struct perf_event *event, void *data)
{
        struct perf_event *parent = event->parent;
        struct remote_output *ro = data;
        struct perf_buffer *rb = ro->rb;
        struct stop_event_data sd = {
                .event        = event,
        };

        if (!has_aux(event))
                return;

        if (!parent)
                parent = event;

        /*
         * In case of inheritance, it will be the parent that links to the
         * ring-buffer, but it will be the child that's actually using it.
         *
         * We are using event::rb to determine if the event should be stopped,
         * however this may race with ring_buffer_attach() (through set_output),
         * which will make us skip the event that actually needs to be stopped.
         * So ring_buffer_attach() has to stop an aux event before re-assigning
         * its rb pointer.
         */
        if (rcu_dereference(parent->rb) == rb)
                ro->err = __perf_event_stop(&sd);
}

static int __perf_pmu_output_stop(void *info)
{
        struct perf_event *event = info;
        struct pmu *pmu = event->ctx->pmu;
        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
        struct remote_output ro = {
                .rb        = event->rb,
        };

        rcu_read_lock();
        perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
        if (cpuctx->task_ctx)
                perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
                                   &ro, false);
        rcu_read_unlock();

        return ro.err;
}

static void perf_pmu_output_stop(struct perf_event *event)
{
        struct perf_event *iter;
        int err, cpu;

restart:
        rcu_read_lock();
        list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
                /*
                 * For per-CPU events, we need to make sure that neither they
                 * nor their children are running; for cpu==-1 events it's
                 * sufficient to stop the event itself if it's active, since
                 * it can't have children.
                 */
                cpu = iter->cpu;
                if (cpu == -1)
                        cpu = READ_ONCE(iter->oncpu);

                if (cpu == -1)
                        continue;

                err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
                if (err == -EAGAIN) {
                        rcu_read_unlock();
                        goto restart;
                }
        }
        rcu_read_unlock();
}

/*
 * task tracking -- fork/exit
 *
 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
 */

struct perf_task_event {
        struct task_struct                *task;
        struct perf_event_context        *task_ctx;

        struct {
                struct perf_event_header        header;

                u32                                pid;
                u32                                ppid;
                u32                                tid;
                u32                                ptid;
                u64                                time;
        } event_id;
};

static int perf_event_task_match(struct perf_event *event)
{
        return event->attr.comm  || event->attr.mmap ||
               event->attr.mmap2 || event->attr.mmap_data ||
               event->attr.task;
}

static void perf_event_task_output(struct perf_event *event,
                                   void *data)
{
        struct perf_task_event *task_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data        sample;
        struct task_struct *task = task_event->task;
        int ret, size = task_event->event_id.header.size;

        if (!perf_event_task_match(event))
                return;

        perf_event_header__init_id(&task_event->event_id.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event,
                                task_event->event_id.header.size);
        if (ret)
                goto out;

        task_event->event_id.pid = perf_event_pid(event, task);
        task_event->event_id.tid = perf_event_tid(event, task);

        if (task_event->event_id.header.type == PERF_RECORD_EXIT) {
                task_event->event_id.ppid = perf_event_pid(event,
                                                        task->real_parent);
                task_event->event_id.ptid = perf_event_pid(event,
                                                        task->real_parent);
        } else {  /* PERF_RECORD_FORK */
                task_event->event_id.ppid = perf_event_pid(event, current);
                task_event->event_id.ptid = perf_event_tid(event, current);
        }

        task_event->event_id.time = perf_event_clock(event);

        perf_output_put(&handle, task_event->event_id);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        task_event->event_id.header.size = size;
}

static void perf_event_task(struct task_struct *task,
                              struct perf_event_context *task_ctx,
                              int new)
{
        struct perf_task_event task_event;

        if (!atomic_read(&nr_comm_events) &&
            !atomic_read(&nr_mmap_events) &&
            !atomic_read(&nr_task_events))
                return;

        task_event = (struct perf_task_event){
                .task          = task,
                .task_ctx = task_ctx,
                .event_id    = {
                        .header = {
                                .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
                                .misc = 0,
                                .size = sizeof(task_event.event_id),
                        },
                        /* .pid  */
                        /* .ppid */
                        /* .tid  */
                        /* .ptid */
                        /* .time */
                },
        };

        perf_iterate_sb(perf_event_task_output,
                       &task_event,
                       task_ctx);
}

void perf_event_fork(struct task_struct *task)
{
        perf_event_task(task, NULL, 1);
        perf_event_namespaces(task);
}

/*
 * comm tracking
 */

struct perf_comm_event {
        struct task_struct        *task;
        char                        *comm;
        int                        comm_size;

        struct {
                struct perf_event_header        header;

                u32                                pid;
                u32                                tid;
        } event_id;
};

static int perf_event_comm_match(struct perf_event *event)
{
        return event->attr.comm;
}

static void perf_event_comm_output(struct perf_event *event,
                                   void *data)
{
        struct perf_comm_event *comm_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int size = comm_event->event_id.header.size;
        int ret;

        if (!perf_event_comm_match(event))
                return;

        perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                comm_event->event_id.header.size);

        if (ret)
                goto out;

        comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
        comm_event->event_id.tid = perf_event_tid(event, comm_event->task);

        perf_output_put(&handle, comm_event->event_id);
        __output_copy(&handle, comm_event->comm,
                                   comm_event->comm_size);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        comm_event->event_id.header.size = size;
}

static void perf_event_comm_event(struct perf_comm_event *comm_event)
{
        char comm[TASK_COMM_LEN];
        unsigned int size;

        memset(comm, 0, sizeof(comm));
        strlcpy(comm, comm_event->task->comm, sizeof(comm));
        size = ALIGN(strlen(comm)+1, sizeof(u64));

        comm_event->comm = comm;
        comm_event->comm_size = size;

        comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;

        perf_iterate_sb(perf_event_comm_output,
                       comm_event,
                       NULL);
}

void perf_event_comm(struct task_struct *task, bool exec)
{
        struct perf_comm_event comm_event;

        if (!atomic_read(&nr_comm_events))
                return;

        comm_event = (struct perf_comm_event){
                .task        = task,
                /* .comm      */
                /* .comm_size */
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_COMM,
                                .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
                                /* .size */
                        },
                        /* .pid */
                        /* .tid */
                },
        };

        perf_event_comm_event(&comm_event);
}

/*
 * namespaces tracking
 */

struct perf_namespaces_event {
        struct task_struct                *task;

        struct {
                struct perf_event_header        header;

                u32                                pid;
                u32                                tid;
                u64                                nr_namespaces;
                struct perf_ns_link_info        link_info[NR_NAMESPACES];
        } event_id;
};

static int perf_event_namespaces_match(struct perf_event *event)
{
        return event->attr.namespaces;
}

static void perf_event_namespaces_output(struct perf_event *event,
                                         void *data)
{
        struct perf_namespaces_event *namespaces_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        u16 header_size = namespaces_event->event_id.header.size;
        int ret;

        if (!perf_event_namespaces_match(event))
                return;

        perf_event_header__init_id(&namespaces_event->event_id.header,
                                   &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                namespaces_event->event_id.header.size);
        if (ret)
                goto out;

        namespaces_event->event_id.pid = perf_event_pid(event,
                                                        namespaces_event->task);
        namespaces_event->event_id.tid = perf_event_tid(event,
                                                        namespaces_event->task);

        perf_output_put(&handle, namespaces_event->event_id);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        namespaces_event->event_id.header.size = header_size;
}

static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
                                   struct task_struct *task,
                                   const struct proc_ns_operations *ns_ops)
{
        struct path ns_path;
        struct inode *ns_inode;
        int error;

        error = ns_get_path(&ns_path, task, ns_ops);
        if (!error) {
                ns_inode = ns_path.dentry->d_inode;
                ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
                ns_link_info->ino = ns_inode->i_ino;
                path_put(&ns_path);
        }
}

void perf_event_namespaces(struct task_struct *task)
{
        struct perf_namespaces_event namespaces_event;
        struct perf_ns_link_info *ns_link_info;

        if (!atomic_read(&nr_namespaces_events))
                return;

        namespaces_event = (struct perf_namespaces_event){
                .task        = task,
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_NAMESPACES,
                                .misc = 0,
                                .size = sizeof(namespaces_event.event_id),
                        },
                        /* .pid */
                        /* .tid */
                        .nr_namespaces = NR_NAMESPACES,
                        /* .link_info[NR_NAMESPACES] */
                },
        };

        ns_link_info = namespaces_event.event_id.link_info;

        perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
                               task, &mntns_operations);

#ifdef CONFIG_USER_NS
        perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
                               task, &userns_operations);
#endif
#ifdef CONFIG_NET_NS
        perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
                               task, &netns_operations);
#endif
#ifdef CONFIG_UTS_NS
        perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
                               task, &utsns_operations);
#endif
#ifdef CONFIG_IPC_NS
        perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
                               task, &ipcns_operations);
#endif
#ifdef CONFIG_PID_NS
        perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
                               task, &pidns_operations);
#endif
#ifdef CONFIG_CGROUPS
        perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
                               task, &cgroupns_operations);
#endif

        perf_iterate_sb(perf_event_namespaces_output,
                        &namespaces_event,
                        NULL);
}

/*
 * cgroup tracking
 */
#ifdef CONFIG_CGROUP_PERF

struct perf_cgroup_event {
        char                                *path;
        int                                path_size;
        struct {
                struct perf_event_header        header;
                u64                                id;
                char                                path[];
        } event_id;
};

static int perf_event_cgroup_match(struct perf_event *event)
{
        return event->attr.cgroup;
}

static void perf_event_cgroup_output(struct perf_event *event, void *data)
{
        struct perf_cgroup_event *cgroup_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        u16 header_size = cgroup_event->event_id.header.size;
        int ret;

        if (!perf_event_cgroup_match(event))
                return;

        perf_event_header__init_id(&cgroup_event->event_id.header,
                                   &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                cgroup_event->event_id.header.size);
        if (ret)
                goto out;

        perf_output_put(&handle, cgroup_event->event_id);
        __output_copy(&handle, cgroup_event->path, cgroup_event->path_size);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        cgroup_event->event_id.header.size = header_size;
}

static void perf_event_cgroup(struct cgroup *cgrp)
{
        struct perf_cgroup_event cgroup_event;
        char path_enomem[16] = "//enomem";
        char *pathname;
        size_t size;

        if (!atomic_read(&nr_cgroup_events))
                return;

        cgroup_event = (struct perf_cgroup_event){
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_CGROUP,
                                .misc = 0,
                                .size = sizeof(cgroup_event.event_id),
                        },
                        .id = cgroup_id(cgrp),
                },
        };

        pathname = kmalloc(PATH_MAX, GFP_KERNEL);
        if (pathname == NULL) {
                cgroup_event.path = path_enomem;
        } else {
                /* just to be sure to have enough space for alignment */
                cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
                cgroup_event.path = pathname;
        }

        /*
         * Since our buffer works in 8 byte units we need to align our string
         * size to a multiple of 8. However, we must guarantee the tail end is
         * zero'd out to avoid leaking random bits to userspace.
         */
        size = strlen(cgroup_event.path) + 1;
        while (!IS_ALIGNED(size, sizeof(u64)))
                cgroup_event.path[size++] = '\0';

        cgroup_event.event_id.header.size += size;
        cgroup_event.path_size = size;

        perf_iterate_sb(perf_event_cgroup_output,
                        &cgroup_event,
                        NULL);

        kfree(pathname);
}

#endif

/*
 * mmap tracking
 */

struct perf_mmap_event {
        struct vm_area_struct        *vma;

        const char                *file_name;
        int                        file_size;
        int                        maj, min;
        u64                        ino;
        u64                        ino_generation;
        u32                        prot, flags;

        struct {
                struct perf_event_header        header;

                u32                                pid;
                u32                                tid;
                u64                                start;
                u64                                len;
                u64                                pgoff;
        } event_id;
};

static int perf_event_mmap_match(struct perf_event *event,
                                 void *data)
{
        struct perf_mmap_event *mmap_event = data;
        struct vm_area_struct *vma = mmap_event->vma;
        int executable = vma->vm_flags & VM_EXEC;

        return (!executable && event->attr.mmap_data) ||
               (executable && (event->attr.mmap || event->attr.mmap2));
}

static void perf_event_mmap_output(struct perf_event *event,
                                   void *data)
{
        struct perf_mmap_event *mmap_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int size = mmap_event->event_id.header.size;
        u32 type = mmap_event->event_id.header.type;
        int ret;

        if (!perf_event_mmap_match(event, data))
                return;

        if (event->attr.mmap2) {
                mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
                mmap_event->event_id.header.size += sizeof(mmap_event->maj);
                mmap_event->event_id.header.size += sizeof(mmap_event->min);
                mmap_event->event_id.header.size += sizeof(mmap_event->ino);
                mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
                mmap_event->event_id.header.size += sizeof(mmap_event->prot);
                mmap_event->event_id.header.size += sizeof(mmap_event->flags);
        }

        perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                mmap_event->event_id.header.size);
        if (ret)
                goto out;

        mmap_event->event_id.pid = perf_event_pid(event, current);
        mmap_event->event_id.tid = perf_event_tid(event, current);

        perf_output_put(&handle, mmap_event->event_id);

        if (event->attr.mmap2) {
                perf_output_put(&handle, mmap_event->maj);
                perf_output_put(&handle, mmap_event->min);
                perf_output_put(&handle, mmap_event->ino);
                perf_output_put(&handle, mmap_event->ino_generation);
                perf_output_put(&handle, mmap_event->prot);
                perf_output_put(&handle, mmap_event->flags);
        }

        __output_copy(&handle, mmap_event->file_name,
                                   mmap_event->file_size);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        mmap_event->event_id.header.size = size;
        mmap_event->event_id.header.type = type;
}

static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
{
        struct vm_area_struct *vma = mmap_event->vma;
        struct file *file = vma->vm_file;
        int maj = 0, min = 0;
        u64 ino = 0, gen = 0;
        u32 prot = 0, flags = 0;
        unsigned int size;
        char tmp[16];
        char *buf = NULL;
        char *name;

        if (vma->vm_flags & VM_READ)
                prot |= PROT_READ;
        if (vma->vm_flags & VM_WRITE)
                prot |= PROT_WRITE;
        if (vma->vm_flags & VM_EXEC)
                prot |= PROT_EXEC;

        if (vma->vm_flags & VM_MAYSHARE)
                flags = MAP_SHARED;
        else
                flags = MAP_PRIVATE;

        if (vma->vm_flags & VM_DENYWRITE)
                flags |= MAP_DENYWRITE;
        if (vma->vm_flags & VM_MAYEXEC)
                flags |= MAP_EXECUTABLE;
        if (vma->vm_flags & VM_LOCKED)
                flags |= MAP_LOCKED;
        if (is_vm_hugetlb_page(vma))
                flags |= MAP_HUGETLB;

        if (file) {
                struct inode *inode;
                dev_t dev;

                buf = kmalloc(PATH_MAX, GFP_KERNEL);
                if (!buf) {
                        name = "//enomem";
                        goto cpy_name;
                }
                /*
                 * d_path() works from the end of the rb backwards, so we
                 * need to add enough zero bytes after the string to handle
                 * the 64bit alignment we do later.
                 */
                name = file_path(file, buf, PATH_MAX - sizeof(u64));
                if (IS_ERR(name)) {
                        name = "//toolong";
                        goto cpy_name;
                }
                inode = file_inode(vma->vm_file);
                dev = inode->i_sb->s_dev;
                ino = inode->i_ino;
                gen = inode->i_generation;
                maj = MAJOR(dev);
                min = MINOR(dev);

                goto got_name;
        } else {
                if (vma->vm_ops && vma->vm_ops->name) {
                        name = (char *) vma->vm_ops->name(vma);
                        if (name)
                                goto cpy_name;
                }

                name = (char *)arch_vma_name(vma);
                if (name)
                        goto cpy_name;

                if (vma->vm_start <= vma->vm_mm->start_brk &&
                                vma->vm_end >= vma->vm_mm->brk) {
                        name = "[heap]";
                        goto cpy_name;
                }
                if (vma->vm_start <= vma->vm_mm->start_stack &&
                                vma->vm_end >= vma->vm_mm->start_stack) {
                        name = "[stack]";
                        goto cpy_name;
                }

                name = "//anon";
                goto cpy_name;
        }

cpy_name:
        strlcpy(tmp, name, sizeof(tmp));
        name = tmp;
got_name:
        /*
         * Since our buffer works in 8 byte units we need to align our string
         * size to a multiple of 8. However, we must guarantee the tail end is
         * zero'd out to avoid leaking random bits to userspace.
         */
        size = strlen(name)+1;
        while (!IS_ALIGNED(size, sizeof(u64)))
                name[size++] = '\0';

        mmap_event->file_name = name;
        mmap_event->file_size = size;
        mmap_event->maj = maj;
        mmap_event->min = min;
        mmap_event->ino = ino;
        mmap_event->ino_generation = gen;
        mmap_event->prot = prot;
        mmap_event->flags = flags;

        if (!(vma->vm_flags & VM_EXEC))
                mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;

        mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;

        perf_iterate_sb(perf_event_mmap_output,
                       mmap_event,
                       NULL);

        kfree(buf);
}

/*
 * Check whether inode and address range match filter criteria.
 */
static bool perf_addr_filter_match(struct perf_addr_filter *filter,
                                     struct file *file, unsigned long offset,
                                     unsigned long size)
{
        /* d_inode(NULL) won't be equal to any mapped user-space file */
        if (!filter->path.dentry)
                return false;

        if (d_inode(filter->path.dentry) != file_inode(file))
                return false;

        if (filter->offset > offset + size)
                return false;

        if (filter->offset + filter->size < offset)
                return false;

        return true;
}

static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
                                        struct vm_area_struct *vma,
                                        struct perf_addr_filter_range *fr)
{
        unsigned long vma_size = vma->vm_end - vma->vm_start;
        unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
        struct file *file = vma->vm_file;

        if (!perf_addr_filter_match(filter, file, off, vma_size))
                return false;

        if (filter->offset < off) {
                fr->start = vma->vm_start;
                fr->size = min(vma_size, filter->size - (off - filter->offset));
        } else {
                fr->start = vma->vm_start + filter->offset - off;
                fr->size = min(vma->vm_end - fr->start, filter->size);
        }

        return true;
}

static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
{
        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
        struct vm_area_struct *vma = data;
        struct perf_addr_filter *filter;
        unsigned int restart = 0, count = 0;
        unsigned long flags;

        if (!has_addr_filter(event))
                return;

        if (!vma->vm_file)
                return;

        raw_spin_lock_irqsave(&ifh->lock, flags);
        list_for_each_entry(filter, &ifh->list, entry) {
                if (perf_addr_filter_vma_adjust(filter, vma,
                                                &event->addr_filter_ranges[count]))
                        restart++;

                count++;
        }

        if (restart)
                event->addr_filters_gen++;
        raw_spin_unlock_irqrestore(&ifh->lock, flags);

        if (restart)
                perf_event_stop(event, 1);
}

/*
 * Adjust all task's events' filters to the new vma
 */
static void perf_addr_filters_adjust(struct vm_area_struct *vma)
{
        struct perf_event_context *ctx;
        int ctxn;

        /*
         * Data tracing isn't supported yet and as such there is no need
         * to keep track of anything that isn't related to executable code:
         */
        if (!(vma->vm_flags & VM_EXEC))
                return;

        rcu_read_lock();
        for_each_task_context_nr(ctxn) {
                ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
                if (!ctx)
                        continue;

                perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
        }
        rcu_read_unlock();
}

void perf_event_mmap(struct vm_area_struct *vma)
{
        struct perf_mmap_event mmap_event;

        if (!atomic_read(&nr_mmap_events))
                return;

        mmap_event = (struct perf_mmap_event){
                .vma        = vma,
                /* .file_name */
                /* .file_size */
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_MMAP,
                                .misc = PERF_RECORD_MISC_USER,
                                /* .size */
                        },
                        /* .pid */
                        /* .tid */
                        .start  = vma->vm_start,
                        .len    = vma->vm_end - vma->vm_start,
                        .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
                },
                /* .maj (attr_mmap2 only) */
                /* .min (attr_mmap2 only) */
                /* .ino (attr_mmap2 only) */
                /* .ino_generation (attr_mmap2 only) */
                /* .prot (attr_mmap2 only) */
                /* .flags (attr_mmap2 only) */
        };

        perf_addr_filters_adjust(vma);
        perf_event_mmap_event(&mmap_event);
}

void perf_event_aux_event(struct perf_event *event, unsigned long head,
                          unsigned long size, u64 flags)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        struct perf_aux_event {
                struct perf_event_header        header;
                u64                                offset;
                u64                                size;
                u64                                flags;
        } rec = {
                .header = {
                        .type = PERF_RECORD_AUX,
                        .misc = 0,
                        .size = sizeof(rec),
                },
                .offset                = head,
                .size                = size,
                .flags                = flags,
        };
        int ret;

        perf_event_header__init_id(&rec.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event, rec.header.size);

        if (ret)
                return;

        perf_output_put(&handle, rec);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

/*
 * Lost/dropped samples logging
 */
void perf_log_lost_samples(struct perf_event *event, u64 lost)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        struct {
                struct perf_event_header        header;
                u64                                lost;
        } lost_samples_event = {
                .header = {
                        .type = PERF_RECORD_LOST_SAMPLES,
                        .misc = 0,
                        .size = sizeof(lost_samples_event),
                },
                .lost                = lost,
        };

        perf_event_header__init_id(&lost_samples_event.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event,
                                lost_samples_event.header.size);
        if (ret)
                return;

        perf_output_put(&handle, lost_samples_event);
        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
}

/*
 * context_switch tracking
 */

struct perf_switch_event {
        struct task_struct        *task;
        struct task_struct        *next_prev;

        struct {
                struct perf_event_header        header;
                u32                                next_prev_pid;
                u32                                next_prev_tid;
        } event_id;
};

static int perf_event_switch_match(struct perf_event *event)
{
        return event->attr.context_switch;
}

static void perf_event_switch_output(struct perf_event *event, void *data)
{
        struct perf_switch_event *se = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        if (!perf_event_switch_match(event))
                return;

        /* Only CPU-wide events are allowed to see next/prev pid/tid */
        if (event->ctx->task) {
                se->event_id.header.type = PERF_RECORD_SWITCH;
                se->event_id.header.size = sizeof(se->event_id.header);
        } else {
                se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
                se->event_id.header.size = sizeof(se->event_id);
                se->event_id.next_prev_pid =
                                        perf_event_pid(event, se->next_prev);
                se->event_id.next_prev_tid =
                                        perf_event_tid(event, se->next_prev);
        }

        perf_event_header__init_id(&se->event_id.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
        if (ret)
                return;

        if (event->ctx->task)
                perf_output_put(&handle, se->event_id.header);
        else
                perf_output_put(&handle, se->event_id);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

static void perf_event_switch(struct task_struct *task,
                              struct task_struct *next_prev, bool sched_in)
{
        struct perf_switch_event switch_event;

        /* N.B. caller checks nr_switch_events != 0 */

        switch_event = (struct perf_switch_event){
                .task                = task,
                .next_prev        = next_prev,
                .event_id        = {
                        .header = {
                                /* .type */
                                .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
                                /* .size */
                        },
                        /* .next_prev_pid */
                        /* .next_prev_tid */
                },
        };

        if (!sched_in && task->state == TASK_RUNNING)
                switch_event.event_id.header.misc |=
                                PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;

        perf_iterate_sb(perf_event_switch_output,
                       &switch_event,
                       NULL);
}

/*
 * IRQ throttle logging
 */

static void perf_log_throttle(struct perf_event *event, int enable)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        struct {
                struct perf_event_header        header;
                u64                                time;
                u64                                id;
                u64                                stream_id;
        } throttle_event = {
                .header = {
                        .type = PERF_RECORD_THROTTLE,
                        .misc = 0,
                        .size = sizeof(throttle_event),
                },
                .time                = perf_event_clock(event),
                .id                = primary_event_id(event),
                .stream_id        = event->id,
        };

        if (enable)
                throttle_event.header.type = PERF_RECORD_UNTHROTTLE;

        perf_event_header__init_id(&throttle_event.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event,
                                throttle_event.header.size);
        if (ret)
                return;

        perf_output_put(&handle, throttle_event);
        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
}

/*
 * ksymbol register/unregister tracking
 */

struct perf_ksymbol_event {
        const char        *name;
        int                name_len;
        struct {
                struct perf_event_header        header;
                u64                                addr;
                u32                                len;
                u16                                ksym_type;
                u16                                flags;
        } event_id;
};

static int perf_event_ksymbol_match(struct perf_event *event)
{
        return event->attr.ksymbol;
}

static void perf_event_ksymbol_output(struct perf_event *event, void *data)
{
        struct perf_ksymbol_event *ksymbol_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        if (!perf_event_ksymbol_match(event))
                return;

        perf_event_header__init_id(&ksymbol_event->event_id.header,
                                   &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                ksymbol_event->event_id.header.size);
        if (ret)
                return;

        perf_output_put(&handle, ksymbol_event->event_id);
        __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
                        const char *sym)
{
        struct perf_ksymbol_event ksymbol_event;
        char name[KSYM_NAME_LEN];
        u16 flags = 0;
        int name_len;

        if (!atomic_read(&nr_ksymbol_events))
                return;

        if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
            ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
                goto err;

        strlcpy(name, sym, KSYM_NAME_LEN);
        name_len = strlen(name) + 1;
        while (!IS_ALIGNED(name_len, sizeof(u64)))
                name[name_len++] = '\0';
        BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));

        if (unregister)
                flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;

        ksymbol_event = (struct perf_ksymbol_event){
                .name = name,
                .name_len = name_len,
                .event_id = {
                        .header = {
                                .type = PERF_RECORD_KSYMBOL,
                                .size = sizeof(ksymbol_event.event_id) +
                                        name_len,
                        },
                        .addr = addr,
                        .len = len,
                        .ksym_type = ksym_type,
                        .flags = flags,
                },
        };

        perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
        return;
err:
        WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
}

/*
 * bpf program load/unload tracking
 */

struct perf_bpf_event {
        struct bpf_prog        *prog;
        struct {
                struct perf_event_header        header;
                u16                                type;
                u16                                flags;
                u32                                id;
                u8                                tag[BPF_TAG_SIZE];
        } event_id;
};

static int perf_event_bpf_match(struct perf_event *event)
{
        return event->attr.bpf_event;
}

static void perf_event_bpf_output(struct perf_event *event, void *data)
{
        struct perf_bpf_event *bpf_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        if (!perf_event_bpf_match(event))
                return;

        perf_event_header__init_id(&bpf_event->event_id.header,
                                   &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                bpf_event->event_id.header.size);
        if (ret)
                return;

        perf_output_put(&handle, bpf_event->event_id);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
                                         enum perf_bpf_event_type type)
{
        bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
        int i;

        if (prog->aux->func_cnt == 0) {
                perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
                                   (u64)(unsigned long)prog->bpf_func,
                                   prog->jited_len, unregister,
                                   prog->aux->ksym.name);
        } else {
                for (i = 0; i < prog->aux->func_cnt; i++) {
                        struct bpf_prog *subprog = prog->aux->func[i];

                        perf_event_ksymbol(
                                PERF_RECORD_KSYMBOL_TYPE_BPF,
                                (u64)(unsigned long)subprog->bpf_func,
                                subprog->jited_len, unregister,
                                subprog->aux->ksym.name);
                }
        }
}

void perf_event_bpf_event(struct bpf_prog *prog,
                          enum perf_bpf_event_type type,
                          u16 flags)
{
        struct perf_bpf_event bpf_event;

        if (type <= PERF_BPF_EVENT_UNKNOWN ||
            type >= PERF_BPF_EVENT_MAX)
                return;

        switch (type) {
        case PERF_BPF_EVENT_PROG_LOAD:
        case PERF_BPF_EVENT_PROG_UNLOAD:
                if (atomic_read(&nr_ksymbol_events))
                        perf_event_bpf_emit_ksymbols(prog, type);
                break;
        default:
                break;
        }

        if (!atomic_read(&nr_bpf_events))
                return;

        bpf_event = (struct perf_bpf_event){
                .prog = prog,
                .event_id = {
                        .header = {
                                .type = PERF_RECORD_BPF_EVENT,
                                .size = sizeof(bpf_event.event_id),
                        },
                        .type = type,
                        .flags = flags,
                        .id = prog->aux->id,
                },
        };

        BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));

        memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
        perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
}

struct perf_text_poke_event {
        const void                *old_bytes;
        const void                *new_bytes;
        size_t                        pad;
        u16                        old_len;
        u16                        new_len;

        struct {
                struct perf_event_header        header;

                u64                                addr;
        } event_id;
};

static int perf_event_text_poke_match(struct perf_event *event)
{
        return event->attr.text_poke;
}

static void perf_event_text_poke_output(struct perf_event *event, void *data)
{
        struct perf_text_poke_event *text_poke_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        u64 padding = 0;
        int ret;

        if (!perf_event_text_poke_match(event))
                return;

        perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event,
                                text_poke_event->event_id.header.size);
        if (ret)
                return;

        perf_output_put(&handle, text_poke_event->event_id);
        perf_output_put(&handle, text_poke_event->old_len);
        perf_output_put(&handle, text_poke_event->new_len);

        __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
        __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);

        if (text_poke_event->pad)
                __output_copy(&handle, &padding, text_poke_event->pad);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

void perf_event_text_poke(const void *addr, const void *old_bytes,
                          size_t old_len, const void *new_bytes, size_t new_len)
{
        struct perf_text_poke_event text_poke_event;
        size_t tot, pad;

        if (!atomic_read(&nr_text_poke_events))
                return;

        tot  = sizeof(text_poke_event.old_len) + old_len;
        tot += sizeof(text_poke_event.new_len) + new_len;
        pad  = ALIGN(tot, sizeof(u64)) - tot;

        text_poke_event = (struct perf_text_poke_event){
                .old_bytes    = old_bytes,
                .new_bytes    = new_bytes,
                .pad          = pad,
                .old_len      = old_len,
                .new_len      = new_len,
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_TEXT_POKE,
                                .misc = PERF_RECORD_MISC_KERNEL,
                                .size = sizeof(text_poke_event.event_id) + tot + pad,
                        },
                        .addr = (unsigned long)addr,
                },
        };

        perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
}

void perf_event_itrace_started(struct perf_event *event)
{
        event->attach_state |= PERF_ATTACH_ITRACE;
}

static void perf_log_itrace_start(struct perf_event *event)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        struct perf_aux_event {
                struct perf_event_header        header;
                u32                                pid;
                u32                                tid;
        } rec;
        int ret;

        if (event->parent)
                event = event->parent;

        if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
            event->attach_state & PERF_ATTACH_ITRACE)
                return;

        rec.header.type        = PERF_RECORD_ITRACE_START;
        rec.header.misc        = 0;
        rec.header.size        = sizeof(rec);
        rec.pid        = perf_event_pid(event, current);
        rec.tid        = perf_event_tid(event, current);

        perf_event_header__init_id(&rec.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event, rec.header.size);

        if (ret)
                return;

        perf_output_put(&handle, rec);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

static int
__perf_event_account_interrupt(struct perf_event *event, int throttle)
{
        struct hw_perf_event *hwc = &event->hw;
        int ret = 0;
        u64 seq;

        seq = __this_cpu_read(perf_throttled_seq);
        if (seq != hwc->interrupts_seq) {
                hwc->interrupts_seq = seq;
                hwc->interrupts = 1;
        } else {
                hwc->interrupts++;
                if (unlikely(throttle &&
                             hwc->interrupts > max_samples_per_tick)) {
                        __this_cpu_inc(perf_throttled_count);
                        tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
                        hwc->interrupts = MAX_INTERRUPTS;
                        perf_log_throttle(event, 0);
                        ret = 1;
                }
        }

        if (event->attr.freq) {
                u64 now = perf_clock();
                s64 delta = now - hwc->freq_time_stamp;

                hwc->freq_time_stamp = now;

                if (delta > 0 && delta < 2*TICK_NSEC)
                        perf_adjust_period(event, delta, hwc->last_period, true);
        }

        return ret;
}

int perf_event_account_interrupt(struct perf_event *event)
{
        return __perf_event_account_interrupt(event, 1);
}

/*
 * Generic event overflow handling, sampling.
 */

static int __perf_event_overflow(struct perf_event *event,
                                   int throttle, struct perf_sample_data *data,
                                   struct pt_regs *regs)
{
        int events = atomic_read(&event->event_limit);
        int ret = 0;

        /*
         * Non-sampling counters might still use the PMI to fold short
         * hardware counters, ignore those.
         */
        if (unlikely(!is_sampling_event(event)))
                return 0;

        ret = __perf_event_account_interrupt(event, throttle);

        /*
         * XXX event_limit might not quite work as expected on inherited
         * events
         */

        event->pending_kill = POLL_IN;
        if (events && atomic_dec_and_test(&event->event_limit)) {
                ret = 1;
                event->pending_kill = POLL_HUP;

                perf_event_disable_inatomic(event);
        }

        READ_ONCE(event->overflow_handler)(event, data, regs);

        if (*perf_event_fasync(event) && event->pending_kill) {
                event->pending_wakeup = 1;
                irq_work_queue(&event->pending);
        }

        return ret;
}

int perf_event_overflow(struct perf_event *event,
                          struct perf_sample_data *data,
                          struct pt_regs *regs)
{
        return __perf_event_overflow(event, 1, data, regs);
}

/*
 * Generic software event infrastructure
 */

struct swevent_htable {
        struct swevent_hlist                *swevent_hlist;
        struct mutex                        hlist_mutex;
        int                                hlist_refcount;

        /* Recursion avoidance in each contexts */
        int                                recursion[PERF_NR_CONTEXTS];
};

static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);

/*
 * We directly increment event->count and keep a second value in
 * event->hw.period_left to count intervals. This period event
 * is kept in the range [-sample_period, 0] so that we can use the
 * sign as trigger.
 */

u64 perf_swevent_set_period(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;
        u64 period = hwc->last_period;
        u64 nr, offset;
        s64 old, val;

        hwc->last_period = hwc->sample_period;

again:
        old = val = local64_read(&hwc->period_left);
        if (val < 0)
                return 0;

        nr = div64_u64(period + val, period);
        offset = nr * period;
        val -= offset;
        if (local64_cmpxchg(&hwc->period_left, old, val) != old)
                goto again;

        return nr;
}

static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
                                    struct perf_sample_data *data,
                                    struct pt_regs *regs)
{
        struct hw_perf_event *hwc = &event->hw;
        int throttle = 0;

        if (!overflow)
                overflow = perf_swevent_set_period(event);

        if (hwc->interrupts == MAX_INTERRUPTS)
                return;

        for (; overflow; overflow--) {
                if (__perf_event_overflow(event, throttle,
                                            data, regs)) {
                        /*
                         * We inhibit the overflow from happening when
                         * hwc->interrupts == MAX_INTERRUPTS.
                         */
                        break;
                }
                throttle = 1;
        }
}

static void perf_swevent_event(struct perf_event *event, u64 nr,
                               struct perf_sample_data *data,
                               struct pt_regs *regs)
{
        struct hw_perf_event *hwc = &event->hw;

        local64_add(nr, &event->count);

        if (!regs)
                return;

        if (!is_sampling_event(event))
                return;

        if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
                data->period = nr;
                return perf_swevent_overflow(event, 1, data, regs);
        } else
                data->period = event->hw.last_period;

        if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
                return perf_swevent_overflow(event, 1, data, regs);

        if (local64_add_negative(nr, &hwc->period_left))
                return;

        perf_swevent_overflow(event, 0, data, regs);
}

static int perf_exclude_event(struct perf_event *event,
                              struct pt_regs *regs)
{
        if (event->hw.state & PERF_HES_STOPPED)
                return 1;

        if (regs) {
                if (event->attr.exclude_user && user_mode(regs))
                        return 1;

                if (event->attr.exclude_kernel && !user_mode(regs))
                        return 1;
        }

        return 0;
}

static int perf_swevent_match(struct perf_event *event,
                                enum perf_type_id type,
                                u32 event_id,
                                struct perf_sample_data *data,
                                struct pt_regs *regs)
{
        if (event->attr.type != type)
                return 0;

        if (event->attr.config != event_id)
                return 0;

        if (perf_exclude_event(event, regs))
                return 0;

        return 1;
}

static inline u64 swevent_hash(u64 type, u32 event_id)
{
        u64 val = event_id | (type << 32);

        return hash_64(val, SWEVENT_HLIST_BITS);
}

static inline struct hlist_head *
__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
{
        u64 hash = swevent_hash(type, event_id);

        return &hlist->heads[hash];
}

/* For the read side: events when they trigger */
static inline struct hlist_head *
find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
{
        struct swevent_hlist *hlist;

        hlist = rcu_dereference(swhash->swevent_hlist);
        if (!hlist)
                return NULL;

        return __find_swevent_head(hlist, type, event_id);
}

/* For the event head insertion and removal in the hlist */
static inline struct hlist_head *
find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
{
        struct swevent_hlist *hlist;
        u32 event_id = event->attr.config;
        u64 type = event->attr.type;

        /*
         * Event scheduling is always serialized against hlist allocation
         * and release. Which makes the protected version suitable here.
         * The context lock guarantees that.
         */
        hlist = rcu_dereference_protected(swhash->swevent_hlist,
                                          lockdep_is_held(&event->ctx->lock));
        if (!hlist)
                return NULL;

        return __find_swevent_head(hlist, type, event_id);
}

static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
                                    u64 nr,
                                    struct perf_sample_data *data,
                                    struct pt_regs *regs)
{
        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
        struct perf_event *event;
        struct hlist_head *head;

        rcu_read_lock();
        head = find_swevent_head_rcu(swhash, type, event_id);
        if (!head)
                goto end;

        hlist_for_each_entry_rcu(event, head, hlist_entry) {
                if (perf_swevent_match(event, type, event_id, data, regs))
                        perf_swevent_event(event, nr, data, regs);
        }
end:
        rcu_read_unlock();
}

DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);

int perf_swevent_get_recursion_context(void)
{
        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);

        return get_recursion_context(swhash->recursion);
}
EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);

void perf_swevent_put_recursion_context(int rctx)
{
        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);

        put_recursion_context(swhash->recursion, rctx);
}

void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
        struct perf_sample_data data;

        if (WARN_ON_ONCE(!regs))
                return;

        perf_sample_data_init(&data, addr, 0);
        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
}

void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
        int rctx;

        preempt_disable_notrace();
        rctx = perf_swevent_get_recursion_context();
        if (unlikely(rctx < 0))
                goto fail;

        ___perf_sw_event(event_id, nr, regs, addr);

        perf_swevent_put_recursion_context(rctx);
fail:
        preempt_enable_notrace();
}

static void perf_swevent_read(struct perf_event *event)
{
}

static int perf_swevent_add(struct perf_event *event, int flags)
{
        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
        struct hw_perf_event *hwc = &event->hw;
        struct hlist_head *head;

        if (is_sampling_event(event)) {
                hwc->last_period = hwc->sample_period;
                perf_swevent_set_period(event);
        }

        hwc->state = !(flags & PERF_EF_START);

        head = find_swevent_head(swhash, event);
        if (WARN_ON_ONCE(!head))
                return -EINVAL;

        hlist_add_head_rcu(&event->hlist_entry, head);
        perf_event_update_userpage(event);

        return 0;
}

static void perf_swevent_del(struct perf_event *event, int flags)
{
        hlist_del_rcu(&event->hlist_entry);
}

static void perf_swevent_start(struct perf_event *event, int flags)
{
        event->hw.state = 0;
}

static void perf_swevent_stop(struct perf_event *event, int flags)
{
        event->hw.state = PERF_HES_STOPPED;
}

/* Deref the hlist from the update side */
static inline struct swevent_hlist *
swevent_hlist_deref(struct swevent_htable *swhash)
{
        return rcu_dereference_protected(swhash->swevent_hlist,
                                         lockdep_is_held(&swhash->hlist_mutex));
}

static void swevent_hlist_release(struct swevent_htable *swhash)
{
        struct swevent_hlist *hlist = swevent_hlist_deref(swhash);

        if (!hlist)
                return;

        RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
        kfree_rcu(hlist, rcu_head);
}

static void swevent_hlist_put_cpu(int cpu)
{
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);

        mutex_lock(&swhash->hlist_mutex);

        if (!--swhash->hlist_refcount)
                swevent_hlist_release(swhash);

        mutex_unlock(&swhash->hlist_mutex);
}

static void swevent_hlist_put(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                swevent_hlist_put_cpu(cpu);
}

static int swevent_hlist_get_cpu(int cpu)
{
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
        int err = 0;

        mutex_lock(&swhash->hlist_mutex);
        if (!swevent_hlist_deref(swhash) &&
            cpumask_test_cpu(cpu, perf_online_mask)) {
                struct swevent_hlist *hlist;

                hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
                if (!hlist) {
                        err = -ENOMEM;
                        goto exit;
                }
                rcu_assign_pointer(swhash->swevent_hlist, hlist);
        }
        swhash->hlist_refcount++;
exit:
        mutex_unlock(&swhash->hlist_mutex);

        return err;
}

static int swevent_hlist_get(void)
{
        int err, cpu, failed_cpu;

        mutex_lock(&pmus_lock);
        for_each_possible_cpu(cpu) {
                err = swevent_hlist_get_cpu(cpu);
                if (err) {
                        failed_cpu = cpu;
                        goto fail;
                }
        }
        mutex_unlock(&pmus_lock);
        return 0;
fail:
        for_each_possible_cpu(cpu) {
                if (cpu == failed_cpu)
                        break;
                swevent_hlist_put_cpu(cpu);
        }
        mutex_unlock(&pmus_lock);
        return err;
}

struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];

static void sw_perf_event_destroy(struct perf_event *event)
{
        u64 event_id = event->attr.config;

        WARN_ON(event->parent);

        static_key_slow_dec(&perf_swevent_enabled[event_id]);
        swevent_hlist_put();
}

static int perf_swevent_init(struct perf_event *event)
{
        u64 event_id = event->attr.config;

        if (event->attr.type != PERF_TYPE_SOFTWARE)
                return -ENOENT;

        /*
         * no branch sampling for software events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        switch (event_id) {
        case PERF_COUNT_SW_CPU_CLOCK:
        case PERF_COUNT_SW_TASK_CLOCK:
                return -ENOENT;

        default:
                break;
        }

        if (event_id >= PERF_COUNT_SW_MAX)
                return -ENOENT;

        if (!event->parent) {
                int err;

                err = swevent_hlist_get();
                if (err)
                        return err;

                static_key_slow_inc(&perf_swevent_enabled[event_id]);
                event->destroy = sw_perf_event_destroy;
        }

        return 0;
}

static struct pmu perf_swevent = {
        .task_ctx_nr        = perf_sw_context,

        .capabilities        = PERF_PMU_CAP_NO_NMI,

        .event_init        = perf_swevent_init,
        .add                = perf_swevent_add,
        .del                = perf_swevent_del,
        .start                = perf_swevent_start,
        .stop                = perf_swevent_stop,
        .read                = perf_swevent_read,
};

#ifdef CONFIG_EVENT_TRACING

static int perf_tp_filter_match(struct perf_event *event,
                                struct perf_sample_data *data)
{
        void *record = data->raw->frag.data;

        /* only top level events have filters set */
        if (event->parent)
                event = event->parent;

        if (likely(!event->filter) || filter_match_preds(event->filter, record))
                return 1;
        return 0;
}

static int perf_tp_event_match(struct perf_event *event,
                                struct perf_sample_data *data,
                                struct pt_regs *regs)
{
        if (event->hw.state & PERF_HES_STOPPED)
                return 0;
        /*
         * If exclude_kernel, only trace user-space tracepoints (uprobes)
         */
        if (event->attr.exclude_kernel && !user_mode(regs))
                return 0;

        if (!perf_tp_filter_match(event, data))
                return 0;

        return 1;
}

void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
                               struct trace_event_call *call, u64 count,
                               struct pt_regs *regs, struct hlist_head *head,
                               struct task_struct *task)
{
        if (bpf_prog_array_valid(call)) {
                *(struct pt_regs **)raw_data = regs;
                if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
                        perf_swevent_put_recursion_context(rctx);
                        return;
                }
        }
        perf_tp_event(call->event.type, count, raw_data, size, regs, head,
                      rctx, task);
}
EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);

void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
                   struct pt_regs *regs, struct hlist_head *head, int rctx,
                   struct task_struct *task)
{
        struct perf_sample_data data;
        struct perf_event *event;

        struct perf_raw_record raw = {
                .frag = {
                        .size = entry_size,
                        .data = record,
                },
        };

        perf_sample_data_init(&data, 0, 0);
        data.raw = &raw;

        perf_trace_buf_update(record, event_type);

        hlist_for_each_entry_rcu(event, head, hlist_entry) {
                if (perf_tp_event_match(event, &data, regs))
                        perf_swevent_event(event, count, &data, regs);
        }

        /*
         * If we got specified a target task, also iterate its context and
         * deliver this event there too.
         */
        if (task && task != current) {
                struct perf_event_context *ctx;
                struct trace_entry *entry = record;

                rcu_read_lock();
                ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
                if (!ctx)
                        goto unlock;

                list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                        if (event->cpu != smp_processor_id())
                                continue;
                        if (event->attr.type != PERF_TYPE_TRACEPOINT)
                                continue;
                        if (event->attr.config != entry->type)
                                continue;
                        if (perf_tp_event_match(event, &data, regs))
                                perf_swevent_event(event, count, &data, regs);
                }
unlock:
                rcu_read_unlock();
        }

        perf_swevent_put_recursion_context(rctx);
}
EXPORT_SYMBOL_GPL(perf_tp_event);

static void tp_perf_event_destroy(struct perf_event *event)
{
        perf_trace_destroy(event);
}

static int perf_tp_event_init(struct perf_event *event)
{
        int err;

        if (event->attr.type != PERF_TYPE_TRACEPOINT)
                return -ENOENT;

        /*
         * no branch sampling for tracepoint events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        err = perf_trace_init(event);
        if (err)
                return err;

        event->destroy = tp_perf_event_destroy;

        return 0;
}

static struct pmu perf_tracepoint = {
        .task_ctx_nr        = perf_sw_context,

        .event_init        = perf_tp_event_init,
        .add                = perf_trace_add,
        .del                = perf_trace_del,
        .start                = perf_swevent_start,
        .stop                = perf_swevent_stop,
        .read                = perf_swevent_read,
};

#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
/*
 * Flags in config, used by dynamic PMU kprobe and uprobe
 * The flags should match following PMU_FORMAT_ATTR().
 *
 * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
 *                               if not set, create kprobe/uprobe
 *
 * The following values specify a reference counter (or semaphore in the
 * terminology of tools like dtrace, systemtap, etc.) Userspace Statically
 * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset.
 *
 * PERF_UPROBE_REF_CTR_OFFSET_BITS        # of bits in config as th offset
 * PERF_UPROBE_REF_CTR_OFFSET_SHIFT        # of bits to shift left
 */
enum perf_probe_config {
        PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,  /* [k,u]retprobe */
        PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
        PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
};

PMU_FORMAT_ATTR(retprobe, "config:0");
#endif

#ifdef CONFIG_KPROBE_EVENTS
static struct attribute *kprobe_attrs[] = {
        &format_attr_retprobe.attr,
        NULL,
};

static struct attribute_group kprobe_format_group = {
        .name = "format",
        .attrs = kprobe_attrs,
};

static const struct attribute_group *kprobe_attr_groups[] = {
        &kprobe_format_group,
        NULL,
};

static int perf_kprobe_event_init(struct perf_event *event);
static struct pmu perf_kprobe = {
        .task_ctx_nr        = perf_sw_context,
        .event_init        = perf_kprobe_event_init,
        .add                = perf_trace_add,
        .del                = perf_trace_del,
        .start                = perf_swevent_start,
        .stop                = perf_swevent_stop,
        .read                = perf_swevent_read,
        .attr_groups        = kprobe_attr_groups,
};

static int perf_kprobe_event_init(struct perf_event *event)
{
        int err;
        bool is_retprobe;

        if (event->attr.type != perf_kprobe.type)
                return -ENOENT;

        if (!perfmon_capable())
                return -EACCES;

        /*
         * no branch sampling for probe events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
        err = perf_kprobe_init(event, is_retprobe);
        if (err)
                return err;

        event->destroy = perf_kprobe_destroy;

        return 0;
}
#endif /* CONFIG_KPROBE_EVENTS */

#ifdef CONFIG_UPROBE_EVENTS
PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");

static struct attribute *uprobe_attrs[] = {
        &format_attr_retprobe.attr,
        &format_attr_ref_ctr_offset.attr,
        NULL,
};

static struct attribute_group uprobe_format_group = {
        .name = "format",
        .attrs = uprobe_attrs,
};

static const struct attribute_group *uprobe_attr_groups[] = {
        &uprobe_format_group,
        NULL,
};

static int perf_uprobe_event_init(struct perf_event *event);
static struct pmu perf_uprobe = {
        .task_ctx_nr        = perf_sw_context,
        .event_init        = perf_uprobe_event_init,
        .add                = perf_trace_add,
        .del                = perf_trace_del,
        .start                = perf_swevent_start,
        .stop                = perf_swevent_stop,
        .read                = perf_swevent_read,
        .attr_groups        = uprobe_attr_groups,
};

static int perf_uprobe_event_init(struct perf_event *event)
{
        int err;
        unsigned long ref_ctr_offset;
        bool is_retprobe;

        if (event->attr.type != perf_uprobe.type)
                return -ENOENT;

        if (!perfmon_capable())
                return -EACCES;

        /*
         * no branch sampling for probe events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
        ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
        err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
        if (err)
                return err;

        event->destroy = perf_uprobe_destroy;

        return 0;
}
#endif /* CONFIG_UPROBE_EVENTS */

static inline void perf_tp_register(void)
{
        perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
#ifdef CONFIG_KPROBE_EVENTS
        perf_pmu_register(&perf_kprobe, "kprobe", -1);
#endif
#ifdef CONFIG_UPROBE_EVENTS
        perf_pmu_register(&perf_uprobe, "uprobe", -1);
#endif
}

static void perf_event_free_filter(struct perf_event *event)
{
        ftrace_profile_free_filter(event);
}

#ifdef CONFIG_BPF_SYSCALL
static void bpf_overflow_handler(struct perf_event *event,
                                 struct perf_sample_data *data,
                                 struct pt_regs *regs)
{
        struct bpf_perf_event_data_kern ctx = {
                .data = data,
                .event = event,
        };
        int ret = 0;

        ctx.regs = perf_arch_bpf_user_pt_regs(regs);
        if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
                goto out;
        rcu_read_lock();
        ret = BPF_PROG_RUN(event->prog, &ctx);
        rcu_read_unlock();
out:
        __this_cpu_dec(bpf_prog_active);
        if (!ret)
                return;

        event->orig_overflow_handler(event, data, regs);
}

static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
{
        struct bpf_prog *prog;

        if (event->overflow_handler_context)
                /* hw breakpoint or kernel counter */
                return -EINVAL;

        if (event->prog)
                return -EEXIST;

        prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
        if (IS_ERR(prog))
                return PTR_ERR(prog);

        if (event->attr.precise_ip &&
            prog->call_get_stack &&
            (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) ||
             event->attr.exclude_callchain_kernel ||
             event->attr.exclude_callchain_user)) {
                /*
                 * On perf_event with precise_ip, calling bpf_get_stack()
                 * may trigger unwinder warnings and occasional crashes.
                 * bpf_get_[stack|stackid] works around this issue by using
                 * callchain attached to perf_sample_data. If the
                 * perf_event does not full (kernel and user) callchain
                 * attached to perf_sample_data, do not allow attaching BPF
                 * program that calls bpf_get_[stack|stackid].
                 */
                bpf_prog_put(prog);
                return -EPROTO;
        }

        event->prog = prog;
        event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
        WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
        return 0;
}

static void perf_event_free_bpf_handler(struct perf_event *event)
{
        struct bpf_prog *prog = event->prog;

        if (!prog)
                return;

        WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
        event->prog = NULL;
        bpf_prog_put(prog);
}
#else
static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
{
        return -EOPNOTSUPP;
}
static void perf_event_free_bpf_handler(struct perf_event *event)
{
}
#endif

/*
 * returns true if the event is a tracepoint, or a kprobe/upprobe created
 * with perf_event_open()
 */
static inline bool perf_event_is_tracing(struct perf_event *event)
{
        if (event->pmu == &perf_tracepoint)
                return true;
#ifdef CONFIG_KPROBE_EVENTS
        if (event->pmu == &perf_kprobe)
                return true;
#endif
#ifdef CONFIG_UPROBE_EVENTS
        if (event->pmu == &perf_uprobe)
                return true;
#endif
        return false;
}

static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
        bool is_kprobe, is_tracepoint, is_syscall_tp;
        struct bpf_prog *prog;
        int ret;

        if (!perf_event_is_tracing(event))
                return perf_event_set_bpf_handler(event, prog_fd);

        is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
        is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
        is_syscall_tp = is_syscall_trace_event(event->tp_event);
        if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
                /* bpf programs can only be attached to u/kprobe or tracepoint */
                return -EINVAL;

        prog = bpf_prog_get(prog_fd);
        if (IS_ERR(prog))
                return PTR_ERR(prog);

        if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
            (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
            (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
                /* valid fd, but invalid bpf program type */
                bpf_prog_put(prog);
                return -EINVAL;
        }

        /* Kprobe override only works for kprobes, not uprobes. */
        if (prog->kprobe_override &&
            !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
                bpf_prog_put(prog);
                return -EINVAL;
        }

        if (is_tracepoint || is_syscall_tp) {
                int off = trace_event_get_offsets(event->tp_event);

                if (prog->aux->max_ctx_offset > off) {
                        bpf_prog_put(prog);
                        return -EACCES;
                }
        }

        ret = perf_event_attach_bpf_prog(event, prog);
        if (ret)
                bpf_prog_put(prog);
        return ret;
}

static void perf_event_free_bpf_prog(struct perf_event *event)
{
        if (!perf_event_is_tracing(event)) {
                perf_event_free_bpf_handler(event);
                return;
        }
        perf_event_detach_bpf_prog(event);
}

#else

static inline void perf_tp_register(void)
{
}

static void perf_event_free_filter(struct perf_event *event)
{
}

static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
        return -ENOENT;
}

static void perf_event_free_bpf_prog(struct perf_event *event)
{
}
#endif /* CONFIG_EVENT_TRACING */

#ifdef CONFIG_HAVE_HW_BREAKPOINT
void perf_bp_event(struct perf_event *bp, void *data)
{
        struct perf_sample_data sample;
        struct pt_regs *regs = data;

        perf_sample_data_init(&sample, bp->attr.bp_addr, 0);

        if (!bp->hw.state && !perf_exclude_event(bp, regs))
                perf_swevent_event(bp, 1, &sample, regs);
}
#endif

/*
 * Allocate a new address filter
 */
static struct perf_addr_filter *
perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
{
        int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
        struct perf_addr_filter *filter;

        filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
        if (!filter)
                return NULL;

        INIT_LIST_HEAD(&filter->entry);
        list_add_tail(&filter->entry, filters);

        return filter;
}

static void free_filters_list(struct list_head *filters)
{
        struct perf_addr_filter *filter, *iter;

        list_for_each_entry_safe(filter, iter, filters, entry) {
                path_put(&filter->path);
                list_del(&filter->entry);
                kfree(filter);
        }
}

/*
 * Free existing address filters and optionally install new ones
 */
static void perf_addr_filters_splice(struct perf_event *event,
                                     struct list_head *head)
{
        unsigned long flags;
        LIST_HEAD(list);

        if (!has_addr_filter(event))
                return;

        /* don't bother with children, they don't have their own filters */
        if (event->parent)
                return;

        raw_spin_lock_irqsave(&event->addr_filters.lock, flags);

        list_splice_init(&event->addr_filters.list, &list);
        if (head)
                list_splice(head, &event->addr_filters.list);

        raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);

        free_filters_list(&list);
}

/*
 * Scan through mm's vmas and see if one of them matches the
 * @filter; if so, adjust filter's address range.
 * Called with mm::mmap_lock down for reading.
 */
static void perf_addr_filter_apply(struct perf_addr_filter *filter,
                                   struct mm_struct *mm,
                                   struct perf_addr_filter_range *fr)
{
        struct vm_area_struct *vma;

        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                if (!vma->vm_file)
                        continue;

                if (perf_addr_filter_vma_adjust(filter, vma, fr))
                        return;
        }
}

/*
 * Update event's address range filters based on the
 * task's existing mappings, if any.
 */
static void perf_event_addr_filters_apply(struct perf_event *event)
{
        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
        struct task_struct *task = READ_ONCE(event->ctx->task);
        struct perf_addr_filter *filter;
        struct mm_struct *mm = NULL;
        unsigned int count = 0;
        unsigned long flags;

        /*
         * We may observe TASK_TOMBSTONE, which means that the event tear-down
         * will stop on the parent's child_mutex that our caller is also holding
         */
        if (task == TASK_TOMBSTONE)
                return;

        if (ifh->nr_file_filters) {
                mm = get_task_mm(task);
                if (!mm)
                        goto restart;

                mmap_read_lock(mm);
        }

        raw_spin_lock_irqsave(&ifh->lock, flags);
        list_for_each_entry(filter, &ifh->list, entry) {
                if (filter->path.dentry) {
                        /*
                         * Adjust base offset if the filter is associated to a
                         * binary that needs to be mapped:
                         */
                        event->addr_filter_ranges[count].start = 0;
                        event->addr_filter_ranges[count].size = 0;

                        perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
                } else {
                        event->addr_filter_ranges[count].start = filter->offset;
                        event->addr_filter_ranges[count].size  = filter->size;
                }

                count++;
        }

        event->addr_filters_gen++;
        raw_spin_unlock_irqrestore(&ifh->lock, flags);

        if (ifh->nr_file_filters) {
                mmap_read_unlock(mm);

                mmput(mm);
        }

restart:
        perf_event_stop(event, 1);
}

/*
 * Address range filtering: limiting the data to certain
 * instruction address ranges. Filters are ioctl()ed to us from
 * userspace as ascii strings.
 *
 * Filter string format:
 *
 * ACTION RANGE_SPEC
 * where ACTION is one of the
 *  * "filter": limit the trace to this region
 *  * "start": start tracing from this address
 *  * "stop": stop tracing at this address/region;
 * RANGE_SPEC is
 *  * for kernel addresses: <start address>[/<size>]
 *  * for object files:     <start address>[/<size>]@</path/to/object/file>
 *
 * if <size> is not specified or is zero, the range is treated as a single
 * address; not valid for ACTION=="filter".
 */
enum {
        IF_ACT_NONE = -1,
        IF_ACT_FILTER,
        IF_ACT_START,
        IF_ACT_STOP,
        IF_SRC_FILE,
        IF_SRC_KERNEL,
        IF_SRC_FILEADDR,
        IF_SRC_KERNELADDR,
};

enum {
        IF_STATE_ACTION = 0,
        IF_STATE_SOURCE,
        IF_STATE_END,
};

static const match_table_t if_tokens = {
        { IF_ACT_FILTER,        "filter" },
        { IF_ACT_START,                "start" },
        { IF_ACT_STOP,                "stop" },
        { IF_SRC_FILE,                "%u/%u@%s" },
        { IF_SRC_KERNEL,        "%u/%u" },
        { IF_SRC_FILEADDR,        "%u@%s" },
        { IF_SRC_KERNELADDR,        "%u" },
        { IF_ACT_NONE,                NULL },
};

/*
 * Address filter string parser
 */
static int
perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
                             struct list_head *filters)
{
        struct perf_addr_filter *filter = NULL;
        char *start, *orig, *filename = NULL;
        substring_t args[MAX_OPT_ARGS];
        int state = IF_STATE_ACTION, token;
        unsigned int kernel = 0;
        int ret = -EINVAL;

        orig = fstr = kstrdup(fstr, GFP_KERNEL);
        if (!fstr)
                return -ENOMEM;

        while ((start = strsep(&fstr, " ,\n")) != NULL) {
                static const enum perf_addr_filter_action_t actions[] = {
                        [IF_ACT_FILTER]        = PERF_ADDR_FILTER_ACTION_FILTER,
                        [IF_ACT_START]        = PERF_ADDR_FILTER_ACTION_START,
                        [IF_ACT_STOP]        = PERF_ADDR_FILTER_ACTION_STOP,
                };
                ret = -EINVAL;

                if (!*start)
                        continue;

                /* filter definition begins */
                if (state == IF_STATE_ACTION) {
                        filter = perf_addr_filter_new(event, filters);
                        if (!filter)
                                goto fail;
                }

                token = match_token(start, if_tokens, args);
                switch (token) {
                case IF_ACT_FILTER:
                case IF_ACT_START:
                case IF_ACT_STOP:
                        if (state != IF_STATE_ACTION)
                                goto fail;

                        filter->action = actions[token];
                        state = IF_STATE_SOURCE;
                        break;

                case IF_SRC_KERNELADDR:
                case IF_SRC_KERNEL:
                        kernel = 1;
                        fallthrough;

                case IF_SRC_FILEADDR:
                case IF_SRC_FILE:
                        if (state != IF_STATE_SOURCE)
                                goto fail;

                        *args[0].to = 0;
                        ret = kstrtoul(args[0].from, 0, &filter->offset);
                        if (ret)
                                goto fail;

                        if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
                                *args[1].to = 0;
                                ret = kstrtoul(args[1].from, 0, &filter->size);
                                if (ret)
                                        goto fail;
                        }

                        if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
                                int fpos = token == IF_SRC_FILE ? 2 : 1;

                                kfree(filename);
                                filename = match_strdup(&args[fpos]);
                                if (!filename) {
                                        ret = -ENOMEM;
                                        goto fail;
                                }
                        }

                        state = IF_STATE_END;
                        break;

                default:
                        goto fail;
                }

                /*
                 * Filter definition is fully parsed, validate and install it.
                 * Make sure that it doesn't contradict itself or the event's
                 * attribute.
                 */
                if (state == IF_STATE_END) {
                        ret = -EINVAL;
                        if (kernel && event->attr.exclude_kernel)
                                goto fail;

                        /*
                         * ACTION "filter" must have a non-zero length region
                         * specified.
                         */
                        if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
                            !filter->size)
                                goto fail;

                        if (!kernel) {
                                if (!filename)
                                        goto fail;

                                /*
                                 * For now, we only support file-based filters
                                 * in per-task events; doing so for CPU-wide
                                 * events requires additional context switching
                                 * trickery, since same object code will be
                                 * mapped at different virtual addresses in
                                 * different processes.
                                 */
                                ret = -EOPNOTSUPP;
                                if (!event->ctx->task)
                                        goto fail;

                                /* look up the path and grab its inode */
                                ret = kern_path(filename, LOOKUP_FOLLOW,
                                                &filter->path);
                                if (ret)
                                        goto fail;

                                ret = -EINVAL;
                                if (!filter->path.dentry ||
                                    !S_ISREG(d_inode(filter->path.dentry)
                                             ->i_mode))
                                        goto fail;

                                event->addr_filters.nr_file_filters++;
                        }

                        /* ready to consume more filters */
                        kfree(filename);
                        filename = NULL;
                        state = IF_STATE_ACTION;
                        filter = NULL;
                        kernel = 0;
                }
        }

        if (state != IF_STATE_ACTION)
                goto fail;

        kfree(filename);
        kfree(orig);

        return 0;

fail:
        kfree(filename);
        free_filters_list(filters);
        kfree(orig);

        return ret;
}

static int
perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
{
        LIST_HEAD(filters);
        int ret;

        /*
         * Since this is called in perf_ioctl() path, we're already holding
         * ctx::mutex.
         */
        lockdep_assert_held(&event->ctx->mutex);

        if (WARN_ON_ONCE(event->parent))
                return -EINVAL;

        ret = perf_event_parse_addr_filter(event, filter_str, &filters);
        if (ret)
                goto fail_clear_files;

        ret = event->pmu->addr_filters_validate(&filters);
        if (ret)
                goto fail_free_filters;

        /* remove existing filters, if any */
        perf_addr_filters_splice(event, &filters);

        /* install new filters */
        perf_event_for_each_child(event, perf_event_addr_filters_apply);

        return ret;

fail_free_filters:
        free_filters_list(&filters);

fail_clear_files:
        event->addr_filters.nr_file_filters = 0;

        return ret;
}

static int perf_event_set_filter(struct perf_event *event, void __user *arg)
{
        int ret = -EINVAL;
        char *filter_str;

        filter_str = strndup_user(arg, PAGE_SIZE);
        if (IS_ERR(filter_str))
                return PTR_ERR(filter_str);

#ifdef CONFIG_EVENT_TRACING
        if (perf_event_is_tracing(event)) {
                struct perf_event_context *ctx = event->ctx;

                /*
                 * Beware, here be dragons!!
                 *
                 * the tracepoint muck will deadlock against ctx->mutex, but
                 * the tracepoint stuff does not actually need it. So
                 * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we
                 * already have a reference on ctx.
                 *
                 * This can result in event getting moved to a different ctx,
                 * but that does not affect the tracepoint state.
                 */
                mutex_unlock(&ctx->mutex);
                ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
                mutex_lock(&ctx->mutex);
        } else
#endif
        if (has_addr_filter(event))
                ret = perf_event_set_addr_filter(event, filter_str);

        kfree(filter_str);
        return ret;
}

/*
 * hrtimer based swevent callback
 */

static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
{
        enum hrtimer_restart ret = HRTIMER_RESTART;
        struct perf_sample_data data;
        struct pt_regs *regs;
        struct perf_event *event;
        u64 period;

        event = container_of(hrtimer, struct perf_event, hw.hrtimer);

        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return HRTIMER_NORESTART;

        event->pmu->read(event);

        perf_sample_data_init(&data, 0, event->hw.last_period);
        regs = get_irq_regs();

        if (regs && !perf_exclude_event(event, regs)) {
                if (!(event->attr.exclude_idle && is_idle_task(current)))
                        if (__perf_event_overflow(event, 1, &data, regs))
                                ret = HRTIMER_NORESTART;
        }

        period = max_t(u64, 10000, event->hw.sample_period);
        hrtimer_forward_now(hrtimer, ns_to_ktime(period));

        return ret;
}

static void perf_swevent_start_hrtimer(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;
        s64 period;

        if (!is_sampling_event(event))
                return;

        period = local64_read(&hwc->period_left);
        if (period) {
                if (period < 0)
                        period = 10000;

                local64_set(&hwc->period_left, 0);
        } else {
                period = max_t(u64, 10000, hwc->sample_period);
        }
        hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
                      HRTIMER_MODE_REL_PINNED_HARD);
}

static void perf_swevent_cancel_hrtimer(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;

        if (is_sampling_event(event)) {
                ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
                local64_set(&hwc->period_left, ktime_to_ns(remaining));

                hrtimer_cancel(&hwc->hrtimer);
        }
}

static void perf_swevent_init_hrtimer(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;

        if (!is_sampling_event(event))
                return;

        hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
        hwc->hrtimer.function = perf_swevent_hrtimer;

        /*
         * Since hrtimers have a fixed rate, we can do a static freq->period
         * mapping and avoid the whole period adjust feedback stuff.
         */
        if (event->attr.freq) {
                long freq = event->attr.sample_freq;

                event->attr.sample_period = NSEC_PER_SEC / freq;
                hwc->sample_period = event->attr.sample_period;
                local64_set(&hwc->period_left, hwc->sample_period);
                hwc->last_period = hwc->sample_period;
                event->attr.freq = 0;
        }
}

/*
 * Software event: cpu wall time clock
 */

static void cpu_clock_event_update(struct perf_event *event)
{
        s64 prev;
        u64 now;

        now = local_clock();
        prev = local64_xchg(&event->hw.prev_count, now);
        local64_add(now - prev, &event->count);
}

static void cpu_clock_event_start(struct perf_event *event, int flags)
{
        local64_set(&event->hw.prev_count, local_clock());
        perf_swevent_start_hrtimer(event);
}

static void cpu_clock_event_stop(struct perf_event *event, int flags)
{
        perf_swevent_cancel_hrtimer(event);
        cpu_clock_event_update(event);
}

static int cpu_clock_event_add(struct perf_event *event, int flags)
{
        if (flags & PERF_EF_START)
                cpu_clock_event_start(event, flags);
        perf_event_update_userpage(event);

        return 0;
}

static void cpu_clock_event_del(struct perf_event *event, int flags)
{
        cpu_clock_event_stop(event, flags);
}

static void cpu_clock_event_read(struct perf_event *event)
{
        cpu_clock_event_update(event);
}

static int cpu_clock_event_init(struct perf_event *event)
{
        if (event->attr.type != PERF_TYPE_SOFTWARE)
                return -ENOENT;

        if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
                return -ENOENT;

        /*
         * no branch sampling for software events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        perf_swevent_init_hrtimer(event);

        return 0;
}

static struct pmu perf_cpu_clock = {
        .task_ctx_nr        = perf_sw_context,

        .capabilities        = PERF_PMU_CAP_NO_NMI,

        .event_init        = cpu_clock_event_init,
        .add                = cpu_clock_event_add,
        .del                = cpu_clock_event_del,
        .start                = cpu_clock_event_start,
        .stop                = cpu_clock_event_stop,
        .read                = cpu_clock_event_read,
};

/*
 * Software event: task time clock
 */

static void task_clock_event_update(struct perf_event *event, u64 now)
{
        u64 prev;
        s64 delta;

        prev = local64_xchg(&event->hw.prev_count, now);
        delta = now - prev;
        local64_add(delta, &event->count);
}

static void task_clock_event_start(struct perf_event *event, int flags)
{
        local64_set(&event->hw.prev_count, event->ctx->time);
        perf_swevent_start_hrtimer(event);
}

static void task_clock_event_stop(struct perf_event *event, int flags)
{
        perf_swevent_cancel_hrtimer(event);
        task_clock_event_update(event, event->ctx->time);
}

static int task_clock_event_add(struct perf_event *event, int flags)
{
        if (flags & PERF_EF_START)
                task_clock_event_start(event, flags);
        perf_event_update_userpage(event);

        return 0;
}

static void task_clock_event_del(struct perf_event *event, int flags)
{
        task_clock_event_stop(event, PERF_EF_UPDATE);
}

static void task_clock_event_read(struct perf_event *event)
{
        u64 now = perf_clock();
        u64 delta = now - event->ctx->timestamp;
        u64 time = event->ctx->time + delta;

        task_clock_event_update(event, time);
}

static int task_clock_event_init(struct perf_event *event)
{
        if (event->attr.type != PERF_TYPE_SOFTWARE)
                return -ENOENT;

        if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
                return -ENOENT;

        /*
         * no branch sampling for software events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        perf_swevent_init_hrtimer(event);

        return 0;
}

static struct pmu perf_task_clock = {
        .task_ctx_nr        = perf_sw_context,

        .capabilities        = PERF_PMU_CAP_NO_NMI,

        .event_init        = task_clock_event_init,
        .add                = task_clock_event_add,
        .del                = task_clock_event_del,
        .start                = task_clock_event_start,
        .stop                = task_clock_event_stop,
        .read                = task_clock_event_read,
};

static void perf_pmu_nop_void(struct pmu *pmu)
{
}

static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
{
}

static int perf_pmu_nop_int(struct pmu *pmu)
{
        return 0;
}

static int perf_event_nop_int(struct perf_event *event, u64 value)
{
        return 0;
}

static DEFINE_PER_CPU(unsigned int, nop_txn_flags);

static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
{
        __this_cpu_write(nop_txn_flags, flags);

        if (flags & ~PERF_PMU_TXN_ADD)
                return;

        perf_pmu_disable(pmu);
}

static int perf_pmu_commit_txn(struct pmu *pmu)
{
        unsigned int flags = __this_cpu_read(nop_txn_flags);

        __this_cpu_write(nop_txn_flags, 0);

        if (flags & ~PERF_PMU_TXN_ADD)
                return 0;

        perf_pmu_enable(pmu);
        return 0;
}

static void perf_pmu_cancel_txn(struct pmu *pmu)
{
        unsigned int flags =  __this_cpu_read(nop_txn_flags);

        __this_cpu_write(nop_txn_flags, 0);

        if (flags & ~PERF_PMU_TXN_ADD)
                return;

        perf_pmu_enable(pmu);
}

static int perf_event_idx_default(struct perf_event *event)
{
        return 0;
}

/*
 * Ensures all contexts with the same task_ctx_nr have the same
 * pmu_cpu_context too.
 */
static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
{
        struct pmu *pmu;

        if (ctxn < 0)
                return NULL;

        list_for_each_entry(pmu, &pmus, entry) {
                if (pmu->task_ctx_nr == ctxn)
                        return pmu->pmu_cpu_context;
        }

        return NULL;
}

static void free_pmu_context(struct pmu *pmu)
{
        /*
         * Static contexts such as perf_sw_context have a global lifetime
         * and may be shared between different PMUs. Avoid freeing them
         * when a single PMU is going away.
         */
        if (pmu->task_ctx_nr > perf_invalid_context)
                return;

        free_percpu(pmu->pmu_cpu_context);
}

/*
 * Let userspace know that this PMU supports address range filtering:
 */
static ssize_t nr_addr_filters_show(struct device *dev,
                                    struct device_attribute *attr,
                                    char *page)
{
        struct pmu *pmu = dev_get_drvdata(dev);

        return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
}
DEVICE_ATTR_RO(nr_addr_filters);

static struct idr pmu_idr;

static ssize_t
type_show(struct device *dev, struct device_attribute *attr, char *page)
{
        struct pmu *pmu = dev_get_drvdata(dev);

        return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
}
static DEVICE_ATTR_RO(type);

static ssize_t
perf_event_mux_interval_ms_show(struct device *dev,
                                struct device_attribute *attr,
                                char *page)
{
        struct pmu *pmu = dev_get_drvdata(dev);

        return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
}

static DEFINE_MUTEX(mux_interval_mutex);

static ssize_t
perf_event_mux_interval_ms_store(struct device *dev,
                                 struct device_attribute *attr,
                                 const char *buf, size_t count)
{
        struct pmu *pmu = dev_get_drvdata(dev);
        int timer, cpu, ret;

        ret = kstrtoint(buf, 0, &timer);
        if (ret)
                return ret;

        if (timer < 1)
                return -EINVAL;

        /* same value, noting to do */
        if (timer == pmu->hrtimer_interval_ms)
                return count;

        mutex_lock(&mux_interval_mutex);
        pmu->hrtimer_interval_ms = timer;

        /* update all cpuctx for this PMU */
        cpus_read_lock();
        for_each_online_cpu(cpu) {
                struct perf_cpu_context *cpuctx;
                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
                cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);

                cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpuctx);
        }
        cpus_read_unlock();
        mutex_unlock(&mux_interval_mutex);

        return count;
}
static DEVICE_ATTR_RW(perf_event_mux_interval_ms);

static struct attribute *pmu_dev_attrs[] = {
        &dev_attr_type.attr,
        &dev_attr_perf_event_mux_interval_ms.attr,
        &dev_attr_nr_addr_filters.attr,
        NULL,
};

static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int n)
{
        struct device *dev = kobj_to_dev(kobj);
        struct pmu *pmu = dev_get_drvdata(dev);

        if (n == 2 && !pmu->nr_addr_filters)
                return 0;

        return a->mode;
}

static struct attribute_group pmu_dev_attr_group = {
        .is_visible = pmu_dev_is_visible,
        .attrs = pmu_dev_attrs,
};

static const struct attribute_group *pmu_dev_groups[] = {
        &pmu_dev_attr_group,
        NULL,
};

static int pmu_bus_running;
static struct bus_type pmu_bus = {
        .name                = "event_source",
        .dev_groups        = pmu_dev_groups,
};

static void pmu_dev_release(struct device *dev)
{
        kfree(dev);
}

static int pmu_dev_alloc(struct pmu *pmu)
{
        int ret = -ENOMEM;

        pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
        if (!pmu->dev)
                goto out;

        pmu->dev->groups = pmu->attr_groups;
        device_initialize(pmu->dev);

        dev_set_drvdata(pmu->dev, pmu);
        pmu->dev->bus = &pmu_bus;
        pmu->dev->release = pmu_dev_release;

        ret = dev_set_name(pmu->dev, "%s", pmu->name);
        if (ret)
                goto free_dev;

        ret = device_add(pmu->dev);
        if (ret)
                goto free_dev;

        if (pmu->attr_update) {
                ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
                if (ret)
                        goto del_dev;
        }

out:
        return ret;

del_dev:
        device_del(pmu->dev);

free_dev:
        put_device(pmu->dev);
        goto out;
}

static struct lock_class_key cpuctx_mutex;
static struct lock_class_key cpuctx_lock;

int perf_pmu_register(struct pmu *pmu, const char *name, int type)
{
        int cpu, ret, max = PERF_TYPE_MAX;

        mutex_lock(&pmus_lock);
        ret = -ENOMEM;
        pmu->pmu_disable_count = alloc_percpu(int);
        if (!pmu->pmu_disable_count)
                goto unlock;

        pmu->type = -1;
        if (!name)
                goto skip_type;
        pmu->name = name;

        if (type != PERF_TYPE_SOFTWARE) {
                if (type >= 0)
                        max = type;

                ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
                if (ret < 0)
                        goto free_pdc;

                WARN_ON(type >= 0 && ret != type);

                type = ret;
        }
        pmu->type = type;

        if (pmu_bus_running) {
                ret = pmu_dev_alloc(pmu);
                if (ret)
                        goto free_idr;
        }

skip_type:
        if (pmu->task_ctx_nr == perf_hw_context) {
                static int hw_context_taken = 0;

                /*
                 * Other than systems with heterogeneous CPUs, it never makes
                 * sense for two PMUs to share perf_hw_context. PMUs which are
                 * uncore must use perf_invalid_context.
                 */
                if (WARN_ON_ONCE(hw_context_taken &&
                    !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
                        pmu->task_ctx_nr = perf_invalid_context;

                hw_context_taken = 1;
        }

        pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
        if (pmu->pmu_cpu_context)
                goto got_cpu_context;

        ret = -ENOMEM;
        pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
        if (!pmu->pmu_cpu_context)
                goto free_dev;

        for_each_possible_cpu(cpu) {
                struct perf_cpu_context *cpuctx;

                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
                __perf_event_init_context(&cpuctx->ctx);
                lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
                lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
                cpuctx->ctx.pmu = pmu;
                cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);

                __perf_mux_hrtimer_init(cpuctx, cpu);

                cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
                cpuctx->heap = cpuctx->heap_default;
        }

got_cpu_context:
        if (!pmu->start_txn) {
                if (pmu->pmu_enable) {
                        /*
                         * If we have pmu_enable/pmu_disable calls, install
                         * transaction stubs that use that to try and batch
                         * hardware accesses.
                         */
                        pmu->start_txn  = perf_pmu_start_txn;
                        pmu->commit_txn = perf_pmu_commit_txn;
                        pmu->cancel_txn = perf_pmu_cancel_txn;
                } else {
                        pmu->start_txn  = perf_pmu_nop_txn;
                        pmu->commit_txn = perf_pmu_nop_int;
                        pmu->cancel_txn = perf_pmu_nop_void;
                }
        }

        if (!pmu->pmu_enable) {
                pmu->pmu_enable  = perf_pmu_nop_void;
                pmu->pmu_disable = perf_pmu_nop_void;
        }

        if (!pmu->check_period)
                pmu->check_period = perf_event_nop_int;

        if (!pmu->event_idx)
                pmu->event_idx = perf_event_idx_default;

        /*
         * Ensure the TYPE_SOFTWARE PMUs are at the head of the list,
         * since these cannot be in the IDR. This way the linear search
         * is fast, provided a valid software event is provided.
         */
        if (type == PERF_TYPE_SOFTWARE || !name)
                list_add_rcu(&pmu->entry, &pmus);
        else
                list_add_tail_rcu(&pmu->entry, &pmus);

        atomic_set(&pmu->exclusive_cnt, 0);
        ret = 0;
unlock:
        mutex_unlock(&pmus_lock);

        return ret;

free_dev:
        device_del(pmu->dev);
        put_device(pmu->dev);

free_idr:
        if (pmu->type != PERF_TYPE_SOFTWARE)
                idr_remove(&pmu_idr, pmu->type);

free_pdc:
        free_percpu(pmu->pmu_disable_count);
        goto unlock;
}
EXPORT_SYMBOL_GPL(perf_pmu_register);

void perf_pmu_unregister(struct pmu *pmu)
{
        mutex_lock(&pmus_lock);
        list_del_rcu(&pmu->entry);

        /*
         * We dereference the pmu list under both SRCU and regular RCU, so
         * synchronize against both of those.
         */
        synchronize_srcu(&pmus_srcu);
        synchronize_rcu();

        free_percpu(pmu->pmu_disable_count);
        if (pmu->type != PERF_TYPE_SOFTWARE)
                idr_remove(&pmu_idr, pmu->type);
        if (pmu_bus_running) {
                if (pmu->nr_addr_filters)
                        device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
                device_del(pmu->dev);
                put_device(pmu->dev);
        }
        free_pmu_context(pmu);
        mutex_unlock(&pmus_lock);
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);

static inline bool has_extended_regs(struct perf_event *event)
{
        return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
               (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
}

static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
{
        struct perf_event_context *ctx = NULL;
        int ret;

        if (!try_module_get(pmu->module))
                return -ENODEV;

        /*
         * A number of pmu->event_init() methods iterate the sibling_list to,
         * for example, validate if the group fits on the PMU. Therefore,
         * if this is a sibling event, acquire the ctx->mutex to protect
         * the sibling_list.
         */
        if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
                /*
                 * This ctx->mutex can nest when we're called through
                 * inheritance. See the perf_event_ctx_lock_nested() comment.
                 */
                ctx = perf_event_ctx_lock_nested(event->group_leader,
                                                 SINGLE_DEPTH_NESTING);
                BUG_ON(!ctx);
        }

        event->pmu = pmu;
        ret = pmu->event_init(event);

        if (ctx)
                perf_event_ctx_unlock(event->group_leader, ctx);

        if (!ret) {
                if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
                    has_extended_regs(event))
                        ret = -EOPNOTSUPP;

                if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
                    event_has_any_exclude_flag(event))
                        ret = -EINVAL;

                if (ret && event->destroy)
                        event->destroy(event);
        }

        if (ret)
                module_put(pmu->module);

        return ret;
}

static struct pmu *perf_init_event(struct perf_event *event)
{
        int idx, type, ret;
        struct pmu *pmu;

        idx = srcu_read_lock(&pmus_srcu);

        /* Try parent's PMU first: */
        if (event->parent && event->parent->pmu) {
                pmu = event->parent->pmu;
                ret = perf_try_init_event(pmu, event);
                if (!ret)
                        goto unlock;
        }

        /*
         * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
         * are often aliases for PERF_TYPE_RAW.
         */
        type = event->attr.type;
        if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE)
                type = PERF_TYPE_RAW;

again:
        rcu_read_lock();
        pmu = idr_find(&pmu_idr, type);
        rcu_read_unlock();
        if (pmu) {
                ret = perf_try_init_event(pmu, event);
                if (ret == -ENOENT && event->attr.type != type) {
                        type = event->attr.type;
                        goto again;
                }

                if (ret)
                        pmu = ERR_PTR(ret);

                goto unlock;
        }

        list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
                ret = perf_try_init_event(pmu, event);
                if (!ret)
                        goto unlock;

                if (ret != -ENOENT) {
                        pmu = ERR_PTR(ret);
                        goto unlock;
                }
        }
        pmu = ERR_PTR(-ENOENT);
unlock:
        srcu_read_unlock(&pmus_srcu, idx);

        return pmu;
}

static void attach_sb_event(struct perf_event *event)
{
        struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);

        raw_spin_lock(&pel->lock);
        list_add_rcu(&event->sb_list, &pel->list);
        raw_spin_unlock(&pel->lock);
}

/*
 * We keep a list of all !task (and therefore per-cpu) events
 * that need to receive side-band records.
 *
 * This avoids having to scan all the various PMU per-cpu contexts
 * looking for them.
 */
static void account_pmu_sb_event(struct perf_event *event)
{
        if (is_sb_event(event))
                attach_sb_event(event);
}

static void account_event_cpu(struct perf_event *event, int cpu)
{
        if (event->parent)
                return;

        if (is_cgroup_event(event))
                atomic_inc(&per_cpu(perf_cgroup_events, cpu));
}

/* Freq events need the tick to stay alive (see perf_event_task_tick). */
static void account_freq_event_nohz(void)
{
#ifdef CONFIG_NO_HZ_FULL
        /* Lock so we don't race with concurrent unaccount */
        spin_lock(&nr_freq_lock);
        if (atomic_inc_return(&nr_freq_events) == 1)
                tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
        spin_unlock(&nr_freq_lock);
#endif
}

static void account_freq_event(void)
{
        if (tick_nohz_full_enabled())
                account_freq_event_nohz();
        else
                atomic_inc(&nr_freq_events);
}


static void account_event(struct perf_event *event)
{
        bool inc = false;

        if (event->parent)
                return;

        if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
                inc = true;
        if (event->attr.mmap || event->attr.mmap_data)
                atomic_inc(&nr_mmap_events);
        if (event->attr.comm)
                atomic_inc(&nr_comm_events);
        if (event->attr.namespaces)
                atomic_inc(&nr_namespaces_events);
        if (event->attr.cgroup)
                atomic_inc(&nr_cgroup_events);
        if (event->attr.task)
                atomic_inc(&nr_task_events);
        if (event->attr.freq)
                account_freq_event();
        if (event->attr.context_switch) {
                atomic_inc(&nr_switch_events);
                inc = true;
        }
        if (has_branch_stack(event))
                inc = true;
        if (is_cgroup_event(event))
                inc = true;
        if (event->attr.ksymbol)
                atomic_inc(&nr_ksymbol_events);
        if (event->attr.bpf_event)
                atomic_inc(&nr_bpf_events);
        if (event->attr.text_poke)
                atomic_inc(&nr_text_poke_events);

        if (inc) {
                /*
                 * We need the mutex here because static_branch_enable()
                 * must complete *before* the perf_sched_count increment
                 * becomes visible.
                 */
                if (atomic_inc_not_zero(&perf_sched_count))
                        goto enabled;

                mutex_lock(&perf_sched_mutex);
                if (!atomic_read(&perf_sched_count)) {
                        static_branch_enable(&perf_sched_events);
                        /*
                         * Guarantee that all CPUs observe they key change and
                         * call the perf scheduling hooks before proceeding to
                         * install events that need them.
                         */
                        synchronize_rcu();
                }
                /*
                 * Now that we have waited for the sync_sched(), allow further
                 * increments to by-pass the mutex.
                 */
                atomic_inc(&perf_sched_count);
                mutex_unlock(&perf_sched_mutex);
        }
enabled:

        account_event_cpu(event, event->cpu);

        account_pmu_sb_event(event);
}

/*
 * Allocate and initialize an event structure
 */
static struct perf_event *
perf_event_alloc(struct perf_event_attr *attr, int cpu,
                 struct task_struct *task,
                 struct perf_event *group_leader,
                 struct perf_event *parent_event,
                 perf_overflow_handler_t overflow_handler,
                 void *context, int cgroup_fd)
{
        struct pmu *pmu;
        struct perf_event *event;
        struct hw_perf_event *hwc;
        long err = -EINVAL;

        if ((unsigned)cpu >= nr_cpu_ids) {
                if (!task || cpu != -1)
                        return ERR_PTR(-EINVAL);
        }

        event = kzalloc(sizeof(*event), GFP_KERNEL);
        if (!event)
                return ERR_PTR(-ENOMEM);

        /*
         * Single events are their own group leaders, with an
         * empty sibling list:
         */
        if (!group_leader)
                group_leader = event;

        mutex_init(&event->child_mutex);
        INIT_LIST_HEAD(&event->child_list);

        INIT_LIST_HEAD(&event->event_entry);
        INIT_LIST_HEAD(&event->sibling_list);
        INIT_LIST_HEAD(&event->active_list);
        init_event_group(event);
        INIT_LIST_HEAD(&event->rb_entry);
        INIT_LIST_HEAD(&event->active_entry);
        INIT_LIST_HEAD(&event->addr_filters.list);
        INIT_HLIST_NODE(&event->hlist_entry);


        init_waitqueue_head(&event->waitq);
        event->pending_disable = -1;
        init_irq_work(&event->pending, perf_pending_event);

        mutex_init(&event->mmap_mutex);
        raw_spin_lock_init(&event->addr_filters.lock);

        atomic_long_set(&event->refcount, 1);
        event->cpu                = cpu;
        event->attr                = *attr;
        event->group_leader        = group_leader;
        event->pmu                = NULL;
        event->oncpu                = -1;

        event->parent                = parent_event;

        event->ns                = get_pid_ns(task_active_pid_ns(current));
        event->id                = atomic64_inc_return(&perf_event_id);

        event->state                = PERF_EVENT_STATE_INACTIVE;

        if (task) {
                event->attach_state = PERF_ATTACH_TASK;
                /*
                 * XXX pmu::event_init needs to know what task to account to
                 * and we cannot use the ctx information because we need the
                 * pmu before we get a ctx.
                 */
                event->hw.target = get_task_struct(task);
        }

        event->clock = &local_clock;
        if (parent_event)
                event->clock = parent_event->clock;

        if (!overflow_handler && parent_event) {
                overflow_handler = parent_event->overflow_handler;
                context = parent_event->overflow_handler_context;
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
                if (overflow_handler == bpf_overflow_handler) {
                        struct bpf_prog *prog = parent_event->prog;

                        bpf_prog_inc(prog);
                        event->prog = prog;
                        event->orig_overflow_handler =
                                parent_event->orig_overflow_handler;
                }
#endif
        }

        if (overflow_handler) {
                event->overflow_handler        = overflow_handler;
                event->overflow_handler_context = context;
        } else if (is_write_backward(event)){
                event->overflow_handler = perf_event_output_backward;
                event->overflow_handler_context = NULL;
        } else {
                event->overflow_handler = perf_event_output_forward;
                event->overflow_handler_context = NULL;
        }

        perf_event__state_init(event);

        pmu = NULL;

        hwc = &event->hw;
        hwc->sample_period = attr->sample_period;
        if (attr->freq && attr->sample_freq)
                hwc->sample_period = 1;
        hwc->last_period = hwc->sample_period;

        local64_set(&hwc->period_left, hwc->sample_period);

        /*
         * We currently do not support PERF_SAMPLE_READ on inherited events.
         * See perf_output_read().
         */
        if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
                goto err_ns;

        if (!has_branch_stack(event))
                event->attr.branch_sample_type = 0;

        pmu = perf_init_event(event);
        if (IS_ERR(pmu)) {
                err = PTR_ERR(pmu);
                goto err_ns;
        }

        /*
         * Disallow uncore-cgroup events, they don't make sense as the cgroup will
         * be different on other CPUs in the uncore mask.
         */
        if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
                err = -EINVAL;
                goto err_pmu;
        }

        if (event->attr.aux_output &&
            !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
                err = -EOPNOTSUPP;
                goto err_pmu;
        }

        if (cgroup_fd != -1) {
                err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
                if (err)
                        goto err_pmu;
        }

        err = exclusive_event_init(event);
        if (err)
                goto err_pmu;

        if (has_addr_filter(event)) {
                event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
                                                    sizeof(struct perf_addr_filter_range),
                                                    GFP_KERNEL);
                if (!event->addr_filter_ranges) {
                        err = -ENOMEM;
                        goto err_per_task;
                }

                /*
                 * Clone the parent's vma offsets: they are valid until exec()
                 * even if the mm is not shared with the parent.
                 */
                if (event->parent) {
                        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);

                        raw_spin_lock_irq(&ifh->lock);
                        memcpy(event->addr_filter_ranges,
                               event->parent->addr_filter_ranges,
                               pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
                        raw_spin_unlock_irq(&ifh->lock);
                }

                /* force hw sync on the address filters */
                event->addr_filters_gen = 1;
        }

        if (!event->parent) {
                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
                        err = get_callchain_buffers(attr->sample_max_stack);
                        if (err)
                                goto err_addr_filters;
                }
        }

        err = security_perf_event_alloc(event);
        if (err)
                goto err_callchain_buffer;

        /* symmetric to unaccount_event() in _free_event() */
        account_event(event);

        return event;

err_callchain_buffer:
        if (!event->parent) {
                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
                        put_callchain_buffers();
        }
err_addr_filters:
        kfree(event->addr_filter_ranges);

err_per_task:
        exclusive_event_destroy(event);

err_pmu:
        if (is_cgroup_event(event))
                perf_detach_cgroup(event);
        if (event->destroy)
                event->destroy(event);
        module_put(pmu->module);
err_ns:
        if (event->ns)
                put_pid_ns(event->ns);
        if (event->hw.target)
                put_task_struct(event->hw.target);
        kfree(event);

        return ERR_PTR(err);
}

static int perf_copy_attr(struct perf_event_attr __user *uattr,
                          struct perf_event_attr *attr)
{
        u32 size;
        int ret;

        /* Zero the full structure, so that a short copy will be nice. */
        memset(attr, 0, sizeof(*attr));

        ret = get_user(size, &uattr->size);
        if (ret)
                return ret;

        /* ABI compatibility quirk: */
        if (!size)
                size = PERF_ATTR_SIZE_VER0;
        if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE)
                goto err_size;

        ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
        if (ret) {
                if (ret == -E2BIG)
                        goto err_size;
                return ret;
        }

        attr->size = size;

        if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
                return -EINVAL;

        if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
                return -EINVAL;

        if (attr->read_format & ~(PERF_FORMAT_MAX-1))
                return -EINVAL;

        if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
                u64 mask = attr->branch_sample_type;

                /* only using defined bits */
                if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
                        return -EINVAL;

                /* at least one branch bit must be set */
                if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
                        return -EINVAL;

                /* propagate priv level, when not set for branch */
                if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {

                        /* exclude_kernel checked on syscall entry */
                        if (!attr->exclude_kernel)
                                mask |= PERF_SAMPLE_BRANCH_KERNEL;

                        if (!attr->exclude_user)
                                mask |= PERF_SAMPLE_BRANCH_USER;

                        if (!attr->exclude_hv)
                                mask |= PERF_SAMPLE_BRANCH_HV;
                        /*
                         * adjust user setting (for HW filter setup)
                         */
                        attr->branch_sample_type = mask;
                }
                /* privileged levels capture (kernel, hv): check permissions */
                if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
                        ret = perf_allow_kernel(attr);
                        if (ret)
                                return ret;
                }
        }

        if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
                ret = perf_reg_validate(attr->sample_regs_user);
                if (ret)
                        return ret;
        }

        if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
                if (!arch_perf_have_user_stack_dump())
                        return -ENOSYS;

                /*
                 * We have __u32 type for the size, but so far
                 * we can only use __u16 as maximum due to the
                 * __u16 sample size limit.
                 */
                if (attr->sample_stack_user >= USHRT_MAX)
                        return -EINVAL;
                else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
                        return -EINVAL;
        }

        if (!attr->sample_max_stack)
                attr->sample_max_stack = sysctl_perf_event_max_stack;

        if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
                ret = perf_reg_validate(attr->sample_regs_intr);

#ifndef CONFIG_CGROUP_PERF
        if (attr->sample_type & PERF_SAMPLE_CGROUP)
                return -EINVAL;
#endif

out:
        return ret;

err_size:
        put_user(sizeof(*attr), &uattr->size);
        ret = -E2BIG;
        goto out;
}

static void mutex_lock_double(struct mutex *a, struct mutex *b)
{
        if (b < a)
                swap(a, b);

        mutex_lock(a);
        mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
}

static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
        struct perf_buffer *rb = NULL;
        int ret = -EINVAL;

        if (!output_event) {
                mutex_lock(&event->mmap_mutex);
                goto set;
        }

        /* don't allow circular references */
        if (event == output_event)
                goto out;

        /*
         * Don't allow cross-cpu buffers
         */
        if (output_event->cpu != event->cpu)
                goto out;

        /*
         * If its not a per-cpu rb, it must be the same task.
         */
        if (output_event->cpu == -1 && output_event->hw.target != event->hw.target)
                goto out;

        /*
         * Mixing clocks in the same buffer is trouble you don't need.
         */
        if (output_event->clock != event->clock)
                goto out;

        /*
         * Either writing ring buffer from beginning or from end.
         * Mixing is not allowed.
         */
        if (is_write_backward(output_event) != is_write_backward(event))
                goto out;

        /*
         * If both events generate aux data, they must be on the same PMU
         */
        if (has_aux(event) && has_aux(output_event) &&
            event->pmu != output_event->pmu)
                goto out;

        /*
         * Hold both mmap_mutex to serialize against perf_mmap_close().  Since
         * output_event is already on rb->event_list, and the list iteration
         * restarts after every removal, it is guaranteed this new event is
         * observed *OR* if output_event is already removed, it's guaranteed we
         * observe !rb->mmap_count.
         */
        mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
set:
        /* Can't redirect output if we've got an active mmap() */
        if (atomic_read(&event->mmap_count))
                goto unlock;

        if (output_event) {
                /* get the rb we want to redirect to */
                rb = ring_buffer_get(output_event);
                if (!rb)
                        goto unlock;

                /* did we race against perf_mmap_close() */
                if (!atomic_read(&rb->mmap_count)) {
                        ring_buffer_put(rb);
                        goto unlock;
                }
        }

        ring_buffer_attach(event, rb);

        ret = 0;
unlock:
        mutex_unlock(&event->mmap_mutex);
        if (output_event)
                mutex_unlock(&output_event->mmap_mutex);

out:
        return ret;
}

static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
{
        bool nmi_safe = false;

        switch (clk_id) {
        case CLOCK_MONOTONIC:
                event->clock = &ktime_get_mono_fast_ns;
                nmi_safe = true;
                break;

        case CLOCK_MONOTONIC_RAW:
                event->clock = &ktime_get_raw_fast_ns;
                nmi_safe = true;
                break;

        case CLOCK_REALTIME:
                event->clock = &ktime_get_real_ns;
                break;

        case CLOCK_BOOTTIME:
                event->clock = &ktime_get_boottime_ns;
                break;

        case CLOCK_TAI:
                event->clock = &ktime_get_clocktai_ns;
                break;

        default:
                return -EINVAL;
        }

        if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
                return -EINVAL;

        return 0;
}

/*
 * Variation on perf_event_ctx_lock_nested(), except we take two context
 * mutexes.
 */
static struct perf_event_context *
__perf_event_ctx_lock_double(struct perf_event *group_leader,
                             struct perf_event_context *ctx)
{
        struct perf_event_context *gctx;

again:
        rcu_read_lock();
        gctx = READ_ONCE(group_leader->ctx);
        if (!refcount_inc_not_zero(&gctx->refcount)) {
                rcu_read_unlock();
                goto again;
        }
        rcu_read_unlock();

        mutex_lock_double(&gctx->mutex, &ctx->mutex);

        if (group_leader->ctx != gctx) {
                mutex_unlock(&ctx->mutex);
                mutex_unlock(&gctx->mutex);
                put_ctx(gctx);
                goto again;
        }

        return gctx;
}

/**
 * sys_perf_event_open - open a performance event, associate it to a task/cpu
 *
 * @attr_uptr:        event_id type attributes for monitoring/sampling
 * @pid:                target pid
 * @cpu:                target cpu
 * @group_fd:                group leader event fd
 */
SYSCALL_DEFINE5(perf_event_open,
                struct perf_event_attr __user *, attr_uptr,
                pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
        struct perf_event *group_leader = NULL, *output_event = NULL;
        struct perf_event *event, *sibling;
        struct perf_event_attr attr;
        struct perf_event_context *ctx, *gctx;
        struct file *event_file = NULL;
        struct fd group = {NULL, 0};
        struct task_struct *task = NULL;
        struct pmu *pmu;
        int event_fd;
        int move_group = 0;
        int err;
        int f_flags = O_RDWR;
        int cgroup_fd = -1;

        /* for future expandability... */
        if (flags & ~PERF_FLAG_ALL)
                return -EINVAL;

        err = perf_copy_attr(attr_uptr, &attr);
        if (err)
                return err;

        /* Do we allow access to perf_event_open(2) ? */
        err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
        if (err)
                return err;

        if (!attr.exclude_kernel) {
                err = perf_allow_kernel(&attr);
                if (err)
                        return err;
        }

        if (attr.namespaces) {
                if (!perfmon_capable())
                        return -EACCES;
        }

        if (attr.freq) {
                if (attr.sample_freq > sysctl_perf_event_sample_rate)
                        return -EINVAL;
        } else {
                if (attr.sample_period & (1ULL << 63))
                        return -EINVAL;
        }

        /* Only privileged users can get physical addresses */
        if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
                err = perf_allow_kernel(&attr);
                if (err)
                        return err;
        }

        /* REGS_INTR can leak data, lockdown must prevent this */
        if (attr.sample_type & PERF_SAMPLE_REGS_INTR) {
                err = security_locked_down(LOCKDOWN_PERF);
                if (err)
                        return err;
        }

        /*
         * In cgroup mode, the pid argument is used to pass the fd
         * opened to the cgroup directory in cgroupfs. The cpu argument
         * designates the cpu on which to monitor threads from that
         * cgroup.
         */
        if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
                return -EINVAL;

        if (flags & PERF_FLAG_FD_CLOEXEC)
                f_flags |= O_CLOEXEC;

        event_fd = get_unused_fd_flags(f_flags);
        if (event_fd < 0)
                return event_fd;

        if (group_fd != -1) {
                err = perf_fget_light(group_fd, &group);
                if (err)
                        goto err_fd;
                group_leader = group.file->private_data;
                if (flags & PERF_FLAG_FD_OUTPUT)
                        output_event = group_leader;
                if (flags & PERF_FLAG_FD_NO_GROUP)
                        group_leader = NULL;
        }

        if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
                task = find_lively_task_by_vpid(pid);
                if (IS_ERR(task)) {
                        err = PTR_ERR(task);
                        goto err_group_fd;
                }
        }

        if (task && group_leader &&
            group_leader->attr.inherit != attr.inherit) {
                err = -EINVAL;
                goto err_task;
        }

        if (flags & PERF_FLAG_PID_CGROUP)
                cgroup_fd = pid;

        event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
                                 NULL, NULL, cgroup_fd);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
                goto err_task;
        }

        if (is_sampling_event(event)) {
                if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
                        err = -EOPNOTSUPP;
                        goto err_alloc;
                }
        }

        /*
         * Special case software events and allow them to be part of
         * any hardware group.
         */
        pmu = event->pmu;

        if (attr.use_clockid) {
                err = perf_event_set_clock(event, attr.clockid);
                if (err)
                        goto err_alloc;
        }

        if (pmu->task_ctx_nr == perf_sw_context)
                event->event_caps |= PERF_EV_CAP_SOFTWARE;

        if (group_leader) {
                if (is_software_event(event) &&
                    !in_software_context(group_leader)) {
                        /*
                         * If the event is a sw event, but the group_leader
                         * is on hw context.
                         *
                         * Allow the addition of software events to hw
                         * groups, this is safe because software events
                         * never fail to schedule.
                         */
                        pmu = group_leader->ctx->pmu;
                } else if (!is_software_event(event) &&
                           is_software_event(group_leader) &&
                           (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
                        /*
                         * In case the group is a pure software group, and we
                         * try to add a hardware event, move the whole group to
                         * the hardware context.
                         */
                        move_group = 1;
                }
        }

        /*
         * Get the target context (task or percpu):
         */
        ctx = find_get_context(pmu, task, event);
        if (IS_ERR(ctx)) {
                err = PTR_ERR(ctx);
                goto err_alloc;
        }

        /*
         * Look up the group leader (we will attach this event to it):
         */
        if (group_leader) {
                err = -EINVAL;

                /*
                 * Do not allow a recursive hierarchy (this new sibling
                 * becoming part of another group-sibling):
                 */
                if (group_leader->group_leader != group_leader)
                        goto err_context;

                /* All events in a group should have the same clock */
                if (group_leader->clock != event->clock)
                        goto err_context;

                /*
                 * Make sure we're both events for the same CPU;
                 * grouping events for different CPUs is broken; since
                 * you can never concurrently schedule them anyhow.
                 */
                if (group_leader->cpu != event->cpu)
                        goto err_context;

                /*
                 * Make sure we're both on the same task, or both
                 * per-CPU events.
                 */
                if (group_leader->ctx->task != ctx->task)
                        goto err_context;

                /*
                 * Do not allow to attach to a group in a different task
                 * or CPU context. If we're moving SW events, we'll fix
                 * this up later, so allow that.
                 *
                 * Racy, not holding group_leader->ctx->mutex, see comment with
                 * perf_event_ctx_lock().
                 */
                if (!move_group && group_leader->ctx != ctx)
                        goto err_context;

                /*
                 * Only a group leader can be exclusive or pinned
                 */
                if (attr.exclusive || attr.pinned)
                        goto err_context;
        }

        if (output_event) {
                err = perf_event_set_output(event, output_event);
                if (err)
                        goto err_context;
        }

        event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
                                        f_flags);
        if (IS_ERR(event_file)) {
                err = PTR_ERR(event_file);
                event_file = NULL;
                goto err_context;
        }

        if (task) {
                err = down_read_interruptible(&task->signal->exec_update_lock);
                if (err)
                        goto err_file;

                /*
                 * Preserve ptrace permission check for backwards compatibility.
                 *
                 * We must hold exec_update_lock across this and any potential
                 * perf_install_in_context() call for this new event to
                 * serialize against exec() altering our credentials (and the
                 * perf_event_exit_task() that could imply).
                 */
                err = -EACCES;
                if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
                        goto err_cred;
        }

        if (move_group) {
                gctx = __perf_event_ctx_lock_double(group_leader, ctx);

                if (gctx->task == TASK_TOMBSTONE) {
                        err = -ESRCH;
                        goto err_locked;
                }

                /*
                 * Check if we raced against another sys_perf_event_open() call
                 * moving the software group underneath us.
                 */
                if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
                        /*
                         * If someone moved the group out from under us, check
                         * if this new event wound up on the same ctx, if so
                         * its the regular !move_group case, otherwise fail.
                         */
                        if (gctx != ctx) {
                                err = -EINVAL;
                                goto err_locked;
                        } else {
                                perf_event_ctx_unlock(group_leader, gctx);
                                move_group = 0;
                                goto not_move_group;
                        }
                }

                /*
                 * Failure to create exclusive events returns -EBUSY.
                 */
                err = -EBUSY;
                if (!exclusive_event_installable(group_leader, ctx))
                        goto err_locked;

                for_each_sibling_event(sibling, group_leader) {
                        if (!exclusive_event_installable(sibling, ctx))
                                goto err_locked;
                }
        } else {
                mutex_lock(&ctx->mutex);

                /*
                 * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx,
                 * see the group_leader && !move_group test earlier.
                 */
                if (group_leader && group_leader->ctx != ctx) {
                        err = -EINVAL;
                        goto err_locked;
                }
        }
not_move_group:

        if (ctx->task == TASK_TOMBSTONE) {
                err = -ESRCH;
                goto err_locked;
        }

        if (!perf_event_validate_size(event)) {
                err = -E2BIG;
                goto err_locked;
        }

        if (!task) {
                /*
                 * Check if the @cpu we're creating an event for is online.
                 *
                 * We use the perf_cpu_context::ctx::mutex to serialize against
                 * the hotplug notifiers. See perf_event_{init,exit}_cpu().
                 */
                struct perf_cpu_context *cpuctx =
                        container_of(ctx, struct perf_cpu_context, ctx);

                if (!cpuctx->online) {
                        err = -ENODEV;
                        goto err_locked;
                }
        }

        if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
                err = -EINVAL;
                goto err_locked;
        }

        /*
         * Must be under the same ctx::mutex as perf_install_in_context(),
         * because we need to serialize with concurrent event creation.
         */
        if (!exclusive_event_installable(event, ctx)) {
                err = -EBUSY;
                goto err_locked;
        }

        WARN_ON_ONCE(ctx->parent_ctx);

        /*
         * This is the point on no return; we cannot fail hereafter. This is
         * where we start modifying current state.
         */

        if (move_group) {
                /*
                 * See perf_event_ctx_lock() for comments on the details
                 * of swizzling perf_event::ctx.
                 */
                perf_remove_from_context(group_leader, 0);
                put_ctx(gctx);

                for_each_sibling_event(sibling, group_leader) {
                        perf_remove_from_context(sibling, 0);
                        put_ctx(gctx);
                }

                /*
                 * Wait for everybody to stop referencing the events through
                 * the old lists, before installing it on new lists.
                 */
                synchronize_rcu();

                /*
                 * Install the group siblings before the group leader.
                 *
                 * Because a group leader will try and install the entire group
                 * (through the sibling list, which is still in-tact), we can
                 * end up with siblings installed in the wrong context.
                 *
                 * By installing siblings first we NO-OP because they're not
                 * reachable through the group lists.
                 */
                for_each_sibling_event(sibling, group_leader) {
                        perf_event__state_init(sibling);
                        perf_install_in_context(ctx, sibling, sibling->cpu);
                        get_ctx(ctx);
                }

                /*
                 * Removing from the context ends up with disabled
                 * event. What we want here is event in the initial
                 * startup state, ready to be add into new context.
                 */
                perf_event__state_init(group_leader);
                perf_install_in_context(ctx, group_leader, group_leader->cpu);
                get_ctx(ctx);
        }

        /*
         * Precalculate sample_data sizes; do while holding ctx::mutex such
         * that we're serialized against further additions and before
         * perf_install_in_context() which is the point the event is active and
         * can use these values.
         */
        perf_event__header_size(event);
        perf_event__id_header_size(event);

        event->owner = current;

        perf_install_in_context(ctx, event, event->cpu);
        perf_unpin_context(ctx);

        if (move_group)
                perf_event_ctx_unlock(group_leader, gctx);
        mutex_unlock(&ctx->mutex);

        if (task) {
                up_read(&task->signal->exec_update_lock);
                put_task_struct(task);
        }

        mutex_lock(&current->perf_event_mutex);
        list_add_tail(&event->owner_entry, &current->perf_event_list);
        mutex_unlock(&current->perf_event_mutex);

        /*
         * Drop the reference on the group_event after placing the
         * new event on the sibling_list. This ensures destruction
         * of the group leader will find the pointer to itself in
         * perf_group_detach().
         */
        fdput(group);
        fd_install(event_fd, event_file);
        return event_fd;

err_locked:
        if (move_group)
                perf_event_ctx_unlock(group_leader, gctx);
        mutex_unlock(&ctx->mutex);
err_cred:
        if (task)
                up_read(&task->signal->exec_update_lock);
err_file:
        fput(event_file);
err_context:
        perf_unpin_context(ctx);
        put_ctx(ctx);
err_alloc:
        /*
         * If event_file is set, the fput() above will have called ->release()
         * and that will take care of freeing the event.
         */
        if (!event_file)
                free_event(event);
err_task:
        if (task)
                put_task_struct(task);
err_group_fd:
        fdput(group);
err_fd:
        put_unused_fd(event_fd);
        return err;
}

/**
 * perf_event_create_kernel_counter
 *
 * @attr: attributes of the counter to create
 * @cpu: cpu in which the counter is bound
 * @task: task to profile (NULL for percpu)
 */
struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
                                 struct task_struct *task,
                                 perf_overflow_handler_t overflow_handler,
                                 void *context)
{
        struct perf_event_context *ctx;
        struct perf_event *event;
        int err;

        /*
         * Grouping is not supported for kernel events, neither is 'AUX',
         * make sure the caller's intentions are adjusted.
         */
        if (attr->aux_output)
                return ERR_PTR(-EINVAL);

        event = perf_event_alloc(attr, cpu, task, NULL, NULL,
                                 overflow_handler, context, -1);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
                goto err;
        }

        /* Mark owner so we could distinguish it from user events. */
        event->owner = TASK_TOMBSTONE;

        /*
         * Get the target context (task or percpu):
         */
        ctx = find_get_context(event->pmu, task, event);
        if (IS_ERR(ctx)) {
                err = PTR_ERR(ctx);
                goto err_free;
        }

        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
        if (ctx->task == TASK_TOMBSTONE) {
                err = -ESRCH;
                goto err_unlock;
        }

        if (!task) {
                /*
                 * Check if the @cpu we're creating an event for is online.
                 *
                 * We use the perf_cpu_context::ctx::mutex to serialize against
                 * the hotplug notifiers. See perf_event_{init,exit}_cpu().
                 */
                struct perf_cpu_context *cpuctx =
                        container_of(ctx, struct perf_cpu_context, ctx);
                if (!cpuctx->online) {
                        err = -ENODEV;
                        goto err_unlock;
                }
        }

        if (!exclusive_event_installable(event, ctx)) {
                err = -EBUSY;
                goto err_unlock;
        }

        perf_install_in_context(ctx, event, event->cpu);
        perf_unpin_context(ctx);
        mutex_unlock(&ctx->mutex);

        return event;

err_unlock:
        mutex_unlock(&ctx->mutex);
        perf_unpin_context(ctx);
        put_ctx(ctx);
err_free:
        free_event(event);
err:
        return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);

void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
{
        struct perf_event_context *src_ctx;
        struct perf_event_context *dst_ctx;
        struct perf_event *event, *tmp;
        LIST_HEAD(events);

        src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
        dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;

        /*
         * See perf_event_ctx_lock() for comments on the details
         * of swizzling perf_event::ctx.
         */
        mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
        list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
                                 event_entry) {
                perf_remove_from_context(event, 0);
                unaccount_event_cpu(event, src_cpu);
                put_ctx(src_ctx);
                list_add(&event->migrate_entry, &events);
        }

        /*
         * Wait for the events to quiesce before re-instating them.
         */
        synchronize_rcu();

        /*
         * Re-instate events in 2 passes.
         *
         * Skip over group leaders and only install siblings on this first
         * pass, siblings will not get enabled without a leader, however a
         * leader will enable its siblings, even if those are still on the old
         * context.
         */
        list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
                if (event->group_leader == event)
                        continue;

                list_del(&event->migrate_entry);
                if (event->state >= PERF_EVENT_STATE_OFF)
                        event->state = PERF_EVENT_STATE_INACTIVE;
                account_event_cpu(event, dst_cpu);
                perf_install_in_context(dst_ctx, event, dst_cpu);
                get_ctx(dst_ctx);
        }

        /*
         * Once all the siblings are setup properly, install the group leaders
         * to make it go.
         */
        list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
                list_del(&event->migrate_entry);
                if (event->state >= PERF_EVENT_STATE_OFF)
                        event->state = PERF_EVENT_STATE_INACTIVE;
                account_event_cpu(event, dst_cpu);
                perf_install_in_context(dst_ctx, event, dst_cpu);
                get_ctx(dst_ctx);
        }
        mutex_unlock(&dst_ctx->mutex);
        mutex_unlock(&src_ctx->mutex);
}
EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);

static void sync_child_event(struct perf_event *child_event,
                               struct task_struct *child)
{
        struct perf_event *parent_event = child_event->parent;
        u64 child_val;

        if (child_event->attr.inherit_stat)
                perf_event_read_event(child_event, child);

        child_val = perf_event_count(child_event);

        /*
         * Add back the child's count to the parent's count:
         */
        atomic64_add(child_val, &parent_event->child_count);
        atomic64_add(child_event->total_time_enabled,
                     &parent_event->child_total_time_enabled);
        atomic64_add(child_event->total_time_running,
                     &parent_event->child_total_time_running);
}

static void
perf_event_exit_event(struct perf_event *child_event,
                      struct perf_event_context *child_ctx,
                      struct task_struct *child)
{
        struct perf_event *parent_event = child_event->parent;

        /*
         * Do not destroy the 'original' grouping; because of the context
         * switch optimization the original events could've ended up in a
         * random child task.
         *
         * If we were to destroy the original group, all group related
         * operations would cease to function properly after this random
         * child dies.
         *
         * Do destroy all inherited groups, we don't care about those
         * and being thorough is better.
         */
        raw_spin_lock_irq(&child_ctx->lock);
        WARN_ON_ONCE(child_ctx->is_active);

        if (parent_event)
                perf_group_detach(child_event);
        list_del_event(child_event, child_ctx);
        perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */
        raw_spin_unlock_irq(&child_ctx->lock);

        /*
         * Parent events are governed by their filedesc, retain them.
         */
        if (!parent_event) {
                perf_event_wakeup(child_event);
                return;
        }
        /*
         * Child events can be cleaned up.
         */

        sync_child_event(child_event, child);

        /*
         * Remove this event from the parent's list
         */
        WARN_ON_ONCE(parent_event->ctx->parent_ctx);
        mutex_lock(&parent_event->child_mutex);
        list_del_init(&child_event->child_list);
        mutex_unlock(&parent_event->child_mutex);

        /*
         * Kick perf_poll() for is_event_hup().
         */
        perf_event_wakeup(parent_event);
        free_event(child_event);
        put_event(parent_event);
}

static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
{
        struct perf_event_context *child_ctx, *clone_ctx = NULL;
        struct perf_event *child_event, *next;

        WARN_ON_ONCE(child != current);

        child_ctx = perf_pin_task_context(child, ctxn);
        if (!child_ctx)
                return;

        /*
         * In order to reduce the amount of tricky in ctx tear-down, we hold
         * ctx::mutex over the entire thing. This serializes against almost
         * everything that wants to access the ctx.
         *
         * The exception is sys_perf_event_open() /
         * perf_event_create_kernel_count() which does find_get_context()
         * without ctx::mutex (it cannot because of the move_group double mutex
         * lock thing). See the comments in perf_install_in_context().
         */
        mutex_lock(&child_ctx->mutex);

        /*
         * In a single ctx::lock section, de-schedule the events and detach the
         * context from the task such that we cannot ever get it scheduled back
         * in.
         */
        raw_spin_lock_irq(&child_ctx->lock);
        task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);

        /*
         * Now that the context is inactive, destroy the task <-> ctx relation
         * and mark the context dead.
         */
        RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
        put_ctx(child_ctx); /* cannot be last */
        WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
        put_task_struct(current); /* cannot be last */

        clone_ctx = unclone_ctx(child_ctx);
        raw_spin_unlock_irq(&child_ctx->lock);

        if (clone_ctx)
                put_ctx(clone_ctx);

        /*
         * Report the task dead after unscheduling the events so that we
         * won't get any samples after PERF_RECORD_EXIT. We can however still
         * get a few PERF_RECORD_READ events.
         */
        perf_event_task(child, child_ctx, 0);

        list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
                perf_event_exit_event(child_event, child_ctx, child);

        mutex_unlock(&child_ctx->mutex);

        put_ctx(child_ctx);
}

/*
 * When a child task exits, feed back event values to parent events.
 *
 * Can be called with exec_update_lock held when called from
 * setup_new_exec().
 */
void perf_event_exit_task(struct task_struct *child)
{
        struct perf_event *event, *tmp;
        int ctxn;

        mutex_lock(&child->perf_event_mutex);
        list_for_each_entry_safe(event, tmp, &child->perf_event_list,
                                 owner_entry) {
                list_del_init(&event->owner_entry);

                /*
                 * Ensure the list deletion is visible before we clear
                 * the owner, closes a race against perf_release() where
                 * we need to serialize on the owner->perf_event_mutex.
                 */
                smp_store_release(&event->owner, NULL);
        }
        mutex_unlock(&child->perf_event_mutex);

        for_each_task_context_nr(ctxn)
                perf_event_exit_task_context(child, ctxn);

        /*
         * The perf_event_exit_task_context calls perf_event_task
         * with child's task_ctx, which generates EXIT events for
         * child contexts and sets child->perf_event_ctxp[] to NULL.
         * At this point we need to send EXIT events to cpu contexts.
         */
        perf_event_task(child, NULL, 0);
}

static void perf_free_event(struct perf_event *event,
                            struct perf_event_context *ctx)
{
        struct perf_event *parent = event->parent;

        if (WARN_ON_ONCE(!parent))
                return;

        mutex_lock(&parent->child_mutex);
        list_del_init(&event->child_list);
        mutex_unlock(&parent->child_mutex);

        put_event(parent);

        raw_spin_lock_irq(&ctx->lock);
        perf_group_detach(event);
        list_del_event(event, ctx);
        raw_spin_unlock_irq(&ctx->lock);
        free_event(event);
}

/*
 * Free a context as created by inheritance by perf_event_init_task() below,
 * used by fork() in case of fail.
 *
 * Even though the task has never lived, the context and events have been
 * exposed through the child_list, so we must take care tearing it all down.
 */
void perf_event_free_task(struct task_struct *task)
{
        struct perf_event_context *ctx;
        struct perf_event *event, *tmp;
        int ctxn;

        for_each_task_context_nr(ctxn) {
                ctx = task->perf_event_ctxp[ctxn];
                if (!ctx)
                        continue;

                mutex_lock(&ctx->mutex);
                raw_spin_lock_irq(&ctx->lock);
                /*
                 * Destroy the task <-> ctx relation and mark the context dead.
                 *
                 * This is important because even though the task hasn't been
                 * exposed yet the context has been (through child_list).
                 */
                RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
                WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
                put_task_struct(task); /* cannot be last */
                raw_spin_unlock_irq(&ctx->lock);

                list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
                        perf_free_event(event, ctx);

                mutex_unlock(&ctx->mutex);

                /*
                 * perf_event_release_kernel() could've stolen some of our
                 * child events and still have them on its free_list. In that
                 * case we must wait for these events to have been freed (in
                 * particular all their references to this task must've been
                 * dropped).
                 *
                 * Without this copy_process() will unconditionally free this
                 * task (irrespective of its reference count) and
                 * _free_event()'s put_task_struct(event->hw.target) will be a
                 * use-after-free.
                 *
                 * Wait for all events to drop their context reference.
                 */
                wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
                put_ctx(ctx); /* must be last */
        }
}

void perf_event_delayed_put(struct task_struct *task)
{
        int ctxn;

        for_each_task_context_nr(ctxn)
                WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
}

struct file *perf_event_get(unsigned int fd)
{
        struct file *file = fget(fd);
        if (!file)
                return ERR_PTR(-EBADF);

        if (file->f_op != &perf_fops) {
                fput(file);
                return ERR_PTR(-EBADF);
        }

        return file;
}

const struct perf_event *perf_get_event(struct file *file)
{
        if (file->f_op != &perf_fops)
                return ERR_PTR(-EINVAL);

        return file->private_data;
}

const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
{
        if (!event)
                return ERR_PTR(-EINVAL);

        return &event->attr;
}

/*
 * Inherit an event from parent task to child task.
 *
 * Returns:
 *  - valid pointer on success
 *  - NULL for orphaned events
 *  - IS_ERR() on error
 */
static struct perf_event *
inherit_event(struct perf_event *parent_event,
              struct task_struct *parent,
              struct perf_event_context *parent_ctx,
              struct task_struct *child,
              struct perf_event *group_leader,
              struct perf_event_context *child_ctx)
{
        enum perf_event_state parent_state = parent_event->state;
        struct perf_event *child_event;
        unsigned long flags;

        /*
         * Instead of creating recursive hierarchies of events,
         * we link inherited events back to the original parent,
         * which has a filp for sure, which we use as the reference
         * count:
         */
        if (parent_event->parent)
                parent_event = parent_event->parent;

        child_event = perf_event_alloc(&parent_event->attr,
                                           parent_event->cpu,
                                           child,
                                           group_leader, parent_event,
                                           NULL, NULL, -1);
        if (IS_ERR(child_event))
                return child_event;


        if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
            !child_ctx->task_ctx_data) {
                struct pmu *pmu = child_event->pmu;

                child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
                if (!child_ctx->task_ctx_data) {
                        free_event(child_event);
                        return ERR_PTR(-ENOMEM);
                }
        }

        /*
         * is_orphaned_event() and list_add_tail(&parent_event->child_list)
         * must be under the same lock in order to serialize against
         * perf_event_release_kernel(), such that either we must observe
         * is_orphaned_event() or they will observe us on the child_list.
         */
        mutex_lock(&parent_event->child_mutex);
        if (is_orphaned_event(parent_event) ||
            !atomic_long_inc_not_zero(&parent_event->refcount)) {
                mutex_unlock(&parent_event->child_mutex);
                /* task_ctx_data is freed with child_ctx */
                free_event(child_event);
                return NULL;
        }

        get_ctx(child_ctx);

        /*
         * Make the child state follow the state of the parent event,
         * not its attr.disabled bit.  We hold the parent's mutex,
         * so we won't race with perf_event_{en, dis}able_family.
         */
        if (parent_state >= PERF_EVENT_STATE_INACTIVE)
                child_event->state = PERF_EVENT_STATE_INACTIVE;
        else
                child_event->state = PERF_EVENT_STATE_OFF;

        if (parent_event->attr.freq) {
                u64 sample_period = parent_event->hw.sample_period;
                struct hw_perf_event *hwc = &child_event->hw;

                hwc->sample_period = sample_period;
                hwc->last_period   = sample_period;

                local64_set(&hwc->period_left, sample_period);
        }

        child_event->ctx = child_ctx;
        child_event->overflow_handler = parent_event->overflow_handler;
        child_event->overflow_handler_context
                = parent_event->overflow_handler_context;

        /*
         * Precalculate sample_data sizes
         */
        perf_event__header_size(child_event);
        perf_event__id_header_size(child_event);

        /*
         * Link it up in the child's context:
         */
        raw_spin_lock_irqsave(&child_ctx->lock, flags);
        add_event_to_ctx(child_event, child_ctx);
        raw_spin_unlock_irqrestore(&child_ctx->lock, flags);

        /*
         * Link this into the parent event's child list
         */
        list_add_tail(&child_event->child_list, &parent_event->child_list);
        mutex_unlock(&parent_event->child_mutex);

        return child_event;
}

/*
 * Inherits an event group.
 *
 * This will quietly suppress orphaned events; !inherit_event() is not an error.
 * This matches with perf_event_release_kernel() removing all child events.
 *
 * Returns:
 *  - 0 on success
 *  - <0 on error
 */
static int inherit_group(struct perf_event *parent_event,
              struct task_struct *parent,
              struct perf_event_context *parent_ctx,
              struct task_struct *child,
              struct perf_event_context *child_ctx)
{
        struct perf_event *leader;
        struct perf_event *sub;
        struct perf_event *child_ctr;

        leader = inherit_event(parent_event, parent, parent_ctx,
                                 child, NULL, child_ctx);
        if (IS_ERR(leader))
                return PTR_ERR(leader);
        /*
         * @leader can be NULL here because of is_orphaned_event(). In this
         * case inherit_event() will create individual events, similar to what
         * perf_group_detach() would do anyway.
         */
        for_each_sibling_event(sub, parent_event) {
                child_ctr = inherit_event(sub, parent, parent_ctx,
                                            child, leader, child_ctx);
                if (IS_ERR(child_ctr))
                        return PTR_ERR(child_ctr);

                if (sub->aux_event == parent_event && child_ctr &&
                    !perf_get_aux_event(child_ctr, leader))
                        return -EINVAL;
        }
        if (leader)
                leader->group_generation = parent_event->group_generation;
        return 0;
}

/*
 * Creates the child task context and tries to inherit the event-group.
 *
 * Clears @inherited_all on !attr.inherited or error. Note that we'll leave
 * inherited_all set when we 'fail' to inherit an orphaned event; this is
 * consistent with perf_event_release_kernel() removing all child events.
 *
 * Returns:
 *  - 0 on success
 *  - <0 on error
 */
static int
inherit_task_group(struct perf_event *event, struct task_struct *parent,
                   struct perf_event_context *parent_ctx,
                   struct task_struct *child, int ctxn,
                   int *inherited_all)
{
        int ret;
        struct perf_event_context *child_ctx;

        if (!event->attr.inherit) {
                *inherited_all = 0;
                return 0;
        }

        child_ctx = child->perf_event_ctxp[ctxn];
        if (!child_ctx) {
                /*
                 * This is executed from the parent task context, so
                 * inherit events that have been marked for cloning.
                 * First allocate and initialize a context for the
                 * child.
                 */
                child_ctx = alloc_perf_context(parent_ctx->pmu, child);
                if (!child_ctx)
                        return -ENOMEM;

                child->perf_event_ctxp[ctxn] = child_ctx;
        }

        ret = inherit_group(event, parent, parent_ctx,
                            child, child_ctx);

        if (ret)
                *inherited_all = 0;

        return ret;
}

/*
 * Initialize the perf_event context in task_struct
 */
static int perf_event_init_context(struct task_struct *child, int ctxn)
{
        struct perf_event_context *child_ctx, *parent_ctx;
        struct perf_event_context *cloned_ctx;
        struct perf_event *event;
        struct task_struct *parent = current;
        int inherited_all = 1;
        unsigned long flags;
        int ret = 0;

        if (likely(!parent->perf_event_ctxp[ctxn]))
                return 0;

        /*
         * If the parent's context is a clone, pin it so it won't get
         * swapped under us.
         */
        parent_ctx = perf_pin_task_context(parent, ctxn);
        if (!parent_ctx)
                return 0;

        /*
         * No need to check if parent_ctx != NULL here; since we saw
         * it non-NULL earlier, the only reason for it to become NULL
         * is if we exit, and since we're currently in the middle of
         * a fork we can't be exiting at the same time.
         */

        /*
         * Lock the parent list. No need to lock the child - not PID
         * hashed yet and not running, so nobody can access it.
         */
        mutex_lock(&parent_ctx->mutex);

        /*
         * We dont have to disable NMIs - we are only looking at
         * the list, not manipulating it:
         */
        perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
                ret = inherit_task_group(event, parent, parent_ctx,
                                         child, ctxn, &inherited_all);
                if (ret)
                        goto out_unlock;
        }

        /*
         * We can't hold ctx->lock when iterating the ->flexible_group list due
         * to allocations, but we need to prevent rotation because
         * rotate_ctx() will change the list from interrupt context.
         */
        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
        parent_ctx->rotate_disable = 1;
        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);

        perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
                ret = inherit_task_group(event, parent, parent_ctx,
                                         child, ctxn, &inherited_all);
                if (ret)
                        goto out_unlock;
        }

        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
        parent_ctx->rotate_disable = 0;

        child_ctx = child->perf_event_ctxp[ctxn];

        if (child_ctx && inherited_all) {
                /*
                 * Mark the child context as a clone of the parent
                 * context, or of whatever the parent is a clone of.
                 *
                 * Note that if the parent is a clone, the holding of
                 * parent_ctx->lock avoids it from being uncloned.
                 */
                cloned_ctx = parent_ctx->parent_ctx;
                if (cloned_ctx) {
                        child_ctx->parent_ctx = cloned_ctx;
                        child_ctx->parent_gen = parent_ctx->parent_gen;
                } else {
                        child_ctx->parent_ctx = parent_ctx;
                        child_ctx->parent_gen = parent_ctx->generation;
                }
                get_ctx(child_ctx->parent_ctx);
        }

        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
out_unlock:
        mutex_unlock(&parent_ctx->mutex);

        perf_unpin_context(parent_ctx);
        put_ctx(parent_ctx);

        return ret;
}

/*
 * Initialize the perf_event context in task_struct
 */
int perf_event_init_task(struct task_struct *child)
{
        int ctxn, ret;

        memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
        mutex_init(&child->perf_event_mutex);
        INIT_LIST_HEAD(&child->perf_event_list);

        for_each_task_context_nr(ctxn) {
                ret = perf_event_init_context(child, ctxn);
                if (ret) {
                        perf_event_free_task(child);
                        return ret;
                }
        }

        return 0;
}

static void __init perf_event_init_all_cpus(void)
{
        struct swevent_htable *swhash;
        int cpu;

        zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);

        for_each_possible_cpu(cpu) {
                swhash = &per_cpu(swevent_htable, cpu);
                mutex_init(&swhash->hlist_mutex);
                INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));

                INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
                raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));

#ifdef CONFIG_CGROUP_PERF
                INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
#endif
                INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
        }
}

static void perf_swevent_init_cpu(unsigned int cpu)
{
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);

        mutex_lock(&swhash->hlist_mutex);
        if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
                struct swevent_hlist *hlist;

                hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
                WARN_ON(!hlist);
                rcu_assign_pointer(swhash->swevent_hlist, hlist);
        }
        mutex_unlock(&swhash->hlist_mutex);
}

#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
static void __perf_event_exit_context(void *__info)
{
        struct perf_event_context *ctx = __info;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        struct perf_event *event;

        raw_spin_lock(&ctx->lock);
        ctx_sched_out(ctx, cpuctx, EVENT_TIME);
        list_for_each_entry(event, &ctx->event_list, event_entry)
                __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
        raw_spin_unlock(&ctx->lock);
}

static void perf_event_exit_cpu_context(int cpu)
{
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;
        struct pmu *pmu;

        mutex_lock(&pmus_lock);
        list_for_each_entry(pmu, &pmus, entry) {
                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
                ctx = &cpuctx->ctx;

                mutex_lock(&ctx->mutex);
                smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
                cpuctx->online = 0;
                mutex_unlock(&ctx->mutex);
        }
        cpumask_clear_cpu(cpu, perf_online_mask);
        mutex_unlock(&pmus_lock);
}
#else

static void perf_event_exit_cpu_context(int cpu) { }

#endif

int perf_event_init_cpu(unsigned int cpu)
{
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;
        struct pmu *pmu;

        perf_swevent_init_cpu(cpu);

        mutex_lock(&pmus_lock);
        cpumask_set_cpu(cpu, perf_online_mask);
        list_for_each_entry(pmu, &pmus, entry) {
                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
                ctx = &cpuctx->ctx;

                mutex_lock(&ctx->mutex);
                cpuctx->online = 1;
                mutex_unlock(&ctx->mutex);
        }
        mutex_unlock(&pmus_lock);

        return 0;
}

int perf_event_exit_cpu(unsigned int cpu)
{
        perf_event_exit_cpu_context(cpu);
        return 0;
}

static int
perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
{
        int cpu;

        for_each_online_cpu(cpu)
                perf_event_exit_cpu(cpu);

        return NOTIFY_OK;
}

/*
 * Run the perf reboot notifier at the very last possible moment so that
 * the generic watchdog code runs as long as possible.
 */
static struct notifier_block perf_reboot_notifier = {
        .notifier_call = perf_reboot,
        .priority = INT_MIN,
};

void __init perf_event_init(void)
{
        int ret;

        idr_init(&pmu_idr);

        perf_event_init_all_cpus();
        init_srcu_struct(&pmus_srcu);
        perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
        perf_pmu_register(&perf_cpu_clock, NULL, -1);
        perf_pmu_register(&perf_task_clock, NULL, -1);
        perf_tp_register();
        perf_event_init_cpu(smp_processor_id());
        register_reboot_notifier(&perf_reboot_notifier);

        ret = init_hw_breakpoint();
        WARN(ret, "hw_breakpoint initialization failed with: %d", ret);

        /*
         * Build time assertion that we keep the data_head at the intended
         * location.  IOW, validation we got the __reserved[] size right.
         */
        BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
                     != 1024);
}

ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
                              char *page)
{
        struct perf_pmu_events_attr *pmu_attr =
                container_of(attr, struct perf_pmu_events_attr, attr);

        if (pmu_attr->event_str)
                return sprintf(page, "%s\n", pmu_attr->event_str);

        return 0;
}
EXPORT_SYMBOL_GPL(perf_event_sysfs_show);

static int __init perf_event_sysfs_init(void)
{
        struct pmu *pmu;
        int ret;

        mutex_lock(&pmus_lock);

        ret = bus_register(&pmu_bus);
        if (ret)
                goto unlock;

        list_for_each_entry(pmu, &pmus, entry) {
                if (!pmu->name || pmu->type < 0)
                        continue;

                ret = pmu_dev_alloc(pmu);
                WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
        }
        pmu_bus_running = 1;
        ret = 0;

unlock:
        mutex_unlock(&pmus_lock);

        return ret;
}
device_initcall(perf_event_sysfs_init);

#ifdef CONFIG_CGROUP_PERF
static struct cgroup_subsys_state *
perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
        struct perf_cgroup *jc;

        jc = kzalloc(sizeof(*jc), GFP_KERNEL);
        if (!jc)
                return ERR_PTR(-ENOMEM);

        jc->info = alloc_percpu(struct perf_cgroup_info);
        if (!jc->info) {
                kfree(jc);
                return ERR_PTR(-ENOMEM);
        }

        return &jc->css;
}

static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
{
        struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);

        free_percpu(jc->info);
        kfree(jc);
}

static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
{
        perf_event_cgroup(css->cgroup);
        return 0;
}

static int __perf_cgroup_move(void *info)
{
        struct task_struct *task = info;
        rcu_read_lock();
        perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
        rcu_read_unlock();
        return 0;
}

static void perf_cgroup_attach(struct cgroup_taskset *tset)
{
        struct task_struct *task;
        struct cgroup_subsys_state *css;

        cgroup_taskset_for_each(task, css, tset)
                task_function_call(task, __perf_cgroup_move, task);
}

struct cgroup_subsys perf_event_cgrp_subsys = {
        .css_alloc        = perf_cgroup_css_alloc,
        .css_free        = perf_cgroup_css_free,
        .css_online        = perf_cgroup_css_online,
        .attach                = perf_cgroup_attach,
        /*
         * Implicitly enable on dfl hierarchy so that perf events can
         * always be filtered by cgroup2 path as long as perf_event
         * controller is not mounted on a legacy hierarchy.
         */
        .implicit_on_dfl = true,
        .threaded        = true,
};
#endif /* CONFIG_CGROUP_PERF */



































































    1 
    1 







































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
// SPDX-License-Identifier: GPL-2.0
/*
 * IA-32 Huge TLB Page Support for Kernel.
 *
 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
 */

#include <linux/init.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/hugetlb.h>
#include <linux/pagemap.h>
#include <linux/err.h>
#include <linux/sysctl.h>
#include <linux/compat.h>
#include <asm/mman.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/elf.h>

#if 0        /* This is just for testing */
struct page *
follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
{
        unsigned long start = address;
        int length = 1;
        int nr;
        struct page *page;
        struct vm_area_struct *vma;

        vma = find_vma(mm, addr);
        if (!vma || !is_vm_hugetlb_page(vma))
                return ERR_PTR(-EINVAL);

        pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));

        /* hugetlb should be locked, and hence, prefaulted */
        WARN_ON(!pte || pte_none(*pte));

        page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];

        WARN_ON(!PageHead(page));

        return page;
}

int pmd_huge(pmd_t pmd)
{
        return 0;
}

int pud_huge(pud_t pud)
{
        return 0;
}

#else

/*
 * pmd_huge() returns 1 if @pmd is hugetlb related entry, that is normal
 * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry.
 * Otherwise, returns 0.
 */
int pmd_huge(pmd_t pmd)
{
        return !pmd_none(pmd) &&
                (pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT;
}

int pud_huge(pud_t pud)
{
        return !!(pud_val(pud) & _PAGE_PSE);
}
#endif

#ifdef CONFIG_HUGETLB_PAGE
static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
                unsigned long addr, unsigned long len,
                unsigned long pgoff, unsigned long flags)
{
        struct hstate *h = hstate_file(file);
        struct vm_unmapped_area_info info;

        info.flags = 0;
        info.length = len;
        info.low_limit = get_mmap_base(1);

        /*
         * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
         * in the full address space.
         */
        info.high_limit = in_32bit_syscall() ?
                task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW);

        info.align_mask = PAGE_MASK & ~huge_page_mask(h);
        info.align_offset = 0;
        return vm_unmapped_area(&info);
}

static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
                unsigned long addr, unsigned long len,
                unsigned long pgoff, unsigned long flags)
{
        struct hstate *h = hstate_file(file);
        struct vm_unmapped_area_info info;

        info.flags = VM_UNMAPPED_AREA_TOPDOWN;
        info.length = len;
        info.low_limit = PAGE_SIZE;
        info.high_limit = get_mmap_base(0);

        /*
         * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
         * in the full address space.
         */
        if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall())
                info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;

        info.align_mask = PAGE_MASK & ~huge_page_mask(h);
        info.align_offset = 0;
        addr = vm_unmapped_area(&info);

        /*
         * A failed mmap() very likely causes application failure,
         * so fall back to the bottom-up function here. This scenario
         * can happen with large stack limits and large mmap()
         * allocations.
         */
        if (addr & ~PAGE_MASK) {
                VM_BUG_ON(addr != -ENOMEM);
                info.flags = 0;
                info.low_limit = TASK_UNMAPPED_BASE;
                info.high_limit = TASK_SIZE_LOW;
                addr = vm_unmapped_area(&info);
        }

        return addr;
}

unsigned long
hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags)
{
        struct hstate *h = hstate_file(file);
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;

        if (len & ~huge_page_mask(h))
                return -EINVAL;

        if (len > TASK_SIZE)
                return -ENOMEM;

        /* No address checking. See comment at mmap_address_hint_valid() */
        if (flags & MAP_FIXED) {
                if (prepare_hugepage_range(file, addr, len))
                        return -EINVAL;
                return addr;
        }

        if (addr) {
                addr &= huge_page_mask(h);
                if (!mmap_address_hint_valid(addr, len))
                        goto get_unmapped_area;

                vma = find_vma(mm, addr);
                if (!vma || addr + len <= vm_start_gap(vma))
                        return addr;
        }

get_unmapped_area:
        if (mm->get_unmapped_area == arch_get_unmapped_area)
                return hugetlb_get_unmapped_area_bottomup(file, addr, len,
                                pgoff, flags);
        else
                return hugetlb_get_unmapped_area_topdown(file, addr, len,
                                pgoff, flags);
}
#endif /* CONFIG_HUGETLB_PAGE */

#ifdef CONFIG_X86_64
bool __init arch_hugetlb_valid_size(unsigned long size)
{
        if (size == PMD_SIZE)
                return true;
        else if (size == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES))
                return true;
        else
                return false;
}

#ifdef CONFIG_CONTIG_ALLOC
static __init int gigantic_pages_init(void)
{
        /* With compaction or CMA we can allocate gigantic pages at runtime */
        if (boot_cpu_has(X86_FEATURE_GBPAGES))
                hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
        return 0;
}
arch_initcall(gigantic_pages_init);
#endif
#endif



























    1 











































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
// SPDX-License-Identifier: GPL-2.0-only
#include "cgroup-internal.h"

#include <linux/sched/cputime.h>

static DEFINE_SPINLOCK(cgroup_rstat_lock);
static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);

static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);

static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
{
        return per_cpu_ptr(cgrp->rstat_cpu, cpu);
}

/**
 * cgroup_rstat_updated - keep track of updated rstat_cpu
 * @cgrp: target cgroup
 * @cpu: cpu on which rstat_cpu was updated
 *
 * @cgrp's rstat_cpu on @cpu was updated.  Put it on the parent's matching
 * rstat_cpu->updated_children list.  See the comment on top of
 * cgroup_rstat_cpu definition for details.
 */
void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
{
        raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
        struct cgroup *parent;
        unsigned long flags;

        /* nothing to do for root */
        if (!cgroup_parent(cgrp))
                return;

        /*
         * Speculative already-on-list test. This may race leading to
         * temporary inaccuracies, which is fine.
         *
         * Because @parent's updated_children is terminated with @parent
         * instead of NULL, we can tell whether @cgrp is on the list by
         * testing the next pointer for NULL.
         */
        if (cgroup_rstat_cpu(cgrp, cpu)->updated_next)
                return;

        raw_spin_lock_irqsave(cpu_lock, flags);

        /* put @cgrp and all ancestors on the corresponding updated lists */
        for (parent = cgroup_parent(cgrp); parent;
             cgrp = parent, parent = cgroup_parent(cgrp)) {
                struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
                struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);

                /*
                 * Both additions and removals are bottom-up.  If a cgroup
                 * is already in the tree, all ancestors are.
                 */
                if (rstatc->updated_next)
                        break;

                rstatc->updated_next = prstatc->updated_children;
                prstatc->updated_children = cgrp;
        }

        raw_spin_unlock_irqrestore(cpu_lock, flags);
}

/**
 * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
 * @pos: current position
 * @root: root of the tree to traversal
 * @cpu: target cpu
 *
 * Walks the udpated rstat_cpu tree on @cpu from @root.  %NULL @pos starts
 * the traversal and %NULL return indicates the end.  During traversal,
 * each returned cgroup is unlinked from the tree.  Must be called with the
 * matching cgroup_rstat_cpu_lock held.
 *
 * The only ordering guarantee is that, for a parent and a child pair
 * covered by a given traversal, if a child is visited, its parent is
 * guaranteed to be visited afterwards.
 */
static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
                                                   struct cgroup *root, int cpu)
{
        struct cgroup_rstat_cpu *rstatc;

        if (pos == root)
                return NULL;

        /*
         * We're gonna walk down to the first leaf and visit/remove it.  We
         * can pick whatever unvisited node as the starting point.
         */
        if (!pos)
                pos = root;
        else
                pos = cgroup_parent(pos);

        /* walk down to the first leaf */
        while (true) {
                rstatc = cgroup_rstat_cpu(pos, cpu);
                if (rstatc->updated_children == pos)
                        break;
                pos = rstatc->updated_children;
        }

        /*
         * Unlink @pos from the tree.  As the updated_children list is
         * singly linked, we have to walk it to find the removal point.
         * However, due to the way we traverse, @pos will be the first
         * child in most cases. The only exception is @root.
         */
        if (rstatc->updated_next) {
                struct cgroup *parent = cgroup_parent(pos);
                struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
                struct cgroup_rstat_cpu *nrstatc;
                struct cgroup **nextp;

                nextp = &prstatc->updated_children;
                while (true) {
                        nrstatc = cgroup_rstat_cpu(*nextp, cpu);
                        if (*nextp == pos)
                                break;

                        WARN_ON_ONCE(*nextp == parent);
                        nextp = &nrstatc->updated_next;
                }

                *nextp = rstatc->updated_next;
                rstatc->updated_next = NULL;

                return pos;
        }

        /* only happens for @root */
        return NULL;
}

/* see cgroup_rstat_flush() */
static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
        __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
{
        int cpu;

        lockdep_assert_held(&cgroup_rstat_lock);

        for_each_possible_cpu(cpu) {
                raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
                                                       cpu);
                struct cgroup *pos = NULL;

                raw_spin_lock(cpu_lock);
                while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
                        struct cgroup_subsys_state *css;

                        cgroup_base_stat_flush(pos, cpu);

                        rcu_read_lock();
                        list_for_each_entry_rcu(css, &pos->rstat_css_list,
                                                rstat_css_node)
                                css->ss->css_rstat_flush(css, cpu);
                        rcu_read_unlock();
                }
                raw_spin_unlock(cpu_lock);

                /* if @may_sleep, play nice and yield if necessary */
                if (may_sleep && (need_resched() ||
                                  spin_needbreak(&cgroup_rstat_lock))) {
                        spin_unlock_irq(&cgroup_rstat_lock);
                        if (!cond_resched())
                                cpu_relax();
                        spin_lock_irq(&cgroup_rstat_lock);
                }
        }
}

/**
 * cgroup_rstat_flush - flush stats in @cgrp's subtree
 * @cgrp: target cgroup
 *
 * Collect all per-cpu stats in @cgrp's subtree into the global counters
 * and propagate them upwards.  After this function returns, all cgroups in
 * the subtree have up-to-date ->stat.
 *
 * This also gets all cgroups in the subtree including @cgrp off the
 * ->updated_children lists.
 *
 * This function may block.
 */
void cgroup_rstat_flush(struct cgroup *cgrp)
{
        might_sleep();

        spin_lock_irq(&cgroup_rstat_lock);
        cgroup_rstat_flush_locked(cgrp, true);
        spin_unlock_irq(&cgroup_rstat_lock);
}

/**
 * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush()
 * @cgrp: target cgroup
 *
 * This function can be called from any context.
 */
void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
{
        unsigned long flags;

        spin_lock_irqsave(&cgroup_rstat_lock, flags);
        cgroup_rstat_flush_locked(cgrp, false);
        spin_unlock_irqrestore(&cgroup_rstat_lock, flags);
}

/**
 * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold
 * @cgrp: target cgroup
 *
 * Flush stats in @cgrp's subtree and prevent further flushes.  Must be
 * paired with cgroup_rstat_flush_release().
 *
 * This function may block.
 */
void cgroup_rstat_flush_hold(struct cgroup *cgrp)
        __acquires(&cgroup_rstat_lock)
{
        might_sleep();
        spin_lock_irq(&cgroup_rstat_lock);
        cgroup_rstat_flush_locked(cgrp, true);
}

/**
 * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
 */
void cgroup_rstat_flush_release(void)
        __releases(&cgroup_rstat_lock)
{
        spin_unlock_irq(&cgroup_rstat_lock);
}

int cgroup_rstat_init(struct cgroup *cgrp)
{
        int cpu;

        /* the root cgrp has rstat_cpu preallocated */
        if (!cgrp->rstat_cpu) {
                cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
                if (!cgrp->rstat_cpu)
                        return -ENOMEM;
        }

        /* ->updated_children list is self terminated */
        for_each_possible_cpu(cpu) {
                struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);

                rstatc->updated_children = cgrp;
                u64_stats_init(&rstatc->bsync);
        }

        return 0;
}

void cgroup_rstat_exit(struct cgroup *cgrp)
{
        int cpu;

        cgroup_rstat_flush(cgrp);

        /* sanity check */
        for_each_possible_cpu(cpu) {
                struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);

                if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
                    WARN_ON_ONCE(rstatc->updated_next))
                        return;
        }

        free_percpu(cgrp->rstat_cpu);
        cgrp->rstat_cpu = NULL;
}

void __init cgroup_rstat_boot(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));

        BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp));
}

/*
 * Functions for cgroup basic resource statistics implemented on top of
 * rstat.
 */
static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
                                 struct cgroup_base_stat *src_bstat)
{
        dst_bstat->cputime.utime += src_bstat->cputime.utime;
        dst_bstat->cputime.stime += src_bstat->cputime.stime;
        dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
}

static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
                                 struct cgroup_base_stat *src_bstat)
{
        dst_bstat->cputime.utime -= src_bstat->cputime.utime;
        dst_bstat->cputime.stime -= src_bstat->cputime.stime;
        dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
}

static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{
        struct cgroup *parent = cgroup_parent(cgrp);
        struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
        struct cgroup_base_stat cur, delta;
        unsigned seq;

        /* fetch the current per-cpu values */
        do {
                seq = __u64_stats_fetch_begin(&rstatc->bsync);
                cur.cputime = rstatc->bstat.cputime;
        } while (__u64_stats_fetch_retry(&rstatc->bsync, seq));

        /* propagate percpu delta to global */
        delta = cur;
        cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
        cgroup_base_stat_add(&cgrp->bstat, &delta);
        cgroup_base_stat_add(&rstatc->last_bstat, &delta);

        /* propagate global delta to parent */
        if (parent) {
                delta = cgrp->bstat;
                cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
                cgroup_base_stat_add(&parent->bstat, &delta);
                cgroup_base_stat_add(&cgrp->last_bstat, &delta);
        }
}

static struct cgroup_rstat_cpu *
cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp)
{
        struct cgroup_rstat_cpu *rstatc;

        rstatc = get_cpu_ptr(cgrp->rstat_cpu);
        u64_stats_update_begin(&rstatc->bsync);
        return rstatc;
}

static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
                                                 struct cgroup_rstat_cpu *rstatc)
{
        u64_stats_update_end(&rstatc->bsync);
        cgroup_rstat_updated(cgrp, smp_processor_id());
        put_cpu_ptr(rstatc);
}

void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
{
        struct cgroup_rstat_cpu *rstatc;

        rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
        rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
        cgroup_base_stat_cputime_account_end(cgrp, rstatc);
}

void __cgroup_account_cputime_field(struct cgroup *cgrp,
                                    enum cpu_usage_stat index, u64 delta_exec)
{
        struct cgroup_rstat_cpu *rstatc;

        rstatc = cgroup_base_stat_cputime_account_begin(cgrp);

        switch (index) {
        case CPUTIME_USER:
        case CPUTIME_NICE:
                rstatc->bstat.cputime.utime += delta_exec;
                break;
        case CPUTIME_SYSTEM:
        case CPUTIME_IRQ:
        case CPUTIME_SOFTIRQ:
                rstatc->bstat.cputime.stime += delta_exec;
                break;
        default:
                break;
        }

        cgroup_base_stat_cputime_account_end(cgrp, rstatc);
}

/*
 * compute the cputime for the root cgroup by getting the per cpu data
 * at a global level, then categorizing the fields in a manner consistent
 * with how it is done by __cgroup_account_cputime_field for each bit of
 * cpu time attributed to a cgroup.
 */
static void root_cgroup_cputime(struct task_cputime *cputime)
{
        int i;

        cputime->stime = 0;
        cputime->utime = 0;
        cputime->sum_exec_runtime = 0;
        for_each_possible_cpu(i) {
                struct kernel_cpustat kcpustat;
                u64 *cpustat = kcpustat.cpustat;
                u64 user = 0;
                u64 sys = 0;

                kcpustat_cpu_fetch(&kcpustat, i);

                user += cpustat[CPUTIME_USER];
                user += cpustat[CPUTIME_NICE];
                cputime->utime += user;

                sys += cpustat[CPUTIME_SYSTEM];
                sys += cpustat[CPUTIME_IRQ];
                sys += cpustat[CPUTIME_SOFTIRQ];
                cputime->stime += sys;

                cputime->sum_exec_runtime += user;
                cputime->sum_exec_runtime += sys;
                cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
        }
}

void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        u64 usage, utime, stime;
        struct task_cputime cputime;

        if (cgroup_parent(cgrp)) {
                cgroup_rstat_flush_hold(cgrp);
                usage = cgrp->bstat.cputime.sum_exec_runtime;
                cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
                               &utime, &stime);
                cgroup_rstat_flush_release();
        } else {
                root_cgroup_cputime(&cputime);
                usage = cputime.sum_exec_runtime;
                utime = cputime.utime;
                stime = cputime.stime;
        }

        do_div(usage, NSEC_PER_USEC);
        do_div(utime, NSEC_PER_USEC);
        do_div(stime, NSEC_PER_USEC);

        seq_printf(seq, "usage_usec %llu\n"
                   "user_usec %llu\n"
                   "system_usec %llu\n",
                   usage, utime, stime);
}




























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_GENERIC_PGALLOC_H
#define __ASM_GENERIC_PGALLOC_H

#ifdef CONFIG_MMU

#define GFP_PGTABLE_KERNEL        (GFP_KERNEL | __GFP_ZERO)
#define GFP_PGTABLE_USER        (GFP_PGTABLE_KERNEL | __GFP_ACCOUNT)

/**
 * __pte_alloc_one_kernel - allocate a page for PTE-level kernel page table
 * @mm: the mm_struct of the current context
 *
 * This function is intended for architectures that need
 * anything beyond simple page allocation.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pte_t *__pte_alloc_one_kernel(struct mm_struct *mm)
{
        return (pte_t *)__get_free_page(GFP_PGTABLE_KERNEL);
}

#ifndef __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
/**
 * pte_alloc_one_kernel - allocate a page for PTE-level kernel page table
 * @mm: the mm_struct of the current context
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
{
        return __pte_alloc_one_kernel(mm);
}
#endif

/**
 * pte_free_kernel - free PTE-level kernel page table page
 * @mm: the mm_struct of the current context
 * @pte: pointer to the memory containing the page table
 */
static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
{
        free_page((unsigned long)pte);
}

/**
 * __pte_alloc_one - allocate a page for PTE-level user page table
 * @mm: the mm_struct of the current context
 * @gfp: GFP flags to use for the allocation
 *
 * Allocates a page and runs the pgtable_pte_page_ctor().
 *
 * This function is intended for architectures that need
 * anything beyond simple page allocation or must have custom GFP flags.
 *
 * Return: `struct page` initialized as page table or %NULL on error
 */
static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp)
{
        struct page *pte;

        pte = alloc_page(gfp);
        if (!pte)
                return NULL;
        if (!pgtable_pte_page_ctor(pte)) {
                __free_page(pte);
                return NULL;
        }

        return pte;
}

#ifndef __HAVE_ARCH_PTE_ALLOC_ONE
/**
 * pte_alloc_one - allocate a page for PTE-level user page table
 * @mm: the mm_struct of the current context
 *
 * Allocates a page and runs the pgtable_pte_page_ctor().
 *
 * Return: `struct page` initialized as page table or %NULL on error
 */
static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
{
        return __pte_alloc_one(mm, GFP_PGTABLE_USER);
}
#endif

/*
 * Should really implement gc for free page table pages. This could be
 * done with a reference count in struct page.
 */

/**
 * pte_free - free PTE-level user page table page
 * @mm: the mm_struct of the current context
 * @pte_page: the `struct page` representing the page table
 */
static inline void pte_free(struct mm_struct *mm, struct page *pte_page)
{
        pgtable_pte_page_dtor(pte_page);
        __free_page(pte_page);
}


#if CONFIG_PGTABLE_LEVELS > 2

#ifndef __HAVE_ARCH_PMD_ALLOC_ONE
/**
 * pmd_alloc_one - allocate a page for PMD-level page table
 * @mm: the mm_struct of the current context
 *
 * Allocates a page and runs the pgtable_pmd_page_ctor().
 * Allocations use %GFP_PGTABLE_USER in user context and
 * %GFP_PGTABLE_KERNEL in kernel context.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
{
        struct page *page;
        gfp_t gfp = GFP_PGTABLE_USER;

        if (mm == &init_mm)
                gfp = GFP_PGTABLE_KERNEL;
        page = alloc_pages(gfp, 0);
        if (!page)
                return NULL;
        if (!pgtable_pmd_page_ctor(page)) {
                __free_pages(page, 0);
                return NULL;
        }
        return (pmd_t *)page_address(page);
}
#endif

#ifndef __HAVE_ARCH_PMD_FREE
static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
{
        BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
        pgtable_pmd_page_dtor(virt_to_page(pmd));
        free_page((unsigned long)pmd);
}
#endif

#endif /* CONFIG_PGTABLE_LEVELS > 2 */

#if CONFIG_PGTABLE_LEVELS > 3

#ifndef __HAVE_ARCH_PUD_ALLOC_ONE
/**
 * pud_alloc_one - allocate a page for PUD-level page table
 * @mm: the mm_struct of the current context
 *
 * Allocates a page using %GFP_PGTABLE_USER for user context and
 * %GFP_PGTABLE_KERNEL for kernel context.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
{
        gfp_t gfp = GFP_PGTABLE_USER;

        if (mm == &init_mm)
                gfp = GFP_PGTABLE_KERNEL;
        return (pud_t *)get_zeroed_page(gfp);
}
#endif

static inline void pud_free(struct mm_struct *mm, pud_t *pud)
{
        BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
        free_page((unsigned long)pud);
}

#endif /* CONFIG_PGTABLE_LEVELS > 3 */

#ifndef __HAVE_ARCH_PGD_FREE
static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
        free_page((unsigned long)pgd);
}
#endif

#endif /* CONFIG_MMU */

#endif /* __ASM_GENERIC_PGALLOC_H */

































    1 














































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
// SPDX-License-Identifier: GPL-2.0
#include <linux/mm.h>
#include <linux/gfp.h>
#include <linux/hugetlb.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/fixmap.h>
#include <asm/mtrr.h>

#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
EXPORT_SYMBOL(physical_mask);
#endif

#ifdef CONFIG_HIGHPTE
#define PGTABLE_HIGHMEM __GFP_HIGHMEM
#else
#define PGTABLE_HIGHMEM 0
#endif

#ifndef CONFIG_PARAVIRT
static inline
void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
{
        tlb_remove_page(tlb, table);
}
#endif

gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;

pgtable_t pte_alloc_one(struct mm_struct *mm)
{
        return __pte_alloc_one(mm, __userpte_alloc_gfp);
}

static int __init setup_userpte(char *arg)
{
        if (!arg)
                return -EINVAL;

        /*
         * "userpte=nohigh" disables allocation of user pagetables in
         * high memory.
         */
        if (strcmp(arg, "nohigh") == 0)
                __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
        else
                return -EINVAL;
        return 0;
}
early_param("userpte", setup_userpte);

void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
        pgtable_pte_page_dtor(pte);
        paravirt_release_pte(page_to_pfn(pte));
        paravirt_tlb_remove_table(tlb, pte);
}

#if CONFIG_PGTABLE_LEVELS > 2
void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
{
        struct page *page = virt_to_page(pmd);
        paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
        /*
         * NOTE! For PAE, any changes to the top page-directory-pointer-table
         * entries need a full cr3 reload to flush.
         */
#ifdef CONFIG_X86_PAE
        tlb->need_flush_all = 1;
#endif
        pgtable_pmd_page_dtor(page);
        paravirt_tlb_remove_table(tlb, page);
}

#if CONFIG_PGTABLE_LEVELS > 3
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
        paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
        paravirt_tlb_remove_table(tlb, virt_to_page(pud));
}

#if CONFIG_PGTABLE_LEVELS > 4
void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
{
        paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
        paravirt_tlb_remove_table(tlb, virt_to_page(p4d));
}
#endif        /* CONFIG_PGTABLE_LEVELS > 4 */
#endif        /* CONFIG_PGTABLE_LEVELS > 3 */
#endif        /* CONFIG_PGTABLE_LEVELS > 2 */

static inline void pgd_list_add(pgd_t *pgd)
{
        struct page *page = virt_to_page(pgd);

        list_add(&page->lru, &pgd_list);
}

static inline void pgd_list_del(pgd_t *pgd)
{
        struct page *page = virt_to_page(pgd);

        list_del(&page->lru);
}

#define UNSHARED_PTRS_PER_PGD                                \
        (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
#define MAX_UNSHARED_PTRS_PER_PGD                        \
        max_t(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD)


static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
{
        virt_to_page(pgd)->pt_mm = mm;
}

struct mm_struct *pgd_page_get_mm(struct page *page)
{
        return page->pt_mm;
}

static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
{
        /* If the pgd points to a shared pagetable level (either the
           ptes in non-PAE, or shared PMD in PAE), then just copy the
           references from swapper_pg_dir. */
        if (CONFIG_PGTABLE_LEVELS == 2 ||
            (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
            CONFIG_PGTABLE_LEVELS >= 4) {
                clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
                                swapper_pg_dir + KERNEL_PGD_BOUNDARY,
                                KERNEL_PGD_PTRS);
        }

        /* list required to sync kernel mapping updates */
        if (!SHARED_KERNEL_PMD) {
                pgd_set_mm(pgd, mm);
                pgd_list_add(pgd);
        }
}

static void pgd_dtor(pgd_t *pgd)
{
        if (SHARED_KERNEL_PMD)
                return;

        spin_lock(&pgd_lock);
        pgd_list_del(pgd);
        spin_unlock(&pgd_lock);
}

/*
 * List of all pgd's needed for non-PAE so it can invalidate entries
 * in both cached and uncached pgd's; not needed for PAE since the
 * kernel pmd is shared. If PAE were not to share the pmd a similar
 * tactic would be needed. This is essentially codepath-based locking
 * against pageattr.c; it is the unique case in which a valid change
 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
 * vmalloc faults work because attached pagetables are never freed.
 * -- nyc
 */

#ifdef CONFIG_X86_PAE
/*
 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
 * updating the top-level pagetable entries to guarantee the
 * processor notices the update.  Since this is expensive, and
 * all 4 top-level entries are used almost immediately in a
 * new process's life, we just pre-populate them here.
 *
 * Also, if we're in a paravirt environment where the kernel pmd is
 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
 * and initialize the kernel pmds here.
 */
#define PREALLOCATED_PMDS        UNSHARED_PTRS_PER_PGD
#define MAX_PREALLOCATED_PMDS        MAX_UNSHARED_PTRS_PER_PGD

/*
 * We allocate separate PMDs for the kernel part of the user page-table
 * when PTI is enabled. We need them to map the per-process LDT into the
 * user-space page-table.
 */
#define PREALLOCATED_USER_PMDS         (boot_cpu_has(X86_FEATURE_PTI) ? \
                                        KERNEL_PGD_PTRS : 0)
#define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS

void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
{
        paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);

        /* Note: almost everything apart from _PAGE_PRESENT is
           reserved at the pmd (PDPT) level. */
        set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));

        /*
         * According to Intel App note "TLBs, Paging-Structure Caches,
         * and Their Invalidation", April 2007, document 317080-001,
         * section 8.1: in PAE mode we explicitly have to flush the
         * TLB via cr3 if the top-level pgd is changed...
         */
        flush_tlb_mm(mm);
}
#else  /* !CONFIG_X86_PAE */

/* No need to prepopulate any pagetable entries in non-PAE modes. */
#define PREALLOCATED_PMDS        0
#define MAX_PREALLOCATED_PMDS        0
#define PREALLOCATED_USER_PMDS         0
#define MAX_PREALLOCATED_USER_PMDS 0
#endif        /* CONFIG_X86_PAE */

static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
{
        int i;

        for (i = 0; i < count; i++)
                if (pmds[i]) {
                        pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
                        free_page((unsigned long)pmds[i]);
                        mm_dec_nr_pmds(mm);
                }
}

static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
{
        int i;
        bool failed = false;
        gfp_t gfp = GFP_PGTABLE_USER;

        if (mm == &init_mm)
                gfp &= ~__GFP_ACCOUNT;

        for (i = 0; i < count; i++) {
                pmd_t *pmd = (pmd_t *)__get_free_page(gfp);
                if (!pmd)
                        failed = true;
                if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) {
                        free_page((unsigned long)pmd);
                        pmd = NULL;
                        failed = true;
                }
                if (pmd)
                        mm_inc_nr_pmds(mm);
                pmds[i] = pmd;
        }

        if (failed) {
                free_pmds(mm, pmds, count);
                return -ENOMEM;
        }

        return 0;
}

/*
 * Mop up any pmd pages which may still be attached to the pgd.
 * Normally they will be freed by munmap/exit_mmap, but any pmd we
 * preallocate which never got a corresponding vma will need to be
 * freed manually.
 */
static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp)
{
        pgd_t pgd = *pgdp;

        if (pgd_val(pgd) != 0) {
                pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);

                pgd_clear(pgdp);

                paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
                pmd_free(mm, pmd);
                mm_dec_nr_pmds(mm);
        }
}

static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
{
        int i;

        for (i = 0; i < PREALLOCATED_PMDS; i++)
                mop_up_one_pmd(mm, &pgdp[i]);

#ifdef CONFIG_PAGE_TABLE_ISOLATION

        if (!boot_cpu_has(X86_FEATURE_PTI))
                return;

        pgdp = kernel_to_user_pgdp(pgdp);

        for (i = 0; i < PREALLOCATED_USER_PMDS; i++)
                mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]);
#endif
}

static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
{
        p4d_t *p4d;
        pud_t *pud;
        int i;

        if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
                return;

        p4d = p4d_offset(pgd, 0);
        pud = pud_offset(p4d, 0);

        for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
                pmd_t *pmd = pmds[i];

                if (i >= KERNEL_PGD_BOUNDARY)
                        memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
                               sizeof(pmd_t) * PTRS_PER_PMD);

                pud_populate(mm, pud, pmd);
        }
}

#ifdef CONFIG_PAGE_TABLE_ISOLATION
static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
                                     pgd_t *k_pgd, pmd_t *pmds[])
{
        pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir);
        pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
        p4d_t *u_p4d;
        pud_t *u_pud;
        int i;

        u_p4d = p4d_offset(u_pgd, 0);
        u_pud = pud_offset(u_p4d, 0);

        s_pgd += KERNEL_PGD_BOUNDARY;
        u_pud += KERNEL_PGD_BOUNDARY;

        for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) {
                pmd_t *pmd = pmds[i];

                memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd),
                       sizeof(pmd_t) * PTRS_PER_PMD);

                pud_populate(mm, u_pud, pmd);
        }

}
#else
static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
                                     pgd_t *k_pgd, pmd_t *pmds[])
{
}
#endif
/*
 * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also
 * assumes that pgd should be in one page.
 *
 * But kernel with PAE paging that is not running as a Xen domain
 * only needs to allocate 32 bytes for pgd instead of one page.
 */
#ifdef CONFIG_X86_PAE

#include <linux/slab.h>

#define PGD_SIZE        (PTRS_PER_PGD * sizeof(pgd_t))
#define PGD_ALIGN        32

static struct kmem_cache *pgd_cache;

void __init pgtable_cache_init(void)
{
        /*
         * When PAE kernel is running as a Xen domain, it does not use
         * shared kernel pmd. And this requires a whole page for pgd.
         */
        if (!SHARED_KERNEL_PMD)
                return;

        /*
         * when PAE kernel is not running as a Xen domain, it uses
         * shared kernel pmd. Shared kernel pmd does not require a whole
         * page for pgd. We are able to just allocate a 32-byte for pgd.
         * During boot time, we create a 32-byte slab for pgd table allocation.
         */
        pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
                                      SLAB_PANIC, NULL);
}

static inline pgd_t *_pgd_alloc(void)
{
        /*
         * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain.
         * We allocate one page for pgd.
         */
        if (!SHARED_KERNEL_PMD)
                return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,
                                                 PGD_ALLOCATION_ORDER);

        /*
         * Now PAE kernel is not running as a Xen domain. We can allocate
         * a 32-byte slab for pgd to save memory space.
         */
        return kmem_cache_alloc(pgd_cache, GFP_PGTABLE_USER);
}

static inline void _pgd_free(pgd_t *pgd)
{
        if (!SHARED_KERNEL_PMD)
                free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
        else
                kmem_cache_free(pgd_cache, pgd);
}
#else

static inline pgd_t *_pgd_alloc(void)
{
        return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,
                                         PGD_ALLOCATION_ORDER);
}

static inline void _pgd_free(pgd_t *pgd)
{
        free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
}
#endif /* CONFIG_X86_PAE */

pgd_t *pgd_alloc(struct mm_struct *mm)
{
        pgd_t *pgd;
        pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS];
        pmd_t *pmds[MAX_PREALLOCATED_PMDS];

        pgd = _pgd_alloc();

        if (pgd == NULL)
                goto out;

        mm->pgd = pgd;

        if (preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
                goto out_free_pgd;

        if (preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
                goto out_free_pmds;

        if (paravirt_pgd_alloc(mm) != 0)
                goto out_free_user_pmds;

        /*
         * Make sure that pre-populating the pmds is atomic with
         * respect to anything walking the pgd_list, so that they
         * never see a partially populated pgd.
         */
        spin_lock(&pgd_lock);

        pgd_ctor(mm, pgd);
        pgd_prepopulate_pmd(mm, pgd, pmds);
        pgd_prepopulate_user_pmd(mm, pgd, u_pmds);

        spin_unlock(&pgd_lock);

        return pgd;

out_free_user_pmds:
        free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
out_free_pmds:
        free_pmds(mm, pmds, PREALLOCATED_PMDS);
out_free_pgd:
        _pgd_free(pgd);
out:
        return NULL;
}

void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
        pgd_mop_up_pmds(mm, pgd);
        pgd_dtor(pgd);
        paravirt_pgd_free(mm, pgd);
        _pgd_free(pgd);
}

/*
 * Used to set accessed or dirty bits in the page table entries
 * on other architectures. On x86, the accessed and dirty bits
 * are tracked by hardware. However, do_wp_page calls this function
 * to also make the pte writeable at the same time the dirty bit is
 * set. In that case we do actually need to write the PTE.
 */
int ptep_set_access_flags(struct vm_area_struct *vma,
                          unsigned long address, pte_t *ptep,
                          pte_t entry, int dirty)
{
        int changed = !pte_same(*ptep, entry);

        if (changed && dirty)
                set_pte(ptep, entry);

        return changed;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int pmdp_set_access_flags(struct vm_area_struct *vma,
                          unsigned long address, pmd_t *pmdp,
                          pmd_t entry, int dirty)
{
        int changed = !pmd_same(*pmdp, entry);

        VM_BUG_ON(address & ~HPAGE_PMD_MASK);

        if (changed && dirty) {
                set_pmd(pmdp, entry);
                /*
                 * We had a write-protection fault here and changed the pmd
                 * to to more permissive. No need to flush the TLB for that,
                 * #PF is architecturally guaranteed to do that and in the
                 * worst-case we'll generate a spurious fault.
                 */
        }

        return changed;
}

int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
                          pud_t *pudp, pud_t entry, int dirty)
{
        int changed = !pud_same(*pudp, entry);

        VM_BUG_ON(address & ~HPAGE_PUD_MASK);

        if (changed && dirty) {
                set_pud(pudp, entry);
                /*
                 * We had a write-protection fault here and changed the pud
                 * to to more permissive. No need to flush the TLB for that,
                 * #PF is architecturally guaranteed to do that and in the
                 * worst-case we'll generate a spurious fault.
                 */
        }

        return changed;
}
#endif

int ptep_test_and_clear_young(struct vm_area_struct *vma,
                              unsigned long addr, pte_t *ptep)
{
        int ret = 0;

        if (pte_young(*ptep))
                ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
                                         (unsigned long *) &ptep->pte);

        return ret;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int pmdp_test_and_clear_young(struct vm_area_struct *vma,
                              unsigned long addr, pmd_t *pmdp)
{
        int ret = 0;

        if (pmd_young(*pmdp))
                ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
                                         (unsigned long *)pmdp);

        return ret;
}
int pudp_test_and_clear_young(struct vm_area_struct *vma,
                              unsigned long addr, pud_t *pudp)
{
        int ret = 0;

        if (pud_young(*pudp))
                ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
                                         (unsigned long *)pudp);

        return ret;
}
#endif

int ptep_clear_flush_young(struct vm_area_struct *vma,
                           unsigned long address, pte_t *ptep)
{
        /*
         * On x86 CPUs, clearing the accessed bit without a TLB flush
         * doesn't cause data corruption. [ It could cause incorrect
         * page aging and the (mistaken) reclaim of hot pages, but the
         * chance of that should be relatively low. ]
         *
         * So as a performance optimization don't flush the TLB when
         * clearing the accessed bit, it will eventually be flushed by
         * a context switch or a VM operation anyway. [ In the rare
         * event of it not getting flushed for a long time the delay
         * shouldn't really matter because there's no real memory
         * pressure for swapout to react to. ]
         */
        return ptep_test_and_clear_young(vma, address, ptep);
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int pmdp_clear_flush_young(struct vm_area_struct *vma,
                           unsigned long address, pmd_t *pmdp)
{
        int young;

        VM_BUG_ON(address & ~HPAGE_PMD_MASK);

        young = pmdp_test_and_clear_young(vma, address, pmdp);
        if (young)
                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);

        return young;
}
#endif

/**
 * reserve_top_address - reserves a hole in the top of kernel address space
 * @reserve - size of hole to reserve
 *
 * Can be used to relocate the fixmap area and poke a hole in the top
 * of kernel address space to make room for a hypervisor.
 */
void __init reserve_top_address(unsigned long reserve)
{
#ifdef CONFIG_X86_32
        BUG_ON(fixmaps_set > 0);
        __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
        printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
               -reserve, __FIXADDR_TOP + PAGE_SIZE);
#endif
}

int fixmaps_set;

void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
{
        unsigned long address = __fix_to_virt(idx);

#ifdef CONFIG_X86_64
       /*
        * Ensure that the static initial page tables are covering the
        * fixmap completely.
        */
        BUILD_BUG_ON(__end_of_permanent_fixed_addresses >
                     (FIXMAP_PMD_NUM * PTRS_PER_PTE));
#endif

        if (idx >= __end_of_fixed_addresses) {
                BUG();
                return;
        }
        set_pte_vaddr(address, pte);
        fixmaps_set++;
}

void native_set_fixmap(unsigned /* enum fixed_addresses */ idx,
                       phys_addr_t phys, pgprot_t flags)
{
        /* Sanitize 'prot' against any unsupported bits: */
        pgprot_val(flags) &= __default_kernel_pte_mask;

        __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
}

#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
#ifdef CONFIG_X86_5LEVEL
/**
 * p4d_set_huge - setup kernel P4D mapping
 *
 * No 512GB pages yet -- always return 0
 */
int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}

/**
 * p4d_clear_huge - clear kernel P4D mapping when it is set
 *
 * No 512GB pages yet -- always return 0
 */
int p4d_clear_huge(p4d_t *p4d)
{
        return 0;
}
#endif

/**
 * pud_set_huge - setup kernel PUD mapping
 *
 * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
 * function sets up a huge page only if any of the following conditions are met:
 *
 * - MTRRs are disabled, or
 *
 * - MTRRs are enabled and the range is completely covered by a single MTRR, or
 *
 * - MTRRs are enabled and the corresponding MTRR memory type is WB, which
 *   has no effect on the requested PAT memory type.
 *
 * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
 * page mapping attempt fails.
 *
 * Returns 1 on success and 0 on failure.
 */
int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
{
        u8 mtrr, uniform;

        mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
        if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
            (mtrr != MTRR_TYPE_WRBACK))
                return 0;

        /* Bail out if we are we on a populated non-leaf entry: */
        if (pud_present(*pud) && !pud_huge(*pud))
                return 0;

        set_pte((pte_t *)pud, pfn_pte(
                (u64)addr >> PAGE_SHIFT,
                __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));

        return 1;
}

/**
 * pmd_set_huge - setup kernel PMD mapping
 *
 * See text over pud_set_huge() above.
 *
 * Returns 1 on success and 0 on failure.
 */
int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
{
        u8 mtrr, uniform;

        mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
        if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
            (mtrr != MTRR_TYPE_WRBACK)) {
                pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
                             __func__, addr, addr + PMD_SIZE);
                return 0;
        }

        /* Bail out if we are we on a populated non-leaf entry: */
        if (pmd_present(*pmd) && !pmd_huge(*pmd))
                return 0;

        set_pte((pte_t *)pmd, pfn_pte(
                (u64)addr >> PAGE_SHIFT,
                __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));

        return 1;
}

/**
 * pud_clear_huge - clear kernel PUD mapping when it is set
 *
 * Returns 1 on success and 0 on failure (no PUD map is found).
 */
int pud_clear_huge(pud_t *pud)
{
        if (pud_large(*pud)) {
                pud_clear(pud);
                return 1;
        }

        return 0;
}

/**
 * pmd_clear_huge - clear kernel PMD mapping when it is set
 *
 * Returns 1 on success and 0 on failure (no PMD map is found).
 */
int pmd_clear_huge(pmd_t *pmd)
{
        if (pmd_large(*pmd)) {
                pmd_clear(pmd);
                return 1;
        }

        return 0;
}

/*
 * Until we support 512GB pages, skip them in the vmap area.
 */
int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
{
        return 0;
}

#ifdef CONFIG_X86_64
/**
 * pud_free_pmd_page - Clear pud entry and free pmd page.
 * @pud: Pointer to a PUD.
 * @addr: Virtual address associated with pud.
 *
 * Context: The pud range has been unmapped and TLB purged.
 * Return: 1 if clearing the entry succeeded. 0 otherwise.
 *
 * NOTE: Callers must allow a single page allocation.
 */
int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
        pmd_t *pmd, *pmd_sv;
        pte_t *pte;
        int i;

        pmd = pud_pgtable(*pud);
        pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
        if (!pmd_sv)
                return 0;

        for (i = 0; i < PTRS_PER_PMD; i++) {
                pmd_sv[i] = pmd[i];
                if (!pmd_none(pmd[i]))
                        pmd_clear(&pmd[i]);
        }

        pud_clear(pud);

        /* INVLPG to clear all paging-structure caches */
        flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);

        for (i = 0; i < PTRS_PER_PMD; i++) {
                if (!pmd_none(pmd_sv[i])) {
                        pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
                        free_page((unsigned long)pte);
                }
        }

        free_page((unsigned long)pmd_sv);

        pgtable_pmd_page_dtor(virt_to_page(pmd));
        free_page((unsigned long)pmd);

        return 1;
}

/**
 * pmd_free_pte_page - Clear pmd entry and free pte page.
 * @pmd: Pointer to a PMD.
 * @addr: Virtual address associated with pmd.
 *
 * Context: The pmd range has been unmapped and TLB purged.
 * Return: 1 if clearing the entry succeeded. 0 otherwise.
 */
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
        pte_t *pte;

        pte = (pte_t *)pmd_page_vaddr(*pmd);
        pmd_clear(pmd);

        /* INVLPG to clear all paging-structure caches */
        flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);

        free_page((unsigned long)pte);

        return 1;
}

#else /* !CONFIG_X86_64 */

int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
        return pud_none(*pud);
}

/*
 * Disable free page handling on x86-PAE. This assures that ioremap()
 * does not update sync'd pmd entries. See vmalloc_sync_one().
 */
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
        return pmd_none(*pmd);
}

#endif /* CONFIG_X86_64 */
#endif        /* CONFIG_HAVE_ARCH_HUGE_VMAP */





















































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TIMEKEEPING_H
#define _LINUX_TIMEKEEPING_H

#include <linux/errno.h>

/* Included from linux/ktime.h */

void timekeeping_init(void);
extern int timekeeping_suspended;

/* Architecture timer tick functions: */
extern void update_process_times(int user);
extern void xtime_update(unsigned long ticks);

/*
 * Get and set timeofday
 */
extern int do_settimeofday64(const struct timespec64 *ts);
extern int do_sys_settimeofday64(const struct timespec64 *tv,
                                 const struct timezone *tz);

/*
 * ktime_get() family: read the current time in a multitude of ways,
 *
 * The default time reference is CLOCK_MONOTONIC, starting at
 * boot time but not counting the time spent in suspend.
 * For other references, use the functions with "real", "clocktai",
 * "boottime" and "raw" suffixes.
 *
 * To get the time in a different format, use the ones wit
 * "ns", "ts64" and "seconds" suffix.
 *
 * See Documentation/core-api/timekeeping.rst for more details.
 */


/*
 * timespec64 based interfaces
 */
extern void ktime_get_raw_ts64(struct timespec64 *ts);
extern void ktime_get_ts64(struct timespec64 *ts);
extern void ktime_get_real_ts64(struct timespec64 *tv);
extern void ktime_get_coarse_ts64(struct timespec64 *ts);
extern void ktime_get_coarse_real_ts64(struct timespec64 *ts);

void getboottime64(struct timespec64 *ts);

/*
 * time64_t base interfaces
 */
extern time64_t ktime_get_seconds(void);
extern time64_t __ktime_get_real_seconds(void);
extern time64_t ktime_get_real_seconds(void);

/*
 * ktime_t based interfaces
 */

enum tk_offsets {
        TK_OFFS_REAL,
        TK_OFFS_BOOT,
        TK_OFFS_TAI,
        TK_OFFS_MAX,
};

extern ktime_t ktime_get(void);
extern ktime_t ktime_get_with_offset(enum tk_offsets offs);
extern ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs);
extern ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs);
extern ktime_t ktime_get_raw(void);
extern u32 ktime_get_resolution_ns(void);

/**
 * ktime_get_real - get the real (wall-) time in ktime_t format
 */
static inline ktime_t ktime_get_real(void)
{
        return ktime_get_with_offset(TK_OFFS_REAL);
}

static inline ktime_t ktime_get_coarse_real(void)
{
        return ktime_get_coarse_with_offset(TK_OFFS_REAL);
}

/**
 * ktime_get_boottime - Returns monotonic time since boot in ktime_t format
 *
 * This is similar to CLOCK_MONTONIC/ktime_get, but also includes the
 * time spent in suspend.
 */
static inline ktime_t ktime_get_boottime(void)
{
        return ktime_get_with_offset(TK_OFFS_BOOT);
}

static inline ktime_t ktime_get_coarse_boottime(void)
{
        return ktime_get_coarse_with_offset(TK_OFFS_BOOT);
}

/**
 * ktime_get_clocktai - Returns the TAI time of day in ktime_t format
 */
static inline ktime_t ktime_get_clocktai(void)
{
        return ktime_get_with_offset(TK_OFFS_TAI);
}

static inline ktime_t ktime_get_coarse_clocktai(void)
{
        return ktime_get_coarse_with_offset(TK_OFFS_TAI);
}

static inline ktime_t ktime_get_coarse(void)
{
        struct timespec64 ts;

        ktime_get_coarse_ts64(&ts);
        return timespec64_to_ktime(ts);
}

static inline u64 ktime_get_coarse_ns(void)
{
        return ktime_to_ns(ktime_get_coarse());
}

static inline u64 ktime_get_coarse_real_ns(void)
{
        return ktime_to_ns(ktime_get_coarse_real());
}

static inline u64 ktime_get_coarse_boottime_ns(void)
{
        return ktime_to_ns(ktime_get_coarse_boottime());
}

static inline u64 ktime_get_coarse_clocktai_ns(void)
{
        return ktime_to_ns(ktime_get_coarse_clocktai());
}

/**
 * ktime_mono_to_real - Convert monotonic time to clock realtime
 */
static inline ktime_t ktime_mono_to_real(ktime_t mono)
{
        return ktime_mono_to_any(mono, TK_OFFS_REAL);
}

static inline u64 ktime_get_ns(void)
{
        return ktime_to_ns(ktime_get());
}

static inline u64 ktime_get_real_ns(void)
{
        return ktime_to_ns(ktime_get_real());
}

static inline u64 ktime_get_boottime_ns(void)
{
        return ktime_to_ns(ktime_get_boottime());
}

static inline u64 ktime_get_clocktai_ns(void)
{
        return ktime_to_ns(ktime_get_clocktai());
}

static inline u64 ktime_get_raw_ns(void)
{
        return ktime_to_ns(ktime_get_raw());
}

extern u64 ktime_get_mono_fast_ns(void);
extern u64 ktime_get_raw_fast_ns(void);
extern u64 ktime_get_boot_fast_ns(void);
extern u64 ktime_get_real_fast_ns(void);

/*
 * timespec64/time64_t interfaces utilizing the ktime based ones
 * for API completeness, these could be implemented more efficiently
 * if needed.
 */
static inline void ktime_get_boottime_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_boottime());
}

static inline void ktime_get_coarse_boottime_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_coarse_boottime());
}

static inline time64_t ktime_get_boottime_seconds(void)
{
        return ktime_divns(ktime_get_coarse_boottime(), NSEC_PER_SEC);
}

static inline void ktime_get_clocktai_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_clocktai());
}

static inline void ktime_get_coarse_clocktai_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_coarse_clocktai());
}

static inline time64_t ktime_get_clocktai_seconds(void)
{
        return ktime_divns(ktime_get_coarse_clocktai(), NSEC_PER_SEC);
}

/*
 * RTC specific
 */
extern bool timekeeping_rtc_skipsuspend(void);
extern bool timekeeping_rtc_skipresume(void);

extern void timekeeping_inject_sleeptime64(const struct timespec64 *delta);

/*
 * struct ktime_timestanps - Simultaneous mono/boot/real timestamps
 * @mono:        Monotonic timestamp
 * @boot:        Boottime timestamp
 * @real:        Realtime timestamp
 */
struct ktime_timestamps {
        u64                mono;
        u64                boot;
        u64                real;
};

/**
 * struct system_time_snapshot - simultaneous raw/real time capture with
 *                                 counter value
 * @cycles:        Clocksource counter value to produce the system times
 * @real:        Realtime system time
 * @raw:        Monotonic raw system time
 * @clock_was_set_seq:        The sequence number of clock was set events
 * @cs_was_changed_seq:        The sequence number of clocksource change events
 */
struct system_time_snapshot {
        u64                cycles;
        ktime_t                real;
        ktime_t                raw;
        unsigned int        clock_was_set_seq;
        u8                cs_was_changed_seq;
};

/**
 * struct system_device_crosststamp - system/device cross-timestamp
 *                                      (synchronized capture)
 * @device:                Device time
 * @sys_realtime:        Realtime simultaneous with device time
 * @sys_monoraw:        Monotonic raw simultaneous with device time
 */
struct system_device_crosststamp {
        ktime_t device;
        ktime_t sys_realtime;
        ktime_t sys_monoraw;
};

/**
 * struct system_counterval_t - system counter value with the pointer to the
 *                                corresponding clocksource
 * @cycles:        System counter value
 * @cs:                Clocksource corresponding to system counter value. Used by
 *                timekeeping code to verify comparibility of two cycle values
 */
struct system_counterval_t {
        u64                        cycles;
        struct clocksource        *cs;
};

/*
 * Get cross timestamp between system clock and device clock
 */
extern int get_device_system_crosststamp(
                        int (*get_time_fn)(ktime_t *device_time,
                                struct system_counterval_t *system_counterval,
                                void *ctx),
                        void *ctx,
                        struct system_time_snapshot *history,
                        struct system_device_crosststamp *xtstamp);

/*
 * Simultaneously snapshot realtime and monotonic raw clocks
 */
extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot);

/* NMI safe mono/boot/realtime timestamps */
extern void ktime_get_fast_timestamps(struct ktime_timestamps *snap);

/*
 * Persistent clock related interfaces
 */
extern int persistent_clock_is_local;

extern void read_persistent_clock64(struct timespec64 *ts);
void read_persistent_wall_and_boot_offset(struct timespec64 *wall_clock,
                                          struct timespec64 *boot_offset);
extern int update_persistent_clock64(struct timespec64 now);

#endif

















































































































    1 
    1 




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/exec.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * #!-checking implemented by tytso.
 */
/*
 * Demand-loading implemented 01.12.91 - no need to read anything but
 * the header into memory. The inode of the executable is put into
 * "current->executable", and page faults do the actual loading. Clean.
 *
 * Once more I can proudly say that linux stood up to being changed: it
 * was less than 2 hours work to get demand-loading completely implemented.
 *
 * Demand loading changed July 1993 by Eric Youngdale.   Use mmap instead,
 * current->executable is only used by the procfs.  This allows a dispatch
 * table to check for several different types  of binary formats.  We keep
 * trying until we recognize the file or we run out of supported binary
 * formats.
 */

#include <linux/kernel_read_file.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
#include <linux/swap.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/signal.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/pagemap.h>
#include <linux/perf_event.h>
#include <linux/highmem.h>
#include <linux/spinlock.h>
#include <linux/key.h>
#include <linux/personality.h>
#include <linux/binfmts.h>
#include <linux/utsname.h>
#include <linux/pid_namespace.h>
#include <linux/module.h>
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/audit.h>
#include <linux/tracehook.h>
#include <linux/kmod.h>
#include <linux/fsnotify.h>
#include <linux/fs_struct.h>
#include <linux/oom.h>
#include <linux/compat.h>
#include <linux/vmalloc.h>
#include <linux/io_uring.h>

#include <linux/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/tlb.h>

#include <trace/events/task.h>
#include "internal.h"

#include <trace/events/sched.h>

static int bprm_creds_from_file(struct linux_binprm *bprm);

int suid_dumpable = 0;

static LIST_HEAD(formats);
static DEFINE_RWLOCK(binfmt_lock);

void __register_binfmt(struct linux_binfmt * fmt, int insert)
{
        BUG_ON(!fmt);
        if (WARN_ON(!fmt->load_binary))
                return;
        write_lock(&binfmt_lock);
        insert ? list_add(&fmt->lh, &formats) :
                 list_add_tail(&fmt->lh, &formats);
        write_unlock(&binfmt_lock);
}

EXPORT_SYMBOL(__register_binfmt);

void unregister_binfmt(struct linux_binfmt * fmt)
{
        write_lock(&binfmt_lock);
        list_del(&fmt->lh);
        write_unlock(&binfmt_lock);
}

EXPORT_SYMBOL(unregister_binfmt);

static inline void put_binfmt(struct linux_binfmt * fmt)
{
        module_put(fmt->module);
}

bool path_noexec(const struct path *path)
{
        return (path->mnt->mnt_flags & MNT_NOEXEC) ||
               (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
}

#ifdef CONFIG_USELIB
/*
 * Note that a shared library must be both readable and executable due to
 * security reasons.
 *
 * Also note that we take the address to load from from the file itself.
 */
SYSCALL_DEFINE1(uselib, const char __user *, library)
{
        struct linux_binfmt *fmt;
        struct file *file;
        struct filename *tmp = getname(library);
        int error = PTR_ERR(tmp);
        static const struct open_flags uselib_flags = {
                .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
                .acc_mode = MAY_READ | MAY_EXEC,
                .intent = LOOKUP_OPEN,
                .lookup_flags = LOOKUP_FOLLOW,
        };

        if (IS_ERR(tmp))
                goto out;

        file = do_filp_open(AT_FDCWD, tmp, &uselib_flags);
        putname(tmp);
        error = PTR_ERR(file);
        if (IS_ERR(file))
                goto out;

        /*
         * may_open() has already checked for this, so it should be
         * impossible to trip now. But we need to be extra cautious
         * and check again at the very end too.
         */
        error = -EACCES;
        if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) ||
                         path_noexec(&file->f_path)))
                goto exit;

        fsnotify_open(file);

        error = -ENOEXEC;

        read_lock(&binfmt_lock);
        list_for_each_entry(fmt, &formats, lh) {
                if (!fmt->load_shlib)
                        continue;
                if (!try_module_get(fmt->module))
                        continue;
                read_unlock(&binfmt_lock);
                error = fmt->load_shlib(file);
                read_lock(&binfmt_lock);
                put_binfmt(fmt);
                if (error != -ENOEXEC)
                        break;
        }
        read_unlock(&binfmt_lock);
exit:
        fput(file);
out:
          return error;
}
#endif /* #ifdef CONFIG_USELIB */

#ifdef CONFIG_MMU
/*
 * The nascent bprm->mm is not visible until exec_mmap() but it can
 * use a lot of memory, account these pages in current->mm temporary
 * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
 * change the counter back via acct_arg_size(0).
 */
static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
{
        struct mm_struct *mm = current->mm;
        long diff = (long)(pages - bprm->vma_pages);

        if (!mm || !diff)
                return;

        bprm->vma_pages = pages;
        add_mm_counter(mm, MM_ANONPAGES, diff);
}

static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                int write)
{
        struct page *page;
        int ret;
        unsigned int gup_flags = FOLL_FORCE;

#ifdef CONFIG_STACK_GROWSUP
        if (write) {
                ret = expand_downwards(bprm->vma, pos);
                if (ret < 0)
                        return NULL;
        }
#endif

        if (write)
                gup_flags |= FOLL_WRITE;

        /*
         * We are doing an exec().  'current' is the process
         * doing the exec and bprm->mm is the new process's mm.
         */
        ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags,
                        &page, NULL, NULL);
        if (ret <= 0)
                return NULL;

        if (write)
                acct_arg_size(bprm, vma_pages(bprm->vma));

        return page;
}

static void put_arg_page(struct page *page)
{
        put_page(page);
}

static void free_arg_pages(struct linux_binprm *bprm)
{
}

static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
                struct page *page)
{
        flush_cache_page(bprm->vma, pos, page_to_pfn(page));
}

static int __bprm_mm_init(struct linux_binprm *bprm)
{
        int err;
        struct vm_area_struct *vma = NULL;
        struct mm_struct *mm = bprm->mm;

        bprm->vma = vma = vm_area_alloc(mm);
        if (!vma)
                return -ENOMEM;
        vma_set_anonymous(vma);

        if (mmap_write_lock_killable(mm)) {
                err = -EINTR;
                goto err_free;
        }

        /*
         * Place the stack at the largest stack address the architecture
         * supports. Later, we'll move this to an appropriate place. We don't
         * use STACK_TOP because that can depend on attributes which aren't
         * configured yet.
         */
        BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
        vma->vm_end = STACK_TOP_MAX;
        vma->vm_start = vma->vm_end - PAGE_SIZE;
        vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);

        err = insert_vm_struct(mm, vma);
        if (err)
                goto err;

        mm->stack_vm = mm->total_vm = 1;
        mmap_write_unlock(mm);
        bprm->p = vma->vm_end - sizeof(void *);
        return 0;
err:
        mmap_write_unlock(mm);
err_free:
        bprm->vma = NULL;
        vm_area_free(vma);
        return err;
}

static bool valid_arg_len(struct linux_binprm *bprm, long len)
{
        return len <= MAX_ARG_STRLEN;
}

#else

static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
{
}

static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                int write)
{
        struct page *page;

        page = bprm->page[pos / PAGE_SIZE];
        if (!page && write) {
                page = alloc_page(GFP_HIGHUSER|__GFP_ZERO);
                if (!page)
                        return NULL;
                bprm->page[pos / PAGE_SIZE] = page;
        }

        return page;
}

static void put_arg_page(struct page *page)
{
}

static void free_arg_page(struct linux_binprm *bprm, int i)
{
        if (bprm->page[i]) {
                __free_page(bprm->page[i]);
                bprm->page[i] = NULL;
        }
}

static void free_arg_pages(struct linux_binprm *bprm)
{
        int i;

        for (i = 0; i < MAX_ARG_PAGES; i++)
                free_arg_page(bprm, i);
}

static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
                struct page *page)
{
}

static int __bprm_mm_init(struct linux_binprm *bprm)
{
        bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
        return 0;
}

static bool valid_arg_len(struct linux_binprm *bprm, long len)
{
        return len <= bprm->p;
}

#endif /* CONFIG_MMU */

/*
 * Create a new mm_struct and populate it with a temporary stack
 * vm_area_struct.  We don't have enough context at this point to set the stack
 * flags, permissions, and offset, so we use temporary values.  We'll update
 * them later in setup_arg_pages().
 */
static int bprm_mm_init(struct linux_binprm *bprm)
{
        int err;
        struct mm_struct *mm = NULL;

        bprm->mm = mm = mm_alloc();
        err = -ENOMEM;
        if (!mm)
                goto err;

        /* Save current stack limit for all calculations made during exec. */
        task_lock(current->group_leader);
        bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK];
        task_unlock(current->group_leader);

        err = __bprm_mm_init(bprm);
        if (err)
                goto err;

        return 0;

err:
        if (mm) {
                bprm->mm = NULL;
                mmdrop(mm);
        }

        return err;
}

struct user_arg_ptr {
#ifdef CONFIG_COMPAT
        bool is_compat;
#endif
        union {
                const char __user *const __user *native;
#ifdef CONFIG_COMPAT
                const compat_uptr_t __user *compat;
#endif
        } ptr;
};

static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
{
        const char __user *native;

#ifdef CONFIG_COMPAT
        if (unlikely(argv.is_compat)) {
                compat_uptr_t compat;

                if (get_user(compat, argv.ptr.compat + nr))
                        return ERR_PTR(-EFAULT);

                return compat_ptr(compat);
        }
#endif

        if (get_user(native, argv.ptr.native + nr))
                return ERR_PTR(-EFAULT);

        return native;
}

/*
 * count() counts the number of strings in array ARGV.
 */
static int count(struct user_arg_ptr argv, int max)
{
        int i = 0;

        if (argv.ptr.native != NULL) {
                for (;;) {
                        const char __user *p = get_user_arg_ptr(argv, i);

                        if (!p)
                                break;

                        if (IS_ERR(p))
                                return -EFAULT;

                        if (i >= max)
                                return -E2BIG;
                        ++i;

                        if (fatal_signal_pending(current))
                                return -ERESTARTNOHAND;
                        cond_resched();
                }
        }
        return i;
}

static int count_strings_kernel(const char *const *argv)
{
        int i;

        if (!argv)
                return 0;

        for (i = 0; argv[i]; ++i) {
                if (i >= MAX_ARG_STRINGS)
                        return -E2BIG;
                if (fatal_signal_pending(current))
                        return -ERESTARTNOHAND;
                cond_resched();
        }
        return i;
}

static int bprm_stack_limits(struct linux_binprm *bprm)
{
        unsigned long limit, ptr_size;

        /*
         * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
         * (whichever is smaller) for the argv+env strings.
         * This ensures that:
         *  - the remaining binfmt code will not run out of stack space,
         *  - the program will have a reasonable amount of stack left
         *    to work from.
         */
        limit = _STK_LIM / 4 * 3;
        limit = min(limit, bprm->rlim_stack.rlim_cur / 4);
        /*
         * We've historically supported up to 32 pages (ARG_MAX)
         * of argument strings even with small stacks
         */
        limit = max_t(unsigned long, limit, ARG_MAX);
        /*
         * We must account for the size of all the argv and envp pointers to
         * the argv and envp strings, since they will also take up space in
         * the stack. They aren't stored until much later when we can't
         * signal to the parent that the child has run out of stack space.
         * Instead, calculate it here so it's possible to fail gracefully.
         *
         * In the case of argc = 0, make sure there is space for adding a
         * empty string (which will bump argc to 1), to ensure confused
         * userspace programs don't start processing from argv[1], thinking
         * argc can never be 0, to keep them from walking envp by accident.
         * See do_execveat_common().
         */
        ptr_size = (max(bprm->argc, 1) + bprm->envc) * sizeof(void *);
        if (limit <= ptr_size)
                return -E2BIG;
        limit -= ptr_size;

        bprm->argmin = bprm->p - limit;
        return 0;
}

/*
 * 'copy_strings()' copies argument/environment strings from the old
 * processes's memory to the new process's stack.  The call to get_user_pages()
 * ensures the destination page is created and not swapped out.
 */
static int copy_strings(int argc, struct user_arg_ptr argv,
                        struct linux_binprm *bprm)
{
        struct page *kmapped_page = NULL;
        char *kaddr = NULL;
        unsigned long kpos = 0;
        int ret;

        while (argc-- > 0) {
                const char __user *str;
                int len;
                unsigned long pos;

                ret = -EFAULT;
                str = get_user_arg_ptr(argv, argc);
                if (IS_ERR(str))
                        goto out;

                len = strnlen_user(str, MAX_ARG_STRLEN);
                if (!len)
                        goto out;

                ret = -E2BIG;
                if (!valid_arg_len(bprm, len))
                        goto out;

                /* We're going to work our way backwords. */
                pos = bprm->p;
                str += len;
                bprm->p -= len;
#ifdef CONFIG_MMU
                if (bprm->p < bprm->argmin)
                        goto out;
#endif

                while (len > 0) {
                        int offset, bytes_to_copy;

                        if (fatal_signal_pending(current)) {
                                ret = -ERESTARTNOHAND;
                                goto out;
                        }
                        cond_resched();

                        offset = pos % PAGE_SIZE;
                        if (offset == 0)
                                offset = PAGE_SIZE;

                        bytes_to_copy = offset;
                        if (bytes_to_copy > len)
                                bytes_to_copy = len;

                        offset -= bytes_to_copy;
                        pos -= bytes_to_copy;
                        str -= bytes_to_copy;
                        len -= bytes_to_copy;

                        if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
                                struct page *page;

                                page = get_arg_page(bprm, pos, 1);
                                if (!page) {
                                        ret = -E2BIG;
                                        goto out;
                                }

                                if (kmapped_page) {
                                        flush_kernel_dcache_page(kmapped_page);
                                        kunmap(kmapped_page);
                                        put_arg_page(kmapped_page);
                                }
                                kmapped_page = page;
                                kaddr = kmap(kmapped_page);
                                kpos = pos & PAGE_MASK;
                                flush_arg_page(bprm, kpos, kmapped_page);
                        }
                        if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
                                ret = -EFAULT;
                                goto out;
                        }
                }
        }
        ret = 0;
out:
        if (kmapped_page) {
                flush_kernel_dcache_page(kmapped_page);
                kunmap(kmapped_page);
                put_arg_page(kmapped_page);
        }
        return ret;
}

/*
 * Copy and argument/environment string from the kernel to the processes stack.
 */
int copy_string_kernel(const char *arg, struct linux_binprm *bprm)
{
        int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */;
        unsigned long pos = bprm->p;

        if (len == 0)
                return -EFAULT;
        if (!valid_arg_len(bprm, len))
                return -E2BIG;

        /* We're going to work our way backwards. */
        arg += len;
        bprm->p -= len;
        if (IS_ENABLED(CONFIG_MMU) && bprm->p < bprm->argmin)
                return -E2BIG;

        while (len > 0) {
                unsigned int bytes_to_copy = min_t(unsigned int, len,
                                min_not_zero(offset_in_page(pos), PAGE_SIZE));
                struct page *page;
                char *kaddr;

                pos -= bytes_to_copy;
                arg -= bytes_to_copy;
                len -= bytes_to_copy;

                page = get_arg_page(bprm, pos, 1);
                if (!page)
                        return -E2BIG;
                kaddr = kmap_atomic(page);
                flush_arg_page(bprm, pos & PAGE_MASK, page);
                memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy);
                flush_kernel_dcache_page(page);
                kunmap_atomic(kaddr);
                put_arg_page(page);
        }

        return 0;
}
EXPORT_SYMBOL(copy_string_kernel);

static int copy_strings_kernel(int argc, const char *const *argv,
                               struct linux_binprm *bprm)
{
        while (argc-- > 0) {
                int ret = copy_string_kernel(argv[argc], bprm);
                if (ret < 0)
                        return ret;
                if (fatal_signal_pending(current))
                        return -ERESTARTNOHAND;
                cond_resched();
        }
        return 0;
}

#ifdef CONFIG_MMU

/*
 * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX.  Once
 * the binfmt code determines where the new stack should reside, we shift it to
 * its final location.  The process proceeds as follows:
 *
 * 1) Use shift to calculate the new vma endpoints.
 * 2) Extend vma to cover both the old and new ranges.  This ensures the
 *    arguments passed to subsequent functions are consistent.
 * 3) Move vma's page tables to the new range.
 * 4) Free up any cleared pgd range.
 * 5) Shrink the vma to cover only the new range.
 */
static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
{
        struct mm_struct *mm = vma->vm_mm;
        unsigned long old_start = vma->vm_start;
        unsigned long old_end = vma->vm_end;
        unsigned long length = old_end - old_start;
        unsigned long new_start = old_start - shift;
        unsigned long new_end = old_end - shift;
        struct mmu_gather tlb;

        BUG_ON(new_start > new_end);

        /*
         * ensure there are no vmas between where we want to go
         * and where we are
         */
        if (vma != find_vma(mm, new_start))
                return -EFAULT;

        /*
         * cover the whole range: [new_start, old_end)
         */
        if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL))
                return -ENOMEM;

        /*
         * move the page tables downwards, on failure we rely on
         * process cleanup to remove whatever mess we made.
         */
        if (length != move_page_tables(vma, old_start,
                                       vma, new_start, length, false))
                return -ENOMEM;

        lru_add_drain();
        tlb_gather_mmu(&tlb, mm, old_start, old_end);
        if (new_end > old_start) {
                /*
                 * when the old and new regions overlap clear from new_end.
                 */
                free_pgd_range(&tlb, new_end, old_end, new_end,
                        vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
        } else {
                /*
                 * otherwise, clean from old_start; this is done to not touch
                 * the address space in [new_end, old_start) some architectures
                 * have constraints on va-space that make this illegal (IA64) -
                 * for the others its just a little faster.
                 */
                free_pgd_range(&tlb, old_start, old_end, new_end,
                        vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
        }
        tlb_finish_mmu(&tlb, old_start, old_end);

        /*
         * Shrink the vma to just the new range.  Always succeeds.
         */
        vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);

        return 0;
}

/*
 * Finalizes the stack vm_area_struct. The flags and permissions are updated,
 * the stack is optionally relocated, and some extra space is added.
 */
int setup_arg_pages(struct linux_binprm *bprm,
                    unsigned long stack_top,
                    int executable_stack)
{
        unsigned long ret;
        unsigned long stack_shift;
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma = bprm->vma;
        struct vm_area_struct *prev = NULL;
        unsigned long vm_flags;
        unsigned long stack_base;
        unsigned long stack_size;
        unsigned long stack_expand;
        unsigned long rlim_stack;

#ifdef CONFIG_STACK_GROWSUP
        /* Limit stack size */
        stack_base = bprm->rlim_stack.rlim_max;
        if (stack_base > STACK_SIZE_MAX)
                stack_base = STACK_SIZE_MAX;

        /* Add space for stack randomization. */
        stack_base += (STACK_RND_MASK << PAGE_SHIFT);

        /* Make sure we didn't let the argument array grow too large. */
        if (vma->vm_end - vma->vm_start > stack_base)
                return -ENOMEM;

        stack_base = PAGE_ALIGN(stack_top - stack_base);

        stack_shift = vma->vm_start - stack_base;
        mm->arg_start = bprm->p - stack_shift;
        bprm->p = vma->vm_end - stack_shift;
#else
        stack_top = arch_align_stack(stack_top);
        stack_top = PAGE_ALIGN(stack_top);

        if (unlikely(stack_top < mmap_min_addr) ||
            unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
                return -ENOMEM;

        stack_shift = vma->vm_end - stack_top;

        bprm->p -= stack_shift;
        mm->arg_start = bprm->p;
#endif

        if (bprm->loader)
                bprm->loader -= stack_shift;
        bprm->exec -= stack_shift;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        vm_flags = VM_STACK_FLAGS;

        /*
         * Adjust stack execute permissions; explicitly enable for
         * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
         * (arch default) otherwise.
         */
        if (unlikely(executable_stack == EXSTACK_ENABLE_X))
                vm_flags |= VM_EXEC;
        else if (executable_stack == EXSTACK_DISABLE_X)
                vm_flags &= ~VM_EXEC;
        vm_flags |= mm->def_flags;
        vm_flags |= VM_STACK_INCOMPLETE_SETUP;

        ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
                        vm_flags);
        if (ret)
                goto out_unlock;
        BUG_ON(prev != vma);

        if (unlikely(vm_flags & VM_EXEC)) {
                pr_warn_once("process '%pD4' started with executable stack\n",
                             bprm->file);
        }

        /* Move stack pages down in memory. */
        if (stack_shift) {
                ret = shift_arg_pages(vma, stack_shift);
                if (ret)
                        goto out_unlock;
        }

        /* mprotect_fixup is overkill to remove the temporary stack flags */
        vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;

        stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
        stack_size = vma->vm_end - vma->vm_start;
        /*
         * Align this down to a page boundary as expand_stack
         * will align it up.
         */
        rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK;
#ifdef CONFIG_STACK_GROWSUP
        if (stack_size + stack_expand > rlim_stack)
                stack_base = vma->vm_start + rlim_stack;
        else
                stack_base = vma->vm_end + stack_expand;
#else
        if (stack_size + stack_expand > rlim_stack)
                stack_base = vma->vm_end - rlim_stack;
        else
                stack_base = vma->vm_start - stack_expand;
#endif
        current->mm->start_stack = bprm->p;
        ret = expand_stack(vma, stack_base);
        if (ret)
                ret = -EFAULT;

out_unlock:
        mmap_write_unlock(mm);
        return ret;
}
EXPORT_SYMBOL(setup_arg_pages);

#else

/*
 * Transfer the program arguments and environment from the holding pages
 * onto the stack. The provided stack pointer is adjusted accordingly.
 */
int transfer_args_to_stack(struct linux_binprm *bprm,
                           unsigned long *sp_location)
{
        unsigned long index, stop, sp;
        int ret = 0;

        stop = bprm->p >> PAGE_SHIFT;
        sp = *sp_location;

        for (index = MAX_ARG_PAGES - 1; index >= stop; index--) {
                unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0;
                char *src = kmap(bprm->page[index]) + offset;
                sp -= PAGE_SIZE - offset;
                if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0)
                        ret = -EFAULT;
                kunmap(bprm->page[index]);
                if (ret)
                        goto out;
        }

        bprm->exec += *sp_location - MAX_ARG_PAGES * PAGE_SIZE;
        *sp_location = sp;

out:
        return ret;
}
EXPORT_SYMBOL(transfer_args_to_stack);

#endif /* CONFIG_MMU */

static struct file *do_open_execat(int fd, struct filename *name, int flags)
{
        struct file *file;
        int err;
        struct open_flags open_exec_flags = {
                .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
                .acc_mode = MAY_EXEC,
                .intent = LOOKUP_OPEN,
                .lookup_flags = LOOKUP_FOLLOW,
        };

        if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
                return ERR_PTR(-EINVAL);
        if (flags & AT_SYMLINK_NOFOLLOW)
                open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
        if (flags & AT_EMPTY_PATH)
                open_exec_flags.lookup_flags |= LOOKUP_EMPTY;

        file = do_filp_open(fd, name, &open_exec_flags);
        if (IS_ERR(file))
                goto out;

        /*
         * may_open() has already checked for this, so it should be
         * impossible to trip now. But we need to be extra cautious
         * and check again at the very end too.
         */
        err = -EACCES;
        if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) ||
                         path_noexec(&file->f_path)))
                goto exit;

        err = deny_write_access(file);
        if (err)
                goto exit;

        if (name->name[0] != '\0')
                fsnotify_open(file);

out:
        return file;

exit:
        fput(file);
        return ERR_PTR(err);
}

struct file *open_exec(const char *name)
{
        struct filename *filename = getname_kernel(name);
        struct file *f = ERR_CAST(filename);

        if (!IS_ERR(filename)) {
                f = do_open_execat(AT_FDCWD, filename, 0);
                putname(filename);
        }
        return f;
}
EXPORT_SYMBOL(open_exec);

#if defined(CONFIG_HAVE_AOUT) || defined(CONFIG_BINFMT_FLAT) || \
    defined(CONFIG_BINFMT_ELF_FDPIC)
ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
{
        ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
        if (res > 0)
                flush_icache_user_range(addr, addr + len);
        return res;
}
EXPORT_SYMBOL(read_code);
#endif

/*
 * Maps the mm_struct mm into the current task struct.
 * On success, this function returns with exec_update_lock
 * held for writing.
 */
static int exec_mmap(struct mm_struct *mm)
{
        struct task_struct *tsk;
        struct mm_struct *old_mm, *active_mm;
        int ret;

        /* Notify parent that we're no longer interested in the old VM */
        tsk = current;
        old_mm = current->mm;
        exec_mm_release(tsk, old_mm);
        if (old_mm)
                sync_mm_rss(old_mm);

        ret = down_write_killable(&tsk->signal->exec_update_lock);
        if (ret)
                return ret;

        if (old_mm) {
                /*
                 * Make sure that if there is a core dump in progress
                 * for the old mm, we get out and die instead of going
                 * through with the exec.  We must hold mmap_lock around
                 * checking core_state and changing tsk->mm.
                 */
                mmap_read_lock(old_mm);
                if (unlikely(old_mm->core_state)) {
                        mmap_read_unlock(old_mm);
                        up_write(&tsk->signal->exec_update_lock);
                        return -EINTR;
                }
        }

        task_lock(tsk);
        membarrier_exec_mmap(mm);

        local_irq_disable();
        active_mm = tsk->active_mm;
        tsk->active_mm = mm;
        tsk->mm = mm;
        /*
         * This prevents preemption while active_mm is being loaded and
         * it and mm are being updated, which could cause problems for
         * lazy tlb mm refcounting when these are updated by context
         * switches. Not all architectures can handle irqs off over
         * activate_mm yet.
         */
        if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
                local_irq_enable();
        activate_mm(active_mm, mm);
        if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
                local_irq_enable();
        tsk->mm->vmacache_seqnum = 0;
        vmacache_flush(tsk);
        task_unlock(tsk);
        if (old_mm) {
                mmap_read_unlock(old_mm);
                BUG_ON(active_mm != old_mm);
                setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
                mm_update_next_owner(old_mm);
                mmput(old_mm);
                return 0;
        }
        mmdrop(active_mm);
        return 0;
}

static int de_thread(struct task_struct *tsk)
{
        struct signal_struct *sig = tsk->signal;
        struct sighand_struct *oldsighand = tsk->sighand;
        spinlock_t *lock = &oldsighand->siglock;

        if (thread_group_empty(tsk))
                goto no_thread_group;

        /*
         * Kill all other threads in the thread group.
         */
        spin_lock_irq(lock);
        if (signal_group_exit(sig)) {
                /*
                 * Another group action in progress, just
                 * return so that the signal is processed.
                 */
                spin_unlock_irq(lock);
                return -EAGAIN;
        }

        sig->group_exit_task = tsk;
        sig->notify_count = zap_other_threads(tsk);
        if (!thread_group_leader(tsk))
                sig->notify_count--;

        while (sig->notify_count) {
                __set_current_state(TASK_KILLABLE);
                spin_unlock_irq(lock);
                schedule();
                if (__fatal_signal_pending(tsk))
                        goto killed;
                spin_lock_irq(lock);
        }
        spin_unlock_irq(lock);

        /*
         * At this point all other threads have exited, all we have to
         * do is to wait for the thread group leader to become inactive,
         * and to assume its PID:
         */
        if (!thread_group_leader(tsk)) {
                struct task_struct *leader = tsk->group_leader;

                for (;;) {
                        cgroup_threadgroup_change_begin(tsk);
                        write_lock_irq(&tasklist_lock);
                        /*
                         * Do this under tasklist_lock to ensure that
                         * exit_notify() can't miss ->group_exit_task
                         */
                        sig->notify_count = -1;
                        if (likely(leader->exit_state))
                                break;
                        __set_current_state(TASK_KILLABLE);
                        write_unlock_irq(&tasklist_lock);
                        cgroup_threadgroup_change_end(tsk);
                        schedule();
                        if (__fatal_signal_pending(tsk))
                                goto killed;
                }

                /*
                 * The only record we have of the real-time age of a
                 * process, regardless of execs it's done, is start_time.
                 * All the past CPU time is accumulated in signal_struct
                 * from sister threads now dead.  But in this non-leader
                 * exec, nothing survives from the original leader thread,
                 * whose birth marks the true age of this process now.
                 * When we take on its identity by switching to its PID, we
                 * also take its birthdate (always earlier than our own).
                 */
                tsk->start_time = leader->start_time;
                tsk->start_boottime = leader->start_boottime;

                BUG_ON(!same_thread_group(leader, tsk));
                /*
                 * An exec() starts a new thread group with the
                 * TGID of the previous thread group. Rehash the
                 * two threads with a switched PID, and release
                 * the former thread group leader:
                 */

                /* Become a process group leader with the old leader's pid.
                 * The old leader becomes a thread of the this thread group.
                 */
                exchange_tids(tsk, leader);
                transfer_pid(leader, tsk, PIDTYPE_TGID);
                transfer_pid(leader, tsk, PIDTYPE_PGID);
                transfer_pid(leader, tsk, PIDTYPE_SID);

                list_replace_rcu(&leader->tasks, &tsk->tasks);
                list_replace_init(&leader->sibling, &tsk->sibling);

                tsk->group_leader = tsk;
                leader->group_leader = tsk;

                tsk->exit_signal = SIGCHLD;
                leader->exit_signal = -1;

                BUG_ON(leader->exit_state != EXIT_ZOMBIE);
                leader->exit_state = EXIT_DEAD;

                /*
                 * We are going to release_task()->ptrace_unlink() silently,
                 * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
                 * the tracer wont't block again waiting for this thread.
                 */
                if (unlikely(leader->ptrace))
                        __wake_up_parent(leader, leader->parent);
                write_unlock_irq(&tasklist_lock);
                cgroup_threadgroup_change_end(tsk);

                release_task(leader);
        }

        sig->group_exit_task = NULL;
        sig->notify_count = 0;

no_thread_group:
        /* we have changed execution domain */
        tsk->exit_signal = SIGCHLD;

        BUG_ON(!thread_group_leader(tsk));
        return 0;

killed:
        /* protects against exit_notify() and __exit_signal() */
        read_lock(&tasklist_lock);
        sig->group_exit_task = NULL;
        sig->notify_count = 0;
        read_unlock(&tasklist_lock);
        return -EAGAIN;
}


/*
 * This function makes sure the current process has its own signal table,
 * so that flush_signal_handlers can later reset the handlers without
 * disturbing other processes.  (Other processes might share the signal
 * table via the CLONE_SIGHAND option to clone().)
 */
static int unshare_sighand(struct task_struct *me)
{
        struct sighand_struct *oldsighand = me->sighand;

        if (refcount_read(&oldsighand->count) != 1) {
                struct sighand_struct *newsighand;
                /*
                 * This ->sighand is shared with the CLONE_SIGHAND
                 * but not CLONE_THREAD task, switch to the new one.
                 */
                newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
                if (!newsighand)
                        return -ENOMEM;

                refcount_set(&newsighand->count, 1);

                write_lock_irq(&tasklist_lock);
                spin_lock(&oldsighand->siglock);
                memcpy(newsighand->action, oldsighand->action,
                       sizeof(newsighand->action));
                rcu_assign_pointer(me->sighand, newsighand);
                spin_unlock(&oldsighand->siglock);
                write_unlock_irq(&tasklist_lock);

                __cleanup_sighand(oldsighand);
        }
        return 0;
}

char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk)
{
        task_lock(tsk);
        strncpy(buf, tsk->comm, buf_size);
        task_unlock(tsk);
        return buf;
}
EXPORT_SYMBOL_GPL(__get_task_comm);

/*
 * These functions flushes out all traces of the currently running executable
 * so that a new one can be started
 */

void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
{
        task_lock(tsk);
        trace_task_rename(tsk, buf);
        strlcpy(tsk->comm, buf, sizeof(tsk->comm));
        task_unlock(tsk);
        perf_event_comm(tsk, exec);
}

/*
 * Calling this is the point of no return. None of the failures will be
 * seen by userspace since either the process is already taking a fatal
 * signal (via de_thread() or coredump), or will have SEGV raised
 * (after exec_mmap()) by search_binary_handler (see below).
 */
int begin_new_exec(struct linux_binprm * bprm)
{
        struct task_struct *me = current;
        int retval;

        /* Once we are committed compute the creds */
        retval = bprm_creds_from_file(bprm);
        if (retval)
                return retval;

        /*
         * Ensure all future errors are fatal.
         */
        bprm->point_of_no_return = true;

        /*
         * Make this the only thread in the thread group.
         */
        retval = de_thread(me);
        if (retval)
                goto out;

        /*
         * Must be called _before_ exec_mmap() as bprm->mm is
         * not visibile until then. This also enables the update
         * to be lockless.
         */
        set_mm_exe_file(bprm->mm, bprm->file);

        /* If the binary is not readable then enforce mm->dumpable=0 */
        would_dump(bprm, bprm->file);
        if (bprm->have_execfd)
                would_dump(bprm, bprm->executable);

        /*
         * Release all of the old mmap stuff
         */
        acct_arg_size(bprm, 0);
        retval = exec_mmap(bprm->mm);
        if (retval)
                goto out;

        bprm->mm = NULL;

#ifdef CONFIG_POSIX_TIMERS
        spin_lock_irq(&me->sighand->siglock);
        posix_cpu_timers_exit(me);
        spin_unlock_irq(&me->sighand->siglock);
        exit_itimers(me);
        flush_itimer_signals();
#endif

        /*
         * Make the signal table private.
         */
        retval = unshare_sighand(me);
        if (retval)
                goto out_unlock;

        /*
         * Ensure that the uaccess routines can actually operate on userspace
         * pointers:
         */
        force_uaccess_begin();

        me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
                                        PF_NOFREEZE | PF_NO_SETAFFINITY);
        flush_thread();
        me->personality &= ~bprm->per_clear;

        /*
         * We have to apply CLOEXEC before we change whether the process is
         * dumpable (in setup_new_exec) to avoid a race with a process in userspace
         * trying to access the should-be-closed file descriptors of a process
         * undergoing exec(2).
         */
        do_close_on_exec(me->files);

        if (bprm->secureexec) {
                /* Make sure parent cannot signal privileged process. */
                me->pdeath_signal = 0;

                /*
                 * For secureexec, reset the stack limit to sane default to
                 * avoid bad behavior from the prior rlimits. This has to
                 * happen before arch_pick_mmap_layout(), which examines
                 * RLIMIT_STACK, but after the point of no return to avoid
                 * needing to clean up the change on failure.
                 */
                if (bprm->rlim_stack.rlim_cur > _STK_LIM)
                        bprm->rlim_stack.rlim_cur = _STK_LIM;
        }

        me->sas_ss_sp = me->sas_ss_size = 0;

        /*
         * Figure out dumpability. Note that this checking only of current
         * is wrong, but userspace depends on it. This should be testing
         * bprm->secureexec instead.
         */
        if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
            !(uid_eq(current_euid(), current_uid()) &&
              gid_eq(current_egid(), current_gid())))
                set_dumpable(current->mm, suid_dumpable);
        else
                set_dumpable(current->mm, SUID_DUMP_USER);

        perf_event_exec();
        __set_task_comm(me, kbasename(bprm->filename), true);

        /* An exec changes our domain. We are no longer part of the thread
           group */
        WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1);
        flush_signal_handlers(me, 0);

        /*
         * install the new credentials for this executable
         */
        security_bprm_committing_creds(bprm);

        commit_creds(bprm->cred);
        bprm->cred = NULL;

        /*
         * Disable monitoring for regular users
         * when executing setuid binaries. Must
         * wait until new credentials are committed
         * by commit_creds() above
         */
        if (get_dumpable(me->mm) != SUID_DUMP_USER)
                perf_event_exit_task(me);
        /*
         * cred_guard_mutex must be held at least to this point to prevent
         * ptrace_attach() from altering our determination of the task's
         * credentials; any time after this it may be unlocked.
         */
        security_bprm_committed_creds(bprm);

        /* Pass the opened binary to the interpreter. */
        if (bprm->have_execfd) {
                retval = get_unused_fd_flags(0);
                if (retval < 0)
                        goto out_unlock;
                fd_install(retval, bprm->executable);
                bprm->executable = NULL;
                bprm->execfd = retval;
        }
        return 0;

out_unlock:
        up_write(&me->signal->exec_update_lock);
        if (!bprm->cred)
                mutex_unlock(&me->signal->cred_guard_mutex);

out:
        return retval;
}
EXPORT_SYMBOL(begin_new_exec);

void would_dump(struct linux_binprm *bprm, struct file *file)
{
        struct inode *inode = file_inode(file);
        if (inode_permission(inode, MAY_READ) < 0) {
                struct user_namespace *old, *user_ns;
                bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;

                /* Ensure mm->user_ns contains the executable */
                user_ns = old = bprm->mm->user_ns;
                while ((user_ns != &init_user_ns) &&
                       !privileged_wrt_inode_uidgid(user_ns, inode))
                        user_ns = user_ns->parent;

                if (old != user_ns) {
                        bprm->mm->user_ns = get_user_ns(user_ns);
                        put_user_ns(old);
                }
        }
}
EXPORT_SYMBOL(would_dump);

void setup_new_exec(struct linux_binprm * bprm)
{
        /* Setup things that can depend upon the personality */
        struct task_struct *me = current;

        arch_pick_mmap_layout(me->mm, &bprm->rlim_stack);

        arch_setup_new_exec();

        /* Set the new mm task size. We have to do that late because it may
         * depend on TIF_32BIT which is only updated in flush_thread() on
         * some architectures like powerpc
         */
        me->mm->task_size = TASK_SIZE;
        up_write(&me->signal->exec_update_lock);
        mutex_unlock(&me->signal->cred_guard_mutex);
}
EXPORT_SYMBOL(setup_new_exec);

/* Runs immediately before start_thread() takes over. */
void finalize_exec(struct linux_binprm *bprm)
{
        /* Store any stack rlimit changes before starting thread. */
        task_lock(current->group_leader);
        current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack;
        task_unlock(current->group_leader);
}
EXPORT_SYMBOL(finalize_exec);

/*
 * Prepare credentials and lock ->cred_guard_mutex.
 * setup_new_exec() commits the new creds and drops the lock.
 * Or, if exec fails before, free_bprm() should release ->cred and
 * and unlock.
 */
static int prepare_bprm_creds(struct linux_binprm *bprm)
{
        if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
                return -ERESTARTNOINTR;

        bprm->cred = prepare_exec_creds();
        if (likely(bprm->cred))
                return 0;

        mutex_unlock(&current->signal->cred_guard_mutex);
        return -ENOMEM;
}

static void free_bprm(struct linux_binprm *bprm)
{
        if (bprm->mm) {
                acct_arg_size(bprm, 0);
                mmput(bprm->mm);
        }
        free_arg_pages(bprm);
        if (bprm->cred) {
                mutex_unlock(&current->signal->cred_guard_mutex);
                abort_creds(bprm->cred);
        }
        if (bprm->file) {
                allow_write_access(bprm->file);
                fput(bprm->file);
        }
        if (bprm->executable)
                fput(bprm->executable);
        /* If a binfmt changed the interp, free it. */
        if (bprm->interp != bprm->filename)
                kfree(bprm->interp);
        kfree(bprm->fdpath);
        kfree(bprm);
}

static struct linux_binprm *alloc_bprm(int fd, struct filename *filename)
{
        struct linux_binprm *bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
        int retval = -ENOMEM;
        if (!bprm)
                goto out;

        if (fd == AT_FDCWD || filename->name[0] == '/') {
                bprm->filename = filename->name;
        } else {
                if (filename->name[0] == '\0')
                        bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd);
                else
                        bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s",
                                                  fd, filename->name);
                if (!bprm->fdpath)
                        goto out_free;

                bprm->filename = bprm->fdpath;
        }
        bprm->interp = bprm->filename;

        retval = bprm_mm_init(bprm);
        if (retval)
                goto out_free;
        return bprm;

out_free:
        free_bprm(bprm);
out:
        return ERR_PTR(retval);
}

int bprm_change_interp(const char *interp, struct linux_binprm *bprm)
{
        /* If a binfmt changed the interp, free it first. */
        if (bprm->interp != bprm->filename)
                kfree(bprm->interp);
        bprm->interp = kstrdup(interp, GFP_KERNEL);
        if (!bprm->interp)
                return -ENOMEM;
        return 0;
}
EXPORT_SYMBOL(bprm_change_interp);

/*
 * determine how safe it is to execute the proposed program
 * - the caller must hold ->cred_guard_mutex to protect against
 *   PTRACE_ATTACH or seccomp thread-sync
 */
static void check_unsafe_exec(struct linux_binprm *bprm)
{
        struct task_struct *p = current, *t;
        unsigned n_fs;

        if (p->ptrace)
                bprm->unsafe |= LSM_UNSAFE_PTRACE;

        /*
         * This isn't strictly necessary, but it makes it harder for LSMs to
         * mess up.
         */
        if (task_no_new_privs(current))
                bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;

        t = p;
        n_fs = 1;
        spin_lock(&p->fs->lock);
        rcu_read_lock();
        while_each_thread(p, t) {
                if (t->fs == p->fs)
                        n_fs++;
        }
        rcu_read_unlock();

        if (p->fs->users > n_fs)
                bprm->unsafe |= LSM_UNSAFE_SHARE;
        else
                p->fs->in_exec = 1;
        spin_unlock(&p->fs->lock);
}

static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
{
        /* Handle suid and sgid on files */
        struct inode *inode;
        unsigned int mode;
        kuid_t uid;
        kgid_t gid;

        if (!mnt_may_suid(file->f_path.mnt))
                return;

        if (task_no_new_privs(current))
                return;

        inode = file->f_path.dentry->d_inode;
        mode = READ_ONCE(inode->i_mode);
        if (!(mode & (S_ISUID|S_ISGID)))
                return;

        /* Be careful if suid/sgid is set */
        inode_lock(inode);

        /* reload atomically mode/uid/gid now that lock held */
        mode = inode->i_mode;
        uid = inode->i_uid;
        gid = inode->i_gid;
        inode_unlock(inode);

        /* We ignore suid/sgid if there are no mappings for them in the ns */
        if (!kuid_has_mapping(bprm->cred->user_ns, uid) ||
                 !kgid_has_mapping(bprm->cred->user_ns, gid))
                return;

        if (mode & S_ISUID) {
                bprm->per_clear |= PER_CLEAR_ON_SETID;
                bprm->cred->euid = uid;
        }

        if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
                bprm->per_clear |= PER_CLEAR_ON_SETID;
                bprm->cred->egid = gid;
        }
}

/*
 * Compute brpm->cred based upon the final binary.
 */
static int bprm_creds_from_file(struct linux_binprm *bprm)
{
        /* Compute creds based on which file? */
        struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file;

        bprm_fill_uid(bprm, file);
        return security_bprm_creds_from_file(bprm, file);
}

/*
 * Fill the binprm structure from the inode.
 * Read the first BINPRM_BUF_SIZE bytes
 *
 * This may be called multiple times for binary chains (scripts for example).
 */
static int prepare_binprm(struct linux_binprm *bprm)
{
        loff_t pos = 0;

        memset(bprm->buf, 0, BINPRM_BUF_SIZE);
        return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos);
}

/*
 * Arguments are '\0' separated strings found at the location bprm->p
 * points to; chop off the first by relocating brpm->p to right after
 * the first '\0' encountered.
 */
int remove_arg_zero(struct linux_binprm *bprm)
{
        int ret = 0;
        unsigned long offset;
        char *kaddr;
        struct page *page;

        if (!bprm->argc)
                return 0;

        do {
                offset = bprm->p & ~PAGE_MASK;
                page = get_arg_page(bprm, bprm->p, 0);
                if (!page) {
                        ret = -EFAULT;
                        goto out;
                }
                kaddr = kmap_atomic(page);

                for (; offset < PAGE_SIZE && kaddr[offset];
                                offset++, bprm->p++)
                        ;

                kunmap_atomic(kaddr);
                put_arg_page(page);
        } while (offset == PAGE_SIZE);

        bprm->p++;
        bprm->argc--;
        ret = 0;

out:
        return ret;
}
EXPORT_SYMBOL(remove_arg_zero);

#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
/*
 * cycle the list of binary formats handler, until one recognizes the image
 */
static int search_binary_handler(struct linux_binprm *bprm)
{
        bool need_retry = IS_ENABLED(CONFIG_MODULES);
        struct linux_binfmt *fmt;
        int retval;

        retval = prepare_binprm(bprm);
        if (retval < 0)
                return retval;

        retval = security_bprm_check(bprm);
        if (retval)
                return retval;

        retval = -ENOENT;
 retry:
        read_lock(&binfmt_lock);
        list_for_each_entry(fmt, &formats, lh) {
                if (!try_module_get(fmt->module))
                        continue;
                read_unlock(&binfmt_lock);

                retval = fmt->load_binary(bprm);

                read_lock(&binfmt_lock);
                put_binfmt(fmt);
                if (bprm->point_of_no_return || (retval != -ENOEXEC)) {
                        read_unlock(&binfmt_lock);
                        return retval;
                }
        }
        read_unlock(&binfmt_lock);

        if (need_retry) {
                if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
                    printable(bprm->buf[2]) && printable(bprm->buf[3]))
                        return retval;
                if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0)
                        return retval;
                need_retry = false;
                goto retry;
        }

        return retval;
}

static int exec_binprm(struct linux_binprm *bprm)
{
        pid_t old_pid, old_vpid;
        int ret, depth;

        /* Need to fetch pid before load_binary changes it */
        old_pid = current->pid;
        rcu_read_lock();
        old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
        rcu_read_unlock();

        /* This allows 4 levels of binfmt rewrites before failing hard. */
        for (depth = 0;; depth++) {
                struct file *exec;
                if (depth > 5)
                        return -ELOOP;

                ret = search_binary_handler(bprm);
                if (ret < 0)
                        return ret;
                if (!bprm->interpreter)
                        break;

                exec = bprm->file;
                bprm->file = bprm->interpreter;
                bprm->interpreter = NULL;

                allow_write_access(exec);
                if (unlikely(bprm->have_execfd)) {
                        if (bprm->executable) {
                                fput(exec);
                                return -ENOEXEC;
                        }
                        bprm->executable = exec;
                } else
                        fput(exec);
        }

        audit_bprm(bprm);
        trace_sched_process_exec(current, old_pid, bprm);
        ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
        proc_exec_connector(current);
        return 0;
}

/*
 * sys_execve() executes a new program.
 */
static int bprm_execve(struct linux_binprm *bprm,
                       int fd, struct filename *filename, int flags)
{
        struct file *file;
        struct files_struct *displaced;
        int retval;

        /*
         * Cancel any io_uring activity across execve
         */
        io_uring_task_cancel();

        retval = unshare_files(&displaced);
        if (retval)
                return retval;

        retval = prepare_bprm_creds(bprm);
        if (retval)
                goto out_files;

        check_unsafe_exec(bprm);
        current->in_execve = 1;

        file = do_open_execat(fd, filename, flags);
        retval = PTR_ERR(file);
        if (IS_ERR(file))
                goto out_unmark;

        sched_exec();

        bprm->file = file;
        /*
         * Record that a name derived from an O_CLOEXEC fd will be
         * inaccessible after exec. Relies on having exclusive access to
         * current->files (due to unshare_files above).
         */
        if (bprm->fdpath &&
            close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
                bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;

        /* Set the unchanging part of bprm->cred */
        retval = security_bprm_creds_for_exec(bprm);
        if (retval)
                goto out;

        retval = exec_binprm(bprm);
        if (retval < 0)
                goto out;

        /* execve succeeded */
        current->fs->in_exec = 0;
        current->in_execve = 0;
        rseq_execve(current);
        acct_update_integrals(current);
        task_numa_free(current, false);
        if (displaced)
                put_files_struct(displaced);
        return retval;

out:
        /*
         * If past the point of no return ensure the the code never
         * returns to the userspace process.  Use an existing fatal
         * signal if present otherwise terminate the process with
         * SIGSEGV.
         */
        if (bprm->point_of_no_return && !fatal_signal_pending(current))
                force_sigsegv(SIGSEGV);

out_unmark:
        current->fs->in_exec = 0;
        current->in_execve = 0;

out_files:
        if (displaced)
                reset_files_struct(displaced);

        return retval;
}

static int do_execveat_common(int fd, struct filename *filename,
                              struct user_arg_ptr argv,
                              struct user_arg_ptr envp,
                              int flags)
{
        struct linux_binprm *bprm;
        int retval;

        if (IS_ERR(filename))
                return PTR_ERR(filename);

        /*
         * We move the actual failure in case of RLIMIT_NPROC excess from
         * set*uid() to execve() because too many poorly written programs
         * don't check setuid() return code.  Here we additionally recheck
         * whether NPROC limit is still exceeded.
         */
        if ((current->flags & PF_NPROC_EXCEEDED) &&
            atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
                retval = -EAGAIN;
                goto out_ret;
        }

        /* We're below the limit (still or again), so we don't want to make
         * further execve() calls fail. */
        current->flags &= ~PF_NPROC_EXCEEDED;

        bprm = alloc_bprm(fd, filename);
        if (IS_ERR(bprm)) {
                retval = PTR_ERR(bprm);
                goto out_ret;
        }

        retval = count(argv, MAX_ARG_STRINGS);
        if (retval == 0)
                pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n",
                             current->comm, bprm->filename);
        if (retval < 0)
                goto out_free;
        bprm->argc = retval;

        retval = count(envp, MAX_ARG_STRINGS);
        if (retval < 0)
                goto out_free;
        bprm->envc = retval;

        retval = bprm_stack_limits(bprm);
        if (retval < 0)
                goto out_free;

        retval = copy_string_kernel(bprm->filename, bprm);
        if (retval < 0)
                goto out_free;
        bprm->exec = bprm->p;

        retval = copy_strings(bprm->envc, envp, bprm);
        if (retval < 0)
                goto out_free;

        retval = copy_strings(bprm->argc, argv, bprm);
        if (retval < 0)
                goto out_free;

        /*
         * When argv is empty, add an empty string ("") as argv[0] to
         * ensure confused userspace programs that start processing
         * from argv[1] won't end up walking envp. See also
         * bprm_stack_limits().
         */
        if (bprm->argc == 0) {
                retval = copy_string_kernel("", bprm);
                if (retval < 0)
                        goto out_free;
                bprm->argc = 1;
        }

        retval = bprm_execve(bprm, fd, filename, flags);
out_free:
        free_bprm(bprm);

out_ret:
        putname(filename);
        return retval;
}

int kernel_execve(const char *kernel_filename,
                  const char *const *argv, const char *const *envp)
{
        struct filename *filename;
        struct linux_binprm *bprm;
        int fd = AT_FDCWD;
        int retval;

        filename = getname_kernel(kernel_filename);
        if (IS_ERR(filename))
                return PTR_ERR(filename);

        bprm = alloc_bprm(fd, filename);
        if (IS_ERR(bprm)) {
                retval = PTR_ERR(bprm);
                goto out_ret;
        }

        retval = count_strings_kernel(argv);
        if (WARN_ON_ONCE(retval == 0))
                retval = -EINVAL;
        if (retval < 0)
                goto out_free;
        bprm->argc = retval;

        retval = count_strings_kernel(envp);
        if (retval < 0)
                goto out_free;
        bprm->envc = retval;

        retval = bprm_stack_limits(bprm);
        if (retval < 0)
                goto out_free;

        retval = copy_string_kernel(bprm->filename, bprm);
        if (retval < 0)
                goto out_free;
        bprm->exec = bprm->p;

        retval = copy_strings_kernel(bprm->envc, envp, bprm);
        if (retval < 0)
                goto out_free;

        retval = copy_strings_kernel(bprm->argc, argv, bprm);
        if (retval < 0)
                goto out_free;

        retval = bprm_execve(bprm, fd, filename, 0);
out_free:
        free_bprm(bprm);
out_ret:
        putname(filename);
        return retval;
}

static int do_execve(struct filename *filename,
        const char __user *const __user *__argv,
        const char __user *const __user *__envp)
{
        struct user_arg_ptr argv = { .ptr.native = __argv };
        struct user_arg_ptr envp = { .ptr.native = __envp };
        return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
}

static int do_execveat(int fd, struct filename *filename,
                const char __user *const __user *__argv,
                const char __user *const __user *__envp,
                int flags)
{
        struct user_arg_ptr argv = { .ptr.native = __argv };
        struct user_arg_ptr envp = { .ptr.native = __envp };

        return do_execveat_common(fd, filename, argv, envp, flags);
}

#ifdef CONFIG_COMPAT
static int compat_do_execve(struct filename *filename,
        const compat_uptr_t __user *__argv,
        const compat_uptr_t __user *__envp)
{
        struct user_arg_ptr argv = {
                .is_compat = true,
                .ptr.compat = __argv,
        };
        struct user_arg_ptr envp = {
                .is_compat = true,
                .ptr.compat = __envp,
        };
        return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
}

static int compat_do_execveat(int fd, struct filename *filename,
                              const compat_uptr_t __user *__argv,
                              const compat_uptr_t __user *__envp,
                              int flags)
{
        struct user_arg_ptr argv = {
                .is_compat = true,
                .ptr.compat = __argv,
        };
        struct user_arg_ptr envp = {
                .is_compat = true,
                .ptr.compat = __envp,
        };
        return do_execveat_common(fd, filename, argv, envp, flags);
}
#endif

void set_binfmt(struct linux_binfmt *new)
{
        struct mm_struct *mm = current->mm;

        if (mm->binfmt)
                module_put(mm->binfmt->module);

        mm->binfmt = new;
        if (new)
                __module_get(new->module);
}
EXPORT_SYMBOL(set_binfmt);

/*
 * set_dumpable stores three-value SUID_DUMP_* into mm->flags.
 */
void set_dumpable(struct mm_struct *mm, int value)
{
        if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
                return;

        set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value);
}

SYSCALL_DEFINE3(execve,
                const char __user *, filename,
                const char __user *const __user *, argv,
                const char __user *const __user *, envp)
{
        return do_execve(getname(filename), argv, envp);
}

SYSCALL_DEFINE5(execveat,
                int, fd, const char __user *, filename,
                const char __user *const __user *, argv,
                const char __user *const __user *, envp,
                int, flags)
{
        int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;

        return do_execveat(fd,
                           getname_flags(filename, lookup_flags, NULL),
                           argv, envp, flags);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
        const compat_uptr_t __user *, argv,
        const compat_uptr_t __user *, envp)
{
        return compat_do_execve(getname(filename), argv, envp);
}

COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
                       const char __user *, filename,
                       const compat_uptr_t __user *, argv,
                       const compat_uptr_t __user *, envp,
                       int,  flags)
{
        int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;

        return compat_do_execveat(fd,
                                  getname_flags(filename, lookup_flags, NULL),
                                  argv, envp, flags);
}
#endif


























































    1 




    1 








































































































    1 



















    1 

    1 







    1 




    1 


    1 


























    1 










    1 
























    1 



























































    1 

















    1 












    1 





    1 




    1 


    1 






    1 
    1 






    1 





    1 










































































































































































    1 

    1 
    1 
    1 





































































    1 




    1 







    1 

    1 

















    1 
    1 

    1 








































































    1 


    1 




    1 



















    1 







    1 
    1 







    1 





    1 
    1 

    1 



    1 










    1 


    1 

    1 


    1 



















































































































































































































































































































































































































































































































































































































































    1 




    1 











    1 


































































    1 








    1 


    1 















































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
// SPDX-License-Identifier: GPL-2.0+
/*
 * XArray implementation
 * Copyright (c) 2017-2018 Microsoft Corporation
 * Copyright (c) 2018-2020 Oracle
 * Author: Matthew Wilcox <willy@infradead.org>
 */

#include <linux/bitmap.h>
#include <linux/export.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/xarray.h>

/*
 * Coding conventions in this file:
 *
 * @xa is used to refer to the entire xarray.
 * @xas is the 'xarray operation state'.  It may be either a pointer to
 * an xa_state, or an xa_state stored on the stack.  This is an unfortunate
 * ambiguity.
 * @index is the index of the entry being operated on
 * @mark is an xa_mark_t; a small number indicating one of the mark bits.
 * @node refers to an xa_node; usually the primary one being operated on by
 * this function.
 * @offset is the index into the slots array inside an xa_node.
 * @parent refers to the @xa_node closer to the head than @node.
 * @entry refers to something stored in a slot in the xarray
 */

static inline unsigned int xa_lock_type(const struct xarray *xa)
{
        return (__force unsigned int)xa->xa_flags & 3;
}

static inline void xas_lock_type(struct xa_state *xas, unsigned int lock_type)
{
        if (lock_type == XA_LOCK_IRQ)
                xas_lock_irq(xas);
        else if (lock_type == XA_LOCK_BH)
                xas_lock_bh(xas);
        else
                xas_lock(xas);
}

static inline void xas_unlock_type(struct xa_state *xas, unsigned int lock_type)
{
        if (lock_type == XA_LOCK_IRQ)
                xas_unlock_irq(xas);
        else if (lock_type == XA_LOCK_BH)
                xas_unlock_bh(xas);
        else
                xas_unlock(xas);
}

static inline bool xa_track_free(const struct xarray *xa)
{
        return xa->xa_flags & XA_FLAGS_TRACK_FREE;
}

static inline bool xa_zero_busy(const struct xarray *xa)
{
        return xa->xa_flags & XA_FLAGS_ZERO_BUSY;
}

static inline void xa_mark_set(struct xarray *xa, xa_mark_t mark)
{
        if (!(xa->xa_flags & XA_FLAGS_MARK(mark)))
                xa->xa_flags |= XA_FLAGS_MARK(mark);
}

static inline void xa_mark_clear(struct xarray *xa, xa_mark_t mark)
{
        if (xa->xa_flags & XA_FLAGS_MARK(mark))
                xa->xa_flags &= ~(XA_FLAGS_MARK(mark));
}

static inline unsigned long *node_marks(struct xa_node *node, xa_mark_t mark)
{
        return node->marks[(__force unsigned)mark];
}

static inline bool node_get_mark(struct xa_node *node,
                unsigned int offset, xa_mark_t mark)
{
        return test_bit(offset, node_marks(node, mark));
}

/* returns true if the bit was set */
static inline bool node_set_mark(struct xa_node *node, unsigned int offset,
                                xa_mark_t mark)
{
        return __test_and_set_bit(offset, node_marks(node, mark));
}

/* returns true if the bit was set */
static inline bool node_clear_mark(struct xa_node *node, unsigned int offset,
                                xa_mark_t mark)
{
        return __test_and_clear_bit(offset, node_marks(node, mark));
}

static inline bool node_any_mark(struct xa_node *node, xa_mark_t mark)
{
        return !bitmap_empty(node_marks(node, mark), XA_CHUNK_SIZE);
}

static inline void node_mark_all(struct xa_node *node, xa_mark_t mark)
{
        bitmap_fill(node_marks(node, mark), XA_CHUNK_SIZE);
}

#define mark_inc(mark) do { \
        mark = (__force xa_mark_t)((__force unsigned)(mark) + 1); \
} while (0)

/*
 * xas_squash_marks() - Merge all marks to the first entry
 * @xas: Array operation state.
 *
 * Set a mark on the first entry if any entry has it set.  Clear marks on
 * all sibling entries.
 */
static void xas_squash_marks(const struct xa_state *xas)
{
        unsigned int mark = 0;
        unsigned int limit = xas->xa_offset + xas->xa_sibs + 1;

        if (!xas->xa_sibs)
                return;

        do {
                unsigned long *marks = xas->xa_node->marks[mark];
                if (find_next_bit(marks, limit, xas->xa_offset + 1) == limit)
                        continue;
                __set_bit(xas->xa_offset, marks);
                bitmap_clear(marks, xas->xa_offset + 1, xas->xa_sibs);
        } while (mark++ != (__force unsigned)XA_MARK_MAX);
}

/* extracts the offset within this node from the index */
static unsigned int get_offset(unsigned long index, struct xa_node *node)
{
        return (index >> node->shift) & XA_CHUNK_MASK;
}

static void xas_set_offset(struct xa_state *xas)
{
        xas->xa_offset = get_offset(xas->xa_index, xas->xa_node);
}

/* move the index either forwards (find) or backwards (sibling slot) */
static void xas_move_index(struct xa_state *xas, unsigned long offset)
{
        unsigned int shift = xas->xa_node->shift;
        xas->xa_index &= ~XA_CHUNK_MASK << shift;
        xas->xa_index += offset << shift;
}

static void xas_advance(struct xa_state *xas)
{
        xas->xa_offset++;
        xas_move_index(xas, xas->xa_offset);
}

static void *set_bounds(struct xa_state *xas)
{
        xas->xa_node = XAS_BOUNDS;
        return NULL;
}

/*
 * Starts a walk.  If the @xas is already valid, we assume that it's on
 * the right path and just return where we've got to.  If we're in an
 * error state, return NULL.  If the index is outside the current scope
 * of the xarray, return NULL without changing @xas->xa_node.  Otherwise
 * set @xas->xa_node to NULL and return the current head of the array.
 */
static void *xas_start(struct xa_state *xas)
{
        void *entry;

        if (xas_valid(xas))
                return xas_reload(xas);
        if (xas_error(xas))
                return NULL;

        entry = xa_head(xas->xa);
        if (!xa_is_node(entry)) {
                if (xas->xa_index)
                        return set_bounds(xas);
        } else {
                if ((xas->xa_index >> xa_to_node(entry)->shift) > XA_CHUNK_MASK)
                        return set_bounds(xas);
        }

        xas->xa_node = NULL;
        return entry;
}

static void *xas_descend(struct xa_state *xas, struct xa_node *node)
{
        unsigned int offset = get_offset(xas->xa_index, node);
        void *entry = xa_entry(xas->xa, node, offset);

        xas->xa_node = node;
        if (xa_is_sibling(entry)) {
                offset = xa_to_sibling(entry);
                entry = xa_entry(xas->xa, node, offset);
        }

        xas->xa_offset = offset;
        return entry;
}

/**
 * xas_load() - Load an entry from the XArray (advanced).
 * @xas: XArray operation state.
 *
 * Usually walks the @xas to the appropriate state to load the entry
 * stored at xa_index.  However, it will do nothing and return %NULL if
 * @xas is in an error state.  xas_load() will never expand the tree.
 *
 * If the xa_state is set up to operate on a multi-index entry, xas_load()
 * may return %NULL or an internal entry, even if there are entries
 * present within the range specified by @xas.
 *
 * Context: Any context.  The caller should hold the xa_lock or the RCU lock.
 * Return: Usually an entry in the XArray, but see description for exceptions.
 */
void *xas_load(struct xa_state *xas)
{
        void *entry = xas_start(xas);

        while (xa_is_node(entry)) {
                struct xa_node *node = xa_to_node(entry);

                if (xas->xa_shift > node->shift)
                        break;
                entry = xas_descend(xas, node);
                if (node->shift == 0)
                        break;
        }
        return entry;
}
EXPORT_SYMBOL_GPL(xas_load);

/* Move the radix tree node cache here */
extern struct kmem_cache *radix_tree_node_cachep;
extern void radix_tree_node_rcu_free(struct rcu_head *head);

#define XA_RCU_FREE        ((struct xarray *)1)

static void xa_node_free(struct xa_node *node)
{
        XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
        node->array = XA_RCU_FREE;
        call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
}

/*
 * xas_destroy() - Free any resources allocated during the XArray operation.
 * @xas: XArray operation state.
 *
 * This function is now internal-only.
 */
static void xas_destroy(struct xa_state *xas)
{
        struct xa_node *next, *node = xas->xa_alloc;

        while (node) {
                XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
                next = rcu_dereference_raw(node->parent);
                radix_tree_node_rcu_free(&node->rcu_head);
                xas->xa_alloc = node = next;
        }
}

/**
 * xas_nomem() - Allocate memory if needed.
 * @xas: XArray operation state.
 * @gfp: Memory allocation flags.
 *
 * If we need to add new nodes to the XArray, we try to allocate memory
 * with GFP_NOWAIT while holding the lock, which will usually succeed.
 * If it fails, @xas is flagged as needing memory to continue.  The caller
 * should drop the lock and call xas_nomem().  If xas_nomem() succeeds,
 * the caller should retry the operation.
 *
 * Forward progress is guaranteed as one node is allocated here and
 * stored in the xa_state where it will be found by xas_alloc().  More
 * nodes will likely be found in the slab allocator, but we do not tie
 * them up here.
 *
 * Return: true if memory was needed, and was successfully allocated.
 */
bool xas_nomem(struct xa_state *xas, gfp_t gfp)
{
        if (xas->xa_node != XA_ERROR(-ENOMEM)) {
                xas_destroy(xas);
                return false;
        }
        if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT)
                gfp |= __GFP_ACCOUNT;
        xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp);
        if (!xas->xa_alloc)
                return false;
        xas->xa_alloc->parent = NULL;
        XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list));
        xas->xa_node = XAS_RESTART;
        return true;
}
EXPORT_SYMBOL_GPL(xas_nomem);

/*
 * __xas_nomem() - Drop locks and allocate memory if needed.
 * @xas: XArray operation state.
 * @gfp: Memory allocation flags.
 *
 * Internal variant of xas_nomem().
 *
 * Return: true if memory was needed, and was successfully allocated.
 */
static bool __xas_nomem(struct xa_state *xas, gfp_t gfp)
        __must_hold(xas->xa->xa_lock)
{
        unsigned int lock_type = xa_lock_type(xas->xa);

        if (xas->xa_node != XA_ERROR(-ENOMEM)) {
                xas_destroy(xas);
                return false;
        }
        if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT)
                gfp |= __GFP_ACCOUNT;
        if (gfpflags_allow_blocking(gfp)) {
                xas_unlock_type(xas, lock_type);
                xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp);
                xas_lock_type(xas, lock_type);
        } else {
                xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp);
        }
        if (!xas->xa_alloc)
                return false;
        xas->xa_alloc->parent = NULL;
        XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list));
        xas->xa_node = XAS_RESTART;
        return true;
}

static void xas_update(struct xa_state *xas, struct xa_node *node)
{
        if (xas->xa_update)
                xas->xa_update(node);
        else
                XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
}

static void *xas_alloc(struct xa_state *xas, unsigned int shift)
{
        struct xa_node *parent = xas->xa_node;
        struct xa_node *node = xas->xa_alloc;

        if (xas_invalid(xas))
                return NULL;

        if (node) {
                xas->xa_alloc = NULL;
        } else {
                gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN;

                if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT)
                        gfp |= __GFP_ACCOUNT;

                node = kmem_cache_alloc(radix_tree_node_cachep, gfp);
                if (!node) {
                        xas_set_err(xas, -ENOMEM);
                        return NULL;
                }
        }

        if (parent) {
                node->offset = xas->xa_offset;
                parent->count++;
                XA_NODE_BUG_ON(node, parent->count > XA_CHUNK_SIZE);
                xas_update(xas, parent);
        }
        XA_NODE_BUG_ON(node, shift > BITS_PER_LONG);
        XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
        node->shift = shift;
        node->count = 0;
        node->nr_values = 0;
        RCU_INIT_POINTER(node->parent, xas->xa_node);
        node->array = xas->xa;

        return node;
}

#ifdef CONFIG_XARRAY_MULTI
/* Returns the number of indices covered by a given xa_state */
static unsigned long xas_size(const struct xa_state *xas)
{
        return (xas->xa_sibs + 1UL) << xas->xa_shift;
}
#endif

/*
 * Use this to calculate the maximum index that will need to be created
 * in order to add the entry described by @xas.  Because we cannot store a
 * multi-index entry at index 0, the calculation is a little more complex
 * than you might expect.
 */
static unsigned long xas_max(struct xa_state *xas)
{
        unsigned long max = xas->xa_index;

#ifdef CONFIG_XARRAY_MULTI
        if (xas->xa_shift || xas->xa_sibs) {
                unsigned long mask = xas_size(xas) - 1;
                max |= mask;
                if (mask == max)
                        max++;
        }
#endif

        return max;
}

/* The maximum index that can be contained in the array without expanding it */
static unsigned long max_index(void *entry)
{
        if (!xa_is_node(entry))
                return 0;
        return (XA_CHUNK_SIZE << xa_to_node(entry)->shift) - 1;
}

static void xas_shrink(struct xa_state *xas)
{
        struct xarray *xa = xas->xa;
        struct xa_node *node = xas->xa_node;

        for (;;) {
                void *entry;

                XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
                if (node->count != 1)
                        break;
                entry = xa_entry_locked(xa, node, 0);
                if (!entry)
                        break;
                if (!xa_is_node(entry) && node->shift)
                        break;
                if (xa_is_zero(entry) && xa_zero_busy(xa))
                        entry = NULL;
                xas->xa_node = XAS_BOUNDS;

                RCU_INIT_POINTER(xa->xa_head, entry);
                if (xa_track_free(xa) && !node_get_mark(node, 0, XA_FREE_MARK))
                        xa_mark_clear(xa, XA_FREE_MARK);

                node->count = 0;
                node->nr_values = 0;
                if (!xa_is_node(entry))
                        RCU_INIT_POINTER(node->slots[0], XA_RETRY_ENTRY);
                xas_update(xas, node);
                xa_node_free(node);
                if (!xa_is_node(entry))
                        break;
                node = xa_to_node(entry);
                node->parent = NULL;
        }
}

/*
 * xas_delete_node() - Attempt to delete an xa_node
 * @xas: Array operation state.
 *
 * Attempts to delete the @xas->xa_node.  This will fail if xa->node has
 * a non-zero reference count.
 */
static void xas_delete_node(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;

        for (;;) {
                struct xa_node *parent;

                XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
                if (node->count)
                        break;

                parent = xa_parent_locked(xas->xa, node);
                xas->xa_node = parent;
                xas->xa_offset = node->offset;
                xa_node_free(node);

                if (!parent) {
                        xas->xa->xa_head = NULL;
                        xas->xa_node = XAS_BOUNDS;
                        return;
                }

                parent->slots[xas->xa_offset] = NULL;
                parent->count--;
                XA_NODE_BUG_ON(parent, parent->count > XA_CHUNK_SIZE);
                node = parent;
                xas_update(xas, node);
        }

        if (!node->parent)
                xas_shrink(xas);
}

/**
 * xas_free_nodes() - Free this node and all nodes that it references
 * @xas: Array operation state.
 * @top: Node to free
 *
 * This node has been removed from the tree.  We must now free it and all
 * of its subnodes.  There may be RCU walkers with references into the tree,
 * so we must replace all entries with retry markers.
 */
static void xas_free_nodes(struct xa_state *xas, struct xa_node *top)
{
        unsigned int offset = 0;
        struct xa_node *node = top;

        for (;;) {
                void *entry = xa_entry_locked(xas->xa, node, offset);

                if (node->shift && xa_is_node(entry)) {
                        node = xa_to_node(entry);
                        offset = 0;
                        continue;
                }
                if (entry)
                        RCU_INIT_POINTER(node->slots[offset], XA_RETRY_ENTRY);
                offset++;
                while (offset == XA_CHUNK_SIZE) {
                        struct xa_node *parent;

                        parent = xa_parent_locked(xas->xa, node);
                        offset = node->offset + 1;
                        node->count = 0;
                        node->nr_values = 0;
                        xas_update(xas, node);
                        xa_node_free(node);
                        if (node == top)
                                return;
                        node = parent;
                }
        }
}

/*
 * xas_expand adds nodes to the head of the tree until it has reached
 * sufficient height to be able to contain @xas->xa_index
 */
static int xas_expand(struct xa_state *xas, void *head)
{
        struct xarray *xa = xas->xa;
        struct xa_node *node = NULL;
        unsigned int shift = 0;
        unsigned long max = xas_max(xas);

        if (!head) {
                if (max == 0)
                        return 0;
                while ((max >> shift) >= XA_CHUNK_SIZE)
                        shift += XA_CHUNK_SHIFT;
                return shift + XA_CHUNK_SHIFT;
        } else if (xa_is_node(head)) {
                node = xa_to_node(head);
                shift = node->shift + XA_CHUNK_SHIFT;
        }
        xas->xa_node = NULL;

        while (max > max_index(head)) {
                xa_mark_t mark = 0;

                XA_NODE_BUG_ON(node, shift > BITS_PER_LONG);
                node = xas_alloc(xas, shift);
                if (!node)
                        return -ENOMEM;

                node->count = 1;
                if (xa_is_value(head))
                        node->nr_values = 1;
                RCU_INIT_POINTER(node->slots[0], head);

                /* Propagate the aggregated mark info to the new child */
                for (;;) {
                        if (xa_track_free(xa) && mark == XA_FREE_MARK) {
                                node_mark_all(node, XA_FREE_MARK);
                                if (!xa_marked(xa, XA_FREE_MARK)) {
                                        node_clear_mark(node, 0, XA_FREE_MARK);
                                        xa_mark_set(xa, XA_FREE_MARK);
                                }
                        } else if (xa_marked(xa, mark)) {
                                node_set_mark(node, 0, mark);
                        }
                        if (mark == XA_MARK_MAX)
                                break;
                        mark_inc(mark);
                }

                /*
                 * Now that the new node is fully initialised, we can add
                 * it to the tree
                 */
                if (xa_is_node(head)) {
                        xa_to_node(head)->offset = 0;
                        rcu_assign_pointer(xa_to_node(head)->parent, node);
                }
                head = xa_mk_node(node);
                rcu_assign_pointer(xa->xa_head, head);
                xas_update(xas, node);

                shift += XA_CHUNK_SHIFT;
        }

        xas->xa_node = node;
        return shift;
}

/*
 * xas_create() - Create a slot to store an entry in.
 * @xas: XArray operation state.
 * @allow_root: %true if we can store the entry in the root directly
 *
 * Most users will not need to call this function directly, as it is called
 * by xas_store().  It is useful for doing conditional store operations
 * (see the xa_cmpxchg() implementation for an example).
 *
 * Return: If the slot already existed, returns the contents of this slot.
 * If the slot was newly created, returns %NULL.  If it failed to create the
 * slot, returns %NULL and indicates the error in @xas.
 */
static void *xas_create(struct xa_state *xas, bool allow_root)
{
        struct xarray *xa = xas->xa;
        void *entry;
        void __rcu **slot;
        struct xa_node *node = xas->xa_node;
        int shift;
        unsigned int order = xas->xa_shift;

        if (xas_top(node)) {
                entry = xa_head_locked(xa);
                xas->xa_node = NULL;
                if (!entry && xa_zero_busy(xa))
                        entry = XA_ZERO_ENTRY;
                shift = xas_expand(xas, entry);
                if (shift < 0)
                        return NULL;
                if (!shift && !allow_root)
                        shift = XA_CHUNK_SHIFT;
                entry = xa_head_locked(xa);
                slot = &xa->xa_head;
        } else if (xas_error(xas)) {
                return NULL;
        } else if (node) {
                unsigned int offset = xas->xa_offset;

                shift = node->shift;
                entry = xa_entry_locked(xa, node, offset);
                slot = &node->slots[offset];
        } else {
                shift = 0;
                entry = xa_head_locked(xa);
                slot = &xa->xa_head;
        }

        while (shift > order) {
                shift -= XA_CHUNK_SHIFT;
                if (!entry) {
                        node = xas_alloc(xas, shift);
                        if (!node)
                                break;
                        if (xa_track_free(xa))
                                node_mark_all(node, XA_FREE_MARK);
                        rcu_assign_pointer(*slot, xa_mk_node(node));
                } else if (xa_is_node(entry)) {
                        node = xa_to_node(entry);
                } else {
                        break;
                }
                entry = xas_descend(xas, node);
                slot = &node->slots[xas->xa_offset];
        }

        return entry;
}

/**
 * xas_create_range() - Ensure that stores to this range will succeed
 * @xas: XArray operation state.
 *
 * Creates all of the slots in the range covered by @xas.  Sets @xas to
 * create single-index entries and positions it at the beginning of the
 * range.  This is for the benefit of users which have not yet been
 * converted to use multi-index entries.
 */
void xas_create_range(struct xa_state *xas)
{
        unsigned long index = xas->xa_index;
        unsigned char shift = xas->xa_shift;
        unsigned char sibs = xas->xa_sibs;

        xas->xa_index |= ((sibs + 1UL) << shift) - 1;
        if (xas_is_node(xas) && xas->xa_node->shift == xas->xa_shift)
                xas->xa_offset |= sibs;
        xas->xa_shift = 0;
        xas->xa_sibs = 0;

        for (;;) {
                xas_create(xas, true);
                if (xas_error(xas))
                        goto restore;
                if (xas->xa_index <= (index | XA_CHUNK_MASK))
                        goto success;
                xas->xa_index -= XA_CHUNK_SIZE;

                for (;;) {
                        struct xa_node *node = xas->xa_node;
                        if (node->shift >= shift)
                                break;
                        xas->xa_node = xa_parent_locked(xas->xa, node);
                        xas->xa_offset = node->offset - 1;
                        if (node->offset != 0)
                                break;
                }
        }

restore:
        xas->xa_shift = shift;
        xas->xa_sibs = sibs;
        xas->xa_index = index;
        return;
success:
        xas->xa_index = index;
        if (xas->xa_node)
                xas_set_offset(xas);
}
EXPORT_SYMBOL_GPL(xas_create_range);

static void update_node(struct xa_state *xas, struct xa_node *node,
                int count, int values)
{
        if (!node || (!count && !values))
                return;

        node->count += count;
        node->nr_values += values;
        XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
        XA_NODE_BUG_ON(node, node->nr_values > XA_CHUNK_SIZE);
        xas_update(xas, node);
        if (count < 0)
                xas_delete_node(xas);
}

/**
 * xas_store() - Store this entry in the XArray.
 * @xas: XArray operation state.
 * @entry: New entry.
 *
 * If @xas is operating on a multi-index entry, the entry returned by this
 * function is essentially meaningless (it may be an internal entry or it
 * may be %NULL, even if there are non-NULL entries at some of the indices
 * covered by the range).  This is not a problem for any current users,
 * and can be changed if needed.
 *
 * Return: The old entry at this index.
 */
void *xas_store(struct xa_state *xas, void *entry)
{
        struct xa_node *node;
        void __rcu **slot = &xas->xa->xa_head;
        unsigned int offset, max;
        int count = 0;
        int values = 0;
        void *first, *next;
        bool value = xa_is_value(entry);

        if (entry) {
                bool allow_root = !xa_is_node(entry) && !xa_is_zero(entry);
                first = xas_create(xas, allow_root);
        } else {
                first = xas_load(xas);
        }

        if (xas_invalid(xas))
                return first;
        node = xas->xa_node;
        if (node && (xas->xa_shift < node->shift))
                xas->xa_sibs = 0;
        if ((first == entry) && !xas->xa_sibs)
                return first;

        next = first;
        offset = xas->xa_offset;
        max = xas->xa_offset + xas->xa_sibs;
        if (node) {
                slot = &node->slots[offset];
                if (xas->xa_sibs)
                        xas_squash_marks(xas);
        }
        if (!entry)
                xas_init_marks(xas);

        for (;;) {
                /*
                 * Must clear the marks before setting the entry to NULL,
                 * otherwise xas_for_each_marked may find a NULL entry and
                 * stop early.  rcu_assign_pointer contains a release barrier
                 * so the mark clearing will appear to happen before the
                 * entry is set to NULL.
                 */
                rcu_assign_pointer(*slot, entry);
                if (xa_is_node(next) && (!node || node->shift))
                        xas_free_nodes(xas, xa_to_node(next));
                if (!node)
                        break;
                count += !next - !entry;
                values += !xa_is_value(first) - !value;
                if (entry) {
                        if (offset == max)
                                break;
                        if (!xa_is_sibling(entry))
                                entry = xa_mk_sibling(xas->xa_offset);
                } else {
                        if (offset == XA_CHUNK_MASK)
                                break;
                }
                next = xa_entry_locked(xas->xa, node, ++offset);
                if (!xa_is_sibling(next)) {
                        if (!entry && (offset > max))
                                break;
                        first = next;
                }
                slot++;
        }

        update_node(xas, node, count, values);
        return first;
}
EXPORT_SYMBOL_GPL(xas_store);

/**
 * xas_get_mark() - Returns the state of this mark.
 * @xas: XArray operation state.
 * @mark: Mark number.
 *
 * Return: true if the mark is set, false if the mark is clear or @xas
 * is in an error state.
 */
bool xas_get_mark(const struct xa_state *xas, xa_mark_t mark)
{
        if (xas_invalid(xas))
                return false;
        if (!xas->xa_node)
                return xa_marked(xas->xa, mark);
        return node_get_mark(xas->xa_node, xas->xa_offset, mark);
}
EXPORT_SYMBOL_GPL(xas_get_mark);

/**
 * xas_set_mark() - Sets the mark on this entry and its parents.
 * @xas: XArray operation state.
 * @mark: Mark number.
 *
 * Sets the specified mark on this entry, and walks up the tree setting it
 * on all the ancestor entries.  Does nothing if @xas has not been walked to
 * an entry, or is in an error state.
 */
void xas_set_mark(const struct xa_state *xas, xa_mark_t mark)
{
        struct xa_node *node = xas->xa_node;
        unsigned int offset = xas->xa_offset;

        if (xas_invalid(xas))
                return;

        while (node) {
                if (node_set_mark(node, offset, mark))
                        return;
                offset = node->offset;
                node = xa_parent_locked(xas->xa, node);
        }

        if (!xa_marked(xas->xa, mark))
                xa_mark_set(xas->xa, mark);
}
EXPORT_SYMBOL_GPL(xas_set_mark);

/**
 * xas_clear_mark() - Clears the mark on this entry and its parents.
 * @xas: XArray operation state.
 * @mark: Mark number.
 *
 * Clears the specified mark on this entry, and walks back to the head
 * attempting to clear it on all the ancestor entries.  Does nothing if
 * @xas has not been walked to an entry, or is in an error state.
 */
void xas_clear_mark(const struct xa_state *xas, xa_mark_t mark)
{
        struct xa_node *node = xas->xa_node;
        unsigned int offset = xas->xa_offset;

        if (xas_invalid(xas))
                return;

        while (node) {
                if (!node_clear_mark(node, offset, mark))
                        return;
                if (node_any_mark(node, mark))
                        return;

                offset = node->offset;
                node = xa_parent_locked(xas->xa, node);
        }

        if (xa_marked(xas->xa, mark))
                xa_mark_clear(xas->xa, mark);
}
EXPORT_SYMBOL_GPL(xas_clear_mark);

/**
 * xas_init_marks() - Initialise all marks for the entry
 * @xas: Array operations state.
 *
 * Initialise all marks for the entry specified by @xas.  If we're tracking
 * free entries with a mark, we need to set it on all entries.  All other
 * marks are cleared.
 *
 * This implementation is not as efficient as it could be; we may walk
 * up the tree multiple times.
 */
void xas_init_marks(const struct xa_state *xas)
{
        xa_mark_t mark = 0;

        for (;;) {
                if (xa_track_free(xas->xa) && mark == XA_FREE_MARK)
                        xas_set_mark(xas, mark);
                else
                        xas_clear_mark(xas, mark);
                if (mark == XA_MARK_MAX)
                        break;
                mark_inc(mark);
        }
}
EXPORT_SYMBOL_GPL(xas_init_marks);

#ifdef CONFIG_XARRAY_MULTI
static unsigned int node_get_marks(struct xa_node *node, unsigned int offset)
{
        unsigned int marks = 0;
        xa_mark_t mark = XA_MARK_0;

        for (;;) {
                if (node_get_mark(node, offset, mark))
                        marks |= 1 << (__force unsigned int)mark;
                if (mark == XA_MARK_MAX)
                        break;
                mark_inc(mark);
        }

        return marks;
}

static void node_set_marks(struct xa_node *node, unsigned int offset,
                        struct xa_node *child, unsigned int marks)
{
        xa_mark_t mark = XA_MARK_0;

        for (;;) {
                if (marks & (1 << (__force unsigned int)mark)) {
                        node_set_mark(node, offset, mark);
                        if (child)
                                node_mark_all(child, mark);
                }
                if (mark == XA_MARK_MAX)
                        break;
                mark_inc(mark);
        }
}

/**
 * xas_split_alloc() - Allocate memory for splitting an entry.
 * @xas: XArray operation state.
 * @entry: New entry which will be stored in the array.
 * @order: New entry order.
 * @gfp: Memory allocation flags.
 *
 * This function should be called before calling xas_split().
 * If necessary, it will allocate new nodes (and fill them with @entry)
 * to prepare for the upcoming split of an entry of @order size into
 * entries of the order stored in the @xas.
 *
 * Context: May sleep if @gfp flags permit.
 */
void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order,
                gfp_t gfp)
{
        unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;
        unsigned int mask = xas->xa_sibs;

        /* XXX: no support for splitting really large entries yet */
        if (WARN_ON(xas->xa_shift + 2 * XA_CHUNK_SHIFT < order))
                goto nomem;
        if (xas->xa_shift + XA_CHUNK_SHIFT > order)
                return;

        do {
                unsigned int i;
                void *sibling = NULL;
                struct xa_node *node;

                node = kmem_cache_alloc(radix_tree_node_cachep, gfp);
                if (!node)
                        goto nomem;
                node->array = xas->xa;
                for (i = 0; i < XA_CHUNK_SIZE; i++) {
                        if ((i & mask) == 0) {
                                RCU_INIT_POINTER(node->slots[i], entry);
                                sibling = xa_mk_sibling(i);
                        } else {
                                RCU_INIT_POINTER(node->slots[i], sibling);
                        }
                }
                RCU_INIT_POINTER(node->parent, xas->xa_alloc);
                xas->xa_alloc = node;
        } while (sibs-- > 0);

        return;
nomem:
        xas_destroy(xas);
        xas_set_err(xas, -ENOMEM);
}
EXPORT_SYMBOL_GPL(xas_split_alloc);

/**
 * xas_split() - Split a multi-index entry into smaller entries.
 * @xas: XArray operation state.
 * @entry: New entry to store in the array.
 * @order: New entry order.
 *
 * The value in the entry is copied to all the replacement entries.
 *
 * Context: Any context.  The caller should hold the xa_lock.
 */
void xas_split(struct xa_state *xas, void *entry, unsigned int order)
{
        unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;
        unsigned int offset, marks;
        struct xa_node *node;
        void *curr = xas_load(xas);
        int values = 0;

        node = xas->xa_node;
        if (xas_top(node))
                return;

        marks = node_get_marks(node, xas->xa_offset);

        offset = xas->xa_offset + sibs;
        do {
                if (xas->xa_shift < node->shift) {
                        struct xa_node *child = xas->xa_alloc;

                        xas->xa_alloc = rcu_dereference_raw(child->parent);
                        child->shift = node->shift - XA_CHUNK_SHIFT;
                        child->offset = offset;
                        child->count = XA_CHUNK_SIZE;
                        child->nr_values = xa_is_value(entry) ?
                                        XA_CHUNK_SIZE : 0;
                        RCU_INIT_POINTER(child->parent, node);
                        node_set_marks(node, offset, child, marks);
                        rcu_assign_pointer(node->slots[offset],
                                        xa_mk_node(child));
                        if (xa_is_value(curr))
                                values--;
                        xas_update(xas, child);
                } else {
                        unsigned int canon = offset - xas->xa_sibs;

                        node_set_marks(node, canon, NULL, marks);
                        rcu_assign_pointer(node->slots[canon], entry);
                        while (offset > canon)
                                rcu_assign_pointer(node->slots[offset--],
                                                xa_mk_sibling(canon));
                        values += (xa_is_value(entry) - xa_is_value(curr)) *
                                        (xas->xa_sibs + 1);
                }
        } while (offset-- > xas->xa_offset);

        node->nr_values += values;
        xas_update(xas, node);
}
EXPORT_SYMBOL_GPL(xas_split);
#endif

/**
 * xas_pause() - Pause a walk to drop a lock.
 * @xas: XArray operation state.
 *
 * Some users need to pause a walk and drop the lock they're holding in
 * order to yield to a higher priority thread or carry out an operation
 * on an entry.  Those users should call this function before they drop
 * the lock.  It resets the @xas to be suitable for the next iteration
 * of the loop after the user has reacquired the lock.  If most entries
 * found during a walk require you to call xas_pause(), the xa_for_each()
 * iterator may be more appropriate.
 *
 * Note that xas_pause() only works for forward iteration.  If a user needs
 * to pause a reverse iteration, we will need a xas_pause_rev().
 */
void xas_pause(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;

        if (xas_invalid(xas))
                return;

        xas->xa_node = XAS_RESTART;
        if (node) {
                unsigned long offset = xas->xa_offset;
                while (++offset < XA_CHUNK_SIZE) {
                        if (!xa_is_sibling(xa_entry(xas->xa, node, offset)))
                                break;
                }
                xas->xa_index += (offset - xas->xa_offset) << node->shift;
                if (xas->xa_index == 0)
                        xas->xa_node = XAS_BOUNDS;
        } else {
                xas->xa_index++;
        }
}
EXPORT_SYMBOL_GPL(xas_pause);

/*
 * __xas_prev() - Find the previous entry in the XArray.
 * @xas: XArray operation state.
 *
 * Helper function for xas_prev() which handles all the complex cases
 * out of line.
 */
void *__xas_prev(struct xa_state *xas)
{
        void *entry;

        if (!xas_frozen(xas->xa_node))
                xas->xa_index--;
        if (!xas->xa_node)
                return set_bounds(xas);
        if (xas_not_node(xas->xa_node))
                return xas_load(xas);

        if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node))
                xas->xa_offset--;

        while (xas->xa_offset == 255) {
                xas->xa_offset = xas->xa_node->offset - 1;
                xas->xa_node = xa_parent(xas->xa, xas->xa_node);
                if (!xas->xa_node)
                        return set_bounds(xas);
        }

        for (;;) {
                entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                if (!xa_is_node(entry))
                        return entry;

                xas->xa_node = xa_to_node(entry);
                xas_set_offset(xas);
        }
}
EXPORT_SYMBOL_GPL(__xas_prev);

/*
 * __xas_next() - Find the next entry in the XArray.
 * @xas: XArray operation state.
 *
 * Helper function for xas_next() which handles all the complex cases
 * out of line.
 */
void *__xas_next(struct xa_state *xas)
{
        void *entry;

        if (!xas_frozen(xas->xa_node))
                xas->xa_index++;
        if (!xas->xa_node)
                return set_bounds(xas);
        if (xas_not_node(xas->xa_node))
                return xas_load(xas);

        if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node))
                xas->xa_offset++;

        while (xas->xa_offset == XA_CHUNK_SIZE) {
                xas->xa_offset = xas->xa_node->offset + 1;
                xas->xa_node = xa_parent(xas->xa, xas->xa_node);
                if (!xas->xa_node)
                        return set_bounds(xas);
        }

        for (;;) {
                entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                if (!xa_is_node(entry))
                        return entry;

                xas->xa_node = xa_to_node(entry);
                xas_set_offset(xas);
        }
}
EXPORT_SYMBOL_GPL(__xas_next);

/**
 * xas_find() - Find the next present entry in the XArray.
 * @xas: XArray operation state.
 * @max: Highest index to return.
 *
 * If the @xas has not yet been walked to an entry, return the entry
 * which has an index >= xas.xa_index.  If it has been walked, the entry
 * currently being pointed at has been processed, and so we move to the
 * next entry.
 *
 * If no entry is found and the array is smaller than @max, the iterator
 * is set to the smallest index not yet in the array.  This allows @xas
 * to be immediately passed to xas_store().
 *
 * Return: The entry, if found, otherwise %NULL.
 */
void *xas_find(struct xa_state *xas, unsigned long max)
{
        void *entry;

        if (xas_error(xas) || xas->xa_node == XAS_BOUNDS)
                return NULL;
        if (xas->xa_index > max)
                return set_bounds(xas);

        if (!xas->xa_node) {
                xas->xa_index = 1;
                return set_bounds(xas);
        } else if (xas->xa_node == XAS_RESTART) {
                entry = xas_load(xas);
                if (entry || xas_not_node(xas->xa_node))
                        return entry;
        } else if (!xas->xa_node->shift &&
                    xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)) {
                xas->xa_offset = ((xas->xa_index - 1) & XA_CHUNK_MASK) + 1;
        }

        xas_advance(xas);

        while (xas->xa_node && (xas->xa_index <= max)) {
                if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) {
                        xas->xa_offset = xas->xa_node->offset + 1;
                        xas->xa_node = xa_parent(xas->xa, xas->xa_node);
                        continue;
                }

                entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                if (xa_is_node(entry)) {
                        xas->xa_node = xa_to_node(entry);
                        xas->xa_offset = 0;
                        continue;
                }
                if (entry && !xa_is_sibling(entry))
                        return entry;

                xas_advance(xas);
        }

        if (!xas->xa_node)
                xas->xa_node = XAS_BOUNDS;
        return NULL;
}
EXPORT_SYMBOL_GPL(xas_find);

/**
 * xas_find_marked() - Find the next marked entry in the XArray.
 * @xas: XArray operation state.
 * @max: Highest index to return.
 * @mark: Mark number to search for.
 *
 * If the @xas has not yet been walked to an entry, return the marked entry
 * which has an index >= xas.xa_index.  If it has been walked, the entry
 * currently being pointed at has been processed, and so we return the
 * first marked entry with an index > xas.xa_index.
 *
 * If no marked entry is found and the array is smaller than @max, @xas is
 * set to the bounds state and xas->xa_index is set to the smallest index
 * not yet in the array.  This allows @xas to be immediately passed to
 * xas_store().
 *
 * If no entry is found before @max is reached, @xas is set to the restart
 * state.
 *
 * Return: The entry, if found, otherwise %NULL.
 */
void *xas_find_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark)
{
        bool advance = true;
        unsigned int offset;
        void *entry;

        if (xas_error(xas))
                return NULL;
        if (xas->xa_index > max)
                goto max;

        if (!xas->xa_node) {
                xas->xa_index = 1;
                goto out;
        } else if (xas_top(xas->xa_node)) {
                advance = false;
                entry = xa_head(xas->xa);
                xas->xa_node = NULL;
                if (xas->xa_index > max_index(entry))
                        goto out;
                if (!xa_is_node(entry)) {
                        if (xa_marked(xas->xa, mark))
                                return entry;
                        xas->xa_index = 1;
                        goto out;
                }
                xas->xa_node = xa_to_node(entry);
                xas->xa_offset = xas->xa_index >> xas->xa_node->shift;
        }

        while (xas->xa_index <= max) {
                if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) {
                        xas->xa_offset = xas->xa_node->offset + 1;
                        xas->xa_node = xa_parent(xas->xa, xas->xa_node);
                        if (!xas->xa_node)
                                break;
                        advance = false;
                        continue;
                }

                if (!advance) {
                        entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                        if (xa_is_sibling(entry)) {
                                xas->xa_offset = xa_to_sibling(entry);
                                xas_move_index(xas, xas->xa_offset);
                        }
                }

                offset = xas_find_chunk(xas, advance, mark);
                if (offset > xas->xa_offset) {
                        advance = false;
                        xas_move_index(xas, offset);
                        /* Mind the wrap */
                        if ((xas->xa_index - 1) >= max)
                                goto max;
                        xas->xa_offset = offset;
                        if (offset == XA_CHUNK_SIZE)
                                continue;
                }

                entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                if (!entry && !(xa_track_free(xas->xa) && mark == XA_FREE_MARK))
                        continue;
                if (!xa_is_node(entry))
                        return entry;
                xas->xa_node = xa_to_node(entry);
                xas_set_offset(xas);
        }

out:
        if (xas->xa_index > max)
                goto max;
        return set_bounds(xas);
max:
        xas->xa_node = XAS_RESTART;
        return NULL;
}
EXPORT_SYMBOL_GPL(xas_find_marked);

/**
 * xas_find_conflict() - Find the next present entry in a range.
 * @xas: XArray operation state.
 *
 * The @xas describes both a range and a position within that range.
 *
 * Context: Any context.  Expects xa_lock to be held.
 * Return: The next entry in the range covered by @xas or %NULL.
 */
void *xas_find_conflict(struct xa_state *xas)
{
        void *curr;

        if (xas_error(xas))
                return NULL;

        if (!xas->xa_node)
                return NULL;

        if (xas_top(xas->xa_node)) {
                curr = xas_start(xas);
                if (!curr)
                        return NULL;
                while (xa_is_node(curr)) {
                        struct xa_node *node = xa_to_node(curr);
                        curr = xas_descend(xas, node);
                }
                if (curr)
                        return curr;
        }

        if (xas->xa_node->shift > xas->xa_shift)
                return NULL;

        for (;;) {
                if (xas->xa_node->shift == xas->xa_shift) {
                        if ((xas->xa_offset & xas->xa_sibs) == xas->xa_sibs)
                                break;
                } else if (xas->xa_offset == XA_CHUNK_MASK) {
                        xas->xa_offset = xas->xa_node->offset;
                        xas->xa_node = xa_parent_locked(xas->xa, xas->xa_node);
                        if (!xas->xa_node)
                                break;
                        continue;
                }
                curr = xa_entry_locked(xas->xa, xas->xa_node, ++xas->xa_offset);
                if (xa_is_sibling(curr))
                        continue;
                while (xa_is_node(curr)) {
                        xas->xa_node = xa_to_node(curr);
                        xas->xa_offset = 0;
                        curr = xa_entry_locked(xas->xa, xas->xa_node, 0);
                }
                if (curr)
                        return curr;
        }
        xas->xa_offset -= xas->xa_sibs;
        return NULL;
}
EXPORT_SYMBOL_GPL(xas_find_conflict);

/**
 * xa_load() - Load an entry from an XArray.
 * @xa: XArray.
 * @index: index into array.
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: The entry at @index in @xa.
 */
void *xa_load(struct xarray *xa, unsigned long index)
{
        XA_STATE(xas, xa, index);
        void *entry;

        rcu_read_lock();
        do {
                entry = xas_load(&xas);
                if (xa_is_zero(entry))
                        entry = NULL;
        } while (xas_retry(&xas, entry));
        rcu_read_unlock();

        return entry;
}
EXPORT_SYMBOL(xa_load);

static void *xas_result(struct xa_state *xas, void *curr)
{
        if (xa_is_zero(curr))
                return NULL;
        if (xas_error(xas))
                curr = xas->xa_node;
        return curr;
}

/**
 * __xa_erase() - Erase this entry from the XArray while locked.
 * @xa: XArray.
 * @index: Index into array.
 *
 * After this function returns, loading from @index will return %NULL.
 * If the index is part of a multi-index entry, all indices will be erased
 * and none of the entries will be part of a multi-index entry.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.
 * Return: The entry which used to be at this index.
 */
void *__xa_erase(struct xarray *xa, unsigned long index)
{
        XA_STATE(xas, xa, index);
        return xas_result(&xas, xas_store(&xas, NULL));
}
EXPORT_SYMBOL(__xa_erase);

/**
 * xa_erase() - Erase this entry from the XArray.
 * @xa: XArray.
 * @index: Index of entry.
 *
 * After this function returns, loading from @index will return %NULL.
 * If the index is part of a multi-index entry, all indices will be erased
 * and none of the entries will be part of a multi-index entry.
 *
 * Context: Any context.  Takes and releases the xa_lock.
 * Return: The entry which used to be at this index.
 */
void *xa_erase(struct xarray *xa, unsigned long index)
{
        void *entry;

        xa_lock(xa);
        entry = __xa_erase(xa, index);
        xa_unlock(xa);

        return entry;
}
EXPORT_SYMBOL(xa_erase);

/**
 * __xa_store() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * You must already be holding the xa_lock when calling this function.
 * It will drop the lock if needed to allocate memory, and then reacquire
 * it afterwards.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: The old entry at this index or xa_err() if an error happened.
 */
void *__xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
{
        XA_STATE(xas, xa, index);
        void *curr;

        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return XA_ERROR(-EINVAL);
        if (xa_track_free(xa) && !entry)
                entry = XA_ZERO_ENTRY;

        do {
                curr = xas_store(&xas, entry);
                if (xa_track_free(xa))
                        xas_clear_mark(&xas, XA_FREE_MARK);
        } while (__xas_nomem(&xas, gfp));

        return xas_result(&xas, curr);
}
EXPORT_SYMBOL(__xa_store);

/**
 * xa_store() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * After this function returns, loads from this index will return @entry.
 * Storing into an existing multi-index entry updates the entry of every index.
 * The marks associated with @index are unaffected unless @entry is %NULL.
 *
 * Context: Any context.  Takes and releases the xa_lock.
 * May sleep if the @gfp flags permit.
 * Return: The old entry at this index on success, xa_err(-EINVAL) if @entry
 * cannot be stored in an XArray, or xa_err(-ENOMEM) if memory allocation
 * failed.
 */
void *xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
{
        void *curr;

        xa_lock(xa);
        curr = __xa_store(xa, index, entry, gfp);
        xa_unlock(xa);

        return curr;
}
EXPORT_SYMBOL(xa_store);

/**
 * __xa_cmpxchg() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @old: Old value to test against.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * You must already be holding the xa_lock when calling this function.
 * It will drop the lock if needed to allocate memory, and then reacquire
 * it afterwards.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: The old entry at this index or xa_err() if an error happened.
 */
void *__xa_cmpxchg(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        XA_STATE(xas, xa, index);
        void *curr;

        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return XA_ERROR(-EINVAL);

        do {
                curr = xas_load(&xas);
                if (curr == old) {
                        xas_store(&xas, entry);
                        if (xa_track_free(xa) && entry && !curr)
                                xas_clear_mark(&xas, XA_FREE_MARK);
                }
        } while (__xas_nomem(&xas, gfp));

        return xas_result(&xas, curr);
}
EXPORT_SYMBOL(__xa_cmpxchg);

/**
 * __xa_insert() - Store this entry in the XArray if no entry is present.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Inserting a NULL entry will store a reserved entry (like xa_reserve())
 * if no entry is present.  Inserting will fail if a reserved entry is
 * present, even though loading from this index will return NULL.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
 * -ENOMEM if memory could not be allocated.
 */
int __xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
{
        XA_STATE(xas, xa, index);
        void *curr;

        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return -EINVAL;
        if (!entry)
                entry = XA_ZERO_ENTRY;

        do {
                curr = xas_load(&xas);
                if (!curr) {
                        xas_store(&xas, entry);
                        if (xa_track_free(xa))
                                xas_clear_mark(&xas, XA_FREE_MARK);
                } else {
                        xas_set_err(&xas, -EBUSY);
                }
        } while (__xas_nomem(&xas, gfp));

        return xas_error(&xas);
}
EXPORT_SYMBOL(__xa_insert);

#ifdef CONFIG_XARRAY_MULTI
static void xas_set_range(struct xa_state *xas, unsigned long first,
                unsigned long last)
{
        unsigned int shift = 0;
        unsigned long sibs = last - first;
        unsigned int offset = XA_CHUNK_MASK;

        xas_set(xas, first);

        while ((first & XA_CHUNK_MASK) == 0) {
                if (sibs < XA_CHUNK_MASK)
                        break;
                if ((sibs == XA_CHUNK_MASK) && (offset < XA_CHUNK_MASK))
                        break;
                shift += XA_CHUNK_SHIFT;
                if (offset == XA_CHUNK_MASK)
                        offset = sibs & XA_CHUNK_MASK;
                sibs >>= XA_CHUNK_SHIFT;
                first >>= XA_CHUNK_SHIFT;
        }

        offset = first & XA_CHUNK_MASK;
        if (offset + sibs > XA_CHUNK_MASK)
                sibs = XA_CHUNK_MASK - offset;
        if ((((first + sibs + 1) << shift) - 1) > last)
                sibs -= 1;

        xas->xa_shift = shift;
        xas->xa_sibs = sibs;
}

/**
 * xa_store_range() - Store this entry at a range of indices in the XArray.
 * @xa: XArray.
 * @first: First index to affect.
 * @last: Last index to affect.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * After this function returns, loads from any index between @first and @last,
 * inclusive will return @entry.
 * Storing into an existing multi-index entry updates the entry of every index.
 * The marks associated with @index are unaffected unless @entry is %NULL.
 *
 * Context: Process context.  Takes and releases the xa_lock.  May sleep
 * if the @gfp flags permit.
 * Return: %NULL on success, xa_err(-EINVAL) if @entry cannot be stored in
 * an XArray, or xa_err(-ENOMEM) if memory allocation failed.
 */
void *xa_store_range(struct xarray *xa, unsigned long first,
                unsigned long last, void *entry, gfp_t gfp)
{
        XA_STATE(xas, xa, 0);

        if (WARN_ON_ONCE(xa_is_internal(entry)))
                return XA_ERROR(-EINVAL);
        if (last < first)
                return XA_ERROR(-EINVAL);

        do {
                xas_lock(&xas);
                if (entry) {
                        unsigned int order = BITS_PER_LONG;
                        if (last + 1)
                                order = __ffs(last + 1);
                        xas_set_order(&xas, last, order);
                        xas_create(&xas, true);
                        if (xas_error(&xas))
                                goto unlock;
                }
                do {
                        xas_set_range(&xas, first, last);
                        xas_store(&xas, entry);
                        if (xas_error(&xas))
                                goto unlock;
                        first += xas_size(&xas);
                } while (first <= last);
unlock:
                xas_unlock(&xas);
        } while (xas_nomem(&xas, gfp));

        return xas_result(&xas, NULL);
}
EXPORT_SYMBOL(xa_store_range);

/**
 * xa_get_order() - Get the order of an entry.
 * @xa: XArray.
 * @index: Index of the entry.
 *
 * Return: A number between 0 and 63 indicating the order of the entry.
 */
int xa_get_order(struct xarray *xa, unsigned long index)
{
        XA_STATE(xas, xa, index);
        void *entry;
        int order = 0;

        rcu_read_lock();
        entry = xas_load(&xas);

        if (!entry)
                goto unlock;

        if (!xas.xa_node)
                goto unlock;

        for (;;) {
                unsigned int slot = xas.xa_offset + (1 << order);

                if (slot >= XA_CHUNK_SIZE)
                        break;
                if (!xa_is_sibling(xas.xa_node->slots[slot]))
                        break;
                order++;
        }

        order += xas.xa_node->shift;
unlock:
        rcu_read_unlock();

        return order;
}
EXPORT_SYMBOL(xa_get_order);
#endif /* CONFIG_XARRAY_MULTI */

/**
 * __xa_alloc() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @limit: Range for allocated ID.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: 0 on success, -ENOMEM if memory could not be allocated or
 * -EBUSY if there are no free entries in @limit.
 */
int __xa_alloc(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, gfp_t gfp)
{
        XA_STATE(xas, xa, 0);

        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return -EINVAL;
        if (WARN_ON_ONCE(!xa_track_free(xa)))
                return -EINVAL;

        if (!entry)
                entry = XA_ZERO_ENTRY;

        do {
                xas.xa_index = limit.min;
                xas_find_marked(&xas, limit.max, XA_FREE_MARK);
                if (xas.xa_node == XAS_RESTART)
                        xas_set_err(&xas, -EBUSY);
                else
                        *id = xas.xa_index;
                xas_store(&xas, entry);
                xas_clear_mark(&xas, XA_FREE_MARK);
        } while (__xas_nomem(&xas, gfp));

        return xas_error(&xas);
}
EXPORT_SYMBOL(__xa_alloc);

/**
 * __xa_alloc_cyclic() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of allocated ID.
 * @next: Pointer to next ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 * The search for an empty entry will start at @next and will wrap
 * around if necessary.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: 0 if the allocation succeeded without wrapping.  1 if the
 * allocation succeeded after wrapping, -ENOMEM if memory could not be
 * allocated or -EBUSY if there are no free entries in @limit.
 */
int __xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, u32 *next, gfp_t gfp)
{
        u32 min = limit.min;
        int ret;

        limit.min = max(min, *next);
        ret = __xa_alloc(xa, id, entry, limit, gfp);
        if ((xa->xa_flags & XA_FLAGS_ALLOC_WRAPPED) && ret == 0) {
                xa->xa_flags &= ~XA_FLAGS_ALLOC_WRAPPED;
                ret = 1;
        }

        if (ret < 0 && limit.min > min) {
                limit.min = min;
                ret = __xa_alloc(xa, id, entry, limit, gfp);
                if (ret == 0)
                        ret = 1;
        }

        if (ret >= 0) {
                *next = *id + 1;
                if (*next == 0)
                        xa->xa_flags |= XA_FLAGS_ALLOC_WRAPPED;
        }
        return ret;
}
EXPORT_SYMBOL(__xa_alloc_cyclic);

/**
 * __xa_set_mark() - Set this mark on this entry while locked.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * Attempting to set a mark on a %NULL entry does not succeed.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.
 */
void __xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        XA_STATE(xas, xa, index);
        void *entry = xas_load(&xas);

        if (entry)
                xas_set_mark(&xas, mark);
}
EXPORT_SYMBOL(__xa_set_mark);

/**
 * __xa_clear_mark() - Clear this mark on this entry while locked.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.
 */
void __xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        XA_STATE(xas, xa, index);
        void *entry = xas_load(&xas);

        if (entry)
                xas_clear_mark(&xas, mark);
}
EXPORT_SYMBOL(__xa_clear_mark);

/**
 * xa_get_mark() - Inquire whether this mark is set on this entry.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * This function uses the RCU read lock, so the result may be out of date
 * by the time it returns.  If you need the result to be stable, use a lock.
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: True if the entry at @index has this mark set, false if it doesn't.
 */
bool xa_get_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        XA_STATE(xas, xa, index);
        void *entry;

        rcu_read_lock();
        entry = xas_start(&xas);
        while (xas_get_mark(&xas, mark)) {
                if (!xa_is_node(entry))
                        goto found;
                entry = xas_descend(&xas, xa_to_node(entry));
        }
        rcu_read_unlock();
        return false;
 found:
        rcu_read_unlock();
        return true;
}
EXPORT_SYMBOL(xa_get_mark);

/**
 * xa_set_mark() - Set this mark on this entry.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * Attempting to set a mark on a %NULL entry does not succeed.
 *
 * Context: Process context.  Takes and releases the xa_lock.
 */
void xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        xa_lock(xa);
        __xa_set_mark(xa, index, mark);
        xa_unlock(xa);
}
EXPORT_SYMBOL(xa_set_mark);

/**
 * xa_clear_mark() - Clear this mark on this entry.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * Clearing a mark always succeeds.
 *
 * Context: Process context.  Takes and releases the xa_lock.
 */
void xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        xa_lock(xa);
        __xa_clear_mark(xa, index, mark);
        xa_unlock(xa);
}
EXPORT_SYMBOL(xa_clear_mark);

/**
 * xa_find() - Search the XArray for an entry.
 * @xa: XArray.
 * @indexp: Pointer to an index.
 * @max: Maximum index to search to.
 * @filter: Selection criterion.
 *
 * Finds the entry in @xa which matches the @filter, and has the lowest
 * index that is at least @indexp and no more than @max.
 * If an entry is found, @indexp is updated to be the index of the entry.
 * This function is protected by the RCU read lock, so it may not find
 * entries which are being simultaneously added.  It will not return an
 * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: The entry, if found, otherwise %NULL.
 */
void *xa_find(struct xarray *xa, unsigned long *indexp,
                        unsigned long max, xa_mark_t filter)
{
        XA_STATE(xas, xa, *indexp);
        void *entry;

        rcu_read_lock();
        do {
                if ((__force unsigned int)filter < XA_MAX_MARKS)
                        entry = xas_find_marked(&xas, max, filter);
                else
                        entry = xas_find(&xas, max);
        } while (xas_retry(&xas, entry));
        rcu_read_unlock();

        if (entry)
                *indexp = xas.xa_index;
        return entry;
}
EXPORT_SYMBOL(xa_find);

static bool xas_sibling(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;
        unsigned long mask;

        if (!IS_ENABLED(CONFIG_XARRAY_MULTI) || !node)
                return false;
        mask = (XA_CHUNK_SIZE << node->shift) - 1;
        return (xas->xa_index & mask) >
                ((unsigned long)xas->xa_offset << node->shift);
}

/**
 * xa_find_after() - Search the XArray for a present entry.
 * @xa: XArray.
 * @indexp: Pointer to an index.
 * @max: Maximum index to search to.
 * @filter: Selection criterion.
 *
 * Finds the entry in @xa which matches the @filter and has the lowest
 * index that is above @indexp and no more than @max.
 * If an entry is found, @indexp is updated to be the index of the entry.
 * This function is protected by the RCU read lock, so it may miss entries
 * which are being simultaneously added.  It will not return an
 * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: The pointer, if found, otherwise %NULL.
 */
void *xa_find_after(struct xarray *xa, unsigned long *indexp,
                        unsigned long max, xa_mark_t filter)
{
        XA_STATE(xas, xa, *indexp + 1);
        void *entry;

        if (xas.xa_index == 0)
                return NULL;

        rcu_read_lock();
        for (;;) {
                if ((__force unsigned int)filter < XA_MAX_MARKS)
                        entry = xas_find_marked(&xas, max, filter);
                else
                        entry = xas_find(&xas, max);

                if (xas_invalid(&xas))
                        break;
                if (xas_sibling(&xas))
                        continue;
                if (!xas_retry(&xas, entry))
                        break;
        }
        rcu_read_unlock();

        if (entry)
                *indexp = xas.xa_index;
        return entry;
}
EXPORT_SYMBOL(xa_find_after);

static unsigned int xas_extract_present(struct xa_state *xas, void **dst,
                        unsigned long max, unsigned int n)
{
        void *entry;
        unsigned int i = 0;

        rcu_read_lock();
        xas_for_each(xas, entry, max) {
                if (xas_retry(xas, entry))
                        continue;
                dst[i++] = entry;
                if (i == n)
                        break;
        }
        rcu_read_unlock();

        return i;
}

static unsigned int xas_extract_marked(struct xa_state *xas, void **dst,
                        unsigned long max, unsigned int n, xa_mark_t mark)
{
        void *entry;
        unsigned int i = 0;

        rcu_read_lock();
        xas_for_each_marked(xas, entry, max, mark) {
                if (xas_retry(xas, entry))
                        continue;
                dst[i++] = entry;
                if (i == n)
                        break;
        }
        rcu_read_unlock();

        return i;
}

/**
 * xa_extract() - Copy selected entries from the XArray into a normal array.
 * @xa: The source XArray to copy from.
 * @dst: The buffer to copy entries into.
 * @start: The first index in the XArray eligible to be selected.
 * @max: The last index in the XArray eligible to be selected.
 * @n: The maximum number of entries to copy.
 * @filter: Selection criterion.
 *
 * Copies up to @n entries that match @filter from the XArray.  The
 * copied entries will have indices between @start and @max, inclusive.
 *
 * The @filter may be an XArray mark value, in which case entries which are
 * marked with that mark will be copied.  It may also be %XA_PRESENT, in
 * which case all entries which are not %NULL will be copied.
 *
 * The entries returned may not represent a snapshot of the XArray at a
 * moment in time.  For example, if another thread stores to index 5, then
 * index 10, calling xa_extract() may return the old contents of index 5
 * and the new contents of index 10.  Indices not modified while this
 * function is running will not be skipped.
 *
 * If you need stronger guarantees, holding the xa_lock across calls to this
 * function will prevent concurrent modification.
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: The number of entries copied.
 */
unsigned int xa_extract(struct xarray *xa, void **dst, unsigned long start,
                        unsigned long max, unsigned int n, xa_mark_t filter)
{
        XA_STATE(xas, xa, start);

        if (!n)
                return 0;

        if ((__force unsigned int)filter < XA_MAX_MARKS)
                return xas_extract_marked(&xas, dst, max, n, filter);
        return xas_extract_present(&xas, dst, max, n);
}
EXPORT_SYMBOL(xa_extract);

/**
 * xa_delete_node() - Private interface for workingset code.
 * @node: Node to be removed from the tree.
 * @update: Function to call to update ancestor nodes.
 *
 * Context: xa_lock must be held on entry and will not be released.
 */
void xa_delete_node(struct xa_node *node, xa_update_node_t update)
{
        struct xa_state xas = {
                .xa = node->array,
                .xa_index = (unsigned long)node->offset <<
                                (node->shift + XA_CHUNK_SHIFT),
                .xa_shift = node->shift + XA_CHUNK_SHIFT,
                .xa_offset = node->offset,
                .xa_node = xa_parent_locked(node->array, node),
                .xa_update = update,
        };

        xas_store(&xas, NULL);
}
EXPORT_SYMBOL_GPL(xa_delete_node);        /* For the benefit of the test suite */

/**
 * xa_destroy() - Free all internal data structures.
 * @xa: XArray.
 *
 * After calling this function, the XArray is empty and has freed all memory
 * allocated for its internal data structures.  You are responsible for
 * freeing the objects referenced by the XArray.
 *
 * Context: Any context.  Takes and releases the xa_lock, interrupt-safe.
 */
void xa_destroy(struct xarray *xa)
{
        XA_STATE(xas, xa, 0);
        unsigned long flags;
        void *entry;

        xas.xa_node = NULL;
        xas_lock_irqsave(&xas, flags);
        entry = xa_head_locked(xa);
        RCU_INIT_POINTER(xa->xa_head, NULL);
        xas_init_marks(&xas);
        if (xa_zero_busy(xa))
                xa_mark_clear(xa, XA_FREE_MARK);
        /* lockdep checks we're still holding the lock in xas_free_nodes() */
        if (xa_is_node(entry))
                xas_free_nodes(&xas, xa_to_node(entry));
        xas_unlock_irqrestore(&xas, flags);
}
EXPORT_SYMBOL(xa_destroy);

#ifdef XA_DEBUG
void xa_dump_node(const struct xa_node *node)
{
        unsigned i, j;

        if (!node)
                return;
        if ((unsigned long)node & 3) {
                pr_cont("node %px\n", node);
                return;
        }

        pr_cont("node %px %s %d parent %px shift %d count %d values %d "
                "array %px list %px %px marks",
                node, node->parent ? "offset" : "max", node->offset,
                node->parent, node->shift, node->count, node->nr_values,
                node->array, node->private_list.prev, node->private_list.next);
        for (i = 0; i < XA_MAX_MARKS; i++)
                for (j = 0; j < XA_MARK_LONGS; j++)
                        pr_cont(" %lx", node->marks[i][j]);
        pr_cont("\n");
}

void xa_dump_index(unsigned long index, unsigned int shift)
{
        if (!shift)
                pr_info("%lu: ", index);
        else if (shift >= BITS_PER_LONG)
                pr_info("0-%lu: ", ~0UL);
        else
                pr_info("%lu-%lu: ", index, index | ((1UL << shift) - 1));
}

void xa_dump_entry(const void *entry, unsigned long index, unsigned long shift)
{
        if (!entry)
                return;

        xa_dump_index(index, shift);

        if (xa_is_node(entry)) {
                if (shift == 0) {
                        pr_cont("%px\n", entry);
                } else {
                        unsigned long i;
                        struct xa_node *node = xa_to_node(entry);
                        xa_dump_node(node);
                        for (i = 0; i < XA_CHUNK_SIZE; i++)
                                xa_dump_entry(node->slots[i],
                                      index + (i << node->shift), node->shift);
                }
        } else if (xa_is_value(entry))
                pr_cont("value %ld (0x%lx) [%px]\n", xa_to_value(entry),
                                                xa_to_value(entry), entry);
        else if (!xa_is_internal(entry))
                pr_cont("%px\n", entry);
        else if (xa_is_retry(entry))
                pr_cont("retry (%ld)\n", xa_to_internal(entry));
        else if (xa_is_sibling(entry))
                pr_cont("sibling (slot %ld)\n", xa_to_sibling(entry));
        else if (xa_is_zero(entry))
                pr_cont("zero (%ld)\n", xa_to_internal(entry));
        else
                pr_cont("UNKNOWN ENTRY (%px)\n", entry);
}

void xa_dump(const struct xarray *xa)
{
        void *entry = xa->xa_head;
        unsigned int shift = 0;

        pr_info("xarray: %px head %px flags %x marks %d %d %d\n", xa, entry,
                        xa->xa_flags, xa_marked(xa, XA_MARK_0),
                        xa_marked(xa, XA_MARK_1), xa_marked(xa, XA_MARK_2));
        if (xa_is_node(entry))
                shift = xa_to_node(entry)->shift + XA_CHUNK_SHIFT;
        xa_dump_entry(entry, 0, shift);
}
#endif




















































































































































    1 



































































































































































































































































































































































































































    1 
    1 



























































































































































































































































































































































































































































































































































































    1 



    1 











    1 



















































    1 




    1 






    1 

    1 







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/namespace.c
 *
 * (C) Copyright Al Viro 2000, 2001
 *
 * Based on code from fs/super.c, copyright Linus Torvalds and others.
 * Heavily rewritten.
 */

#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/capability.h>
#include <linux/mnt_namespace.h>
#include <linux/user_namespace.h>
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/cred.h>
#include <linux/idr.h>
#include <linux/init.h>                /* init_rootfs */
#include <linux/fs_struct.h>        /* get_fs_root et.al. */
#include <linux/fsnotify.h>        /* fsnotify_vfsmount_delete */
#include <linux/file.h>
#include <linux/uaccess.h>
#include <linux/proc_ns.h>
#include <linux/magic.h>
#include <linux/memblock.h>
#include <linux/task_work.h>
#include <linux/sched/task.h>
#include <uapi/linux/mount.h>
#include <linux/fs_context.h>
#include <linux/shmem_fs.h>

#include "pnode.h"
#include "internal.h"

/* Maximum number of mounts in a mount namespace */
unsigned int sysctl_mount_max __read_mostly = 100000;

static unsigned int m_hash_mask __read_mostly;
static unsigned int m_hash_shift __read_mostly;
static unsigned int mp_hash_mask __read_mostly;
static unsigned int mp_hash_shift __read_mostly;

static __initdata unsigned long mhash_entries;
static int __init set_mhash_entries(char *str)
{
        if (!str)
                return 0;
        mhash_entries = simple_strtoul(str, &str, 0);
        return 1;
}
__setup("mhash_entries=", set_mhash_entries);

static __initdata unsigned long mphash_entries;
static int __init set_mphash_entries(char *str)
{
        if (!str)
                return 0;
        mphash_entries = simple_strtoul(str, &str, 0);
        return 1;
}
__setup("mphash_entries=", set_mphash_entries);

static u64 event;
static DEFINE_IDA(mnt_id_ida);
static DEFINE_IDA(mnt_group_ida);

static struct hlist_head *mount_hashtable __read_mostly;
static struct hlist_head *mountpoint_hashtable __read_mostly;
static struct kmem_cache *mnt_cache __read_mostly;
static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted);        /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */

/* /sys/fs */
struct kobject *fs_kobj;
EXPORT_SYMBOL_GPL(fs_kobj);

/*
 * vfsmount lock may be taken for read to prevent changes to the
 * vfsmount hash, ie. during mountpoint lookups or walking back
 * up the tree.
 *
 * It should be taken for write in all cases where the vfsmount
 * tree or hash is modified or when a vfsmount structure is modified.
 */
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);

static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
{
        unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
        tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
        tmp = tmp + (tmp >> m_hash_shift);
        return &mount_hashtable[tmp & m_hash_mask];
}

static inline struct hlist_head *mp_hash(struct dentry *dentry)
{
        unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
        tmp = tmp + (tmp >> mp_hash_shift);
        return &mountpoint_hashtable[tmp & mp_hash_mask];
}

static int mnt_alloc_id(struct mount *mnt)
{
        int res = ida_alloc(&mnt_id_ida, GFP_KERNEL);

        if (res < 0)
                return res;
        mnt->mnt_id = res;
        return 0;
}

static void mnt_free_id(struct mount *mnt)
{
        ida_free(&mnt_id_ida, mnt->mnt_id);
}

/*
 * Allocate a new peer group ID
 */
static int mnt_alloc_group_id(struct mount *mnt)
{
        int res = ida_alloc_min(&mnt_group_ida, 1, GFP_KERNEL);

        if (res < 0)
                return res;
        mnt->mnt_group_id = res;
        return 0;
}

/*
 * Release a peer group ID
 */
void mnt_release_group_id(struct mount *mnt)
{
        ida_free(&mnt_group_ida, mnt->mnt_group_id);
        mnt->mnt_group_id = 0;
}

/*
 * vfsmount lock must be held for read
 */
static inline void mnt_add_count(struct mount *mnt, int n)
{
#ifdef CONFIG_SMP
        this_cpu_add(mnt->mnt_pcp->mnt_count, n);
#else
        preempt_disable();
        mnt->mnt_count += n;
        preempt_enable();
#endif
}

/*
 * vfsmount lock must be held for write
 */
int mnt_get_count(struct mount *mnt)
{
#ifdef CONFIG_SMP
        int count = 0;
        int cpu;

        for_each_possible_cpu(cpu) {
                count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
        }

        return count;
#else
        return mnt->mnt_count;
#endif
}

static struct mount *alloc_vfsmnt(const char *name)
{
        struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
        if (mnt) {
                int err;

                err = mnt_alloc_id(mnt);
                if (err)
                        goto out_free_cache;

                if (name) {
                        mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL);
                        if (!mnt->mnt_devname)
                                goto out_free_id;
                }

#ifdef CONFIG_SMP
                mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
                if (!mnt->mnt_pcp)
                        goto out_free_devname;

                this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
#else
                mnt->mnt_count = 1;
                mnt->mnt_writers = 0;
#endif

                INIT_HLIST_NODE(&mnt->mnt_hash);
                INIT_LIST_HEAD(&mnt->mnt_child);
                INIT_LIST_HEAD(&mnt->mnt_mounts);
                INIT_LIST_HEAD(&mnt->mnt_list);
                INIT_LIST_HEAD(&mnt->mnt_expire);
                INIT_LIST_HEAD(&mnt->mnt_share);
                INIT_LIST_HEAD(&mnt->mnt_slave_list);
                INIT_LIST_HEAD(&mnt->mnt_slave);
                INIT_HLIST_NODE(&mnt->mnt_mp_list);
                INIT_LIST_HEAD(&mnt->mnt_umounting);
                INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
        }
        return mnt;

#ifdef CONFIG_SMP
out_free_devname:
        kfree_const(mnt->mnt_devname);
#endif
out_free_id:
        mnt_free_id(mnt);
out_free_cache:
        kmem_cache_free(mnt_cache, mnt);
        return NULL;
}

/*
 * Most r/o checks on a fs are for operations that take
 * discrete amounts of time, like a write() or unlink().
 * We must keep track of when those operations start
 * (for permission checks) and when they end, so that
 * we can determine when writes are able to occur to
 * a filesystem.
 */
/*
 * __mnt_is_readonly: check whether a mount is read-only
 * @mnt: the mount to check for its write status
 *
 * This shouldn't be used directly ouside of the VFS.
 * It does not guarantee that the filesystem will stay
 * r/w, just that it is right *now*.  This can not and
 * should not be used in place of IS_RDONLY(inode).
 * mnt_want/drop_write() will _keep_ the filesystem
 * r/w.
 */
bool __mnt_is_readonly(struct vfsmount *mnt)
{
        return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb);
}
EXPORT_SYMBOL_GPL(__mnt_is_readonly);

static inline void mnt_inc_writers(struct mount *mnt)
{
#ifdef CONFIG_SMP
        this_cpu_inc(mnt->mnt_pcp->mnt_writers);
#else
        mnt->mnt_writers++;
#endif
}

static inline void mnt_dec_writers(struct mount *mnt)
{
#ifdef CONFIG_SMP
        this_cpu_dec(mnt->mnt_pcp->mnt_writers);
#else
        mnt->mnt_writers--;
#endif
}

static unsigned int mnt_get_writers(struct mount *mnt)
{
#ifdef CONFIG_SMP
        unsigned int count = 0;
        int cpu;

        for_each_possible_cpu(cpu) {
                count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
        }

        return count;
#else
        return mnt->mnt_writers;
#endif
}

static int mnt_is_readonly(struct vfsmount *mnt)
{
        if (mnt->mnt_sb->s_readonly_remount)
                return 1;
        /* Order wrt setting s_flags/s_readonly_remount in do_remount() */
        smp_rmb();
        return __mnt_is_readonly(mnt);
}

/*
 * Most r/o & frozen checks on a fs are for operations that take discrete
 * amounts of time, like a write() or unlink().  We must keep track of when
 * those operations start (for permission checks) and when they end, so that we
 * can determine when writes are able to occur to a filesystem.
 */
/**
 * __mnt_want_write - get write access to a mount without freeze protection
 * @m: the mount on which to take a write
 *
 * This tells the low-level filesystem that a write is about to be performed to
 * it, and makes sure that writes are allowed (mnt it read-write) before
 * returning success. This operation does not protect against filesystem being
 * frozen. When the write operation is finished, __mnt_drop_write() must be
 * called. This is effectively a refcount.
 */
int __mnt_want_write(struct vfsmount *m)
{
        struct mount *mnt = real_mount(m);
        int ret = 0;

        preempt_disable();
        mnt_inc_writers(mnt);
        /*
         * The store to mnt_inc_writers must be visible before we pass
         * MNT_WRITE_HOLD loop below, so that the slowpath can see our
         * incremented count after it has set MNT_WRITE_HOLD.
         */
        smp_mb();
        while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
                cpu_relax();
        /*
         * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
         * be set to match its requirements. So we must not load that until
         * MNT_WRITE_HOLD is cleared.
         */
        smp_rmb();
        if (mnt_is_readonly(m)) {
                mnt_dec_writers(mnt);
                ret = -EROFS;
        }
        preempt_enable();

        return ret;
}

/**
 * mnt_want_write - get write access to a mount
 * @m: the mount on which to take a write
 *
 * This tells the low-level filesystem that a write is about to be performed to
 * it, and makes sure that writes are allowed (mount is read-write, filesystem
 * is not frozen) before returning success.  When the write operation is
 * finished, mnt_drop_write() must be called.  This is effectively a refcount.
 */
int mnt_want_write(struct vfsmount *m)
{
        int ret;

        sb_start_write(m->mnt_sb);
        ret = __mnt_want_write(m);
        if (ret)
                sb_end_write(m->mnt_sb);
        return ret;
}
EXPORT_SYMBOL_GPL(mnt_want_write);

/**
 * mnt_clone_write - get write access to a mount
 * @mnt: the mount on which to take a write
 *
 * This is effectively like mnt_want_write, except
 * it must only be used to take an extra write reference
 * on a mountpoint that we already know has a write reference
 * on it. This allows some optimisation.
 *
 * After finished, mnt_drop_write must be called as usual to
 * drop the reference.
 */
int mnt_clone_write(struct vfsmount *mnt)
{
        /* superblock may be r/o */
        if (__mnt_is_readonly(mnt))
                return -EROFS;
        preempt_disable();
        mnt_inc_writers(real_mount(mnt));
        preempt_enable();
        return 0;
}
EXPORT_SYMBOL_GPL(mnt_clone_write);

/**
 * __mnt_want_write_file - get write access to a file's mount
 * @file: the file who's mount on which to take a write
 *
 * This is like __mnt_want_write, but it takes a file and can
 * do some optimisations if the file is open for write already
 */
int __mnt_want_write_file(struct file *file)
{
        if (!(file->f_mode & FMODE_WRITER))
                return __mnt_want_write(file->f_path.mnt);
        else
                return mnt_clone_write(file->f_path.mnt);
}

/**
 * mnt_want_write_file - get write access to a file's mount
 * @file: the file who's mount on which to take a write
 *
 * This is like mnt_want_write, but it takes a file and can
 * do some optimisations if the file is open for write already
 */
int mnt_want_write_file(struct file *file)
{
        int ret;

        sb_start_write(file_inode(file)->i_sb);
        ret = __mnt_want_write_file(file);
        if (ret)
                sb_end_write(file_inode(file)->i_sb);
        return ret;
}
EXPORT_SYMBOL_GPL(mnt_want_write_file);

/**
 * __mnt_drop_write - give up write access to a mount
 * @mnt: the mount on which to give up write access
 *
 * Tells the low-level filesystem that we are done
 * performing writes to it.  Must be matched with
 * __mnt_want_write() call above.
 */
void __mnt_drop_write(struct vfsmount *mnt)
{
        preempt_disable();
        mnt_dec_writers(real_mount(mnt));
        preempt_enable();
}

/**
 * mnt_drop_write - give up write access to a mount
 * @mnt: the mount on which to give up write access
 *
 * Tells the low-level filesystem that we are done performing writes to it and
 * also allows filesystem to be frozen again.  Must be matched with
 * mnt_want_write() call above.
 */
void mnt_drop_write(struct vfsmount *mnt)
{
        __mnt_drop_write(mnt);
        sb_end_write(mnt->mnt_sb);
}
EXPORT_SYMBOL_GPL(mnt_drop_write);

void __mnt_drop_write_file(struct file *file)
{
        __mnt_drop_write(file->f_path.mnt);
}

void mnt_drop_write_file(struct file *file)
{
        __mnt_drop_write_file(file);
        sb_end_write(file_inode(file)->i_sb);
}
EXPORT_SYMBOL(mnt_drop_write_file);

static int mnt_make_readonly(struct mount *mnt)
{
        int ret = 0;

        lock_mount_hash();
        mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
        /*
         * After storing MNT_WRITE_HOLD, we'll read the counters. This store
         * should be visible before we do.
         */
        smp_mb();

        /*
         * With writers on hold, if this value is zero, then there are
         * definitely no active writers (although held writers may subsequently
         * increment the count, they'll have to wait, and decrement it after
         * seeing MNT_READONLY).
         *
         * It is OK to have counter incremented on one CPU and decremented on
         * another: the sum will add up correctly. The danger would be when we
         * sum up each counter, if we read a counter before it is incremented,
         * but then read another CPU's count which it has been subsequently
         * decremented from -- we would see more decrements than we should.
         * MNT_WRITE_HOLD protects against this scenario, because
         * mnt_want_write first increments count, then smp_mb, then spins on
         * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
         * we're counting up here.
         */
        if (mnt_get_writers(mnt) > 0)
                ret = -EBUSY;
        else
                mnt->mnt.mnt_flags |= MNT_READONLY;
        /*
         * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
         * that become unheld will see MNT_READONLY.
         */
        smp_wmb();
        mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
        unlock_mount_hash();
        return ret;
}

static int __mnt_unmake_readonly(struct mount *mnt)
{
        lock_mount_hash();
        mnt->mnt.mnt_flags &= ~MNT_READONLY;
        unlock_mount_hash();
        return 0;
}

int sb_prepare_remount_readonly(struct super_block *sb)
{
        struct mount *mnt;
        int err = 0;

        /* Racy optimization.  Recheck the counter under MNT_WRITE_HOLD */
        if (atomic_long_read(&sb->s_remove_count))
                return -EBUSY;

        lock_mount_hash();
        list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
                if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
                        mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
                        smp_mb();
                        if (mnt_get_writers(mnt) > 0) {
                                err = -EBUSY;
                                break;
                        }
                }
        }
        if (!err && atomic_long_read(&sb->s_remove_count))
                err = -EBUSY;

        if (!err) {
                sb->s_readonly_remount = 1;
                smp_wmb();
        }
        list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
                if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
                        mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
        }
        unlock_mount_hash();

        return err;
}

static void free_vfsmnt(struct mount *mnt)
{
        kfree_const(mnt->mnt_devname);
#ifdef CONFIG_SMP
        free_percpu(mnt->mnt_pcp);
#endif
        kmem_cache_free(mnt_cache, mnt);
}

static void delayed_free_vfsmnt(struct rcu_head *head)
{
        free_vfsmnt(container_of(head, struct mount, mnt_rcu));
}

/* call under rcu_read_lock */
int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
{
        struct mount *mnt;
        if (read_seqretry(&mount_lock, seq))
                return 1;
        if (bastard == NULL)
                return 0;
        mnt = real_mount(bastard);
        mnt_add_count(mnt, 1);
        smp_mb();                        // see mntput_no_expire()
        if (likely(!read_seqretry(&mount_lock, seq)))
                return 0;
        if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
                mnt_add_count(mnt, -1);
                return 1;
        }
        lock_mount_hash();
        if (unlikely(bastard->mnt_flags & MNT_DOOMED)) {
                mnt_add_count(mnt, -1);
                unlock_mount_hash();
                return 1;
        }
        unlock_mount_hash();
        /* caller will mntput() */
        return -1;
}

/* call under rcu_read_lock */
bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
{
        int res = __legitimize_mnt(bastard, seq);
        if (likely(!res))
                return true;
        if (unlikely(res < 0)) {
                rcu_read_unlock();
                mntput(bastard);
                rcu_read_lock();
        }
        return false;
}

/*
 * find the first mount at @dentry on vfsmount @mnt.
 * call under rcu_read_lock()
 */
struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
{
        struct hlist_head *head = m_hash(mnt, dentry);
        struct mount *p;

        hlist_for_each_entry_rcu(p, head, mnt_hash)
                if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
                        return p;
        return NULL;
}

/*
 * lookup_mnt - Return the first child mount mounted at path
 *
 * "First" means first mounted chronologically.  If you create the
 * following mounts:
 *
 * mount /dev/sda1 /mnt
 * mount /dev/sda2 /mnt
 * mount /dev/sda3 /mnt
 *
 * Then lookup_mnt() on the base /mnt dentry in the root mount will
 * return successively the root dentry and vfsmount of /dev/sda1, then
 * /dev/sda2, then /dev/sda3, then NULL.
 *
 * lookup_mnt takes a reference to the found vfsmount.
 */
struct vfsmount *lookup_mnt(const struct path *path)
{
        struct mount *child_mnt;
        struct vfsmount *m;
        unsigned seq;

        rcu_read_lock();
        do {
                seq = read_seqbegin(&mount_lock);
                child_mnt = __lookup_mnt(path->mnt, path->dentry);
                m = child_mnt ? &child_mnt->mnt : NULL;
        } while (!legitimize_mnt(m, seq));
        rcu_read_unlock();
        return m;
}

static inline void lock_ns_list(struct mnt_namespace *ns)
{
        spin_lock(&ns->ns_lock);
}

static inline void unlock_ns_list(struct mnt_namespace *ns)
{
        spin_unlock(&ns->ns_lock);
}

static inline bool mnt_is_cursor(struct mount *mnt)
{
        return mnt->mnt.mnt_flags & MNT_CURSOR;
}

/*
 * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
 *                         current mount namespace.
 *
 * The common case is dentries are not mountpoints at all and that
 * test is handled inline.  For the slow case when we are actually
 * dealing with a mountpoint of some kind, walk through all of the
 * mounts in the current mount namespace and test to see if the dentry
 * is a mountpoint.
 *
 * The mount_hashtable is not usable in the context because we
 * need to identify all mounts that may be in the current mount
 * namespace not just a mount that happens to have some specified
 * parent mount.
 */
bool __is_local_mountpoint(struct dentry *dentry)
{
        struct mnt_namespace *ns = current->nsproxy->mnt_ns;
        struct mount *mnt;
        bool is_covered = false;

        down_read(&namespace_sem);
        lock_ns_list(ns);
        list_for_each_entry(mnt, &ns->list, mnt_list) {
                if (mnt_is_cursor(mnt))
                        continue;
                is_covered = (mnt->mnt_mountpoint == dentry);
                if (is_covered)
                        break;
        }
        unlock_ns_list(ns);
        up_read(&namespace_sem);

        return is_covered;
}

static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
{
        struct hlist_head *chain = mp_hash(dentry);
        struct mountpoint *mp;

        hlist_for_each_entry(mp, chain, m_hash) {
                if (mp->m_dentry == dentry) {
                        mp->m_count++;
                        return mp;
                }
        }
        return NULL;
}

static struct mountpoint *get_mountpoint(struct dentry *dentry)
{
        struct mountpoint *mp, *new = NULL;
        int ret;

        if (d_mountpoint(dentry)) {
                /* might be worth a WARN_ON() */
                if (d_unlinked(dentry))
                        return ERR_PTR(-ENOENT);
mountpoint:
                read_seqlock_excl(&mount_lock);
                mp = lookup_mountpoint(dentry);
                read_sequnlock_excl(&mount_lock);
                if (mp)
                        goto done;
        }

        if (!new)
                new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
        if (!new)
                return ERR_PTR(-ENOMEM);


        /* Exactly one processes may set d_mounted */
        ret = d_set_mounted(dentry);

        /* Someone else set d_mounted? */
        if (ret == -EBUSY)
                goto mountpoint;

        /* The dentry is not available as a mountpoint? */
        mp = ERR_PTR(ret);
        if (ret)
                goto done;

        /* Add the new mountpoint to the hash table */
        read_seqlock_excl(&mount_lock);
        new->m_dentry = dget(dentry);
        new->m_count = 1;
        hlist_add_head(&new->m_hash, mp_hash(dentry));
        INIT_HLIST_HEAD(&new->m_list);
        read_sequnlock_excl(&mount_lock);

        mp = new;
        new = NULL;
done:
        kfree(new);
        return mp;
}

/*
 * vfsmount lock must be held.  Additionally, the caller is responsible
 * for serializing calls for given disposal list.
 */
static void __put_mountpoint(struct mountpoint *mp, struct list_head *list)
{
        if (!--mp->m_count) {
                struct dentry *dentry = mp->m_dentry;
                BUG_ON(!hlist_empty(&mp->m_list));
                spin_lock(&dentry->d_lock);
                dentry->d_flags &= ~DCACHE_MOUNTED;
                spin_unlock(&dentry->d_lock);
                dput_to_list(dentry, list);
                hlist_del(&mp->m_hash);
                kfree(mp);
        }
}

/* called with namespace_lock and vfsmount lock */
static void put_mountpoint(struct mountpoint *mp)
{
        __put_mountpoint(mp, &ex_mountpoints);
}

static inline int check_mnt(struct mount *mnt)
{
        return mnt->mnt_ns == current->nsproxy->mnt_ns;
}

/*
 * vfsmount lock must be held for write
 */
static void touch_mnt_namespace(struct mnt_namespace *ns)
{
        if (ns) {
                ns->event = ++event;
                wake_up_interruptible(&ns->poll);
        }
}

/*
 * vfsmount lock must be held for write
 */
static void __touch_mnt_namespace(struct mnt_namespace *ns)
{
        if (ns && ns->event != event) {
                ns->event = event;
                wake_up_interruptible(&ns->poll);
        }
}

/*
 * vfsmount lock must be held for write
 */
static struct mountpoint *unhash_mnt(struct mount *mnt)
{
        struct mountpoint *mp;
        mnt->mnt_parent = mnt;
        mnt->mnt_mountpoint = mnt->mnt.mnt_root;
        list_del_init(&mnt->mnt_child);
        hlist_del_init_rcu(&mnt->mnt_hash);
        hlist_del_init(&mnt->mnt_mp_list);
        mp = mnt->mnt_mp;
        mnt->mnt_mp = NULL;
        return mp;
}

/*
 * vfsmount lock must be held for write
 */
static void umount_mnt(struct mount *mnt)
{
        put_mountpoint(unhash_mnt(mnt));
}

/*
 * vfsmount lock must be held for write
 */
void mnt_set_mountpoint(struct mount *mnt,
                        struct mountpoint *mp,
                        struct mount *child_mnt)
{
        mp->m_count++;
        mnt_add_count(mnt, 1);        /* essentially, that's mntget */
        child_mnt->mnt_mountpoint = mp->m_dentry;
        child_mnt->mnt_parent = mnt;
        child_mnt->mnt_mp = mp;
        hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
}

static void __attach_mnt(struct mount *mnt, struct mount *parent)
{
        hlist_add_head_rcu(&mnt->mnt_hash,
                           m_hash(&parent->mnt, mnt->mnt_mountpoint));
        list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
}

/*
 * vfsmount lock must be held for write
 */
static void attach_mnt(struct mount *mnt,
                        struct mount *parent,
                        struct mountpoint *mp)
{
        mnt_set_mountpoint(parent, mp, mnt);
        __attach_mnt(mnt, parent);
}

void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
{
        struct mountpoint *old_mp = mnt->mnt_mp;
        struct mount *old_parent = mnt->mnt_parent;

        list_del_init(&mnt->mnt_child);
        hlist_del_init(&mnt->mnt_mp_list);
        hlist_del_init_rcu(&mnt->mnt_hash);

        attach_mnt(mnt, parent, mp);

        put_mountpoint(old_mp);
        mnt_add_count(old_parent, -1);
}

/*
 * vfsmount lock must be held for write
 */
static void commit_tree(struct mount *mnt)
{
        struct mount *parent = mnt->mnt_parent;
        struct mount *m;
        LIST_HEAD(head);
        struct mnt_namespace *n = parent->mnt_ns;

        BUG_ON(parent == mnt);

        list_add_tail(&head, &mnt->mnt_list);
        list_for_each_entry(m, &head, mnt_list)
                m->mnt_ns = n;

        list_splice(&head, n->list.prev);

        n->mounts += n->pending_mounts;
        n->pending_mounts = 0;

        __attach_mnt(mnt, parent);
        touch_mnt_namespace(n);
}

static struct mount *next_mnt(struct mount *p, struct mount *root)
{
        struct list_head *next = p->mnt_mounts.next;
        if (next == &p->mnt_mounts) {
                while (1) {
                        if (p == root)
                                return NULL;
                        next = p->mnt_child.next;
                        if (next != &p->mnt_parent->mnt_mounts)
                                break;
                        p = p->mnt_parent;
                }
        }
        return list_entry(next, struct mount, mnt_child);
}

static struct mount *skip_mnt_tree(struct mount *p)
{
        struct list_head *prev = p->mnt_mounts.prev;
        while (prev != &p->mnt_mounts) {
                p = list_entry(prev, struct mount, mnt_child);
                prev = p->mnt_mounts.prev;
        }
        return p;
}

/**
 * vfs_create_mount - Create a mount for a configured superblock
 * @fc: The configuration context with the superblock attached
 *
 * Create a mount to an already configured superblock.  If necessary, the
 * caller should invoke vfs_get_tree() before calling this.
 *
 * Note that this does not attach the mount to anything.
 */
struct vfsmount *vfs_create_mount(struct fs_context *fc)
{
        struct mount *mnt;

        if (!fc->root)
                return ERR_PTR(-EINVAL);

        mnt = alloc_vfsmnt(fc->source ?: "none");
        if (!mnt)
                return ERR_PTR(-ENOMEM);

        if (fc->sb_flags & SB_KERNMOUNT)
                mnt->mnt.mnt_flags = MNT_INTERNAL;

        atomic_inc(&fc->root->d_sb->s_active);
        mnt->mnt.mnt_sb                = fc->root->d_sb;
        mnt->mnt.mnt_root        = dget(fc->root);
        mnt->mnt_mountpoint        = mnt->mnt.mnt_root;
        mnt->mnt_parent                = mnt;

        lock_mount_hash();
        list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
        unlock_mount_hash();
        return &mnt->mnt;
}
EXPORT_SYMBOL(vfs_create_mount);

struct vfsmount *fc_mount(struct fs_context *fc)
{
        int err = vfs_get_tree(fc);
        if (!err) {
                up_write(&fc->root->d_sb->s_umount);
                return vfs_create_mount(fc);
        }
        return ERR_PTR(err);
}
EXPORT_SYMBOL(fc_mount);

struct vfsmount *vfs_kern_mount(struct file_system_type *type,
                                int flags, const char *name,
                                void *data)
{
        struct fs_context *fc;
        struct vfsmount *mnt;
        int ret = 0;

        if (!type)
                return ERR_PTR(-EINVAL);

        fc = fs_context_for_mount(type, flags);
        if (IS_ERR(fc))
                return ERR_CAST(fc);

        if (name)
                ret = vfs_parse_fs_string(fc, "source",
                                          name, strlen(name));
        if (!ret)
                ret = parse_monolithic_mount_data(fc, data);
        if (!ret)
                mnt = fc_mount(fc);
        else
                mnt = ERR_PTR(ret);

        put_fs_context(fc);
        return mnt;
}
EXPORT_SYMBOL_GPL(vfs_kern_mount);

struct vfsmount *
vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
             const char *name, void *data)
{
        /* Until it is worked out how to pass the user namespace
         * through from the parent mount to the submount don't support
         * unprivileged mounts with submounts.
         */
        if (mountpoint->d_sb->s_user_ns != &init_user_ns)
                return ERR_PTR(-EPERM);

        return vfs_kern_mount(type, SB_SUBMOUNT, name, data);
}
EXPORT_SYMBOL_GPL(vfs_submount);

static struct mount *clone_mnt(struct mount *old, struct dentry *root,
                                        int flag)
{
        struct super_block *sb = old->mnt.mnt_sb;
        struct mount *mnt;
        int err;

        mnt = alloc_vfsmnt(old->mnt_devname);
        if (!mnt)
                return ERR_PTR(-ENOMEM);

        if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
                mnt->mnt_group_id = 0; /* not a peer of original */
        else
                mnt->mnt_group_id = old->mnt_group_id;

        if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
                err = mnt_alloc_group_id(mnt);
                if (err)
                        goto out_free;
        }

        mnt->mnt.mnt_flags = old->mnt.mnt_flags;
        mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);

        atomic_inc(&sb->s_active);
        mnt->mnt.mnt_sb = sb;
        mnt->mnt.mnt_root = dget(root);
        mnt->mnt_mountpoint = mnt->mnt.mnt_root;
        mnt->mnt_parent = mnt;
        lock_mount_hash();
        list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
        unlock_mount_hash();

        if ((flag & CL_SLAVE) ||
            ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
                list_add(&mnt->mnt_slave, &old->mnt_slave_list);
                mnt->mnt_master = old;
                CLEAR_MNT_SHARED(mnt);
        } else if (!(flag & CL_PRIVATE)) {
                if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
                        list_add(&mnt->mnt_share, &old->mnt_share);
                if (IS_MNT_SLAVE(old))
                        list_add(&mnt->mnt_slave, &old->mnt_slave);
                mnt->mnt_master = old->mnt_master;
        } else {
                CLEAR_MNT_SHARED(mnt);
        }
        if (flag & CL_MAKE_SHARED)
                set_mnt_shared(mnt);

        /* stick the duplicate mount on the same expiry list
         * as the original if that was on one */
        if (flag & CL_EXPIRE) {
                if (!list_empty(&old->mnt_expire))
                        list_add(&mnt->mnt_expire, &old->mnt_expire);
        }

        return mnt;

 out_free:
        mnt_free_id(mnt);
        free_vfsmnt(mnt);
        return ERR_PTR(err);
}

static void cleanup_mnt(struct mount *mnt)
{
        struct hlist_node *p;
        struct mount *m;
        /*
         * The warning here probably indicates that somebody messed
         * up a mnt_want/drop_write() pair.  If this happens, the
         * filesystem was probably unable to make r/w->r/o transitions.
         * The locking used to deal with mnt_count decrement provides barriers,
         * so mnt_get_writers() below is safe.
         */
        WARN_ON(mnt_get_writers(mnt));
        if (unlikely(mnt->mnt_pins.first))
                mnt_pin_kill(mnt);
        hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
                hlist_del(&m->mnt_umount);
                mntput(&m->mnt);
        }
        fsnotify_vfsmount_delete(&mnt->mnt);
        dput(mnt->mnt.mnt_root);
        deactivate_super(mnt->mnt.mnt_sb);
        mnt_free_id(mnt);
        call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
}

static void __cleanup_mnt(struct rcu_head *head)
{
        cleanup_mnt(container_of(head, struct mount, mnt_rcu));
}

static LLIST_HEAD(delayed_mntput_list);
static void delayed_mntput(struct work_struct *unused)
{
        struct llist_node *node = llist_del_all(&delayed_mntput_list);
        struct mount *m, *t;

        llist_for_each_entry_safe(m, t, node, mnt_llist)
                cleanup_mnt(m);
}
static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);

static void mntput_no_expire(struct mount *mnt)
{
        LIST_HEAD(list);
        int count;

        rcu_read_lock();
        if (likely(READ_ONCE(mnt->mnt_ns))) {
                /*
                 * Since we don't do lock_mount_hash() here,
                 * ->mnt_ns can change under us.  However, if it's
                 * non-NULL, then there's a reference that won't
                 * be dropped until after an RCU delay done after
                 * turning ->mnt_ns NULL.  So if we observe it
                 * non-NULL under rcu_read_lock(), the reference
                 * we are dropping is not the final one.
                 */
                mnt_add_count(mnt, -1);
                rcu_read_unlock();
                return;
        }
        lock_mount_hash();
        /*
         * make sure that if __legitimize_mnt() has not seen us grab
         * mount_lock, we'll see their refcount increment here.
         */
        smp_mb();
        mnt_add_count(mnt, -1);
        count = mnt_get_count(mnt);
        if (count != 0) {
                WARN_ON(count < 0);
                rcu_read_unlock();
                unlock_mount_hash();
                return;
        }
        if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
                rcu_read_unlock();
                unlock_mount_hash();
                return;
        }
        mnt->mnt.mnt_flags |= MNT_DOOMED;
        rcu_read_unlock();

        list_del(&mnt->mnt_instance);

        if (unlikely(!list_empty(&mnt->mnt_mounts))) {
                struct mount *p, *tmp;
                list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts,  mnt_child) {
                        __put_mountpoint(unhash_mnt(p), &list);
                        hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);
                }
        }
        unlock_mount_hash();
        shrink_dentry_list(&list);

        if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
                struct task_struct *task = current;
                if (likely(!(task->flags & PF_KTHREAD))) {
                        init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
                        if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME))
                                return;
                }
                if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
                        schedule_delayed_work(&delayed_mntput_work, 1);
                return;
        }
        cleanup_mnt(mnt);
}

void mntput(struct vfsmount *mnt)
{
        if (mnt) {
                struct mount *m = real_mount(mnt);
                /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
                if (unlikely(m->mnt_expiry_mark))
                        m->mnt_expiry_mark = 0;
                mntput_no_expire(m);
        }
}
EXPORT_SYMBOL(mntput);

struct vfsmount *mntget(struct vfsmount *mnt)
{
        if (mnt)
                mnt_add_count(real_mount(mnt), 1);
        return mnt;
}
EXPORT_SYMBOL(mntget);

/* path_is_mountpoint() - Check if path is a mount in the current
 *                          namespace.
 *
 *  d_mountpoint() can only be used reliably to establish if a dentry is
 *  not mounted in any namespace and that common case is handled inline.
 *  d_mountpoint() isn't aware of the possibility there may be multiple
 *  mounts using a given dentry in a different namespace. This function
 *  checks if the passed in path is a mountpoint rather than the dentry
 *  alone.
 */
bool path_is_mountpoint(const struct path *path)
{
        unsigned seq;
        bool res;

        if (!d_mountpoint(path->dentry))
                return false;

        rcu_read_lock();
        do {
                seq = read_seqbegin(&mount_lock);
                res = __path_is_mountpoint(path);
        } while (read_seqretry(&mount_lock, seq));
        rcu_read_unlock();

        return res;
}
EXPORT_SYMBOL(path_is_mountpoint);

struct vfsmount *mnt_clone_internal(const struct path *path)
{
        struct mount *p;
        p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
        if (IS_ERR(p))
                return ERR_CAST(p);
        p->mnt.mnt_flags |= MNT_INTERNAL;
        return &p->mnt;
}

#ifdef CONFIG_PROC_FS
static struct mount *mnt_list_next(struct mnt_namespace *ns,
                                   struct list_head *p)
{
        struct mount *mnt, *ret = NULL;

        lock_ns_list(ns);
        list_for_each_continue(p, &ns->list) {
                mnt = list_entry(p, typeof(*mnt), mnt_list);
                if (!mnt_is_cursor(mnt)) {
                        ret = mnt;
                        break;
                }
        }
        unlock_ns_list(ns);

        return ret;
}

/* iterator; we want it to have access to namespace_sem, thus here... */
static void *m_start(struct seq_file *m, loff_t *pos)
{
        struct proc_mounts *p = m->private;
        struct list_head *prev;

        down_read(&namespace_sem);
        if (!*pos) {
                prev = &p->ns->list;
        } else {
                prev = &p->cursor.mnt_list;

                /* Read after we'd reached the end? */
                if (list_empty(prev))
                        return NULL;
        }

        return mnt_list_next(p->ns, prev);
}

static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
        struct proc_mounts *p = m->private;
        struct mount *mnt = v;

        ++*pos;
        return mnt_list_next(p->ns, &mnt->mnt_list);
}

static void m_stop(struct seq_file *m, void *v)
{
        struct proc_mounts *p = m->private;
        struct mount *mnt = v;

        lock_ns_list(p->ns);
        if (mnt)
                list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list);
        else
                list_del_init(&p->cursor.mnt_list);
        unlock_ns_list(p->ns);
        up_read(&namespace_sem);
}

static int m_show(struct seq_file *m, void *v)
{
        struct proc_mounts *p = m->private;
        struct mount *r = v;
        return p->show(m, &r->mnt);
}

const struct seq_operations mounts_op = {
        .start        = m_start,
        .next        = m_next,
        .stop        = m_stop,
        .show        = m_show,
};

void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor)
{
        down_read(&namespace_sem);
        lock_ns_list(ns);
        list_del(&cursor->mnt_list);
        unlock_ns_list(ns);
        up_read(&namespace_sem);
}
#endif  /* CONFIG_PROC_FS */

/**
 * may_umount_tree - check if a mount tree is busy
 * @mnt: root of mount tree
 *
 * This is called to check if a tree of mounts has any
 * open files, pwds, chroots or sub mounts that are
 * busy.
 */
int may_umount_tree(struct vfsmount *m)
{
        struct mount *mnt = real_mount(m);
        int actual_refs = 0;
        int minimum_refs = 0;
        struct mount *p;
        BUG_ON(!m);

        /* write lock needed for mnt_get_count */
        lock_mount_hash();
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                actual_refs += mnt_get_count(p);
                minimum_refs += 2;
        }
        unlock_mount_hash();

        if (actual_refs > minimum_refs)
                return 0;

        return 1;
}

EXPORT_SYMBOL(may_umount_tree);

/**
 * may_umount - check if a mount point is busy
 * @mnt: root of mount
 *
 * This is called to check if a mount point has any
 * open files, pwds, chroots or sub mounts. If the
 * mount has sub mounts this will return busy
 * regardless of whether the sub mounts are busy.
 *
 * Doesn't take quota and stuff into account. IOW, in some cases it will
 * give false negatives. The main reason why it's here is that we need
 * a non-destructive way to look for easily umountable filesystems.
 */
int may_umount(struct vfsmount *mnt)
{
        int ret = 1;
        down_read(&namespace_sem);
        lock_mount_hash();
        if (propagate_mount_busy(real_mount(mnt), 2))
                ret = 0;
        unlock_mount_hash();
        up_read(&namespace_sem);
        return ret;
}

EXPORT_SYMBOL(may_umount);

static void namespace_unlock(void)
{
        struct hlist_head head;
        struct hlist_node *p;
        struct mount *m;
        LIST_HEAD(list);

        hlist_move_list(&unmounted, &head);
        list_splice_init(&ex_mountpoints, &list);

        up_write(&namespace_sem);

        shrink_dentry_list(&list);

        if (likely(hlist_empty(&head)))
                return;

        synchronize_rcu_expedited();

        hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
                hlist_del(&m->mnt_umount);
                mntput(&m->mnt);
        }
}

static inline void namespace_lock(void)
{
        down_write(&namespace_sem);
}

enum umount_tree_flags {
        UMOUNT_SYNC = 1,
        UMOUNT_PROPAGATE = 2,
        UMOUNT_CONNECTED = 4,
};

static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how)
{
        /* Leaving mounts connected is only valid for lazy umounts */
        if (how & UMOUNT_SYNC)
                return true;

        /* A mount without a parent has nothing to be connected to */
        if (!mnt_has_parent(mnt))
                return true;

        /* Because the reference counting rules change when mounts are
         * unmounted and connected, umounted mounts may not be
         * connected to mounted mounts.
         */
        if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT))
                return true;

        /* Has it been requested that the mount remain connected? */
        if (how & UMOUNT_CONNECTED)
                return false;

        /* Is the mount locked such that it needs to remain connected? */
        if (IS_MNT_LOCKED(mnt))
                return false;

        /* By default disconnect the mount */
        return true;
}

/*
 * mount_lock must be held
 * namespace_sem must be held for write
 */
static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
{
        LIST_HEAD(tmp_list);
        struct mount *p;

        if (how & UMOUNT_PROPAGATE)
                propagate_mount_unlock(mnt);

        /* Gather the mounts to umount */
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                p->mnt.mnt_flags |= MNT_UMOUNT;
                list_move(&p->mnt_list, &tmp_list);
        }

        /* Hide the mounts from mnt_mounts */
        list_for_each_entry(p, &tmp_list, mnt_list) {
                list_del_init(&p->mnt_child);
        }

        /* Add propogated mounts to the tmp_list */
        if (how & UMOUNT_PROPAGATE)
                propagate_umount(&tmp_list);

        while (!list_empty(&tmp_list)) {
                struct mnt_namespace *ns;
                bool disconnect;
                p = list_first_entry(&tmp_list, struct mount, mnt_list);
                list_del_init(&p->mnt_expire);
                list_del_init(&p->mnt_list);
                ns = p->mnt_ns;
                if (ns) {
                        ns->mounts--;
                        __touch_mnt_namespace(ns);
                }
                p->mnt_ns = NULL;
                if (how & UMOUNT_SYNC)
                        p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;

                disconnect = disconnect_mount(p, how);
                if (mnt_has_parent(p)) {
                        mnt_add_count(p->mnt_parent, -1);
                        if (!disconnect) {
                                /* Don't forget about p */
                                list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
                        } else {
                                umount_mnt(p);
                        }
                }
                change_mnt_propagation(p, MS_PRIVATE);
                if (disconnect)
                        hlist_add_head(&p->mnt_umount, &unmounted);
        }
}

static void shrink_submounts(struct mount *mnt);

static int do_umount_root(struct super_block *sb)
{
        int ret = 0;

        down_write(&sb->s_umount);
        if (!sb_rdonly(sb)) {
                struct fs_context *fc;

                fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY,
                                                SB_RDONLY);
                if (IS_ERR(fc)) {
                        ret = PTR_ERR(fc);
                } else {
                        ret = parse_monolithic_mount_data(fc, NULL);
                        if (!ret)
                                ret = reconfigure_super(fc);
                        put_fs_context(fc);
                }
        }
        up_write(&sb->s_umount);
        return ret;
}

static int do_umount(struct mount *mnt, int flags)
{
        struct super_block *sb = mnt->mnt.mnt_sb;
        int retval;

        retval = security_sb_umount(&mnt->mnt, flags);
        if (retval)
                return retval;

        /*
         * Allow userspace to request a mountpoint be expired rather than
         * unmounting unconditionally. Unmount only happens if:
         *  (1) the mark is already set (the mark is cleared by mntput())
         *  (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
         */
        if (flags & MNT_EXPIRE) {
                if (&mnt->mnt == current->fs->root.mnt ||
                    flags & (MNT_FORCE | MNT_DETACH))
                        return -EINVAL;

                /*
                 * probably don't strictly need the lock here if we examined
                 * all race cases, but it's a slowpath.
                 */
                lock_mount_hash();
                if (mnt_get_count(mnt) != 2) {
                        unlock_mount_hash();
                        return -EBUSY;
                }
                unlock_mount_hash();

                if (!xchg(&mnt->mnt_expiry_mark, 1))
                        return -EAGAIN;
        }

        /*
         * If we may have to abort operations to get out of this
         * mount, and they will themselves hold resources we must
         * allow the fs to do things. In the Unix tradition of
         * 'Gee thats tricky lets do it in userspace' the umount_begin
         * might fail to complete on the first run through as other tasks
         * must return, and the like. Thats for the mount program to worry
         * about for the moment.
         */

        if (flags & MNT_FORCE && sb->s_op->umount_begin) {
                sb->s_op->umount_begin(sb);
        }

        /*
         * No sense to grab the lock for this test, but test itself looks
         * somewhat bogus. Suggestions for better replacement?
         * Ho-hum... In principle, we might treat that as umount + switch
         * to rootfs. GC would eventually take care of the old vfsmount.
         * Actually it makes sense, especially if rootfs would contain a
         * /reboot - static binary that would close all descriptors and
         * call reboot(9). Then init(8) could umount root and exec /reboot.
         */
        if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
                /*
                 * Special case for "unmounting" root ...
                 * we just try to remount it readonly.
                 */
                if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
                        return -EPERM;
                return do_umount_root(sb);
        }

        namespace_lock();
        lock_mount_hash();

        /* Recheck MNT_LOCKED with the locks held */
        retval = -EINVAL;
        if (mnt->mnt.mnt_flags & MNT_LOCKED)
                goto out;

        event++;
        if (flags & MNT_DETACH) {
                if (!list_empty(&mnt->mnt_list))
                        umount_tree(mnt, UMOUNT_PROPAGATE);
                retval = 0;
        } else {
                shrink_submounts(mnt);
                retval = -EBUSY;
                if (!propagate_mount_busy(mnt, 2)) {
                        if (!list_empty(&mnt->mnt_list))
                                umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
                        retval = 0;
                }
        }
out:
        unlock_mount_hash();
        namespace_unlock();
        return retval;
}

/*
 * __detach_mounts - lazily unmount all mounts on the specified dentry
 *
 * During unlink, rmdir, and d_drop it is possible to loose the path
 * to an existing mountpoint, and wind up leaking the mount.
 * detach_mounts allows lazily unmounting those mounts instead of
 * leaking them.
 *
 * The caller may hold dentry->d_inode->i_mutex.
 */
void __detach_mounts(struct dentry *dentry)
{
        struct mountpoint *mp;
        struct mount *mnt;

        namespace_lock();
        lock_mount_hash();
        mp = lookup_mountpoint(dentry);
        if (!mp)
                goto out_unlock;

        event++;
        while (!hlist_empty(&mp->m_list)) {
                mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
                if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
                        umount_mnt(mnt);
                        hlist_add_head(&mnt->mnt_umount, &unmounted);
                }
                else umount_tree(mnt, UMOUNT_CONNECTED);
        }
        put_mountpoint(mp);
out_unlock:
        unlock_mount_hash();
        namespace_unlock();
}

/*
 * Is the caller allowed to modify his namespace?
 */
static inline bool may_mount(void)
{
        return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
}

#ifdef        CONFIG_MANDATORY_FILE_LOCKING
static bool may_mandlock(void)
{
        pr_warn_once("======================================================\n"
                     "WARNING: the mand mount option is being deprecated and\n"
                     "         will be removed in v5.15!\n"
                     "======================================================\n");
        return capable(CAP_SYS_ADMIN);
}
#else
static inline bool may_mandlock(void)
{
        pr_warn("VFS: \"mand\" mount option not supported");
        return false;
}
#endif

static int can_umount(const struct path *path, int flags)
{
        struct mount *mnt = real_mount(path->mnt);

        if (!may_mount())
                return -EPERM;
        if (path->dentry != path->mnt->mnt_root)
                return -EINVAL;
        if (!check_mnt(mnt))
                return -EINVAL;
        if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
                return -EINVAL;
        if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
                return -EPERM;
        return 0;
}

// caller is responsible for flags being sane
int path_umount(struct path *path, int flags)
{
        struct mount *mnt = real_mount(path->mnt);
        int ret;

        ret = can_umount(path, flags);
        if (!ret)
                ret = do_umount(mnt, flags);

        /* we mustn't call path_put() as that would clear mnt_expiry_mark */
        dput(path->dentry);
        mntput_no_expire(mnt);
        return ret;
}

static int ksys_umount(char __user *name, int flags)
{
        int lookup_flags = LOOKUP_MOUNTPOINT;
        struct path path;
        int ret;

        // basic validity checks done first
        if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
                return -EINVAL;

        if (!(flags & UMOUNT_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;
        ret = user_path_at(AT_FDCWD, name, lookup_flags, &path);
        if (ret)
                return ret;
        return path_umount(&path, flags);
}

SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
{
        return ksys_umount(name, flags);
}

#ifdef __ARCH_WANT_SYS_OLDUMOUNT

/*
 *        The 2.0 compatible umount. No flags.
 */
SYSCALL_DEFINE1(oldumount, char __user *, name)
{
        return ksys_umount(name, 0);
}

#endif

static bool is_mnt_ns_file(struct dentry *dentry)
{
        /* Is this a proxy for a mount namespace? */
        return dentry->d_op == &ns_dentry_operations &&
               dentry->d_fsdata == &mntns_operations;
}

static struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
{
        return container_of(ns, struct mnt_namespace, ns);
}

struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
{
        return &mnt->ns;
}

static bool mnt_ns_loop(struct dentry *dentry)
{
        /* Could bind mounting the mount namespace inode cause a
         * mount namespace loop?
         */
        struct mnt_namespace *mnt_ns;
        if (!is_mnt_ns_file(dentry))
                return false;

        mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode));
        return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
}

struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
                                        int flag)
{
        struct mount *res, *p, *q, *r, *parent;

        if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
                return ERR_PTR(-EINVAL);

        if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
                return ERR_PTR(-EINVAL);

        res = q = clone_mnt(mnt, dentry, flag);
        if (IS_ERR(q))
                return q;

        q->mnt_mountpoint = mnt->mnt_mountpoint;

        p = mnt;
        list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
                struct mount *s;
                if (!is_subdir(r->mnt_mountpoint, dentry))
                        continue;

                for (s = r; s; s = next_mnt(s, r)) {
                        if (!(flag & CL_COPY_UNBINDABLE) &&
                            IS_MNT_UNBINDABLE(s)) {
                                if (s->mnt.mnt_flags & MNT_LOCKED) {
                                        /* Both unbindable and locked. */
                                        q = ERR_PTR(-EPERM);
                                        goto out;
                                } else {
                                        s = skip_mnt_tree(s);
                                        continue;
                                }
                        }
                        if (!(flag & CL_COPY_MNT_NS_FILE) &&
                            is_mnt_ns_file(s->mnt.mnt_root)) {
                                s = skip_mnt_tree(s);
                                continue;
                        }
                        while (p != s->mnt_parent) {
                                p = p->mnt_parent;
                                q = q->mnt_parent;
                        }
                        p = s;
                        parent = q;
                        q = clone_mnt(p, p->mnt.mnt_root, flag);
                        if (IS_ERR(q))
                                goto out;
                        lock_mount_hash();
                        list_add_tail(&q->mnt_list, &res->mnt_list);
                        attach_mnt(q, parent, p->mnt_mp);
                        unlock_mount_hash();
                }
        }
        return res;
out:
        if (res) {
                lock_mount_hash();
                umount_tree(res, UMOUNT_SYNC);
                unlock_mount_hash();
        }
        return q;
}

/* Caller should check returned pointer for errors */

struct vfsmount *collect_mounts(const struct path *path)
{
        struct mount *tree;
        namespace_lock();
        if (!check_mnt(real_mount(path->mnt)))
                tree = ERR_PTR(-EINVAL);
        else
                tree = copy_tree(real_mount(path->mnt), path->dentry,
                                 CL_COPY_ALL | CL_PRIVATE);
        namespace_unlock();
        if (IS_ERR(tree))
                return ERR_CAST(tree);
        return &tree->mnt;
}

static void free_mnt_ns(struct mnt_namespace *);
static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);

void dissolve_on_fput(struct vfsmount *mnt)
{
        struct mnt_namespace *ns;
        namespace_lock();
        lock_mount_hash();
        ns = real_mount(mnt)->mnt_ns;
        if (ns) {
                if (is_anon_ns(ns))
                        umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
                else
                        ns = NULL;
        }
        unlock_mount_hash();
        namespace_unlock();
        if (ns)
                free_mnt_ns(ns);
}

void drop_collected_mounts(struct vfsmount *mnt)
{
        namespace_lock();
        lock_mount_hash();
        umount_tree(real_mount(mnt), 0);
        unlock_mount_hash();
        namespace_unlock();
}

static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
{
        struct mount *child;

        list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
                if (!is_subdir(child->mnt_mountpoint, dentry))
                        continue;

                if (child->mnt.mnt_flags & MNT_LOCKED)
                        return true;
        }
        return false;
}

/**
 * clone_private_mount - create a private clone of a path
 *
 * This creates a new vfsmount, which will be the clone of @path.  The new will
 * not be attached anywhere in the namespace and will be private (i.e. changes
 * to the originating mount won't be propagated into this).
 *
 * Release with mntput().
 */
struct vfsmount *clone_private_mount(const struct path *path)
{
        struct mount *old_mnt = real_mount(path->mnt);
        struct mount *new_mnt;

        down_read(&namespace_sem);
        if (IS_MNT_UNBINDABLE(old_mnt))
                goto invalid;

        if (!check_mnt(old_mnt))
                goto invalid;

        if (has_locked_children(old_mnt, path->dentry))
                goto invalid;

        new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
        up_read(&namespace_sem);

        if (IS_ERR(new_mnt))
                return ERR_CAST(new_mnt);

        /* Longterm mount to be removed by kern_unmount*() */
        new_mnt->mnt_ns = MNT_NS_INTERNAL;

        return &new_mnt->mnt;

invalid:
        up_read(&namespace_sem);
        return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(clone_private_mount);

int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
                   struct vfsmount *root)
{
        struct mount *mnt;
        int res = f(root, arg);
        if (res)
                return res;
        list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
                res = f(&mnt->mnt, arg);
                if (res)
                        return res;
        }
        return 0;
}

static void lock_mnt_tree(struct mount *mnt)
{
        struct mount *p;

        for (p = mnt; p; p = next_mnt(p, mnt)) {
                int flags = p->mnt.mnt_flags;
                /* Don't allow unprivileged users to change mount flags */
                flags |= MNT_LOCK_ATIME;

                if (flags & MNT_READONLY)
                        flags |= MNT_LOCK_READONLY;

                if (flags & MNT_NODEV)
                        flags |= MNT_LOCK_NODEV;

                if (flags & MNT_NOSUID)
                        flags |= MNT_LOCK_NOSUID;

                if (flags & MNT_NOEXEC)
                        flags |= MNT_LOCK_NOEXEC;
                /* Don't allow unprivileged users to reveal what is under a mount */
                if (list_empty(&p->mnt_expire))
                        flags |= MNT_LOCKED;
                p->mnt.mnt_flags = flags;
        }
}

static void cleanup_group_ids(struct mount *mnt, struct mount *end)
{
        struct mount *p;

        for (p = mnt; p != end; p = next_mnt(p, mnt)) {
                if (p->mnt_group_id && !IS_MNT_SHARED(p))
                        mnt_release_group_id(p);
        }
}

static int invent_group_ids(struct mount *mnt, bool recurse)
{
        struct mount *p;

        for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
                if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
                        int err = mnt_alloc_group_id(p);
                        if (err) {
                                cleanup_group_ids(mnt, p);
                                return err;
                        }
                }
        }

        return 0;
}

int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
{
        unsigned int max = READ_ONCE(sysctl_mount_max);
        unsigned int mounts = 0, old, pending, sum;
        struct mount *p;

        for (p = mnt; p; p = next_mnt(p, mnt))
                mounts++;

        old = ns->mounts;
        pending = ns->pending_mounts;
        sum = old + pending;
        if ((old > sum) ||
            (pending > sum) ||
            (max < sum) ||
            (mounts > (max - sum)))
                return -ENOSPC;

        ns->pending_mounts = pending + mounts;
        return 0;
}

/*
 *  @source_mnt : mount tree to be attached
 *  @nd         : place the mount tree @source_mnt is attached
 *  @parent_nd  : if non-null, detach the source_mnt from its parent and
 *                     store the parent mount and mountpoint dentry.
 *                     (done when source_mnt is moved)
 *
 *  NOTE: in the table below explains the semantics when a source mount
 *  of a given type is attached to a destination mount of a given type.
 * ---------------------------------------------------------------------------
 * |         BIND MOUNT OPERATION                                            |
 * |**************************************************************************
 * | source-->| shared        |       private  |       slave    | unbindable |
 * | dest     |               |                |                |            |
 * |   |      |               |                |                |            |
 * |   v      |               |                |                |            |
 * |**************************************************************************
 * |  shared  | shared (++)   |     shared (+) |     shared(+++)|  invalid   |
 * |          |               |                |                |            |
 * |non-shared| shared (+)    |      private   |      slave (*) |  invalid   |
 * ***************************************************************************
 * A bind operation clones the source mount and mounts the clone on the
 * destination mount.
 *
 * (++)  the cloned mount is propagated to all the mounts in the propagation
 *          tree of the destination mount and the cloned mount is added to
 *          the peer group of the source mount.
 * (+)   the cloned mount is created under the destination mount and is marked
 *       as shared. The cloned mount is added to the peer group of the source
 *       mount.
 * (+++) the mount is propagated to all the mounts in the propagation tree
 *       of the destination mount and the cloned mount is made slave
 *       of the same master as that of the source mount. The cloned mount
 *       is marked as 'shared and slave'.
 * (*)   the cloned mount is made a slave of the same master as that of the
 *          source mount.
 *
 * ---------------------------------------------------------------------------
 * |                         MOVE MOUNT OPERATION                                 |
 * |**************************************************************************
 * | source-->| shared        |       private  |       slave    | unbindable |
 * | dest     |               |                |                |            |
 * |   |      |               |                |                |            |
 * |   v      |               |                |                |            |
 * |**************************************************************************
 * |  shared  | shared (+)    |     shared (+) |    shared(+++) |  invalid   |
 * |          |               |                |                |            |
 * |non-shared| shared (+*)   |      private   |    slave (*)   | unbindable |
 * ***************************************************************************
 *
 * (+)  the mount is moved to the destination. And is then propagated to
 *         all the mounts in the propagation tree of the destination mount.
 * (+*)  the mount is moved to the destination.
 * (+++)  the mount is moved to the destination and is then propagated to
 *         all the mounts belonging to the destination mount's propagation tree.
 *         the mount is marked as 'shared and slave'.
 * (*)        the mount continues to be a slave at the new location.
 *
 * if the source mount is a tree, the operations explained above is
 * applied to each mount in the tree.
 * Must be called without spinlocks held, since this function can sleep
 * in allocations.
 */
static int attach_recursive_mnt(struct mount *source_mnt,
                        struct mount *dest_mnt,
                        struct mountpoint *dest_mp,
                        bool moving)
{
        struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
        HLIST_HEAD(tree_list);
        struct mnt_namespace *ns = dest_mnt->mnt_ns;
        struct mountpoint *smp;
        struct mount *child, *p;
        struct hlist_node *n;
        int err;

        /* Preallocate a mountpoint in case the new mounts need
         * to be tucked under other mounts.
         */
        smp = get_mountpoint(source_mnt->mnt.mnt_root);
        if (IS_ERR(smp))
                return PTR_ERR(smp);

        /* Is there space to add these mounts to the mount namespace? */
        if (!moving) {
                err = count_mounts(ns, source_mnt);
                if (err)
                        goto out;
        }

        if (IS_MNT_SHARED(dest_mnt)) {
                err = invent_group_ids(source_mnt, true);
                if (err)
                        goto out;
                err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
                lock_mount_hash();
                if (err)
                        goto out_cleanup_ids;
                for (p = source_mnt; p; p = next_mnt(p, source_mnt))
                        set_mnt_shared(p);
        } else {
                lock_mount_hash();
        }
        if (moving) {
                unhash_mnt(source_mnt);
                attach_mnt(source_mnt, dest_mnt, dest_mp);
                touch_mnt_namespace(source_mnt->mnt_ns);
        } else {
                if (source_mnt->mnt_ns) {
                        /* move from anon - the caller will destroy */
                        list_del_init(&source_mnt->mnt_ns->list);
                }
                mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
                commit_tree(source_mnt);
        }

        hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
                struct mount *q;
                hlist_del_init(&child->mnt_hash);
                q = __lookup_mnt(&child->mnt_parent->mnt,
                                 child->mnt_mountpoint);
                if (q)
                        mnt_change_mountpoint(child, smp, q);
                /* Notice when we are propagating across user namespaces */
                if (child->mnt_parent->mnt_ns->user_ns != user_ns)
                        lock_mnt_tree(child);
                child->mnt.mnt_flags &= ~MNT_LOCKED;
                commit_tree(child);
        }
        put_mountpoint(smp);
        unlock_mount_hash();

        return 0;

 out_cleanup_ids:
        while (!hlist_empty(&tree_list)) {
                child = hlist_entry(tree_list.first, struct mount, mnt_hash);
                child->mnt_parent->mnt_ns->pending_mounts = 0;
                umount_tree(child, UMOUNT_SYNC);
        }
        unlock_mount_hash();
        cleanup_group_ids(source_mnt, NULL);
 out:
        ns->pending_mounts = 0;

        read_seqlock_excl(&mount_lock);
        put_mountpoint(smp);
        read_sequnlock_excl(&mount_lock);

        return err;
}

static struct mountpoint *lock_mount(struct path *path)
{
        struct vfsmount *mnt;
        struct dentry *dentry = path->dentry;
retry:
        inode_lock(dentry->d_inode);
        if (unlikely(cant_mount(dentry))) {
                inode_unlock(dentry->d_inode);
                return ERR_PTR(-ENOENT);
        }
        namespace_lock();
        mnt = lookup_mnt(path);
        if (likely(!mnt)) {
                struct mountpoint *mp = get_mountpoint(dentry);
                if (IS_ERR(mp)) {
                        namespace_unlock();
                        inode_unlock(dentry->d_inode);
                        return mp;
                }
                return mp;
        }
        namespace_unlock();
        inode_unlock(path->dentry->d_inode);
        path_put(path);
        path->mnt = mnt;
        dentry = path->dentry = dget(mnt->mnt_root);
        goto retry;
}

static void unlock_mount(struct mountpoint *where)
{
        struct dentry *dentry = where->m_dentry;

        read_seqlock_excl(&mount_lock);
        put_mountpoint(where);
        read_sequnlock_excl(&mount_lock);

        namespace_unlock();
        inode_unlock(dentry->d_inode);
}

static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
{
        if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER)
                return -EINVAL;

        if (d_is_dir(mp->m_dentry) !=
              d_is_dir(mnt->mnt.mnt_root))
                return -ENOTDIR;

        return attach_recursive_mnt(mnt, p, mp, false);
}

/*
 * Sanity check the flags to change_mnt_propagation.
 */

static int flags_to_propagation_type(int ms_flags)
{
        int type = ms_flags & ~(MS_REC | MS_SILENT);

        /* Fail if any non-propagation flags are set */
        if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
                return 0;
        /* Only one propagation flag should be set */
        if (!is_power_of_2(type))
                return 0;
        return type;
}

/*
 * recursively change the type of the mountpoint.
 */
static int do_change_type(struct path *path, int ms_flags)
{
        struct mount *m;
        struct mount *mnt = real_mount(path->mnt);
        int recurse = ms_flags & MS_REC;
        int type;
        int err = 0;

        if (path->dentry != path->mnt->mnt_root)
                return -EINVAL;

        type = flags_to_propagation_type(ms_flags);
        if (!type)
                return -EINVAL;

        namespace_lock();
        if (type == MS_SHARED) {
                err = invent_group_ids(mnt, recurse);
                if (err)
                        goto out_unlock;
        }

        lock_mount_hash();
        for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
                change_mnt_propagation(m, type);
        unlock_mount_hash();

 out_unlock:
        namespace_unlock();
        return err;
}

static struct mount *__do_loopback(struct path *old_path, int recurse)
{
        struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);

        if (IS_MNT_UNBINDABLE(old))
                return mnt;

        if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
                return mnt;

        if (!recurse && has_locked_children(old, old_path->dentry))
                return mnt;

        if (recurse)
                mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
        else
                mnt = clone_mnt(old, old_path->dentry, 0);

        if (!IS_ERR(mnt))
                mnt->mnt.mnt_flags &= ~MNT_LOCKED;

        return mnt;
}

/*
 * do loopback mount.
 */
static int do_loopback(struct path *path, const char *old_name,
                                int recurse)
{
        struct path old_path;
        struct mount *mnt = NULL, *parent;
        struct mountpoint *mp;
        int err;
        if (!old_name || !*old_name)
                return -EINVAL;
        err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
        if (err)
                return err;

        err = -EINVAL;
        if (mnt_ns_loop(old_path.dentry))
                goto out;

        mp = lock_mount(path);
        if (IS_ERR(mp)) {
                err = PTR_ERR(mp);
                goto out;
        }

        parent = real_mount(path->mnt);
        if (!check_mnt(parent))
                goto out2;

        mnt = __do_loopback(&old_path, recurse);
        if (IS_ERR(mnt)) {
                err = PTR_ERR(mnt);
                goto out2;
        }

        err = graft_tree(mnt, parent, mp);
        if (err) {
                lock_mount_hash();
                umount_tree(mnt, UMOUNT_SYNC);
                unlock_mount_hash();
        }
out2:
        unlock_mount(mp);
out:
        path_put(&old_path);
        return err;
}

static struct file *open_detached_copy(struct path *path, bool recursive)
{
        struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
        struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true);
        struct mount *mnt, *p;
        struct file *file;

        if (IS_ERR(ns))
                return ERR_CAST(ns);

        namespace_lock();
        mnt = __do_loopback(path, recursive);
        if (IS_ERR(mnt)) {
                namespace_unlock();
                free_mnt_ns(ns);
                return ERR_CAST(mnt);
        }

        lock_mount_hash();
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                p->mnt_ns = ns;
                ns->mounts++;
        }
        ns->root = mnt;
        list_add_tail(&ns->list, &mnt->mnt_list);
        mntget(&mnt->mnt);
        unlock_mount_hash();
        namespace_unlock();

        mntput(path->mnt);
        path->mnt = &mnt->mnt;
        file = dentry_open(path, O_PATH, current_cred());
        if (IS_ERR(file))
                dissolve_on_fput(path->mnt);
        else
                file->f_mode |= FMODE_NEED_UNMOUNT;
        return file;
}

SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
{
        struct file *file;
        struct path path;
        int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
        bool detached = flags & OPEN_TREE_CLONE;
        int error;
        int fd;

        BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);

        if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
                      AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
                      OPEN_TREE_CLOEXEC))
                return -EINVAL;

        if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
                return -EINVAL;

        if (flags & AT_NO_AUTOMOUNT)
                lookup_flags &= ~LOOKUP_AUTOMOUNT;
        if (flags & AT_SYMLINK_NOFOLLOW)
                lookup_flags &= ~LOOKUP_FOLLOW;
        if (flags & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;

        if (detached && !may_mount())
                return -EPERM;

        fd = get_unused_fd_flags(flags & O_CLOEXEC);
        if (fd < 0)
                return fd;

        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (unlikely(error)) {
                file = ERR_PTR(error);
        } else {
                if (detached)
                        file = open_detached_copy(&path, flags & AT_RECURSIVE);
                else
                        file = dentry_open(&path, O_PATH, current_cred());
                path_put(&path);
        }
        if (IS_ERR(file)) {
                put_unused_fd(fd);
                return PTR_ERR(file);
        }
        fd_install(fd, file);
        return fd;
}

/*
 * Don't allow locked mount flags to be cleared.
 *
 * No locks need to be held here while testing the various MNT_LOCK
 * flags because those flags can never be cleared once they are set.
 */
static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags)
{
        unsigned int fl = mnt->mnt.mnt_flags;

        if ((fl & MNT_LOCK_READONLY) &&
            !(mnt_flags & MNT_READONLY))
                return false;

        if ((fl & MNT_LOCK_NODEV) &&
            !(mnt_flags & MNT_NODEV))
                return false;

        if ((fl & MNT_LOCK_NOSUID) &&
            !(mnt_flags & MNT_NOSUID))
                return false;

        if ((fl & MNT_LOCK_NOEXEC) &&
            !(mnt_flags & MNT_NOEXEC))
                return false;

        if ((fl & MNT_LOCK_ATIME) &&
            ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK)))
                return false;

        return true;
}

static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags)
{
        bool readonly_request = (mnt_flags & MNT_READONLY);

        if (readonly_request == __mnt_is_readonly(&mnt->mnt))
                return 0;

        if (readonly_request)
                return mnt_make_readonly(mnt);

        return __mnt_unmake_readonly(mnt);
}

/*
 * Update the user-settable attributes on a mount.  The caller must hold
 * sb->s_umount for writing.
 */
static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
{
        lock_mount_hash();
        mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
        mnt->mnt.mnt_flags = mnt_flags;
        touch_mnt_namespace(mnt->mnt_ns);
        unlock_mount_hash();
}

static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt)
{
        struct super_block *sb = mnt->mnt_sb;

        if (!__mnt_is_readonly(mnt) &&
           (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
                char *buf = (char *)__get_free_page(GFP_KERNEL);
                char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM);
                struct tm tm;

                time64_to_tm(sb->s_time_max, 0, &tm);

                pr_warn("%s filesystem being %s at %s supports timestamps until %04ld (0x%llx)\n",
                        sb->s_type->name,
                        is_mounted(mnt) ? "remounted" : "mounted",
                        mntpath,
                        tm.tm_year+1900, (unsigned long long)sb->s_time_max);

                free_page((unsigned long)buf);
        }
}

/*
 * Handle reconfiguration of the mountpoint only without alteration of the
 * superblock it refers to.  This is triggered by specifying MS_REMOUNT|MS_BIND
 * to mount(2).
 */
static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
{
        struct super_block *sb = path->mnt->mnt_sb;
        struct mount *mnt = real_mount(path->mnt);
        int ret;

        if (!check_mnt(mnt))
                return -EINVAL;

        if (path->dentry != mnt->mnt.mnt_root)
                return -EINVAL;

        if (!can_change_locked_flags(mnt, mnt_flags))
                return -EPERM;

        down_write(&sb->s_umount);
        ret = change_mount_ro_state(mnt, mnt_flags);
        if (ret == 0)
                set_mount_attributes(mnt, mnt_flags);
        up_write(&sb->s_umount);

        mnt_warn_timestamp_expiry(path, &mnt->mnt);

        return ret;
}

/*
 * change filesystem flags. dir should be a physical root of filesystem.
 * If you've mounted a non-root directory somewhere and want to do remount
 * on it - tough luck.
 */
static int do_remount(struct path *path, int ms_flags, int sb_flags,
                      int mnt_flags, void *data)
{
        int err;
        struct super_block *sb = path->mnt->mnt_sb;
        struct mount *mnt = real_mount(path->mnt);
        struct fs_context *fc;

        if (!check_mnt(mnt))
                return -EINVAL;

        if (path->dentry != path->mnt->mnt_root)
                return -EINVAL;

        if (!can_change_locked_flags(mnt, mnt_flags))
                return -EPERM;

        fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK);
        if (IS_ERR(fc))
                return PTR_ERR(fc);

        /*
         * Indicate to the filesystem that the remount request is coming
         * from the legacy mount system call.
         */
        fc->oldapi = true;

        err = parse_monolithic_mount_data(fc, data);
        if (!err) {
                down_write(&sb->s_umount);
                err = -EPERM;
                if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
                        err = reconfigure_super(fc);
                        if (!err)
                                set_mount_attributes(mnt, mnt_flags);
                }
                up_write(&sb->s_umount);
        }

        mnt_warn_timestamp_expiry(path, &mnt->mnt);

        put_fs_context(fc);
        return err;
}

static inline int tree_contains_unbindable(struct mount *mnt)
{
        struct mount *p;
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                if (IS_MNT_UNBINDABLE(p))
                        return 1;
        }
        return 0;
}

/*
 * Check that there aren't references to earlier/same mount namespaces in the
 * specified subtree.  Such references can act as pins for mount namespaces
 * that aren't checked by the mount-cycle checking code, thereby allowing
 * cycles to be made.
 */
static bool check_for_nsfs_mounts(struct mount *subtree)
{
        struct mount *p;
        bool ret = false;

        lock_mount_hash();
        for (p = subtree; p; p = next_mnt(p, subtree))
                if (mnt_ns_loop(p->mnt.mnt_root))
                        goto out;

        ret = true;
out:
        unlock_mount_hash();
        return ret;
}

static int do_move_mount(struct path *old_path, struct path *new_path)
{
        struct mnt_namespace *ns;
        struct mount *p;
        struct mount *old;
        struct mount *parent;
        struct mountpoint *mp, *old_mp;
        int err;
        bool attached;

        mp = lock_mount(new_path);
        if (IS_ERR(mp))
                return PTR_ERR(mp);

        old = real_mount(old_path->mnt);
        p = real_mount(new_path->mnt);
        parent = old->mnt_parent;
        attached = mnt_has_parent(old);
        old_mp = old->mnt_mp;
        ns = old->mnt_ns;

        err = -EINVAL;
        /* The mountpoint must be in our namespace. */
        if (!check_mnt(p))
                goto out;

        /* The thing moved must be mounted... */
        if (!is_mounted(&old->mnt))
                goto out;

        /* ... and either ours or the root of anon namespace */
        if (!(attached ? check_mnt(old) : is_anon_ns(ns)))
                goto out;

        if (old->mnt.mnt_flags & MNT_LOCKED)
                goto out;

        if (old_path->dentry != old_path->mnt->mnt_root)
                goto out;

        if (d_is_dir(new_path->dentry) !=
            d_is_dir(old_path->dentry))
                goto out;
        /*
         * Don't move a mount residing in a shared parent.
         */
        if (attached && IS_MNT_SHARED(parent))
                goto out;
        /*
         * Don't move a mount tree containing unbindable mounts to a destination
         * mount which is shared.
         */
        if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
                goto out;
        err = -ELOOP;
        if (!check_for_nsfs_mounts(old))
                goto out;
        for (; mnt_has_parent(p); p = p->mnt_parent)
                if (p == old)
                        goto out;

        err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp,
                                   attached);
        if (err)
                goto out;

        /* if the mount is moved, it should no longer be expire
         * automatically */
        list_del_init(&old->mnt_expire);
        if (attached)
                put_mountpoint(old_mp);
out:
        unlock_mount(mp);
        if (!err) {
                if (attached)
                        mntput_no_expire(parent);
                else
                        free_mnt_ns(ns);
        }
        return err;
}

static int do_move_mount_old(struct path *path, const char *old_name)
{
        struct path old_path;
        int err;

        if (!old_name || !*old_name)
                return -EINVAL;

        err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
        if (err)
                return err;

        err = do_move_mount(&old_path, path);
        path_put(&old_path);
        return err;
}

/*
 * add a mount into a namespace's mount tree
 */
static int do_add_mount(struct mount *newmnt, struct mountpoint *mp,
                        struct path *path, int mnt_flags)
{
        struct mount *parent = real_mount(path->mnt);

        mnt_flags &= ~MNT_INTERNAL_FLAGS;

        if (unlikely(!check_mnt(parent))) {
                /* that's acceptable only for automounts done in private ns */
                if (!(mnt_flags & MNT_SHRINKABLE))
                        return -EINVAL;
                /* ... and for those we'd better have mountpoint still alive */
                if (!parent->mnt_ns)
                        return -EINVAL;
        }

        /* Refuse the same filesystem on the same mount point */
        if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
            path->mnt->mnt_root == path->dentry)
                return -EBUSY;

        if (d_is_symlink(newmnt->mnt.mnt_root))
                return -EINVAL;

        newmnt->mnt.mnt_flags = mnt_flags;
        return graft_tree(newmnt, parent, mp);
}

static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags);

/*
 * Create a new mount using a superblock configuration and request it
 * be added to the namespace tree.
 */
static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
                           unsigned int mnt_flags)
{
        struct vfsmount *mnt;
        struct mountpoint *mp;
        struct super_block *sb = fc->root->d_sb;
        int error;

        error = security_sb_kern_mount(sb);
        if (!error && mount_too_revealing(sb, &mnt_flags))
                error = -EPERM;

        if (unlikely(error)) {
                fc_drop_locked(fc);
                return error;
        }

        up_write(&sb->s_umount);

        mnt = vfs_create_mount(fc);
        if (IS_ERR(mnt))
                return PTR_ERR(mnt);

        mnt_warn_timestamp_expiry(mountpoint, mnt);

        mp = lock_mount(mountpoint);
        if (IS_ERR(mp)) {
                mntput(mnt);
                return PTR_ERR(mp);
        }
        error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags);
        unlock_mount(mp);
        if (error < 0)
                mntput(mnt);
        return error;
}

/*
 * create a new mount for userspace and request it to be added into the
 * namespace's tree
 */
static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
                        int mnt_flags, const char *name, void *data)
{
        struct file_system_type *type;
        struct fs_context *fc;
        const char *subtype = NULL;
        int err = 0;

        if (!fstype)
                return -EINVAL;

        type = get_fs_type(fstype);
        if (!type)
                return -ENODEV;

        if (type->fs_flags & FS_HAS_SUBTYPE) {
                subtype = strchr(fstype, '.');
                if (subtype) {
                        subtype++;
                        if (!*subtype) {
                                put_filesystem(type);
                                return -EINVAL;
                        }
                }
        }

        fc = fs_context_for_mount(type, sb_flags);
        put_filesystem(type);
        if (IS_ERR(fc))
                return PTR_ERR(fc);

        /*
         * Indicate to the filesystem that the mount request is coming
         * from the legacy mount system call.
         */
        fc->oldapi = true;

        if (subtype)
                err = vfs_parse_fs_string(fc, "subtype",
                                          subtype, strlen(subtype));
        if (!err && name)
                err = vfs_parse_fs_string(fc, "source", name, strlen(name));
        if (!err)
                err = parse_monolithic_mount_data(fc, data);
        if (!err && !mount_capable(fc))
                err = -EPERM;
        if (!err)
                err = vfs_get_tree(fc);
        if (!err)
                err = do_new_mount_fc(fc, path, mnt_flags);

        put_fs_context(fc);
        return err;
}

int finish_automount(struct vfsmount *m, struct path *path)
{
        struct dentry *dentry = path->dentry;
        struct mountpoint *mp;
        struct mount *mnt;
        int err;

        if (!m)
                return 0;
        if (IS_ERR(m))
                return PTR_ERR(m);

        mnt = real_mount(m);
        /* The new mount record should have at least 2 refs to prevent it being
         * expired before we get a chance to add it
         */
        BUG_ON(mnt_get_count(mnt) < 2);

        if (m->mnt_sb == path->mnt->mnt_sb &&
            m->mnt_root == dentry) {
                err = -ELOOP;
                goto discard;
        }

        /*
         * we don't want to use lock_mount() - in this case finding something
         * that overmounts our mountpoint to be means "quitely drop what we've
         * got", not "try to mount it on top".
         */
        inode_lock(dentry->d_inode);
        namespace_lock();
        if (unlikely(cant_mount(dentry))) {
                err = -ENOENT;
                goto discard_locked;
        }
        rcu_read_lock();
        if (unlikely(__lookup_mnt(path->mnt, dentry))) {
                rcu_read_unlock();
                err = 0;
                goto discard_locked;
        }
        rcu_read_unlock();
        mp = get_mountpoint(dentry);
        if (IS_ERR(mp)) {
                err = PTR_ERR(mp);
                goto discard_locked;
        }

        err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
        unlock_mount(mp);
        if (unlikely(err))
                goto discard;
        mntput(m);
        return 0;

discard_locked:
        namespace_unlock();
        inode_unlock(dentry->d_inode);
discard:
        /* remove m from any expiration list it may be on */
        if (!list_empty(&mnt->mnt_expire)) {
                namespace_lock();
                list_del_init(&mnt->mnt_expire);
                namespace_unlock();
        }
        mntput(m);
        mntput(m);
        return err;
}

/**
 * mnt_set_expiry - Put a mount on an expiration list
 * @mnt: The mount to list.
 * @expiry_list: The list to add the mount to.
 */
void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
{
        namespace_lock();

        list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);

        namespace_unlock();
}
EXPORT_SYMBOL(mnt_set_expiry);

/*
 * process a list of expirable mountpoints with the intent of discarding any
 * mountpoints that aren't in use and haven't been touched since last we came
 * here
 */
void mark_mounts_for_expiry(struct list_head *mounts)
{
        struct mount *mnt, *next;
        LIST_HEAD(graveyard);

        if (list_empty(mounts))
                return;

        namespace_lock();
        lock_mount_hash();

        /* extract from the expiration list every vfsmount that matches the
         * following criteria:
         * - only referenced by its parent vfsmount
         * - still marked for expiry (marked on the last call here; marks are
         *   cleared by mntput())
         */
        list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
                if (!xchg(&mnt->mnt_expiry_mark, 1) ||
                        propagate_mount_busy(mnt, 1))
                        continue;
                list_move(&mnt->mnt_expire, &graveyard);
        }
        while (!list_empty(&graveyard)) {
                mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
                touch_mnt_namespace(mnt->mnt_ns);
                umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
        }
        unlock_mount_hash();
        namespace_unlock();
}

EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);

/*
 * Ripoff of 'select_parent()'
 *
 * search the list of submounts for a given mountpoint, and move any
 * shrinkable submounts to the 'graveyard' list.
 */
static int select_submounts(struct mount *parent, struct list_head *graveyard)
{
        struct mount *this_parent = parent;
        struct list_head *next;
        int found = 0;

repeat:
        next = this_parent->mnt_mounts.next;
resume:
        while (next != &this_parent->mnt_mounts) {
                struct list_head *tmp = next;
                struct mount *mnt = list_entry(tmp, struct mount, mnt_child);

                next = tmp->next;
                if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
                        continue;
                /*
                 * Descend a level if the d_mounts list is non-empty.
                 */
                if (!list_empty(&mnt->mnt_mounts)) {
                        this_parent = mnt;
                        goto repeat;
                }

                if (!propagate_mount_busy(mnt, 1)) {
                        list_move_tail(&mnt->mnt_expire, graveyard);
                        found++;
                }
        }
        /*
         * All done at this level ... ascend and resume the search
         */
        if (this_parent != parent) {
                next = this_parent->mnt_child.next;
                this_parent = this_parent->mnt_parent;
                goto resume;
        }
        return found;
}

/*
 * process a list of expirable mountpoints with the intent of discarding any
 * submounts of a specific parent mountpoint
 *
 * mount_lock must be held for write
 */
static void shrink_submounts(struct mount *mnt)
{
        LIST_HEAD(graveyard);
        struct mount *m;

        /* extract submounts of 'mountpoint' from the expiration list */
        while (select_submounts(mnt, &graveyard)) {
                while (!list_empty(&graveyard)) {
                        m = list_first_entry(&graveyard, struct mount,
                                                mnt_expire);
                        touch_mnt_namespace(m->mnt_ns);
                        umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC);
                }
        }
}

static void *copy_mount_options(const void __user * data)
{
        char *copy;
        unsigned left, offset;

        if (!data)
                return NULL;

        copy = kmalloc(PAGE_SIZE, GFP_KERNEL);
        if (!copy)
                return ERR_PTR(-ENOMEM);

        left = copy_from_user(copy, data, PAGE_SIZE);

        /*
         * Not all architectures have an exact copy_from_user(). Resort to
         * byte at a time.
         */
        offset = PAGE_SIZE - left;
        while (left) {
                char c;
                if (get_user(c, (const char __user *)data + offset))
                        break;
                copy[offset] = c;
                left--;
                offset++;
        }

        if (left == PAGE_SIZE) {
                kfree(copy);
                return ERR_PTR(-EFAULT);
        }

        return copy;
}

static char *copy_mount_string(const void __user *data)
{
        return data ? strndup_user(data, PATH_MAX) : NULL;
}

/*
 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
 * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
 *
 * data is a (void *) that can point to any structure up to
 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
 * information (or be NULL).
 *
 * Pre-0.97 versions of mount() didn't have a flags word.
 * When the flags word was introduced its top half was required
 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
 * Therefore, if this magic number is present, it carries no information
 * and must be discarded.
 */
int path_mount(const char *dev_name, struct path *path,
                const char *type_page, unsigned long flags, void *data_page)
{
        unsigned int mnt_flags = 0, sb_flags;
        int ret;

        /* Discard magic */
        if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
                flags &= ~MS_MGC_MSK;

        /* Basic sanity checks */
        if (data_page)
                ((char *)data_page)[PAGE_SIZE - 1] = 0;

        if (flags & MS_NOUSER)
                return -EINVAL;

        ret = security_sb_mount(dev_name, path, type_page, flags, data_page);
        if (ret)
                return ret;
        if (!may_mount())
                return -EPERM;
        if ((flags & SB_MANDLOCK) && !may_mandlock())
                return -EPERM;

        /* Default to relatime unless overriden */
        if (!(flags & MS_NOATIME))
                mnt_flags |= MNT_RELATIME;

        /* Separate the per-mountpoint flags */
        if (flags & MS_NOSUID)
                mnt_flags |= MNT_NOSUID;
        if (flags & MS_NODEV)
                mnt_flags |= MNT_NODEV;
        if (flags & MS_NOEXEC)
                mnt_flags |= MNT_NOEXEC;
        if (flags & MS_NOATIME)
                mnt_flags |= MNT_NOATIME;
        if (flags & MS_NODIRATIME)
                mnt_flags |= MNT_NODIRATIME;
        if (flags & MS_STRICTATIME)
                mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
        if (flags & MS_RDONLY)
                mnt_flags |= MNT_READONLY;
        if (flags & MS_NOSYMFOLLOW)
                mnt_flags |= MNT_NOSYMFOLLOW;

        /* The default atime for remount is preservation */
        if ((flags & MS_REMOUNT) &&
            ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
                       MS_STRICTATIME)) == 0)) {
                mnt_flags &= ~MNT_ATIME_MASK;
                mnt_flags |= path->mnt->mnt_flags & MNT_ATIME_MASK;
        }

        sb_flags = flags & (SB_RDONLY |
                            SB_SYNCHRONOUS |
                            SB_MANDLOCK |
                            SB_DIRSYNC |
                            SB_SILENT |
                            SB_POSIXACL |
                            SB_LAZYTIME |
                            SB_I_VERSION);

        if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
                return do_reconfigure_mnt(path, mnt_flags);
        if (flags & MS_REMOUNT)
                return do_remount(path, flags, sb_flags, mnt_flags, data_page);
        if (flags & MS_BIND)
                return do_loopback(path, dev_name, flags & MS_REC);
        if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
                return do_change_type(path, flags);
        if (flags & MS_MOVE)
                return do_move_mount_old(path, dev_name);

        return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name,
                            data_page);
}

long do_mount(const char *dev_name, const char __user *dir_name,
                const char *type_page, unsigned long flags, void *data_page)
{
        struct path path;
        int ret;

        ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
        if (ret)
                return ret;
        ret = path_mount(dev_name, &path, type_page, flags, data_page);
        path_put(&path);
        return ret;
}

static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
{
        return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
}

static void dec_mnt_namespaces(struct ucounts *ucounts)
{
        dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
}

static void free_mnt_ns(struct mnt_namespace *ns)
{
        if (!is_anon_ns(ns))
                ns_free_inum(&ns->ns);
        dec_mnt_namespaces(ns->ucounts);
        put_user_ns(ns->user_ns);
        kfree(ns);
}

/*
 * Assign a sequence number so we can detect when we attempt to bind
 * mount a reference to an older mount namespace into the current
 * mount namespace, preventing reference counting loops.  A 64bit
 * number incrementing at 10Ghz will take 12,427 years to wrap which
 * is effectively never, so we can ignore the possibility.
 */
static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);

static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon)
{
        struct mnt_namespace *new_ns;
        struct ucounts *ucounts;
        int ret;

        ucounts = inc_mnt_namespaces(user_ns);
        if (!ucounts)
                return ERR_PTR(-ENOSPC);

        new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
        if (!new_ns) {
                dec_mnt_namespaces(ucounts);
                return ERR_PTR(-ENOMEM);
        }
        if (!anon) {
                ret = ns_alloc_inum(&new_ns->ns);
                if (ret) {
                        kfree(new_ns);
                        dec_mnt_namespaces(ucounts);
                        return ERR_PTR(ret);
                }
        }
        new_ns->ns.ops = &mntns_operations;
        if (!anon)
                new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
        atomic_set(&new_ns->count, 1);
        INIT_LIST_HEAD(&new_ns->list);
        init_waitqueue_head(&new_ns->poll);
        spin_lock_init(&new_ns->ns_lock);
        new_ns->user_ns = get_user_ns(user_ns);
        new_ns->ucounts = ucounts;
        return new_ns;
}

__latent_entropy
struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
                struct user_namespace *user_ns, struct fs_struct *new_fs)
{
        struct mnt_namespace *new_ns;
        struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
        struct mount *p, *q;
        struct mount *old;
        struct mount *new;
        int copy_flags;

        BUG_ON(!ns);

        if (likely(!(flags & CLONE_NEWNS))) {
                get_mnt_ns(ns);
                return ns;
        }

        old = ns->root;

        new_ns = alloc_mnt_ns(user_ns, false);
        if (IS_ERR(new_ns))
                return new_ns;

        namespace_lock();
        /* First pass: copy the tree topology */
        copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
        if (user_ns != ns->user_ns)
                copy_flags |= CL_SHARED_TO_SLAVE;
        new = copy_tree(old, old->mnt.mnt_root, copy_flags);
        if (IS_ERR(new)) {
                namespace_unlock();
                free_mnt_ns(new_ns);
                return ERR_CAST(new);
        }
        if (user_ns != ns->user_ns) {
                lock_mount_hash();
                lock_mnt_tree(new);
                unlock_mount_hash();
        }
        new_ns->root = new;
        list_add_tail(&new_ns->list, &new->mnt_list);

        /*
         * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
         * as belonging to new namespace.  We have already acquired a private
         * fs_struct, so tsk->fs->lock is not needed.
         */
        p = old;
        q = new;
        while (p) {
                q->mnt_ns = new_ns;
                new_ns->mounts++;
                if (new_fs) {
                        if (&p->mnt == new_fs->root.mnt) {
                                new_fs->root.mnt = mntget(&q->mnt);
                                rootmnt = &p->mnt;
                        }
                        if (&p->mnt == new_fs->pwd.mnt) {
                                new_fs->pwd.mnt = mntget(&q->mnt);
                                pwdmnt = &p->mnt;
                        }
                }
                p = next_mnt(p, old);
                q = next_mnt(q, new);
                if (!q)
                        break;
                while (p->mnt.mnt_root != q->mnt.mnt_root)
                        p = next_mnt(p, old);
        }
        namespace_unlock();

        if (rootmnt)
                mntput(rootmnt);
        if (pwdmnt)
                mntput(pwdmnt);

        return new_ns;
}

struct dentry *mount_subtree(struct vfsmount *m, const char *name)
{
        struct mount *mnt = real_mount(m);
        struct mnt_namespace *ns;
        struct super_block *s;
        struct path path;
        int err;

        ns = alloc_mnt_ns(&init_user_ns, true);
        if (IS_ERR(ns)) {
                mntput(m);
                return ERR_CAST(ns);
        }
        mnt->mnt_ns = ns;
        ns->root = mnt;
        ns->mounts++;
        list_add(&mnt->mnt_list, &ns->list);

        err = vfs_path_lookup(m->mnt_root, m,
                        name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);

        put_mnt_ns(ns);

        if (err)
                return ERR_PTR(err);

        /* trade a vfsmount reference for active sb one */
        s = path.mnt->mnt_sb;
        atomic_inc(&s->s_active);
        mntput(path.mnt);
        /* lock the sucker */
        down_write(&s->s_umount);
        /* ... and return the root of (sub)tree on it */
        return path.dentry;
}
EXPORT_SYMBOL(mount_subtree);

SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
                char __user *, type, unsigned long, flags, void __user *, data)
{
        int ret;
        char *kernel_type;
        char *kernel_dev;
        void *options;

        kernel_type = copy_mount_string(type);
        ret = PTR_ERR(kernel_type);
        if (IS_ERR(kernel_type))
                goto out_type;

        kernel_dev = copy_mount_string(dev_name);
        ret = PTR_ERR(kernel_dev);
        if (IS_ERR(kernel_dev))
                goto out_dev;

        options = copy_mount_options(data);
        ret = PTR_ERR(options);
        if (IS_ERR(options))
                goto out_data;

        ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);

        kfree(options);
out_data:
        kfree(kernel_dev);
out_dev:
        kfree(kernel_type);
out_type:
        return ret;
}

/*
 * Create a kernel mount representation for a new, prepared superblock
 * (specified by fs_fd) and attach to an open_tree-like file descriptor.
 */
SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
                unsigned int, attr_flags)
{
        struct mnt_namespace *ns;
        struct fs_context *fc;
        struct file *file;
        struct path newmount;
        struct mount *mnt;
        struct fd f;
        unsigned int mnt_flags = 0;
        long ret;

        if (!may_mount())
                return -EPERM;

        if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
                return -EINVAL;

        if (attr_flags & ~(MOUNT_ATTR_RDONLY |
                           MOUNT_ATTR_NOSUID |
                           MOUNT_ATTR_NODEV |
                           MOUNT_ATTR_NOEXEC |
                           MOUNT_ATTR__ATIME |
                           MOUNT_ATTR_NODIRATIME))
                return -EINVAL;

        if (attr_flags & MOUNT_ATTR_RDONLY)
                mnt_flags |= MNT_READONLY;
        if (attr_flags & MOUNT_ATTR_NOSUID)
                mnt_flags |= MNT_NOSUID;
        if (attr_flags & MOUNT_ATTR_NODEV)
                mnt_flags |= MNT_NODEV;
        if (attr_flags & MOUNT_ATTR_NOEXEC)
                mnt_flags |= MNT_NOEXEC;
        if (attr_flags & MOUNT_ATTR_NODIRATIME)
                mnt_flags |= MNT_NODIRATIME;

        switch (attr_flags & MOUNT_ATTR__ATIME) {
        case MOUNT_ATTR_STRICTATIME:
                break;
        case MOUNT_ATTR_NOATIME:
                mnt_flags |= MNT_NOATIME;
                break;
        case MOUNT_ATTR_RELATIME:
                mnt_flags |= MNT_RELATIME;
                break;
        default:
                return -EINVAL;
        }

        f = fdget(fs_fd);
        if (!f.file)
                return -EBADF;

        ret = -EINVAL;
        if (f.file->f_op != &fscontext_fops)
                goto err_fsfd;

        fc = f.file->private_data;

        ret = mutex_lock_interruptible(&fc->uapi_mutex);
        if (ret < 0)
                goto err_fsfd;

        /* There must be a valid superblock or we can't mount it */
        ret = -EINVAL;
        if (!fc->root)
                goto err_unlock;

        ret = -EPERM;
        if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) {
                pr_warn("VFS: Mount too revealing\n");
                goto err_unlock;
        }

        ret = -EBUSY;
        if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
                goto err_unlock;

        ret = -EPERM;
        if ((fc->sb_flags & SB_MANDLOCK) && !may_mandlock())
                goto err_unlock;

        newmount.mnt = vfs_create_mount(fc);
        if (IS_ERR(newmount.mnt)) {
                ret = PTR_ERR(newmount.mnt);
                goto err_unlock;
        }
        newmount.dentry = dget(fc->root);
        newmount.mnt->mnt_flags = mnt_flags;

        /* We've done the mount bit - now move the file context into more or
         * less the same state as if we'd done an fspick().  We don't want to
         * do any memory allocation or anything like that at this point as we
         * don't want to have to handle any errors incurred.
         */
        vfs_clean_context(fc);

        ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
        if (IS_ERR(ns)) {
                ret = PTR_ERR(ns);
                goto err_path;
        }
        mnt = real_mount(newmount.mnt);
        mnt->mnt_ns = ns;
        ns->root = mnt;
        ns->mounts = 1;
        list_add(&mnt->mnt_list, &ns->list);
        mntget(newmount.mnt);

        /* Attach to an apparent O_PATH fd with a note that we need to unmount
         * it, not just simply put it.
         */
        file = dentry_open(&newmount, O_PATH, fc->cred);
        if (IS_ERR(file)) {
                dissolve_on_fput(newmount.mnt);
                ret = PTR_ERR(file);
                goto err_path;
        }
        file->f_mode |= FMODE_NEED_UNMOUNT;

        ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0);
        if (ret >= 0)
                fd_install(ret, file);
        else
                fput(file);

err_path:
        path_put(&newmount);
err_unlock:
        mutex_unlock(&fc->uapi_mutex);
err_fsfd:
        fdput(f);
        return ret;
}

/*
 * Move a mount from one place to another.  In combination with
 * fsopen()/fsmount() this is used to install a new mount and in combination
 * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy
 * a mount subtree.
 *
 * Note the flags value is a combination of MOVE_MOUNT_* flags.
 */
SYSCALL_DEFINE5(move_mount,
                int, from_dfd, const char __user *, from_pathname,
                int, to_dfd, const char __user *, to_pathname,
                unsigned int, flags)
{
        struct path from_path, to_path;
        unsigned int lflags;
        int ret = 0;

        if (!may_mount())
                return -EPERM;

        if (flags & ~MOVE_MOUNT__MASK)
                return -EINVAL;

        /* If someone gives a pathname, they aren't permitted to move
         * from an fd that requires unmount as we can't get at the flag
         * to clear it afterwards.
         */
        lflags = 0;
        if (flags & MOVE_MOUNT_F_SYMLINKS)        lflags |= LOOKUP_FOLLOW;
        if (flags & MOVE_MOUNT_F_AUTOMOUNTS)        lflags |= LOOKUP_AUTOMOUNT;
        if (flags & MOVE_MOUNT_F_EMPTY_PATH)        lflags |= LOOKUP_EMPTY;

        ret = user_path_at(from_dfd, from_pathname, lflags, &from_path);
        if (ret < 0)
                return ret;

        lflags = 0;
        if (flags & MOVE_MOUNT_T_SYMLINKS)        lflags |= LOOKUP_FOLLOW;
        if (flags & MOVE_MOUNT_T_AUTOMOUNTS)        lflags |= LOOKUP_AUTOMOUNT;
        if (flags & MOVE_MOUNT_T_EMPTY_PATH)        lflags |= LOOKUP_EMPTY;

        ret = user_path_at(to_dfd, to_pathname, lflags, &to_path);
        if (ret < 0)
                goto out_from;

        ret = security_move_mount(&from_path, &to_path);
        if (ret < 0)
                goto out_to;

        ret = do_move_mount(&from_path, &to_path);

out_to:
        path_put(&to_path);
out_from:
        path_put(&from_path);
        return ret;
}

/*
 * Return true if path is reachable from root
 *
 * namespace_sem or mount_lock is held
 */
bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
                         const struct path *root)
{
        while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
                dentry = mnt->mnt_mountpoint;
                mnt = mnt->mnt_parent;
        }
        return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
}

bool path_is_under(const struct path *path1, const struct path *path2)
{
        bool res;
        read_seqlock_excl(&mount_lock);
        res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
        read_sequnlock_excl(&mount_lock);
        return res;
}
EXPORT_SYMBOL(path_is_under);

/*
 * pivot_root Semantics:
 * Moves the root file system of the current process to the directory put_old,
 * makes new_root as the new root file system of the current process, and sets
 * root/cwd of all processes which had them on the current root to new_root.
 *
 * Restrictions:
 * The new_root and put_old must be directories, and  must not be on the
 * same file  system as the current process root. The put_old  must  be
 * underneath new_root,  i.e. adding a non-zero number of /.. to the string
 * pointed to by put_old must yield the same directory as new_root. No other
 * file system may be mounted on put_old. After all, new_root is a mountpoint.
 *
 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
 * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
 * in this situation.
 *
 * Notes:
 *  - we don't move root/cwd if they are not at the root (reason: if something
 *    cared enough to change them, it's probably wrong to force them elsewhere)
 *  - it's okay to pick a root that isn't the root of a file system, e.g.
 *    /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
 *    though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
 *    first.
 */
SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
                const char __user *, put_old)
{
        struct path new, old, root;
        struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
        struct mountpoint *old_mp, *root_mp;
        int error;

        if (!may_mount())
                return -EPERM;

        error = user_path_at(AT_FDCWD, new_root,
                             LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
        if (error)
                goto out0;

        error = user_path_at(AT_FDCWD, put_old,
                             LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
        if (error)
                goto out1;

        error = security_sb_pivotroot(&old, &new);
        if (error)
                goto out2;

        get_fs_root(current->fs, &root);
        old_mp = lock_mount(&old);
        error = PTR_ERR(old_mp);
        if (IS_ERR(old_mp))
                goto out3;

        error = -EINVAL;
        new_mnt = real_mount(new.mnt);
        root_mnt = real_mount(root.mnt);
        old_mnt = real_mount(old.mnt);
        ex_parent = new_mnt->mnt_parent;
        root_parent = root_mnt->mnt_parent;
        if (IS_MNT_SHARED(old_mnt) ||
                IS_MNT_SHARED(ex_parent) ||
                IS_MNT_SHARED(root_parent))
                goto out4;
        if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
                goto out4;
        if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
                goto out4;
        error = -ENOENT;
        if (d_unlinked(new.dentry))
                goto out4;
        error = -EBUSY;
        if (new_mnt == root_mnt || old_mnt == root_mnt)
                goto out4; /* loop, on the same file system  */
        error = -EINVAL;
        if (root.mnt->mnt_root != root.dentry)
                goto out4; /* not a mountpoint */
        if (!mnt_has_parent(root_mnt))
                goto out4; /* not attached */
        if (new.mnt->mnt_root != new.dentry)
                goto out4; /* not a mountpoint */
        if (!mnt_has_parent(new_mnt))
                goto out4; /* not attached */
        /* make sure we can reach put_old from new_root */
        if (!is_path_reachable(old_mnt, old.dentry, &new))
                goto out4;
        /* make certain new is below the root */
        if (!is_path_reachable(new_mnt, new.dentry, &root))
                goto out4;
        lock_mount_hash();
        umount_mnt(new_mnt);
        root_mp = unhash_mnt(root_mnt);  /* we'll need its mountpoint */
        if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
                new_mnt->mnt.mnt_flags |= MNT_LOCKED;
                root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
        }
        /* mount old root on put_old */
        attach_mnt(root_mnt, old_mnt, old_mp);
        /* mount new_root on / */
        attach_mnt(new_mnt, root_parent, root_mp);
        mnt_add_count(root_parent, -1);
        touch_mnt_namespace(current->nsproxy->mnt_ns);
        /* A moved mount should not expire automatically */
        list_del_init(&new_mnt->mnt_expire);
        put_mountpoint(root_mp);
        unlock_mount_hash();
        chroot_fs_refs(&root, &new);
        error = 0;
out4:
        unlock_mount(old_mp);
        if (!error)
                mntput_no_expire(ex_parent);
out3:
        path_put(&root);
out2:
        path_put(&old);
out1:
        path_put(&new);
out0:
        return error;
}

static void __init init_mount_tree(void)
{
        struct vfsmount *mnt;
        struct mount *m;
        struct mnt_namespace *ns;
        struct path root;

        mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL);
        if (IS_ERR(mnt))
                panic("Can't create rootfs");

        ns = alloc_mnt_ns(&init_user_ns, false);
        if (IS_ERR(ns))
                panic("Can't allocate initial namespace");
        m = real_mount(mnt);
        m->mnt_ns = ns;
        ns->root = m;
        ns->mounts = 1;
        list_add(&m->mnt_list, &ns->list);
        init_task.nsproxy->mnt_ns = ns;
        get_mnt_ns(ns);

        root.mnt = mnt;
        root.dentry = mnt->mnt_root;
        mnt->mnt_flags |= MNT_LOCKED;

        set_fs_pwd(current->fs, &root);
        set_fs_root(current->fs, &root);
}

void __init mnt_init(void)
{
        int err;

        mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
                        0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);

        mount_hashtable = alloc_large_system_hash("Mount-cache",
                                sizeof(struct hlist_head),
                                mhash_entries, 19,
                                HASH_ZERO,
                                &m_hash_shift, &m_hash_mask, 0, 0);
        mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
                                sizeof(struct hlist_head),
                                mphash_entries, 19,
                                HASH_ZERO,
                                &mp_hash_shift, &mp_hash_mask, 0, 0);

        if (!mount_hashtable || !mountpoint_hashtable)
                panic("Failed to allocate mount hash table\n");

        kernfs_init();

        err = sysfs_init();
        if (err)
                printk(KERN_WARNING "%s: sysfs_init error: %d\n",
                        __func__, err);
        fs_kobj = kobject_create_and_add("fs", NULL);
        if (!fs_kobj)
                printk(KERN_WARNING "%s: kobj create error\n", __func__);
        shmem_init();
        init_rootfs();
        init_mount_tree();
}

void put_mnt_ns(struct mnt_namespace *ns)
{
        if (!atomic_dec_and_test(&ns->count))
                return;
        drop_collected_mounts(&ns->root->mnt);
        free_mnt_ns(ns);
}

struct vfsmount *kern_mount(struct file_system_type *type)
{
        struct vfsmount *mnt;
        mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
        if (!IS_ERR(mnt)) {
                /*
                 * it is a longterm mount, don't release mnt until
                 * we unmount before file sys is unregistered
                */
                real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
        }
        return mnt;
}
EXPORT_SYMBOL_GPL(kern_mount);

void kern_unmount(struct vfsmount *mnt)
{
        /* release long term mount so mount point can be released */
        if (!IS_ERR_OR_NULL(mnt)) {
                real_mount(mnt)->mnt_ns = NULL;
                synchronize_rcu();        /* yecchhh... */
                mntput(mnt);
        }
}
EXPORT_SYMBOL(kern_unmount);

void kern_unmount_array(struct vfsmount *mnt[], unsigned int num)
{
        unsigned int i;

        for (i = 0; i < num; i++)
                if (mnt[i])
                        real_mount(mnt[i])->mnt_ns = NULL;
        synchronize_rcu_expedited();
        for (i = 0; i < num; i++)
                mntput(mnt[i]);
}
EXPORT_SYMBOL(kern_unmount_array);

bool our_mnt(struct vfsmount *mnt)
{
        return check_mnt(real_mount(mnt));
}

bool current_chrooted(void)
{
        /* Does the current process have a non-standard root */
        struct path ns_root;
        struct path fs_root;
        bool chrooted;

        /* Find the namespace root */
        ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
        ns_root.dentry = ns_root.mnt->mnt_root;
        path_get(&ns_root);
        while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
                ;

        get_fs_root(current->fs, &fs_root);

        chrooted = !path_equal(&fs_root, &ns_root);

        path_put(&fs_root);
        path_put(&ns_root);

        return chrooted;
}

static bool mnt_already_visible(struct mnt_namespace *ns,
                                const struct super_block *sb,
                                int *new_mnt_flags)
{
        int new_flags = *new_mnt_flags;
        struct mount *mnt;
        bool visible = false;

        down_read(&namespace_sem);
        lock_ns_list(ns);
        list_for_each_entry(mnt, &ns->list, mnt_list) {
                struct mount *child;
                int mnt_flags;

                if (mnt_is_cursor(mnt))
                        continue;

                if (mnt->mnt.mnt_sb->s_type != sb->s_type)
                        continue;

                /* This mount is not fully visible if it's root directory
                 * is not the root directory of the filesystem.
                 */
                if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
                        continue;

                /* A local view of the mount flags */
                mnt_flags = mnt->mnt.mnt_flags;

                /* Don't miss readonly hidden in the superblock flags */
                if (sb_rdonly(mnt->mnt.mnt_sb))
                        mnt_flags |= MNT_LOCK_READONLY;

                /* Verify the mount flags are equal to or more permissive
                 * than the proposed new mount.
                 */
                if ((mnt_flags & MNT_LOCK_READONLY) &&
                    !(new_flags & MNT_READONLY))
                        continue;
                if ((mnt_flags & MNT_LOCK_ATIME) &&
                    ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
                        continue;

                /* This mount is not fully visible if there are any
                 * locked child mounts that cover anything except for
                 * empty directories.
                 */
                list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
                        struct inode *inode = child->mnt_mountpoint->d_inode;
                        /* Only worry about locked mounts */
                        if (!(child->mnt.mnt_flags & MNT_LOCKED))
                                continue;
                        /* Is the directory permanetly empty? */
                        if (!is_empty_dir_inode(inode))
                                goto next;
                }
                /* Preserve the locked attributes */
                *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
                                               MNT_LOCK_ATIME);
                visible = true;
                goto found;
        next:        ;
        }
found:
        unlock_ns_list(ns);
        up_read(&namespace_sem);
        return visible;
}

static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
{
        const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
        struct mnt_namespace *ns = current->nsproxy->mnt_ns;
        unsigned long s_iflags;

        if (ns->user_ns == &init_user_ns)
                return false;

        /* Can this filesystem be too revealing? */
        s_iflags = sb->s_iflags;
        if (!(s_iflags & SB_I_USERNS_VISIBLE))
                return false;

        if ((s_iflags & required_iflags) != required_iflags) {
                WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
                          required_iflags);
                return true;
        }

        return !mnt_already_visible(ns, sb, new_mnt_flags);
}

bool mnt_may_suid(struct vfsmount *mnt)
{
        /*
         * Foreign mounts (accessed via fchdir or through /proc
         * symlinks) are always treated as if they are nosuid.  This
         * prevents namespaces from trusting potentially unsafe
         * suid/sgid bits, file caps, or security labels that originate
         * in other namespaces.
         */
        return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) &&
               current_in_userns(mnt->mnt_sb->s_user_ns);
}

static struct ns_common *mntns_get(struct task_struct *task)
{
        struct ns_common *ns = NULL;
        struct nsproxy *nsproxy;

        task_lock(task);
        nsproxy = task->nsproxy;
        if (nsproxy) {
                ns = &nsproxy->mnt_ns->ns;
                get_mnt_ns(to_mnt_ns(ns));
        }
        task_unlock(task);

        return ns;
}

static void mntns_put(struct ns_common *ns)
{
        put_mnt_ns(to_mnt_ns(ns));
}

static int mntns_install(struct nsset *nsset, struct ns_common *ns)
{
        struct nsproxy *nsproxy = nsset->nsproxy;
        struct fs_struct *fs = nsset->fs;
        struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns;
        struct user_namespace *user_ns = nsset->cred->user_ns;
        struct path root;
        int err;

        if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
            !ns_capable(user_ns, CAP_SYS_CHROOT) ||
            !ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        if (is_anon_ns(mnt_ns))
                return -EINVAL;

        if (fs->users != 1)
                return -EINVAL;

        get_mnt_ns(mnt_ns);
        old_mnt_ns = nsproxy->mnt_ns;
        nsproxy->mnt_ns = mnt_ns;

        /* Find the root */
        err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt,
                                "/", LOOKUP_DOWN, &root);
        if (err) {
                /* revert to old namespace */
                nsproxy->mnt_ns = old_mnt_ns;
                put_mnt_ns(mnt_ns);
                return err;
        }

        put_mnt_ns(old_mnt_ns);

        /* Update the pwd and root */
        set_fs_pwd(fs, &root);
        set_fs_root(fs, &root);

        path_put(&root);
        return 0;
}

static struct user_namespace *mntns_owner(struct ns_common *ns)
{
        return to_mnt_ns(ns)->user_ns;
}

const struct proc_ns_operations mntns_operations = {
        .name                = "mnt",
        .type                = CLONE_NEWNS,
        .get                = mntns_get,
        .put                = mntns_put,
        .install        = mntns_install,
        .owner                = mntns_owner,
};

































































    1 


    1 
















    1 

    1 


    1 














    1 



































































































































































































    1 























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_FS_NOTIFY_H
#define _LINUX_FS_NOTIFY_H

/*
 * include/linux/fsnotify.h - generic hooks for filesystem notification, to
 * reduce in-source duplication from both dnotify and inotify.
 *
 * We don't compile any of this away in some complicated menagerie of ifdefs.
 * Instead, we rely on the code inside to optimize away as needed.
 *
 * (C) Copyright 2005 Robert Love
 */

#include <linux/fsnotify_backend.h>
#include <linux/audit.h>
#include <linux/slab.h>
#include <linux/bug.h>

/*
 * Notify this @dir inode about a change in a child directory entry.
 * The directory entry may have turned positive or negative or its inode may
 * have changed (i.e. renamed over).
 *
 * Unlike fsnotify_parent(), the event will be reported regardless of the
 * FS_EVENT_ON_CHILD mask on the parent inode and will not be reported if only
 * the child is interested and not the parent.
 */
static inline void fsnotify_name(struct inode *dir, __u32 mask,
                                 struct inode *child,
                                 const struct qstr *name, u32 cookie)
{
        fsnotify(mask, child, FSNOTIFY_EVENT_INODE, dir, name, NULL, cookie);
}

static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry,
                                   __u32 mask)
{
        fsnotify_name(dir, mask, d_inode(dentry), &dentry->d_name, 0);
}

static inline void fsnotify_inode(struct inode *inode, __u32 mask)
{
        if (S_ISDIR(inode->i_mode))
                mask |= FS_ISDIR;

        fsnotify(mask, inode, FSNOTIFY_EVENT_INODE, NULL, NULL, inode, 0);
}

/* Notify this dentry's parent about a child's events. */
static inline int fsnotify_parent(struct dentry *dentry, __u32 mask,
                                  const void *data, int data_type)
{
        struct inode *inode = d_inode(dentry);

        if (S_ISDIR(inode->i_mode)) {
                mask |= FS_ISDIR;

                /* sb/mount marks are not interested in name of directory */
                if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
                        goto notify_child;
        }

        /* disconnected dentry cannot notify parent */
        if (IS_ROOT(dentry))
                goto notify_child;

        return __fsnotify_parent(dentry, mask, data, data_type);

notify_child:
        return fsnotify(mask, data, data_type, NULL, NULL, inode, 0);
}

/*
 * Simple wrappers to consolidate calls to fsnotify_parent() when an event
 * is on a file/dentry.
 */
static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask)
{
        fsnotify_parent(dentry, mask, d_inode(dentry), FSNOTIFY_EVENT_INODE);
}

static inline int fsnotify_file(struct file *file, __u32 mask)
{
        const struct path *path = &file->f_path;

        if (file->f_mode & FMODE_NONOTIFY)
                return 0;

        return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
}

/* Simple call site for access decisions */
static inline int fsnotify_perm(struct file *file, int mask)
{
        int ret;
        __u32 fsnotify_mask = 0;

        if (!(mask & (MAY_READ | MAY_OPEN)))
                return 0;

        if (mask & MAY_OPEN) {
                fsnotify_mask = FS_OPEN_PERM;

                if (file->f_flags & __FMODE_EXEC) {
                        ret = fsnotify_file(file, FS_OPEN_EXEC_PERM);

                        if (ret)
                                return ret;
                }
        } else if (mask & MAY_READ) {
                fsnotify_mask = FS_ACCESS_PERM;
        }

        return fsnotify_file(file, fsnotify_mask);
}

/*
 * fsnotify_link_count - inode's link count changed
 */
static inline void fsnotify_link_count(struct inode *inode)
{
        fsnotify_inode(inode, FS_ATTRIB);
}

/*
 * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir
 */
static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
                                 const struct qstr *old_name,
                                 int isdir, struct inode *target,
                                 struct dentry *moved)
{
        struct inode *source = moved->d_inode;
        u32 fs_cookie = fsnotify_get_cookie();
        __u32 old_dir_mask = FS_MOVED_FROM;
        __u32 new_dir_mask = FS_MOVED_TO;
        const struct qstr *new_name = &moved->d_name;

        if (old_dir == new_dir)
                old_dir_mask |= FS_DN_RENAME;

        if (isdir) {
                old_dir_mask |= FS_ISDIR;
                new_dir_mask |= FS_ISDIR;
        }

        fsnotify_name(old_dir, old_dir_mask, source, old_name, fs_cookie);
        fsnotify_name(new_dir, new_dir_mask, source, new_name, fs_cookie);

        if (target)
                fsnotify_link_count(target);
        fsnotify_inode(source, FS_MOVE_SELF);
        audit_inode_child(new_dir, moved, AUDIT_TYPE_CHILD_CREATE);
}

/*
 * fsnotify_inode_delete - and inode is being evicted from cache, clean up is needed
 */
static inline void fsnotify_inode_delete(struct inode *inode)
{
        __fsnotify_inode_delete(inode);
}

/*
 * fsnotify_vfsmount_delete - a vfsmount is being destroyed, clean up is needed
 */
static inline void fsnotify_vfsmount_delete(struct vfsmount *mnt)
{
        __fsnotify_vfsmount_delete(mnt);
}

/*
 * fsnotify_inoderemove - an inode is going away
 */
static inline void fsnotify_inoderemove(struct inode *inode)
{
        fsnotify_inode(inode, FS_DELETE_SELF);
        __fsnotify_inode_delete(inode);
}

/*
 * fsnotify_create - 'name' was linked in
 */
static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
{
        audit_inode_child(inode, dentry, AUDIT_TYPE_CHILD_CREATE);

        fsnotify_dirent(inode, dentry, FS_CREATE);
}

/*
 * fsnotify_link - new hardlink in 'inode' directory
 * Note: We have to pass also the linked inode ptr as some filesystems leave
 *   new_dentry->d_inode NULL and instantiate inode pointer later
 */
static inline void fsnotify_link(struct inode *dir, struct inode *inode,
                                 struct dentry *new_dentry)
{
        fsnotify_link_count(inode);
        audit_inode_child(dir, new_dentry, AUDIT_TYPE_CHILD_CREATE);

        fsnotify_name(dir, FS_CREATE, inode, &new_dentry->d_name, 0);
}

/*
 * fsnotify_delete - @dentry was unlinked and unhashed
 *
 * Caller must make sure that dentry->d_name is stable.
 *
 * Note: unlike fsnotify_unlink(), we have to pass also the unlinked inode
 * as this may be called after d_delete() and old_dentry may be negative.
 */
static inline void fsnotify_delete(struct inode *dir, struct inode *inode,
                                   struct dentry *dentry)
{
        __u32 mask = FS_DELETE;

        if (S_ISDIR(inode->i_mode))
                mask |= FS_ISDIR;

        fsnotify_name(dir, mask, inode, &dentry->d_name, 0);
}

/**
 * d_delete_notify - delete a dentry and call fsnotify_delete()
 * @dentry: The dentry to delete
 *
 * This helper is used to guaranty that the unlinked inode cannot be found
 * by lookup of this name after fsnotify_delete() event has been delivered.
 */
static inline void d_delete_notify(struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);

        ihold(inode);
        d_delete(dentry);
        fsnotify_delete(dir, inode, dentry);
        iput(inode);
}

/*
 * fsnotify_unlink - 'name' was unlinked
 *
 * Caller must make sure that dentry->d_name is stable.
 */
static inline void fsnotify_unlink(struct inode *dir, struct dentry *dentry)
{
        if (WARN_ON_ONCE(d_is_negative(dentry)))
                return;

        fsnotify_delete(dir, d_inode(dentry), dentry);
}

/*
 * fsnotify_mkdir - directory 'name' was created
 */
static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
{
        audit_inode_child(inode, dentry, AUDIT_TYPE_CHILD_CREATE);

        fsnotify_dirent(inode, dentry, FS_CREATE | FS_ISDIR);
}

/*
 * fsnotify_rmdir - directory 'name' was removed
 *
 * Caller must make sure that dentry->d_name is stable.
 */
static inline void fsnotify_rmdir(struct inode *dir, struct dentry *dentry)
{
        if (WARN_ON_ONCE(d_is_negative(dentry)))
                return;

        fsnotify_delete(dir, d_inode(dentry), dentry);
}

/*
 * fsnotify_access - file was read
 */
static inline void fsnotify_access(struct file *file)
{
        fsnotify_file(file, FS_ACCESS);
}

/*
 * fsnotify_modify - file was modified
 */
static inline void fsnotify_modify(struct file *file)
{
        fsnotify_file(file, FS_MODIFY);
}

/*
 * fsnotify_open - file was opened
 */
static inline void fsnotify_open(struct file *file)
{
        __u32 mask = FS_OPEN;

        if (file->f_flags & __FMODE_EXEC)
                mask |= FS_OPEN_EXEC;

        fsnotify_file(file, mask);
}

/*
 * fsnotify_close - file was closed
 */
static inline void fsnotify_close(struct file *file)
{
        __u32 mask = (file->f_mode & FMODE_WRITE) ? FS_CLOSE_WRITE :
                                                    FS_CLOSE_NOWRITE;

        fsnotify_file(file, mask);
}

/*
 * fsnotify_xattr - extended attributes were changed
 */
static inline void fsnotify_xattr(struct dentry *dentry)
{
        fsnotify_dentry(dentry, FS_ATTRIB);
}

/*
 * fsnotify_change - notify_change event.  file was modified and/or metadata
 * was changed.
 */
static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
{
        __u32 mask = 0;

        if (ia_valid & ATTR_UID)
                mask |= FS_ATTRIB;
        if (ia_valid & ATTR_GID)
                mask |= FS_ATTRIB;
        if (ia_valid & ATTR_SIZE)
                mask |= FS_MODIFY;

        /* both times implies a utime(s) call */
        if ((ia_valid & (ATTR_ATIME | ATTR_MTIME)) == (ATTR_ATIME | ATTR_MTIME))
                mask |= FS_ATTRIB;
        else if (ia_valid & ATTR_ATIME)
                mask |= FS_ACCESS;
        else if (ia_valid & ATTR_MTIME)
                mask |= FS_MODIFY;

        if (ia_valid & ATTR_MODE)
                mask |= FS_ATTRIB;

        if (mask)
                fsnotify_dentry(dentry, mask);
}

#endif        /* _LINUX_FS_NOTIFY_H */
























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGALLOC_H
#define _ASM_X86_PGALLOC_H

#include <linux/threads.h>
#include <linux/mm.h>                /* for struct page */
#include <linux/pagemap.h>

#define __HAVE_ARCH_PTE_ALLOC_ONE
#define __HAVE_ARCH_PGD_FREE
#include <asm-generic/pgalloc.h>

static inline int  __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }

#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#define paravirt_pgd_alloc(mm)        __paravirt_pgd_alloc(mm)
static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {}
static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn)        {}
static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)        {}
static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
                                            unsigned long start, unsigned long count) {}
static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn)        {}
static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn)        {}
static inline void paravirt_release_pte(unsigned long pfn) {}
static inline void paravirt_release_pmd(unsigned long pfn) {}
static inline void paravirt_release_pud(unsigned long pfn) {}
static inline void paravirt_release_p4d(unsigned long pfn) {}
#endif

/*
 * Flags to use when allocating a user page table page.
 */
extern gfp_t __userpte_alloc_gfp;

#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
 * Instead of one PGD, we acquire two PGDs.  Being order-1, it is
 * both 8k in size and 8k-aligned.  That lets us just flip bit 12
 * in a pointer to swap between the two 4k halves.
 */
#define PGD_ALLOCATION_ORDER 1
#else
#define PGD_ALLOCATION_ORDER 0
#endif

/*
 * Allocate and free page tables.
 */
extern pgd_t *pgd_alloc(struct mm_struct *);
extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);

extern pgtable_t pte_alloc_one(struct mm_struct *);

extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte);

static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
                                  unsigned long address)
{
        ___pte_free_tlb(tlb, pte);
}

static inline void pmd_populate_kernel(struct mm_struct *mm,
                                       pmd_t *pmd, pte_t *pte)
{
        paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
        set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
}

static inline void pmd_populate_kernel_safe(struct mm_struct *mm,
                                       pmd_t *pmd, pte_t *pte)
{
        paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
        set_pmd_safe(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
}

static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
                                struct page *pte)
{
        unsigned long pfn = page_to_pfn(pte);

        paravirt_alloc_pte(mm, pfn);
        set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
}

#define pmd_pgtable(pmd) pmd_page(pmd)

#if CONFIG_PGTABLE_LEVELS > 2
extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);

static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
                                  unsigned long address)
{
        ___pmd_free_tlb(tlb, pmd);
}

#ifdef CONFIG_X86_PAE
extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
#else        /* !CONFIG_X86_PAE */
static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
{
        paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
        set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
}

static inline void pud_populate_safe(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
{
        paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
        set_pud_safe(pud, __pud(_PAGE_TABLE | __pa(pmd)));
}
#endif        /* CONFIG_X86_PAE */

#if CONFIG_PGTABLE_LEVELS > 3
static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
{
        paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
        set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud)));
}

static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
{
        paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
        set_p4d_safe(p4d, __p4d(_PAGE_TABLE | __pa(pud)));
}

extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);

static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
                                  unsigned long address)
{
        ___pud_free_tlb(tlb, pud);
}

#if CONFIG_PGTABLE_LEVELS > 4
static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
{
        if (!pgtable_l5_enabled())
                return;
        paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
        set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
}

static inline void pgd_populate_safe(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
{
        if (!pgtable_l5_enabled())
                return;
        paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
        set_pgd_safe(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
}

static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
{
        gfp_t gfp = GFP_KERNEL_ACCOUNT;

        if (mm == &init_mm)
                gfp &= ~__GFP_ACCOUNT;
        return (p4d_t *)get_zeroed_page(gfp);
}

static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
{
        if (!pgtable_l5_enabled())
                return;

        BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
        free_page((unsigned long)p4d);
}

extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d);

static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
                                  unsigned long address)
{
        if (pgtable_l5_enabled())
                ___p4d_free_tlb(tlb, p4d);
}

#endif        /* CONFIG_PGTABLE_LEVELS > 4 */
#endif        /* CONFIG_PGTABLE_LEVELS > 3 */
#endif        /* CONFIG_PGTABLE_LEVELS > 2 */

#endif /* _ASM_X86_PGALLOC_H */

openat(0xffffffffffffff9c, &(0x7f0000000080)='./file0\x00', 0x105142, 0x0)
openat(0xffffffffffffff9c, 0x0, 0x0, 0x0)
io_uring_register$IORING_UNREGISTER_FILES(0xffffffffffffffff, 0x3, 0x0, 0x0)
perf_event_open(&(0x7f0000000480)={0x1, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4307, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_config_ext}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
r0 = syz_io_uring_setup(0x3875, &(0x7f00000001c0), &(0x7f0000ffb000/0x3000)=nil, &(0x7f0000ffc000/0x4000)=nil, &(0x7f0000000040)=<r1=>0x0, &(0x7f0000000100)=<r2=>0x0)
syz_io_uring_submit(r1, r2, &(0x7f0000000000)=@IORING_OP_OPENAT2={0x1c, 0x0, 0x0, 0xffffffffffffff9c, &(0x7f0000000080), &(0x7f00000000c0)='./file0\x00', 0x18}, 0x0)
mmap$IORING_OFF_SQ_RING(&(0x7f0000ffc000/0x4000)=nil, 0x4000, 0x100000e, 0x13, r0, 0x0)
r3 = mmap$IORING_OFF_CQ_RING(&(0x7f0000ffe000/0x2000)=nil, 0x2000, 0xf, 0x13, r0, 0x8000000)
syz_io_uring_submit(r3, r2, &(0x7f00000002c0)=@IORING_OP_READ=@pass_buffer={0x16, 0x4, 0x0, @fd, 0x0, 0x0}, 0x80000001)
io_uring_enter(r0, 0x58ab, 0x0, 0x0, 0x0, 0x0)

__activate_page---of 26
__page_cache_release---of 26
__pagevec_lru_add---of 1
__pagevec_lru_add_fn49%of 54
__pagevec_release---of 15
__put_compound_page---of 5
__put_page---of 4
__traceiter_mm_lru_activate---of 4
__traceiter_mm_lru_insertion---of 4
deactivate_file_page---of 23
deactivate_page---of 30
get_kernel_page---of 8
get_kernel_pages---of 12
lru_add_drain54%of 13
lru_add_drain_all---of 15
lru_add_drain_cpu25%of 28
lru_add_drain_cpu_zone---of 13
lru_add_drain_per_cpu---of 13
lru_cache_add---of 32
lru_cache_add_inactive_or_unevictable---of 10
lru_deactivate_file_fn---of 31
lru_deactivate_fn---of 20
lru_lazyfree_fn---of 27
lru_note_cost---of 28
lru_note_cost_page---of 3
mark_page_accessed---of 71
mark_page_lazyfree---of 37
pagevec_lookup_entries---of 1
pagevec_lookup_range---of 1
pagevec_lookup_range_nr_tag---of 1
pagevec_lookup_range_tag---of 1
pagevec_lru_move_fn89%of 9
pagevec_move_tail---of 1
pagevec_move_tail_fn---of 30
pagevec_remove_exceptionals---of 6
perf_trace_mm_lru_activate---of 6
perf_trace_mm_lru_insertion---of 24
put_pages_list---of 7
release_pages18%of 52
rotate_reclaimable_page---of 37
trace_event_raw_event_mm_lru_activate---of 10
trace_event_raw_event_mm_lru_insertion---of 28
trace_raw_output_mm_lru_activate---of 4
trace_raw_output_mm_lru_insertion---of 10

__check_sticky---of 5
__ia32_sys_link---of 1
__ia32_sys_linkat---of 1
__ia32_sys_mkdir---of 1
__ia32_sys_mkdirat---of 1
__ia32_sys_mknod---of 1
__ia32_sys_mknodat---of 1
__ia32_sys_rename---of 1
__ia32_sys_renameat---of 1
__ia32_sys_renameat2---of 1
__ia32_sys_rmdir---of 5
__ia32_sys_symlink---of 1
__ia32_sys_symlinkat---of 1
__ia32_sys_unlink---of 5
__ia32_sys_unlinkat---of 6
__legitimize_path50%of 8
__lookup_hash---of 6
__lookup_slow---of 13
__page_symlink---of 7
__traverse_mounts---of 31
__x64_sys_link---of 1
__x64_sys_linkat---of 1
__x64_sys_mkdir---of 1
__x64_sys_mkdirat---of 1
__x64_sys_mknod---of 1
__x64_sys_mknodat---of 1
__x64_sys_rename---of 1
__x64_sys_renameat---of 1
__x64_sys_renameat2---of 1
__x64_sys_rmdir---of 5
__x64_sys_symlink---of 1
__x64_sys_symlinkat---of 1
__x64_sys_unlink---of 5
__x64_sys_unlinkat---of 6
choose_mountpoint_rcu---of 11
complete_walk62%of 13
do_file_open_root---of 15
do_filp_open50%of 12
do_linkat---of 25
do_mkdirat---of 8
do_mknodat---of 15
do_renameat2---of 57
do_rmdir---of 19
do_symlinkat---of 9
do_unlinkat---of 29
done_path_create---of 1
filename_create---of 14
filename_lookup---of 20
filename_parentat---of 17
follow_down---of 6
follow_down_one---of 6
follow_up---of 7
full_name_hash---of 6
generic_permission20%of 30
getname60%of 5
getname_flags---of 5
getname_flags.part.032%of 19
getname_kernel---of 12
handle_dots.part.0---of 83
hashlen_string---of 4
inode_permission40%of 15
kern_path---of 1
kern_path_create---of 1
kern_path_locked---of 7
legitimize_links20%of 15
legitimize_root60%of 5
link_path_walk.part.056%of 38
lock_rename---of 7
lookup_dcache---of 7
lookup_fast34%of 21
lookup_one_len---of 7
lookup_one_len_common---of 13
lookup_one_len_unlocked---of 5
lookup_open.isra.0---of 64
lookup_positive_unlocked---of 5
may_delete---of 26
may_linkat---of 10
may_open39%of 18
may_open_dev---of 3
nd_alloc_stack---of 5
nd_jump_link---of 7
nd_jump_root---of 15
page_get_link---of 17
page_put_link---of 6
page_readlink---of 3
page_symlink---of 1
path_get100%of 2
path_init29%of 71
path_lookupat---of 27
path_openat22%of 146
path_parentat---of 7
path_pts---of 7
path_put---of 1
putname67%of 6
readlink_copy---of 8
set_root---of 16
step_into9%of 99
terminate_walk30%of 17
try_lookup_one_len---of 6
try_to_unlazy41%of 22
try_to_unlazy_next---of 32
unlock_rename---of 2
user_path_at_empty---of 5
user_path_create---of 5
vfs_create---of 18
vfs_get_link---of 5
vfs_link---of 40
vfs_mkdir---of 20
vfs_mknod---of 23
vfs_mkobj---of 15
vfs_path_lookup---of 1
vfs_readlink---of 11
vfs_rename---of 111
vfs_rmdir---of 4
vfs_rmdir.part.0---of 14
vfs_symlink---of 16
vfs_tmpfile---of 13
vfs_unlink---of 27
walk_component20%of 21

_find_next_bit.constprop.084%of 12
find_first_bit80%of 5
find_first_zero_bit---of 5
find_last_bit84%of 6
find_next_and_bit---of 1
find_next_bit100%of 1
find_next_clump8---of 3
find_next_zero_bit100%of 1

__get_user_pages---of 124
__get_user_pages_remote---of 59
__mm_populate56%of 18
fixup_user_fault---of 22
follow_page---of 19
follow_page_mask---of 130
get_dump_page---of 52
get_user_pages---of 7
get_user_pages_fast---of 6
get_user_pages_fast_only---of 1
get_user_pages_locked---of 52
get_user_pages_remote---of 3
get_user_pages_unlocked---of 50
hpage_pincount_add---of 11
hpage_pincount_sub---of 11
internal_get_user_pages_fast---of 191
is_valid_gup_flags---of 6
pin_user_pages---of 4
pin_user_pages_fast---of 4
pin_user_pages_fast_only---of 4
pin_user_pages_locked---of 50
pin_user_pages_remote---of 4
pin_user_pages_unlocked---of 4
populate_vma_page_range---of 19
put_compound_head---of 10
put_page_refs---of 11
try_grab_compound_head---of 39
try_grab_page---of 22
unpin_user_page---of 10
unpin_user_pages---of 13
unpin_user_pages_dirty_lock---of 12

__do_sys_io_uring_enter11%of 95
__do_sys_io_uring_register---of 214
__ia32_sys_io_uring_enter---of 1
__ia32_sys_io_uring_register---of 1
__ia32_sys_io_uring_setup---of 1
__io_arm_poll_handler---of 19
__io_compat_recvmsg_copy_hdr---of 13
__io_cqring_overflow_flush---of 25
__io_file_supports_nowait---of 17
__io_free_req---of 10
__io_openat_prep84%of 12
__io_poll_execute---of 11
__io_prep_linked_timeout---of 8
__io_queue_proc---of 11
__io_queue_sqe12%of 42
__io_recvmsg_copy_hdr---of 7
__io_req_find_next---of 10
__io_sqe_files_unregister---of 5
__io_sqe_files_update---of 27
__io_uring_add_tctx_node60%of 15
__io_uring_cancel---of 1
__io_uring_free---of 7
__traceiter_io_uring_complete---of 4
__traceiter_io_uring_cqring_wait---of 4
__traceiter_io_uring_create---of 4
__traceiter_io_uring_defer---of 4
__traceiter_io_uring_fail_link---of 4
__traceiter_io_uring_file_get---of 4
__traceiter_io_uring_link---of 4
__traceiter_io_uring_poll_arm---of 4
__traceiter_io_uring_poll_wake---of 4
__traceiter_io_uring_queue_async_work---of 4
__traceiter_io_uring_register---of 4
__traceiter_io_uring_submit_sqe---of 4
__traceiter_io_uring_task_add---of 4
__traceiter_io_uring_task_run---of 4
__x64_sys_io_uring_enter100%of 1
__x64_sys_io_uring_register---of 1
__x64_sys_io_uring_setup100%of 1
get_order100%of 1
io_apoll_task_func---of 10
io_async_buf_func---of 4
io_async_queue_proc---of 1
io_buffer_select.part.0---of 15
io_buffer_unmap---of 10
io_cancel_cb---of 4
io_cancel_ctx_cb---of 1
io_cancel_task_cb---of 1
io_clean_op---of 23
io_complete_rw---of 12
io_complete_rw_iopoll---of 8
io_connect---of 22
io_copy_iov---of 6
io_cqring_ev_posted29%of 14
io_cqring_event_overflow---of 6
io_cqring_overflow_flush---of 7
io_disarm_next---of 42
io_dismantle_req56%of 9
io_do_iopoll---of 58
io_drain_req---of 36
io_fallback_req_func---of 22
io_file_get_normal---of 14
io_fill_cqe_aux---of 14
io_fill_cqe_req.constprop.0.isra.0---of 13
io_fixed_file_set---of 7
io_flush_timeouts---of 7
io_free_req_work---of 4
io_import_iovec---of 59
io_install_fixed_file.isra.0---of 27
io_iopoll_try_reap_events.part.0---of 7
io_is_uring_fops---of 1
io_issue_sqe2%of 416
io_kill_timeout.part.0---of 7
io_kill_timeouts---of 21
io_link_timeout_fn---of 7
io_match_task_safe---of 13
io_mem_free.part.0---of 9
io_openat262%of 34
io_poll_add---of 9
io_poll_check_events---of 26
io_poll_find.isra.0---of 8
io_poll_get_ownership_slowpath---of 5
io_poll_queue_proc---of 1
io_poll_remove_all---of 17
io_poll_remove_entries---of 21
io_poll_task_func---of 9
io_poll_wake---of 15
io_prep_async_link---of 6
io_prep_async_work---of 15
io_prep_rw---of 64
io_put_sq_data---of 13
io_queue_async_work---of 20
io_queue_deferred---of 7
io_queue_linked_timeout---of 13
io_queue_rsrc_removal.isra.0---of 3
io_read---of 55
io_recv---of 42
io_recvmsg---of 50
io_register_rsrc---of 9
io_register_rsrc_update---of 33
io_req_complete_post---of 41
io_req_free_batch35%of 20
io_req_io_end---of 13
io_req_prep_async---of 26
io_req_prep_async.cold---of 1
io_req_rw_complete---of 23
io_req_task_cancel---of 3
io_req_task_link_timeout---of 13
io_req_task_submit---of 5
io_req_task_timeout---of 1
io_req_task_work_add---of 14
io_ring_ctx_ref_free---of 1
io_ring_ctx_wait_and_kill---of 11
io_ring_ctx_wait_and_kill.cold---of 1
io_ring_exit_work---of 76
io_rsrc_buf_put---of 1
io_rsrc_data_alloc---of 17
io_rsrc_data_free---of 5
io_rsrc_file_put---of 1
io_rsrc_node_ref_zero---of 7
io_rsrc_node_switch50%of 8
io_rsrc_node_switch_start.part.058%of 7
io_rsrc_put_work---of 17
io_rsrc_ref_quiesce.part.0.constprop.0---of 8
io_run_task_work40%of 10
io_run_task_work_sig---of 7
io_rw_should_reissue---of 10
io_send---of 30
io_sendmsg---of 34
io_setup_async_msg---of 10
io_setup_async_rw---of 16
io_sq_thread---of 57
io_sq_thread_finish---of 4
io_sq_thread_park---of 4
io_sq_thread_unpark---of 5
io_sqd_handle_event---of 9
io_sqe_buffer_register---of 63
io_sqe_buffers_register---of 28
io_sqe_files_register---of 24
io_submit_flush_completions48%of 42
io_submit_sqes14%of 361
io_submit_sqes.cold---of 1
io_tctx_exit_cb---of 4
io_timeout_extract---of 6
io_timeout_fn---of 1
io_timeout_prep---of 24
io_try_cancel_userdata---of 26
io_uring_alloc_task_context67%of 12
io_uring_cancel_generic---of 8
io_uring_cancel_generic.cold---of 27
io_uring_del_tctx_node---of 9
io_uring_drop_tctx_refs---of 6
io_uring_mmap74%of 19
io_uring_poll---of 9
io_uring_release---of 1
io_uring_setup28%of 120
io_uring_show_fdinfo---of 38
io_uring_try_cancel_requests---of 41
io_wake_function---of 5
io_wq_free_work---of 11
io_wq_submit_work---of 17
io_write---of 55
kiocb_done---of 45
kiocb_end_write.part.0.isra.0---of 10
loop_rw_iter---of 18
percpu_ref_put_many53%of 17
percpu_ref_tryget_many48%of 19
perf_trace_io_uring_complete---of 6
perf_trace_io_uring_cqring_wait---of 6
perf_trace_io_uring_create---of 6
perf_trace_io_uring_defer---of 6
perf_trace_io_uring_fail_link---of 6
perf_trace_io_uring_file_get---of 6
perf_trace_io_uring_link---of 6
perf_trace_io_uring_poll_arm---of 6
perf_trace_io_uring_poll_wake---of 6
perf_trace_io_uring_queue_async_work---of 6
perf_trace_io_uring_register---of 6
perf_trace_io_uring_submit_sqe---of 6
perf_trace_io_uring_task_add---of 6
perf_trace_io_uring_task_run---of 6
tctx_task_work---of 47
tctx_task_work.cold---of 1
trace_event_raw_event_io_uring_complete---of 10
trace_event_raw_event_io_uring_cqring_wait---of 10
trace_event_raw_event_io_uring_create---of 10
trace_event_raw_event_io_uring_defer---of 10
trace_event_raw_event_io_uring_fail_link---of 10
trace_event_raw_event_io_uring_file_get---of 10
trace_event_raw_event_io_uring_link---of 10
trace_event_raw_event_io_uring_poll_arm---of 10
trace_event_raw_event_io_uring_poll_wake---of 10
trace_event_raw_event_io_uring_queue_async_work---of 10
trace_event_raw_event_io_uring_register---of 10
trace_event_raw_event_io_uring_submit_sqe---of 10
trace_event_raw_event_io_uring_task_add---of 10
trace_event_raw_event_io_uring_task_run---of 10
trace_raw_output_io_uring_complete---of 4
trace_raw_output_io_uring_cqring_wait---of 4
trace_raw_output_io_uring_create---of 4
trace_raw_output_io_uring_defer---of 4
trace_raw_output_io_uring_fail_link---of 4
trace_raw_output_io_uring_file_get---of 4
trace_raw_output_io_uring_link---of 4
trace_raw_output_io_uring_poll_arm---of 4
trace_raw_output_io_uring_poll_wake---of 4
trace_raw_output_io_uring_queue_async_work---of 5
trace_raw_output_io_uring_register---of 4
trace_raw_output_io_uring_submit_sqe---of 4
trace_raw_output_io_uring_task_add---of 4
trace_raw_output_io_uring_task_run---of 4

call_blocking_lsm_notifier---of 1
get_order---of 1
inode_free_by_rcu---of 1
lsm_append.constprop.0---of 10
lsm_inode_alloc---of 5
register_blocking_lsm_notifier---of 1
security_add_mnt_opt---of 4
security_audit_rule_free---of 2
security_audit_rule_init---of 4
security_audit_rule_known---of 4
security_audit_rule_match---of 4
security_binder_set_context_mgr---of 4
security_binder_transaction---of 4
security_binder_transfer_binder---of 4
security_binder_transfer_file---of 4
security_bprm_check---of 5
security_bprm_committed_creds---of 2
security_bprm_committing_creds---of 2
security_bprm_creds_for_exec---of 4
security_bprm_creds_from_file---of 4
security_capable100%of 4
security_capget---of 4
security_capset---of 4
security_cred_alloc_blank---of 9
security_cred_free---of 5
security_cred_getsecid---of 2
security_d_instantiate50%of 4
security_dentry_create_files_as---of 4
security_dentry_init_security---of 4
security_file_alloc78%of 9
security_file_fcntl---of 4
security_file_free---of 4
security_file_ioctl---of 4
security_file_ioctl_compat---of 4
security_file_lock---of 4
security_file_mprotect---of 5
security_file_open50%of 20
security_file_permission---of 23
security_file_receive---of 4
security_file_send_sigiotask---of 4
security_file_set_fowner---of 2
security_free_mnt_opts---of 5
security_fs_context_dup---of 4
security_fs_context_parse_param---of 5
security_getprocattr---of 6
security_inet_conn_established---of 2
security_inet_conn_request---of 4
security_inet_csk_clone---of 2
security_inode_alloc---of 9
security_inode_copy_up---of 4
security_inode_copy_up_xattr---of 5
security_inode_create---of 6
security_inode_follow_link---of 6
security_inode_free---of 4
security_inode_getattr---of 5
security_inode_getsecctx---of 4
security_inode_getsecid---of 2
security_inode_getsecurity---of 6
security_inode_getxattr---of 5
security_inode_init_security---of 15
security_inode_invalidate_secctx---of 2
security_inode_killpriv---of 4
security_inode_link---of 5
security_inode_listsecurity---of 5
security_inode_listxattr---of 5
security_inode_mkdir---of 6
security_inode_mknod---of 6
security_inode_need_killpriv---of 4
security_inode_notifysecctx---of 4
security_inode_permission100%of 5
security_inode_post_setxattr---of 3
security_inode_readlink---of 5
security_inode_removexattr---of 8
security_inode_rename---of 12
security_inode_rmdir---of 5
security_inode_setattr---of 6
security_inode_setsecctx---of 4
security_inode_setsecurity---of 5
security_inode_setxattr---of 8
security_inode_symlink---of 5
security_inode_unlink---of 5
security_ipc_getsecid---of 2
security_ipc_permission---of 5
security_ismaclabel---of 4
security_kernel_act_as---of 4
security_kernel_create_files_as---of 4
security_kernel_load_data---of 5
security_kernel_module_request---of 5
security_kernel_post_load_data---of 5
security_kernel_post_read_file---of 5
security_kernel_read_file---of 5
security_kernfs_init_security---of 4
security_key_alloc---of 4
security_key_free---of 2
security_key_getsecurity---of 4
security_key_permission---of 4
security_locked_down---of 4
security_mmap_addr100%of 4
security_mmap_file70%of 10
security_move_mount---of 4
security_msg_msg_alloc---of 11
security_msg_msg_free---of 3
security_msg_queue_alloc---of 11
security_msg_queue_associate---of 4
security_msg_queue_free---of 3
security_msg_queue_msgctl---of 4
security_msg_queue_msgrcv---of 4
security_msg_queue_msgsnd---of 4
security_netlink_send---of 4
security_old_inode_init_security---of 5
security_path_notify---of 4
security_perf_event_alloc---of 4
security_perf_event_free---of 2
security_perf_event_open---of 4
security_perf_event_read---of 4
security_perf_event_write---of 4
security_prepare_creds---of 9
security_ptrace_access_check---of 4
security_ptrace_traceme---of 4
security_quota_on---of 4
security_quotactl---of 4
security_release_secctx---of 2
security_req_classify_flow---of 2
security_sb_alloc---of 4
security_sb_clone_mnt_opts---of 4
security_sb_eat_lsm_opts---of 4
security_sb_free---of 2
security_sb_kern_mount---of 4
security_sb_mount---of 4
security_sb_pivotroot---of 4
security_sb_remount---of 4
security_sb_set_mnt_opts---of 5
security_sb_show_options---of 4
security_sb_statfs---of 4
security_sb_umount---of 4
security_sctp_assoc_request---of 4
security_sctp_bind_connect---of 4
security_sctp_sk_clone---of 2
security_secctx_to_secid---of 4
security_secid_to_secctx---of 4
security_secmark_refcount_dec---of 2
security_secmark_refcount_inc---of 2
security_secmark_relabel_packet---of 4
security_sem_alloc---of 11
security_sem_associate---of 4
security_sem_free---of 3
security_sem_semctl---of 4
security_sem_semop---of 4
security_setprocattr---of 6
security_settime64---of 4
security_shm_alloc---of 11
security_shm_associate---of 4
security_shm_free---of 3
security_shm_shmat---of 4
security_shm_shmctl---of 4
security_sk_alloc---of 4
security_sk_classify_flow---of 3
security_sk_clone---of 2
security_sk_free---of 2
security_sock_graft---of 2
security_sock_rcv_skb---of 4
security_socket_accept---of 4
security_socket_bind---of 4
security_socket_connect---of 4
security_socket_create---of 4
security_socket_getpeername---of 4
security_socket_getpeersec_dgram---of 4
security_socket_getpeersec_stream---of 4
security_socket_getsockname---of 4
security_socket_getsockopt---of 4
security_socket_listen---of 4
security_socket_post_create---of 4
security_socket_recvmsg---of 4
security_socket_sendmsg---of 4
security_socket_setsockopt---of 4
security_socket_shutdown---of 4
security_socket_socketpair---of 4
security_syslog---of 4
security_task_alloc---of 11
security_task_fix_setgid---of 4
security_task_fix_setuid---of 4
security_task_free---of 3
security_task_getioprio---of 4
security_task_getpgid---of 4
security_task_getscheduler---of 4
security_task_getsecid---of 2
security_task_getsid---of 4
security_task_kill---of 4
security_task_movememory---of 4
security_task_prctl---of 5
security_task_prlimit---of 4
security_task_setioprio---of 4
security_task_setnice---of 4
security_task_setpgid---of 4
security_task_setrlimit---of 4
security_task_setscheduler---of 4
security_task_to_inode---of 2
security_transfer_creds---of 2
security_tun_dev_alloc_security---of 4
security_tun_dev_attach---of 4
security_tun_dev_attach_queue---of 4
security_tun_dev_create---of 4
security_tun_dev_free_security---of 2
security_tun_dev_open---of 4
security_unix_may_send---of 4
security_unix_stream_connect---of 4
security_vm_enough_memory_mm---of 4
unregister_blocking_lsm_notifier---of 1

__devm_release_region---of 3
__devm_request_region---of 5
__find_resource---of 31
__insert_resource---of 27
__release_child_resources---of 2
__release_region---of 13
__release_resource---of 12
__request_region---of 12
__request_region.cold---of 1
__request_resource---of 9
adjust_resource---of 14
alloc_resource---of 5
allocate_resource---of 7
devm_region_match---of 4
devm_region_release---of 1
devm_release_resource---of 2
devm_request_resource---of 4
devm_request_resource.cold---of 1
devm_resource_match---of 1
devm_resource_release---of 1
find_next_iomem_res67%of 15
free_resource---of 11
insert_resource---of 2
insert_resource_conflict---of 1
insert_resource_expand_to_fit---of 5
insert_resource_expand_to_fit.cold---of 5
iomem_is_exclusive---of 11
iomem_map_sanity_check---of 11
iomem_map_sanity_check.cold---of 1
lookup_resource---of 4
page_is_ram---of 4
r_next---of 5
r_show---of 8
r_start---of 7
r_stop---of 1
reallocate_resource---of 16
region_intersects---of 18
release_child_resources---of 3
release_resource---of 1
remove_resource---of 1
request_resource---of 11
request_resource_conflict---of 9
resource_alignment---of 5
resource_list_create_entry---of 5
resource_list_free---of 2
simple_align_resource---of 1
walk_iomem_res_desc---of 5
walk_mem_res---of 5
walk_system_ram_range100%of 7
walk_system_ram_res---of 5

__ia32_compat_sys_old_shmctl---of 1
__ia32_compat_sys_shmat---of 3
__ia32_compat_sys_shmctl---of 1
__ia32_sys_shmat---of 3
__ia32_sys_shmctl---of 1
__ia32_sys_shmdt---of 1
__ia32_sys_shmget---of 1
__shm_open.isra.0---of 41
__x64_sys_shmat---of 3
__x64_sys_shmctl---of 1
__x64_sys_shmdt---of 1
__x64_sys_shmget---of 1
compat_ksys_old_shmctl---of 1
compat_ksys_shmctl---of 26
copy_compat_shmid_to_user---of 4
do_shm_rmid---of 11
do_shmat---of 91
exit_shm---of 44
is_file_shm_hugepages100%of 1
ksys_shmctl.constprop.0---of 18
ksys_shmdt---of 16
ksys_shmget---of 1
newseg---of 45
shm_add_rss_swap.isra.0---of 4
shm_close---of 37
shm_destroy---of 31
shm_destroy_orphaned---of 3
shm_exit_ns---of 1
shm_fallocate---of 3
shm_fault---of 1
shm_fsync---of 3
shm_get_policy---of 5
shm_get_unmapped_area---of 1
shm_init_ns---of 1
shm_mmap---of 7
shm_more_checks---of 2
shm_open---of 2
shm_pagesize---of 3
shm_rcu_free---of 1
shm_release---of 1
shm_set_policy---of 3
shm_split---of 3
shm_try_destroy_orphaned---of 13
shmctl_do_lock---of 41
shmctl_down---of 22
shmctl_ipc_info---of 7
shmctl_shm_info.part.0---of 10
shmctl_stat---of 29
sysvipc_shm_proc_show---of 1

__traceiter_selinux_audited---of 4
avc_alloc_node---of 34
avc_audit_post_callback---of 30
avc_audit_pre_callback---of 11
avc_compute_av.isra.0---of 28
avc_compute_av.isra.0.cold---of 1
avc_copy_xperms_decision---of 6
avc_denied---of 6
avc_disable---of 2
avc_flush---of 18
avc_get_cache_threshold---of 1
avc_get_hash_stats---of 19
avc_has_extended_perms---of 64
avc_has_perm72%of 7
avc_has_perm_flags---of 8
avc_has_perm_noaudit58%of 21
avc_lookup89%of 9
avc_node_delete---of 3
avc_node_free---of 1
avc_node_kill---of 1
avc_node_replace---of 3
avc_policy_seqno100%of 1
avc_set_cache_threshold---of 1
avc_ss_reset---of 7
avc_update_node.part.0.isra.0---of 26
avc_xperms_decision_alloc---of 9
avc_xperms_decision_free---of 7
avc_xperms_free---of 4
avc_xperms_populate.part.0---of 7
perf_trace_selinux_audited---of 9
selinux_avc_init---of 3
slow_avc_audit---of 6
trace_event_get_offsets_selinux_audited.constprop.0---of 4
trace_event_raw_event_selinux_audited---of 13
trace_raw_output_selinux_audited---of 4

_copy_from_user65%of 14
_copy_to_user67%of 12
check_zeroed_user---of 21

__create_xol_area---of 19
__find_uprobe---of 14
__replace_page---of 64
__update_ref_ctr---of 13
__update_ref_ctr.cold---of 1
__uprobe_register---of 33
__uprobe_register.cold---of 1
__uprobe_unregister---of 12
arch_uprobe_copy_ixol---of 1
arch_uprobe_ignore---of 1
copy_from_page---of 1
copy_to_page---of 1
delayed_uprobe_delete---of 3
dup_xol_work---of 4
dup_xol_work.cold---of 1
filter_chain---of 5
install_breakpoint.isra.0---of 31
is_swbp_insn---of 1
is_trap_insn---of 1
put_uprobe---of 10
register_for_each_vma---of 48
set_orig_insn---of 1
set_swbp---of 1
update_ref_ctr---of 31
update_ref_ctr.cold---of 2
uprobe_apply---of 10
uprobe_clear_state---of 13
uprobe_copy_process---of 16
uprobe_copy_process.cold---of 2
uprobe_deny_signal---of 9
uprobe_dup_mmap---of 2
uprobe_end_dup_mmap---of 9
uprobe_free_utask---of 6
uprobe_get_swbp_addr---of 1
uprobe_get_trap_addr---of 5
uprobe_mmap4%of 56
uprobe_mmap.cold---of 1
uprobe_munmap6%of 18
uprobe_notify_resume---of 138
uprobe_notify_resume.cold---of 4
uprobe_post_sstep_notifier---of 6
uprobe_pre_sstep_notifier---of 7
uprobe_register---of 1
uprobe_register_refctr---of 1
uprobe_start_dup_mmap---of 9
uprobe_unregister---of 3
uprobe_warn.constprop.0---of 1
uprobe_write_opcode---of 77
xol_free_insn_slot---of 10

__ia32_sys_readahead---of 1
__x64_sys_readahead---of 1
do_page_cache_ra---of 5
file_ra_state_init80%of 5
force_page_cache_ra---of 18
ksys_readahead---of 9
ondemand_readahead---of 80
page_cache_async_ra---of 10
page_cache_ra_unbounded---of 22
page_cache_sync_ra---of 7
read_cache_pages---of 15
read_cache_pages_invalidate_page---of 12
read_pages---of 43

__virt_addr_valid54%of 39

arch_mmap_rnd---of 6
arch_pick_mmap_layout---of 11
arch_vma_name---of 1
get_mmap_base---of 11
mmap_address_hint_valid---of 7
mmap_base.isra.0---of 6
pfn_modify_allowed6%of 39
task_size_32bit---of 2
task_size_64bit---of 6
valid_mmap_phys_addr_range---of 1
valid_phys_addr_range---of 4

__get_vma_policy---of 10
__ia32_compat_sys_get_mempolicy---of 5
__ia32_compat_sys_mbind---of 5
__ia32_compat_sys_migrate_pages---of 14
__ia32_compat_sys_set_mempolicy---of 5
__ia32_sys_get_mempolicy---of 1
__ia32_sys_mbind---of 1
__ia32_sys_migrate_pages---of 1
__ia32_sys_set_mempolicy---of 1
__mpol_dup---of 12
__mpol_equal---of 12
__mpol_put---of 2
__nodes_weight.constprop.0---of 1
__x64_sys_get_mempolicy---of 9
__x64_sys_mbind---of 7
__x64_sys_migrate_pages---of 1
__x64_sys_set_mempolicy---of 7
alloc_page_interleave---of 5
alloc_pages_current73%of 11
alloc_pages_vma---of 18
copy_nodes_to_user---of 5
do_get_mempolicy---of 57
do_mbind---of 62
do_migrate_pages---of 3
do_migrate_pages.part.0---of 16
do_set_mempolicy---of 11
get_nodes---of 16
get_task_policy---of 7
huge_node---of 16
init_nodemask_of_mempolicy---of 12
kernel_get_mempolicy---of 8
kernel_mbind---of 7
kernel_migrate_pages---of 58
kernel_set_mempolicy---of 7
mempolicy_nodemask_intersects---of 8
mempolicy_slab_node22%of 14
migrate_to_node---of 6
mpol_free_shared_policy---of 6
mpol_misplaced---of 24
mpol_new---of 16
mpol_new_bind---of 3
mpol_new_interleave---of 3
mpol_new_preferred---of 5
mpol_parse_str---of 56
mpol_put_task_policy---of 2
mpol_rebind_mm---of 8
mpol_rebind_nodemask---of 8
mpol_rebind_preferred---of 9
mpol_rebind_task---of 5
mpol_set_nodemask.part.0---of 13
mpol_set_shared_policy---of 39
mpol_shared_policy_init---of 13
mpol_shared_policy_lookup---of 6
mpol_to_str---of 17
new_page---of 13
numa_default_policy---of 1
numa_map_to_online_node---of 6
offset_il_node---of 5
policy_node50%of 8
policy_nodemask23%of 9
queue_pages_hugetlb---of 30
queue_pages_pte_range---of 30
queue_pages_test_walk---of 16
set_bit---of 1
sp_insert---of 10
sp_lookup.isra.0---of 10
vma_dup_policy50%of 6
vma_migratable---of 12
vma_policy_mof---of 13

__io_wq_cpu_online---of 37
create_io_worker---of 13
create_worker_cb---of 5
create_worker_cont---of 15
io_acct_cancel_pending_work.isra.0---of 19
io_flush_signals---of 5
io_init_new_worker---of 3
io_queue_worker_create---of 27
io_task_work_match---of 5
io_task_worker_match---of 3
io_worker_cancel_cb---of 3
io_worker_handle_work---of 57
io_worker_release---of 5
io_workqueue_create---of 2
io_wq_cancel_cb---of 24
io_wq_cpu_affinity---of 18
io_wq_cpu_offline---of 3
io_wq_cpu_online---of 3
io_wq_create50%of 34
io_wq_enqueue---of 1
io_wq_exit_start---of 1
io_wq_for_each_worker.isra.0---of 16
io_wq_hash_work---of 1
io_wq_max_workers---of 27
io_wq_put_and_exit---of 39
io_wq_work_match_all---of 1
io_wq_work_match_item---of 1
io_wq_worker_cancel---of 9
io_wq_worker_running---of 4
io_wq_worker_sleeping---of 3
io_wq_worker_stopped---of 7
io_wq_worker_wake---of 4
io_wqe_activate_free_worker---of 17
io_wqe_cancel_pending_work---of 5
io_wqe_dec_running---of 5
io_wqe_enqueue---of 38
io_wqe_enqueue.cold---of 1
io_wqe_hash_wake---of 17
io_wqe_worker---of 39

__kthread_bind_mask---of 3
__kthread_cancel_work_sync---of 11
__kthread_create_on_node---of 8
__kthread_create_worker---of 8
__kthread_init_worker---of 1
__kthread_parkme---of 5
__kthread_queue_delayed_work---of 7
__kthread_should_park67%of 3
free_kthread_struct---of 3
kthread---of 8
kthread_bind---of 1
kthread_bind_mask---of 1
kthread_cancel_delayed_work_sync---of 1
kthread_cancel_work_sync---of 1
kthread_create_on_cpu---of 5
kthread_create_on_node---of 1
kthread_create_worker---of 1
kthread_create_worker_on_cpu---of 1
kthread_data---of 3
kthread_delayed_work_timer_fn---of 11
kthread_destroy_worker---of 5
kthread_flush_work---of 15
kthread_flush_work_fn---of 1
kthread_flush_worker---of 1
kthread_freezable_should_stop---of 8
kthread_func---of 5
kthread_insert_work_sanity_check---of 8
kthread_is_per_cpu---of 5
kthread_mod_delayed_work---of 8
kthread_park---of 11
kthread_parkme---of 3
kthread_probe_data---of 5
kthread_queue_delayed_work---of 7
kthread_queue_work---of 9
kthread_set_per_cpu---of 8
kthread_should_park---of 3
kthread_should_stop---of 3
kthread_stop---of 31
kthread_unpark---of 5
kthread_unuse_mm---of 8
kthread_use_mm---of 13
kthread_worker_fn---of 23
kthreadd---of 17
tsk_fork_get_node---of 3

__put_user_ns---of 1
cmp_extents_forward---of 3
cmp_extents_reverse---of 3
cmp_map_id---of 8
create_user_ns---of 15
current_in_userns---of 3
free_user_ns---of 8
from_kgid100%of 1
from_kgid_munged---of 3
from_kprojid---of 1
from_kprojid_munged---of 2
from_kuid100%of 1
from_kuid_munged---of 3
gid_m_show---of 3
gid_m_start---of 5
in_userns---of 3
m_next---of 1
make_kgid---of 1
make_kprojid---of 1
make_kuid---of 1
map_id_range_down---of 12
map_id_up80%of 10
map_write---of 98
map_write.cold---of 1
ns_get_owner---of 7
proc_gid_map_write---of 5
proc_projid_map_write---of 5
proc_setgroups_show---of 2
proc_setgroups_write---of 13
proc_uid_map_write---of 5
projid_m_show---of 3
projid_m_start---of 5
uid_m_show---of 3
uid_m_start---of 7
unshare_userns---of 8
userns_get---of 22
userns_install---of 13
userns_may_setgroups---of 3
userns_owner---of 1
userns_put---of 3

____fput---of 1
__alloc_file75%of 8
__fput---of 34
__fput_sync---of 4
alloc_empty_file58%of 7
alloc_empty_file.cold---of 1
alloc_empty_file_noaccount---of 3
alloc_file75%of 12
alloc_file_clone---of 3
alloc_file_pseudo67%of 6
delayed_fput---of 2
file_free_rcu---of 4
flush_delayed_fput---of 2
fput100%of 1
fput_many17%of 6
get_max_files---of 1
proc_nr_files---of 1

cpumask_any_and_distribute---of 7
cpumask_any_but100%of 4
cpumask_local_spread---of 11
cpumask_next100%of 1
cpumask_next_and---of 1
cpumask_next_wrap---of 6

__tlb_remove_page_size---of 12
tlb_finish_mmu68%of 28
tlb_flush_mmu55%of 24
tlb_gather_mmu63%of 8
tlb_remove_table10%of 41
tlb_remove_table_rcu---of 3
tlb_remove_table_sync_one---of 1

__traceiter_tlb_flush---of 4
adjust_range_page_size_mask---of 10
alloc_low_pages---of 22
cachemode2protval---of 4
cr4_set_bits---of 5
cr4_set_bits_and_update_boot---of 3
devmem_is_allowed---of 6
free_init_pages---of 5
free_initmem---of 1
free_kernel_image_pages---of 3
init_memory_mapping---of 21
max_swapfile_size---of 4
perf_trace_tlb_flush---of 6
pfn_range_is_mapped---of 5
pgprot2cachemode67%of 3
trace_event_raw_event_tlb_flush---of 10
trace_raw_output_tlb_flush---of 4
update_cache_mode_entry---of 4
x86_has_pat_wp---of 1

should_fail_usercopy100%of 1

__ia32_compat_sys_ftruncate---of 1
__ia32_compat_sys_open---of 5
__ia32_compat_sys_openat---of 5
__ia32_compat_sys_truncate---of 3
__ia32_sys_access---of 1
__ia32_sys_chdir---of 8
__ia32_sys_chmod---of 1
__ia32_sys_chown---of 1
__ia32_sys_chroot---of 10
__ia32_sys_close---of 5
__ia32_sys_close_range---of 1
__ia32_sys_creat---of 1
__ia32_sys_faccessat---of 1
__ia32_sys_faccessat2---of 1
__ia32_sys_fallocate---of 5
__ia32_sys_fchdir---of 8
__ia32_sys_fchmod---of 9
__ia32_sys_fchmodat---of 1
__ia32_sys_fchown---of 1
__ia32_sys_fchownat---of 1
__ia32_sys_ftruncate---of 1
__ia32_sys_lchown---of 1
__ia32_sys_open---of 5
__ia32_sys_openat---of 5
__ia32_sys_openat2---of 10
__ia32_sys_truncate---of 3
__x64_sys_access---of 1
__x64_sys_chdir---of 8
__x64_sys_chmod---of 1
__x64_sys_chown---of 1
__x64_sys_chroot---of 10
__x64_sys_close---of 5
__x64_sys_close_range---of 1
__x64_sys_creat---of 1
__x64_sys_faccessat---of 1
__x64_sys_faccessat2---of 1
__x64_sys_fallocate---of 5
__x64_sys_fchdir---of 8
__x64_sys_fchmod---of 9
__x64_sys_fchmodat---of 1
__x64_sys_fchown---of 1
__x64_sys_fchownat---of 1
__x64_sys_ftruncate---of 1
__x64_sys_lchown---of 1
__x64_sys_open---of 5
__x64_sys_openat---of 5
__x64_sys_openat2---of 10
__x64_sys_truncate---of 3
__x64_sys_vhangup---of 3
build_open_flags56%of 47
build_open_how---of 5
chmod_common---of 10
chown_common---of 16
dentry_open---of 7
do_dentry_open44%of 50
do_faccessat---of 34
do_fchmodat---of 4
do_fchownat---of 10
do_sys_ftruncate---of 36
do_sys_open---of 5
do_sys_openat2---of 17
do_sys_truncate---of 3
do_sys_truncate.part.0---of 5
do_truncate---of 8
file_open_name---of 32
file_open_root---of 32
file_path---of 1
filp_close---of 7
filp_close.cold---of 1
filp_open---of 4
finish_no_open---of 1
finish_open---of 3
generic_file_open67%of 3
ksys_fallocate---of 4
ksys_fchown---of 9
nonseekable_open---of 1
open_with_fake_path---of 4
stream_open---of 1
vfs_fallocate---of 54
vfs_fchmod---of 4
vfs_fchown---of 7
vfs_open100%of 1
vfs_truncate---of 23

__account_locked_vm---of 13
__page_mapcount---of 9
__vcalloc---of 4
__vm_enough_memory---of 13
__vma_link_list80%of 5
__vma_unlink_list---of 5
__vmalloc_array---of 4
account_locked_vm---of 5
get_cmdline---of 13
get_cmdline.cold---of 1
kfree_const---of 3
kmemdup---of 3
kmemdup_nul---of 5
kstrdup---of 4
kstrdup_const---of 5
kstrndup---of 8
kstrndup.cold---of 1
kvfree---of 3
kvfree_sensitive---of 4
kvmalloc_node---of 13
kvrealloc---of 8
memcmp_pages---of 1
memdup_user---of 8
memdup_user_nul---of 9
overcommit_kbytes_handler---of 4
overcommit_policy_handler---of 8
overcommit_ratio_handler---of 4
page_anon_vma---of 5
page_mapped---of 14
page_mapping62%of 13
page_mapping_file---of 7
page_rmapping---of 3
randomize_page---of 7
randomize_stack_top---of 4
strndup_user---of 6
sync_overcommit_as---of 1
vcalloc---of 4
vm_commit_limit---of 4
vm_memory_committed---of 1
vm_mmap---of 4
vm_mmap_pgoff100%of 5
vma_is_stack_for_current---of 3
vmalloc_array---of 4
vmemdup_user---of 9

__alloc_percpu---of 1
__alloc_percpu_gfp100%of 1
__alloc_reserved_percpu---of 1
__is_kernel_percpu_address---of 7
__pcpu_chunk_move---of 4
__traceiter_percpu_alloc_percpu---of 4
__traceiter_percpu_alloc_percpu_fail---of 4
__traceiter_percpu_create_chunk---of 4
__traceiter_percpu_destroy_chunk---of 4
__traceiter_percpu_free_percpu---of 4
cpumask_weight.constprop.0---of 1
free_percpu---of 44
is_kernel_percpu_address---of 5
pcpu_alloc32%of 105
pcpu_alloc.cold---of 2
pcpu_alloc_area80%of 29
pcpu_balance_workfn---of 70
pcpu_block_refresh_hint80%of 5
pcpu_block_update27%of 26
pcpu_block_update_hint_alloc43%of 33
pcpu_chunk_populated---of 5
pcpu_chunk_refresh_hint---of 7
pcpu_chunk_relocate72%of 7
pcpu_create_chunk---of 22
pcpu_dump_alloc_info---of 22
pcpu_find_block_fit58%of 14
pcpu_free_area---of 36
pcpu_free_pages.constprop.0---of 6
pcpu_get_pages---of 7
pcpu_init_md_blocks---of 2
pcpu_mem_zalloc---of 8
pcpu_next_fit_region.constprop.077%of 17
pcpu_next_md_free_region---of 11
pcpu_nr_pages---of 1
pcpu_populate_chunk---of 30
per_cpu_ptr_to_phys---of 13
percpu_ref_put_many.constprop.0---of 17
perf_trace_percpu_alloc_percpu---of 6
perf_trace_percpu_alloc_percpu_fail---of 6
perf_trace_percpu_create_chunk---of 6
perf_trace_percpu_destroy_chunk---of 6
perf_trace_percpu_free_percpu---of 6
set_bit---of 1
trace_event_raw_event_percpu_alloc_percpu---of 10
trace_event_raw_event_percpu_alloc_percpu_fail---of 10
trace_event_raw_event_percpu_create_chunk---of 10
trace_event_raw_event_percpu_destroy_chunk---of 10
trace_event_raw_event_percpu_free_percpu---of 10
trace_raw_output_percpu_alloc_percpu---of 4
trace_raw_output_percpu_alloc_percpu_fail---of 4
trace_raw_output_percpu_create_chunk---of 5
trace_raw_output_percpu_destroy_chunk---of 5
trace_raw_output_percpu_free_percpu---of 4

__cleanup_sighand---of 5
__delayed_free_task---of 1
__do_sys_clone---of 1
__do_sys_clone3---of 19
__ia32_sys_clone---of 1
__ia32_sys_clone3---of 1
__ia32_sys_set_tid_address---of 1
__ia32_sys_unshare---of 1
__ia32_sys_vfork---of 1
__mmdrop---of 16
__mmdrop.cold---of 1
__put_task_struct---of 21
__put_task_struct_rcu_cb---of 1
__refcount_add.constprop.0---of 5
__traceiter_task_newtask---of 4
__traceiter_task_rename---of 4
__x64_sys_clone---of 1
__x64_sys_clone3---of 1
__x64_sys_fork---of 1
__x64_sys_set_tid_address---of 1
__x64_sys_unshare---of 1
copy_clone_args_from_user---of 21
copy_process---of 283
create_io_thread---of 1
exec_mm_release---of 1
exit_mm_release---of 1
free_task---of 7
get_mm_exe_file---of 22
get_task_exe_file---of 4
get_task_mm---of 4
kernel_clone---of 45
kernel_thread---of 1
ksys_unshare---of 52
lockdep_tasklist_lock_is_held---of 1
mm_access---of 8
mm_alloc---of 3
mm_init---of 12
mm_release---of 12
mmdrop_async---of 2
mmdrop_async_fn---of 1
mmput---of 11
mmput_async---of 2
mmput_async_fn---of 10
nr_processes---of 3
perf_trace_task_newtask---of 6
perf_trace_task_rename---of 6
pidfd_pid---of 4
pidfd_poll---of 6
pidfd_release---of 1
pidfd_show_fdinfo---of 6
ptrace_event_pid---of 24
put_task_stack---of 13
set_mm_exe_file---of 4
set_task_stack_end_magic---of 1
sighand_ctor---of 1
sysctl_max_threads---of 4
trace_event_raw_event_task_newtask---of 10
trace_event_raw_event_task_rename---of 10
trace_raw_output_task_newtask---of 4
trace_raw_output_task_rename---of 4
unshare_fd---of 6
unshare_files---of 8
vm_area_alloc100%of 3
vm_area_dup100%of 3
vm_area_free100%of 1
walk_process_tree---of 12

__inode_security_revalidate78%of 9
audit_inode_permission---of 3
backing_inode_security---of 8
check_nnp_nosuid.isra.0---of 10
cred_has_capability.isra.073%of 11
cred_has_capability.isra.0.cold---of 1
delayed_superblock_init---of 1
file_has_perm80%of 5
file_map_prot_check42%of 24
get_order---of 1
has_cap_mac_admin---of 5
inode_doinit_use_xattr---of 13
inode_doinit_use_xattr.cold---of 3
inode_doinit_with_dentry---of 58
inode_has_perm100%of 5
inode_security88%of 8
ioctl_has_perm.constprop.0.isra.0---of 5
ipc_has_perm---of 1
match_file---of 8
may_context_mount_inode_relabel.isra.0---of 3
may_context_mount_sb_relabel.isra.0---of 4
may_create---of 5
may_link---of 6
ptrace_parent_sid---of 21
sb_finish_set_opts---of 23
sb_finish_set_opts.cold---of 4
selinux_add_mnt_opt---of 11
selinux_add_opt---of 19
selinux_add_opt.cold---of 1
selinux_binder_set_context_mgr---of 1
selinux_binder_transaction---of 4
selinux_binder_transfer_binder---of 1
selinux_binder_transfer_file---of 10
selinux_bprm_committed_creds---of 7
selinux_bprm_committing_creds---of 16
selinux_bprm_creds_for_exec---of 17
selinux_capable100%of 1
selinux_capget---of 1
selinux_capset---of 1
selinux_complete_init---of 1
selinux_cred_getsecid---of 1
selinux_cred_prepare---of 1
selinux_cred_transfer---of 1
selinux_d_instantiate---of 2
selinux_dentry_create_files_as---of 4
selinux_dentry_init_security---of 4
selinux_determine_inode_label---of 8
selinux_disable---of 3
selinux_disable.cold---of 5
selinux_file_alloc_security100%of 1
selinux_file_fcntl---of 7
selinux_file_ioctl---of 9
selinux_file_ioctl_compat---of 2
selinux_file_lock---of 4
selinux_file_mprotect---of 24
selinux_file_open67%of 9
selinux_file_permission---of 22
selinux_file_receive---of 6
selinux_file_send_sigiotask---of 4
selinux_file_set_fowner---of 1
selinux_free_mnt_opts---of 1
selinux_fs_context_dup---of 11
selinux_fs_context_parse_param---of 4
selinux_getprocattr---of 42
selinux_inet_conn_established---of 4
selinux_inet_conn_request---of 7
selinux_inet_csk_clone---of 1
selinux_inet_sys_rcv_skb---of 5
selinux_inode_alloc_security---of 3
selinux_inode_copy_up---of 6
selinux_inode_copy_up_xattr---of 2
selinux_inode_create---of 1
selinux_inode_follow_link---of 9
selinux_inode_free_security---of 5
selinux_inode_getattr---of 1
selinux_inode_getsecctx---of 3
selinux_inode_getsecid---of 3
selinux_inode_getsecurity---of 10
selinux_inode_getxattr---of 1
selinux_inode_init_security---of 16
selinux_inode_invalidate_secctx---of 3
selinux_inode_link---of 1
selinux_inode_listsecurity---of 5
selinux_inode_listxattr---of 1
selinux_inode_mkdir---of 1
selinux_inode_mknod---of 2
selinux_inode_notifysecctx---of 2
selinux_inode_permission65%of 34
selinux_inode_post_setxattr---of 6
selinux_inode_post_setxattr.cold---of 1
selinux_inode_readlink---of 1
selinux_inode_removexattr---of 5
selinux_inode_rename---of 11
selinux_inode_rmdir---of 1
selinux_inode_setattr---of 9
selinux_inode_setsecctx---of 1
selinux_inode_setsecurity---of 10
selinux_inode_setxattr---of 24
selinux_inode_symlink---of 1
selinux_inode_unlink---of 1
selinux_ip_forward---of 16
selinux_ip_postroute---of 50
selinux_ip_postroute_compat---of 9
selinux_ipc_getsecid---of 1
selinux_ipc_permission---of 6
selinux_ipv4_forward---of 1
selinux_ipv4_output---of 7
selinux_ipv4_postroute---of 1
selinux_ipv6_forward---of 1
selinux_ipv6_output---of 7
selinux_ipv6_postroute---of 1
selinux_ismaclabel---of 1
selinux_kernel_act_as---of 3
selinux_kernel_create_files_as---of 5
selinux_kernel_load_data---of 4
selinux_kernel_module_request---of 1
selinux_kernel_read_file---of 10
selinux_kernfs_init_security---of 15
selinux_key_alloc---of 5
selinux_key_free---of 1
selinux_key_getsecurity---of 3
selinux_key_permission---of 4
selinux_lockdown---of 7
selinux_lsm_notifier_avc_callback---of 4
selinux_mmap_addr50%of 4
selinux_mmap_file100%of 5
selinux_mount---of 4
selinux_move_mount---of 1
selinux_msg_msg_alloc_security---of 1
selinux_msg_queue_alloc_security---of 1
selinux_msg_queue_associate---of 1
selinux_msg_queue_msgctl---of 6
selinux_msg_queue_msgrcv---of 3
selinux_msg_queue_msgsnd---of 6
selinux_netcache_avc_callback---of 4
selinux_netlink_send---of 13
selinux_netlink_send.cold---of 1
selinux_nf_register---of 1
selinux_nf_unregister---of 1
selinux_parse_skb.constprop.0---of 62
selinux_path_notify---of 8
selinux_perf_event_alloc---of 3
selinux_perf_event_free---of 1
selinux_perf_event_open---of 6
selinux_perf_event_read---of 1
selinux_perf_event_write---of 1
selinux_ptrace_access_check---of 4
selinux_ptrace_traceme---of 1
selinux_quota_on---of 1
selinux_quotactl---of 5
selinux_release_secctx---of 1
selinux_req_classify_flow---of 1
selinux_sb_alloc_security---of 3
selinux_sb_clone_mnt_opts---of 38
selinux_sb_clone_mnt_opts.cold---of 1
selinux_sb_eat_lsm_opts---of 37
selinux_sb_free_security---of 1
selinux_sb_kern_mount---of 1
selinux_sb_remount---of 27
selinux_sb_remount.cold---of 5
selinux_sb_show_options---of 17
selinux_sb_statfs---of 1
selinux_sctp_assoc_request---of 19
selinux_sctp_bind_connect---of 13
selinux_sctp_sk_clone---of 3
selinux_secctx_to_secid---of 1
selinux_secid_to_secctx---of 1
selinux_secmark_refcount_dec---of 1
selinux_secmark_refcount_inc---of 1
selinux_secmark_relabel_packet---of 1
selinux_sem_alloc_security---of 1
selinux_sem_associate---of 1
selinux_sem_semctl---of 6
selinux_sem_semop---of 2
selinux_set_mnt_opts---of 92
selinux_set_mnt_opts.cold---of 8
selinux_setprocattr---of 46
selinux_shm_alloc_security---of 1
selinux_shm_associate---of 1
selinux_shm_shmat---of 2
selinux_shm_shmctl---of 6
selinux_sk_alloc_security---of 5
selinux_sk_clone_security---of 1
selinux_sk_free_security---of 1
selinux_sk_getsecid---of 3
selinux_skb_peerlbl_sid---of 3
selinux_skb_peerlbl_sid.cold---of 1
selinux_sock_graft---of 6
selinux_sock_rcv_skb_compat---of 6
selinux_socket_accept---of 7
selinux_socket_bind---of 31
selinux_socket_connect---of 3
selinux_socket_connect_helper.isra.0---of 17
selinux_socket_create---of 24
selinux_socket_getpeername---of 1
selinux_socket_getpeersec_dgram---of 14
selinux_socket_getpeersec_stream---of 15
selinux_socket_getsockname---of 1
selinux_socket_getsockopt---of 1
selinux_socket_listen---of 1
selinux_socket_post_create---of 29
selinux_socket_recvmsg---of 1
selinux_socket_sendmsg---of 1
selinux_socket_setsockopt---of 3
selinux_socket_shutdown---of 1
selinux_socket_sock_rcv_skb---of 21
selinux_socket_socketpair---of 1
selinux_socket_unix_may_send---of 1
selinux_socket_unix_stream_connect---of 4
selinux_syslog---of 5
selinux_task_alloc---of 1
selinux_task_getioprio---of 1
selinux_task_getpgid---of 1
selinux_task_getscheduler---of 1
selinux_task_getsecid---of 1
selinux_task_getsid---of 1
selinux_task_kill---of 8
selinux_task_movememory---of 1
selinux_task_prlimit---of 6
selinux_task_setioprio---of 1
selinux_task_setnice---of 1
selinux_task_setpgid---of 1
selinux_task_setrlimit---of 4
selinux_task_setscheduler---of 1
selinux_task_to_inode---of 4
selinux_tun_dev_alloc_security---of 3
selinux_tun_dev_attach---of 1
selinux_tun_dev_attach_queue---of 1
selinux_tun_dev_create---of 1
selinux_tun_dev_free_security---of 1
selinux_tun_dev_open---of 5
selinux_umount---of 1
selinux_vm_enough_memory---of 1
show_sid---of 6
sock_has_perm---of 3
task_sid---of 18

lockref_get100%of 1
lockref_get_not_dead100%of 3
lockref_get_not_zero---of 3
lockref_get_or_lock---of 3
lockref_mark_dead---of 3
lockref_put_not_zero---of 3
lockref_put_or_lock---of 3
lockref_put_return100%of 1

strncpy_from_user52%of 29

__anon_vma_prepare---of 13
__page_set_anon_rmap---of 11
__put_anon_vma---of 11
anon_vma_clone52%of 25
anon_vma_ctor---of 1
anon_vma_fork---of 10
do_page_add_anon_rmap---of 32
flush_tlb_batched_pending50%of 2
hugepage_add_anon_rmap---of 8
hugepage_add_new_anon_rmap---of 11
invalid_migration_vma---of 3
invalid_mkclean_vma---of 1
invalid_page_referenced_vma---of 24
mm_find_pmd---of 12
page_add_anon_rmap---of 2
page_add_file_rmap---of 14
page_add_new_anon_rmap---of 18
page_address_in_vma---of 22
page_get_anon_vma---of 28
page_lock_anon_vma_read---of 39
page_mkclean---of 13
page_mkclean_one---of 21
page_move_anon_rmap---of 9
page_not_mapped---of 1
page_referenced---of 24
page_referenced_one---of 20
page_remove_rmap---of 41
page_unlock_anon_vma_read---of 1
rmap_walk---of 5
rmap_walk_anon---of 32
rmap_walk_file---of 33
rmap_walk_locked---of 5
try_to_munlock---of 15
try_to_unmap---of 21
try_to_unmap_flush---of 2
try_to_unmap_flush_dirty---of 3
try_to_unmap_one---of 118
unlink_anon_vmas54%of 26

ext4_buffered_write_iter---of 13
ext4_dio_write_end_io---of 7
ext4_file_mmap---of 7
ext4_file_open22%of 33
ext4_file_read_iter---of 14
ext4_file_write_iter---of 98
ext4_generic_write_checks---of 12
ext4_llseek---of 10
ext4_release_file---of 11

page_counter_cancel50%of 2
page_counter_charge---of 5
page_counter_memparse---of 5
page_counter_set_low---of 2
page_counter_set_max---of 5
page_counter_set_min---of 2
page_counter_try_charge67%of 9
page_counter_uncharge100%of 2
propagate_protected_usage78%of 9

__percpu_ref_exit---of 5
__percpu_ref_switch_mode---of 40
percpu_ref_exit---of 2
percpu_ref_init67%of 12
percpu_ref_is_zero---of 7
percpu_ref_kill_and_confirm---of 4
percpu_ref_put_many.constprop.0---of 17
percpu_ref_reinit---of 7
percpu_ref_resurrect---of 20
percpu_ref_switch_to_atomic---of 1
percpu_ref_switch_to_atomic_rcu---of 9
percpu_ref_switch_to_atomic_sync---of 6
percpu_ref_switch_to_percpu---of 1

cap_bprm_creds_from_file---of 93
cap_bprm_creds_from_file.cold---of 2
cap_capable34%of 9
cap_capget---of 18
cap_capset---of 24
cap_convert_nscap---of 14
cap_inode_getsecurity---of 34
cap_inode_killpriv---of 2
cap_inode_need_killpriv---of 1
cap_inode_removexattr---of 7
cap_inode_setxattr---of 5
cap_mmap_addr19%of 11
cap_mmap_file100%of 1
cap_ptrace_access_check---of 27
cap_ptrace_traceme---of 24
cap_safe_nice---of 28
cap_settime---of 1
cap_task_fix_setuid---of 30
cap_task_prctl---of 45
cap_task_setioprio---of 1
cap_task_setnice---of 1
cap_task_setscheduler---of 1
cap_vm_enough_memory---of 9
get_vfs_caps_from_disk---of 20

errseq_check---of 4
errseq_check_and_advance---of 5
errseq_sample100%of 2
errseq_set---of 9

__cpu_hotplug_enable---of 4
__cpuhp_remove_state---of 17
__cpuhp_remove_state_cpuslocked---of 11
__cpuhp_setup_state---of 17
__cpuhp_setup_state_cpuslocked---of 26
__cpuhp_state_add_instance53%of 17
__cpuhp_state_add_instance_cpuslocked54%of 13
__cpuhp_state_remove_instance---of 28
__traceiter_cpuhp_enter---of 4
__traceiter_cpuhp_exit---of 4
__traceiter_cpuhp_multi_enter---of 4
_cpu_down---of 27
_cpu_up---of 17
add_cpu---of 1
bringup_cpu---of 9
bringup_hibernate_cpu---of 2
bringup_hibernate_cpu.cold---of 2
bringup_nonboot_cpus---of 4
clear_bit---of 1
clear_tasks_mm_cpumask---of 19
cpu_device_down---of 3
cpu_device_up---of 1
cpu_hotplug_disable---of 1
cpu_hotplug_enable---of 1
cpu_hotplug_pm_callback---of 6
cpu_maps_update_begin---of 1
cpu_maps_update_done---of 1
cpu_mitigations_auto_nosmt---of 1
cpu_mitigations_off---of 1
cpu_smt_possible---of 1
cpu_up---of 8
cpu_up.cold---of 1
cpuhp_complete_idle_dead---of 1
cpuhp_create---of 1
cpuhp_invoke_callback---of 107
cpuhp_issue_call---of 27
cpuhp_kick_ap---of 17
cpuhp_kick_ap_work---of 23
cpuhp_online_idle---of 2
cpuhp_report_idle_dead---of 3
cpuhp_rollback_install---of 4
cpuhp_should_run---of 1
cpuhp_smt_disable---of 8
cpuhp_smt_enable---of 7
cpuhp_thread_fun---of 30
cpus_read_lock---of 9
cpus_read_trylock---of 10
cpus_read_unlock---of 9
cpus_write_lock---of 1
cpus_write_unlock---of 1
finish_cpu---of 6
freeze_secondary_cpus---of 34
freeze_secondary_cpus.cold---of 3
init_cpu_online---of 1
init_cpu_possible---of 1
init_cpu_present---of 1
lockdep_assert_cpus_held75%of 4
notify_cpu_starting---of 4
perf_trace_cpuhp_enter---of 6
perf_trace_cpuhp_exit---of 6
perf_trace_cpuhp_multi_enter---of 6
remove_cpu---of 1
set_bit---of 1
set_cpu_online---of 5
show_cpuhp_fail---of 1
show_cpuhp_state---of 1
show_cpuhp_states---of 5
show_cpuhp_target---of 1
show_smt_active---of 3
show_smt_control---of 1
smp_shutdown_nonboot_cpus---of 10
store_smt_control---of 18
take_cpu_down---of 9
takedown_cpu---of 8
thaw_secondary_cpus---of 2
thaw_secondary_cpus.cold---of 6
trace_event_raw_event_cpuhp_enter---of 10
trace_event_raw_event_cpuhp_exit---of 10
trace_event_raw_event_cpuhp_multi_enter---of 10
trace_raw_output_cpuhp_enter---of 4
trace_raw_output_cpuhp_exit---of 4
trace_raw_output_cpuhp_multi_enter---of 4
trace_suspend_resume---of 9
write_cpuhp_fail---of 10
write_cpuhp_target---of 21

__alloc_fd79%of 23
__alloc_fd.cold---of 1
__close_fd---of 3
__close_fd_get_file---of 11
__close_range---of 33
__f_unlock_pos---of 1
__fd_install30%of 34
__fdget100%of 1
__fdget_pos---of 7
__fdget_raw---of 1
__fget_files63%of 24
__fget_light40%of 10
__get_unused_fd_flags100%of 1
__ia32_sys_dup---of 5
__ia32_sys_dup2---of 26
__ia32_sys_dup3---of 1
__put_unused_fd---of 8
__receive_fd---of 13
__x64_sys_dup---of 5
__x64_sys_dup2---of 26
__x64_sys_dup3---of 1
alloc_fdtable---of 9
close_fd_get_file---of 1
copy_fd_bitmaps---of 1
do_close_on_exec---of 16
do_dup2---of 19
dup_fd---of 39
exit_files---of 2
expand_files12%of 35
f_dupfd---of 4
fd_install100%of 1
fget100%of 1
fget_many---of 1
fget_raw---of 1
fget_task---of 3
free_fdtable_rcu---of 1
get_close_on_exec---of 19
get_files_struct---of 3
get_unused_fd_flags100%of 1
iterate_fd---of 19
ksys_dup3---of 16
pick_file---of 10
put_files_struct---of 11
put_unused_fd---of 1
replace_fd---of 8
reset_files_struct---of 1
set_close_on_exec---of 11

__percpu_counter_compare---of 5
__percpu_counter_init100%of 3
__percpu_counter_sum---of 4
compute_batch_value---of 1
percpu_counter_add_batch100%of 4
percpu_counter_cpu_dead---of 3
percpu_counter_destroy---of 2
percpu_counter_set---of 4
percpu_counter_sync---of 1

__mod_timer---of 58
__next_timer_interrupt---of 10
__round_jiffies---of 6
__round_jiffies_relative---of 6
__round_jiffies_up---of 3
__round_jiffies_up_relative---of 3
__run_timers.part.0---of 35
__traceiter_hrtimer_cancel---of 4
__traceiter_hrtimer_expire_entry---of 4
__traceiter_hrtimer_expire_exit---of 4
__traceiter_hrtimer_init---of 4
__traceiter_hrtimer_start---of 4
__traceiter_itimer_expire---of 4
__traceiter_itimer_state---of 4
__traceiter_tick_stop---of 4
__traceiter_timer_cancel---of 4
__traceiter_timer_expire_entry---of 4
__traceiter_timer_expire_exit---of 4
__traceiter_timer_init---of 4
__traceiter_timer_start---of 4
add_timer---of 3
add_timer_on---of 15
calc_wheel_index---of 24
call_timer_fn---of 25
del_timer---of 3
detach_if_pending---of 19
enqueue_timer---of 17
get_next_timer_interrupt---of 20
init_timer_key50%of 12
lock_timer_base---of 8
mod_timer---of 1
mod_timer_pending---of 1
msleep---of 4
msleep_interruptible---of 6
perf_trace_hrtimer_class---of 6
perf_trace_hrtimer_expire_entry---of 6
perf_trace_hrtimer_init---of 6
perf_trace_hrtimer_start---of 6
perf_trace_itimer_expire---of 8
perf_trace_itimer_state---of 6
perf_trace_tick_stop---of 6
perf_trace_timer_class---of 6
perf_trace_timer_expire_entry---of 6
perf_trace_timer_start---of 6
process_timeout---of 1
round_jiffies---of 6
round_jiffies_relative---of 6
round_jiffies_up---of 3
round_jiffies_up_relative---of 3
run_local_timers---of 3
run_timer_softirq---of 4
schedule_timeout---of 15
schedule_timeout_idle---of 1
schedule_timeout_interruptible---of 1
schedule_timeout_killable---of 1
schedule_timeout_uninterruptible---of 1
timer_clear_idle---of 1
timer_delete_sync---of 13
timer_migration_handler---of 7
timer_reduce---of 1
timer_update_keys---of 5
timers_dead_cpu---of 27
timers_prepare_cpu---of 1
timers_update_nohz---of 1
trace_event_raw_event_hrtimer_class---of 10
trace_event_raw_event_hrtimer_expire_entry---of 10
trace_event_raw_event_hrtimer_init---of 10
trace_event_raw_event_hrtimer_start---of 10
trace_event_raw_event_itimer_expire---of 12
trace_event_raw_event_itimer_state---of 10
trace_event_raw_event_tick_stop---of 10
trace_event_raw_event_timer_class---of 10
trace_event_raw_event_timer_expire_entry---of 10
trace_event_raw_event_timer_start---of 10
trace_raw_output_hrtimer_class---of 5
trace_raw_output_hrtimer_expire_entry---of 4
trace_raw_output_hrtimer_init---of 4
trace_raw_output_hrtimer_start---of 4
trace_raw_output_itimer_expire---of 4
trace_raw_output_itimer_state---of 4
trace_raw_output_tick_stop---of 4
trace_raw_output_timer_class---of 5
trace_raw_output_timer_expire_entry---of 4
trace_raw_output_timer_start---of 4
try_to_del_timer_sync---of 3
update_process_times---of 6
usleep_range---of 3

__do_sys_capget---of 34
__do_sys_capset---of 20
__ia32_sys_capget---of 1
__ia32_sys_capset---of 1
__x64_sys_capget---of 1
__x64_sys_capset---of 1
cap_validate_magic---of 11
capable---of 5
capable.cold---of 1
capable_wrt_inode_uidgid84%of 6
capable_wrt_inode_uidgid.cold---of 1
file_ns_capable---of 5
has_capability---of 1
has_capability_noaudit---of 1
has_ns_capability---of 18
has_ns_capability_noaudit---of 18
ns_capable---of 5
ns_capable.cold---of 1
ns_capable_noaudit80%of 5
ns_capable_noaudit.cold---of 1
ns_capable_setid---of 5
ns_capable_setid.cold---of 1
privileged_wrt_inode_uidgid---of 4
ptracer_capable---of 20

__check_object_size85%of 20
__check_object_size.cold---of 5
check_stack_object50%of 6
usercopy_abort---of 4
usercopy_warn---of 5

__cpuset_memory_pressure_bump---of 22
__cpuset_node_allowed10%of 32
alloc_trial_cpuset---of 3
bitmap_fill.constprop.0---of 1
cpumask_weight---of 1
cpuset_attach---of 28
cpuset_bind---of 6
cpuset_can_attach---of 28
cpuset_cancel_attach---of 5
cpuset_change_task_nodemask---of 6
cpuset_common_seq_show---of 8
cpuset_cpus_allowed---of 27
cpuset_cpus_allowed_fallback---of 26
cpuset_css_alloc---of 4
cpuset_css_free---of 1
cpuset_css_offline---of 9
cpuset_css_online---of 39
cpuset_force_rebuild---of 1
cpuset_fork---of 6
cpuset_hotplug_workfn---of 165
cpuset_hotplug_workfn.cold---of 1
cpuset_lock---of 1
cpuset_mem_spread_node---of 3
cpuset_mems_allowed---of 25
cpuset_mems_allowed_intersects---of 1
cpuset_migrate_mm---of 3
cpuset_migrate_mm_workfn---of 1
cpuset_nodemask_valid_mems_allowed---of 1
cpuset_post_attach---of 1
cpuset_print_current_mems_allowed---of 15
cpuset_print_current_mems_allowed.cold---of 8
cpuset_read_s64---of 3
cpuset_read_u64---of 20
cpuset_slab_spread_node---of 3
cpuset_task_status_allowed---of 2
cpuset_unlock---of 1
cpuset_update_active_cpus---of 1
cpuset_update_task_spread_flag---of 6
cpuset_wait_for_hotplug---of 1
cpuset_write_resmask---of 153
cpuset_write_s64---of 11
cpuset_write_u64---of 13
current_cpuset_is_being_rebound---of 22
dec_dl_tasks_cs---of 10
dl_update_tasks_root_domain---of 7
fmeter_update---of 4
get_order---of 1
inc_dl_tasks_cs---of 10
is_cpuset_subset---of 5
proc_cpuset_show---of 66
rebuild_sched_domains---of 1
rebuild_sched_domains_locked---of 179
rebuild_sched_domains_locked.cold---of 1
sched_partition_show---of 8
sched_partition_write---of 45
update_cpumasks_hier---of 111
update_domain_attr_tree---of 23
update_flag---of 16
update_parent_subparts_cpumask---of 48
update_prstate---of 18
update_sibling_cpumasks---of 74
update_tasks_cpumask---of 7
update_tasks_flags---of 4
update_tasks_nodemask---of 10
validate_change---of 49

__traceiter_irq_disable---of 4
__traceiter_irq_enable---of 4
perf_trace_preemptirq_template---of 6
trace_event_raw_event_preemptirq_template---of 10
trace_hardirqs_off40%of 10
trace_hardirqs_off_caller---of 10
trace_hardirqs_off_finish55%of 11
trace_hardirqs_on50%of 12
trace_hardirqs_on_caller---of 12
trace_hardirqs_on_prepare59%of 12
trace_raw_output_preemptirq_template---of 4

__init_cache_modes---of 4
__init_cache_modes.cold---of 1
arch_io_free_memtype_wc---of 2
arch_io_reserve_memtype_wc---of 1
get_pat_info40%of 10
init_cache_modes---of 7
lookup_memtype37%of 11
memtype_free---of 3
memtype_free.part.0---of 10
memtype_free.part.0.cold---of 2
memtype_free_io---of 2
memtype_get_idx---of 4
memtype_kernel_map_sync---of 15
memtype_kernel_map_sync.cold---of 3
memtype_reserve---of 43
memtype_reserve.cold---of 17
memtype_reserve_io---of 15
memtype_seq_next---of 1
memtype_seq_open---of 1
memtype_seq_show---of 3
memtype_seq_start---of 3
memtype_seq_stop---of 1
pagerange_is_ram_callback100%of 3
pat_disable---of 4
pat_disable.cold---of 1
pat_enabled---of 1
pat_init---of 23
pat_init.cold---of 1
pat_pagerange_is_ram100%of 4
pat_pfn_immune_to_uc_mtrr---of 1
pgprot_writecombine---of 1
pgprot_writethrough---of 1
phys_mem_access_prot---of 1
phys_mem_access_prot_allowed---of 2
reserve_pfn_range20%of 20
reserve_pfn_range.cold---of 11
track_pfn_copy---of 4
track_pfn_insert---of 3
track_pfn_remap50%of 12
untrack_pfn75%of 12
untrack_pfn_moved---of 1

vmacache_find100%of 11
vmacache_update100%of 3

anon_inode_getfd---of 5
anon_inode_getfile63%of 8
anon_inodefs_dname---of 1
anon_inodefs_init_fs_context---of 3

arch_get_random_long---of 5
prandom_bytes---of 6
prandom_bytes_state---of 5
prandom_reseed---of 13
prandom_seed---of 9
prandom_seed_full_state---of 11
prandom_timer_start---of 1
prandom_u32100%of 1
prandom_u32_state---of 1

___d_drop---of 13
__d_alloc54%of 15
__d_drop---of 2
__d_free---of 1
__d_free_external---of 1
__d_instantiate62%of 13
__d_instantiate_anon---of 20
__d_lookup---of 34
__d_lookup_done---of 10
__d_lookup_rcu60%of 25
__d_move---of 76
__d_obtain_alias---of 11
__d_rehash---of 10
__dentry_kill---of 24
__dput_to_list---of 8
__lock_parent---of 18
d_add---of 23
d_add_ci---of 11
d_alloc---of 3
d_alloc_anon---of 1
d_alloc_cursor---of 5
d_alloc_name---of 3
d_alloc_parallel---of 128
d_alloc_pseudo100%of 3
d_ancestor---of 4
d_delete---of 5
d_drop---of 3
d_exact_alias---of 20
d_exchange---of 9
d_find_alias---of 13
d_find_any_alias---of 3
d_flags_for_inode47%of 13
d_genocide---of 1
d_genocide_kill---of 8
d_hash_and_lookup---of 5
d_instantiate75%of 4
d_instantiate_anon---of 1
d_instantiate_new---of 7
d_invalidate---of 12
d_lookup---of 8
d_lru_add---of 6
d_lru_del---of 6
d_lru_shrink_move---of 5
d_make_root---of 6
d_mark_dontcache---of 5
d_move---of 1
d_obtain_alias---of 1
d_obtain_root---of 1
d_prune_aliases---of 13
d_rehash---of 1
d_set_d_op53%of 19
d_set_fallthru---of 1
d_set_mounted---of 10
d_shrink_del---of 3
d_splice_alias---of 52
d_splice_alias.cold---of 1
d_tmpfile---of 6
d_walk---of 60
dentry_free---of 8
dentry_lru_isolate---of 11
dentry_lru_isolate_shrink---of 3
dentry_unlink_inode---of 13
dget_parent---of 40
dput19%of 79
dput_to_list---of 46
find_submount---of 4
is_subdir---of 24
path_check_mount---of 5
path_has_submounts---of 1
proc_nr_dentry---of 9
prune_dcache_sb---of 1
read_word_at_a_time100%of 1
release_dentry_name_snapshot---of 4
select_collect---of 12
select_collect2---of 20
shrink_dcache_for_umount---of 10
shrink_dcache_parent---of 27
shrink_dcache_sb---of 8
shrink_dentry_list---of 30
shrink_lock_dentry.part.0---of 14
take_dentry_name_snapshot---of 4
umount_check---of 4
umount_check.cold---of 3

__address_space_init_once---of 1
__destroy_inode---of 21
__iget---of 1
__insert_inode_hash---of 3
__remove_inode_hash---of 5
__wait_on_freeing_inode---of 1
address_space_init_once---of 1
alloc_inode---of 13
atime_needs_update---of 21
bmap---of 3
clear_inode---of 13
clear_nlink---of 2
current_time---of 11
dentry_needs_remove_privs---of 7
destroy_inode---of 6
discard_new_inode---of 8
dispose_list---of 2
drop_nlink---of 4
evict---of 27
evict_inodes---of 11
file_modified---of 4
file_remove_privs---of 17
file_update_time---of 17
find_inode---of 11
find_inode_by_ino_rcu---of 13
find_inode_fast---of 11
find_inode_nowait---of 8
find_inode_rcu---of 13
free_inode_nonrcu---of 1
generic_delete_inode---of 1
generic_update_time---of 14
get_next_ino---of 3
get_nr_dirty_inodes---of 6
i_callback---of 3
iget5_locked---of 11
iget_locked---of 21
igrab---of 5
ihold50%of 2
ilookup---of 10
ilookup5---of 8
ilookup5_nowait---of 2
in_group_or_capable---of 3
inc_nlink---of 5
init_once---of 1
init_special_inode---of 7
init_special_inode.cold---of 1
inode_add_lru---of 6
inode_dio_wait---of 7
inode_init_always---of 7
inode_init_once---of 1
inode_init_owner---of 6
inode_insert5---of 17
inode_lru_isolate---of 23
inode_needs_sync---of 7
inode_nohighmem---of 1
inode_owner_or_capable---of 4
inode_sb_list_add---of 1
inode_set_ctime_current---of 1
inode_set_flags---of 4
inode_update_time---of 5
insert_inode_locked---of 23
insert_inode_locked4---of 6
invalidate_inodes---of 14
iput---of 4
iput.part.0---of 33
iunique---of 22
lock_two_inodes---of 11
lock_two_nondirectories---of 8
lockdep_annotate_inode_mutex_key---of 3
lockdep_annotate_inode_mutex_key.part.0---of 2
mode_strip_sgid---of 11
new_inode---of 3
new_inode_pseudo---of 3
no_open---of 1
proc_nr_inodes---of 6
prune_icache_sb---of 1
set_nlink---of 6
timestamp_truncate---of 8
touch_atime---of 26
unlock_new_inode---of 5
unlock_two_nondirectories---of 7
vfs_ioc_fssetxattr_check---of 22
vfs_ioc_setflags_prepare---of 4

__disable_kprobe---of 16
__disarm_kprobe---of 5
__free_insn_slot---of 27
__get_insn_slot---of 34
__get_valid_kprobe---of 13
__is_insn_slot_addr59%of 17
__kretprobe_trampoline_handler---of 20
__kretprobe_trampoline_handler.cold---of 2
__unregister_kprobe_bottom---of 5
__unregister_kprobe_top---of 26
aggr_fault_handler---of 5
aggr_post_handler---of 5
aggr_pre_handler---of 6
alloc_aggr_kprobe---of 5
arch_check_ftrace_location---of 1
arch_deref_entry_point---of 1
arch_kprobe_get_kallsym---of 1
arch_kprobe_on_func_entry---of 1
arch_within_kprobe_blacklist---of 3
arm_kprobe---of 6
cleanup_rp_inst---of 11
collect_garbage_slots---of 9
collect_one_slot.part.0---of 4
disable_kprobe---of 3
dump_kprobe---of 1
enable_kprobe---of 10
get_kprobe---of 4
get_optimized_kprobe---of 10
init_aggr_kprobe---of 6
is_cfi_preamble_symbol---of 10
is_cfi_preamble_symbol.cold---of 1
kill_kprobe---of 20
kprobe_add_area_blacklist---of 7
kprobe_add_ksym_blacklist---of 5
kprobe_blacklist_open---of 4
kprobe_blacklist_seq_next---of 1
kprobe_blacklist_seq_show---of 5
kprobe_blacklist_seq_start---of 1
kprobe_blacklist_seq_stop---of 1
kprobe_busy_begin---of 1
kprobe_busy_end---of 1
kprobe_cache_get_kallsym---of 18
kprobe_disarmed---of 6
kprobe_exceptions_notify---of 1
kprobe_flush_task---of 7
kprobe_free_init_mem---of 8
kprobe_get_kallsym---of 4
kprobe_lookup_name---of 1
kprobe_on_func_entry---of 10
kprobe_optimizer---of 36
kprobe_remove_area_blacklist---of 5
kprobe_seq_next---of 2
kprobe_seq_start---of 2
kprobes_inc_nmissed_count---of 4
kprobes_module_callback---of 41
kprobes_open---of 4
kretprobe_hash_lock---of 1
kretprobe_hash_unlock---of 1
kretprobe_table_lock---of 1
kretprobe_table_unlock---of 1
opt_pre_handler---of 5
optimize_all_kprobes---of 9
optimize_kprobe---of 16
optprobe_queued_unopt---of 6
pre_handler_kretprobe---of 13
proc_kprobes_optimization_handler---of 12
read_enabled_file_bool---of 2
recycle_rp_inst---of 8
register_kprobe---of 76
register_kprobes---of 6
register_kretprobe---of 34
register_kretprobes---of 6
report_probe---of 13
show_kprobe_addr---of 7
unoptimize_kprobe---of 15
unregister_kprobe---of 4
unregister_kprobes---of 2
unregister_kprobes.part.0---of 9
unregister_kretprobe---of 1
unregister_kretprobes---of 2
unregister_kretprobes.part.0---of 9
wait_for_kprobe_optimizer---of 5
within_kprobe_blacklist---of 8
within_kprobe_blacklist.part.0---of 9
write_enabled_file_bool---of 25
write_enabled_file_bool.cold---of 10

__anon_vma_interval_tree_augment_rotate---of 5
__anon_vma_interval_tree_subtree_search---of 9
anon_vma_interval_tree_insert82%of 11
anon_vma_interval_tree_iter_first---of 5
anon_vma_interval_tree_iter_next---of 9
anon_vma_interval_tree_remove16%of 59
vma_interval_tree_augment_rotate60%of 5
vma_interval_tree_insert91%of 11
vma_interval_tree_insert_after---of 14
vma_interval_tree_iter_first---of 5
vma_interval_tree_iter_next---of 9
vma_interval_tree_remove67%of 59
vma_interval_tree_subtree_search---of 9

__do_munmap61%of 76
__do_sys_brk---of 25
__do_sys_remap_file_pages---of 27
__do_sys_remap_file_pages.cold---of 1
__ia32_sys_brk---of 1
__ia32_sys_mmap_pgoff---of 1
__ia32_sys_munmap---of 1
__ia32_sys_remap_file_pages---of 1
__install_special_mapping---of 13
__split_vma54%of 28
__traceiter_vm_unmapped_area---of 4
__vm_munmap---of 5
__vma_adjust36%of 185
__vma_link_file84%of 6
__vma_link_rb---of 30
__vma_rb_erase31%of 69
__x64_sys_brk---of 1
__x64_sys_mmap_pgoff---of 1
__x64_sys_munmap---of 1
__x64_sys_remap_file_pages---of 1
_install_special_mapping---of 1
anon_vma_compatible---of 7
can_vma_merge_before---of 12
copy_vma---of 32
do_brk_flags---of 32
do_mmap53%of 71
do_munmap---of 1
exit_mmap---of 17
expand_downwards---of 48
expand_stack---of 1
find_extend_vma---of 7
find_mergeable_anon_vma---of 11
find_vma100%of 10
find_vma_prev---of 14
get_unmapped_area71%of 17
init_admin_reserve---of 1
init_user_reserve---of 1
insert_vm_struct---of 17
install_special_mapping---of 3
ksys_mmap_pgoff43%of 21
may_expand_vm---of 4
may_expand_vm.part.0---of 5
may_expand_vm.part.0.cold---of 2
mm_drop_all_locks---of 19
mm_take_all_locks---of 30
mmap_region53%of 84
perf_trace_vm_unmapped_area---of 6
remove_vma75%of 8
special_mapping_fault---of 14
special_mapping_mremap---of 6
special_mapping_name---of 1
split_vma---of 3
trace_event_raw_event_vm_unmapped_area---of 10
trace_raw_output_vm_unmapped_area---of 5
unlink_file_vma84%of 6
unmap_region86%of 14
vm_brk---of 7
vm_brk_flags---of 7
vm_get_page_prot---of 3
vm_lock_mapping---of 4
vm_munmap---of 1
vm_stat_account---of 6
vm_unmapped_area---of 72
vma_gap_callbacks_rotate80%of 10
vma_is_special_mapping---of 6
vma_link74%of 34
vma_merge29%of 67
vma_set_page_prot58%of 7
vma_wants_writenotify57%of 16

__dquot_alloc_space---of 62
__dquot_drop---of 14
__dquot_free_space---of 51
__dquot_initialize---of 47
__dquot_transfer---of 87
__quota_error---of 2
__quota_error.cold---of 1
do_get_dqblk---of 1
do_proc_dqstats---of 4
dqcache_shrink_count---of 1
dqcache_shrink_scan---of 13
dqget---of 33
dqput---of 2
dqput.part.0---of 7
dquot_acquire---of 10
dquot_add_inodes---of 24
dquot_add_space---of 29
dquot_alloc---of 1
dquot_alloc_inode---of 43
dquot_claim_space_nodirty---of 36
dquot_commit---of 8
dquot_commit_info---of 1
dquot_decr_inodes---of 6
dquot_decr_space---of 6
dquot_destroy---of 1
dquot_disable---of 74
dquot_drop---of 5
dquot_file_open60%of 5
dquot_free_inode---of 34
dquot_get_dqblk---of 6
dquot_get_next_dqblk---of 8
dquot_get_next_id---of 5
dquot_get_state---of 14
dquot_initialize---of 1
dquot_initialize_needed---of 8
dquot_load_quota_inode---of 4
dquot_load_quota_sb---of 49
dquot_mark_dquot_dirty---of 8
dquot_quota_disable---of 15
dquot_quota_enable---of 14
dquot_quota_off---of 1
dquot_quota_on---of 6
dquot_quota_on_mount---of 8
dquot_quota_sync---of 12
dquot_reclaim_space_nodirty---of 35
dquot_release---of 8
dquot_resume---of 9
dquot_scan_active---of 16
dquot_set_dqblk---of 54
dquot_set_dqinfo---of 16
dquot_transfer---of 19
dquot_writeback_dquots---of 37
info_bdq_free---of 6
info_idq_free---of 7
mark_info_dirty---of 1
prepare_warning---of 4
quota_release_workfn---of 19
register_quota_format---of 1
unregister_quota_format---of 7
vfs_cleanup_quota_inode---of 4
vfs_setup_quota_inode---of 9

__access_remote_vm---of 25
__apply_to_page_range---of 102
__do_fault---of 15
__get_locked_pte---of 11
__might_fault100%of 4
__p4d_alloc---of 21
__pmd_alloc---of 25
__pte_alloc60%of 10
__pte_alloc_kernel---of 9
__pud_alloc---of 18
__vm_insert_mixed---of 9
__vm_map_pages---of 6
access_process_vm---of 3
access_remote_vm---of 1
add_mm_counter_fast---of 11
alloc_set_pte---of 39
apply_to_existing_page_range---of 1
apply_to_page_range---of 1
clear_huge_page---of 15
clear_subpage---of 1
copy_huge_page_from_user---of 11
copy_page_range---of 237
copy_subpage---of 1
copy_user_huge_page---of 18
count_memcg_event_mm.part.0---of 23
do_page_mkwrite---of 16
do_swap_page---of 106
do_wp_page---of 72
fault_around_bytes_fops_open---of 1
fault_around_bytes_get---of 1
fault_around_bytes_set---of 6
fault_dirty_shared_page---of 15
finish_fault---of 10
finish_mkwrite_fault---of 16
follow_invalidate_pte36%of 51
follow_pfn---of 7
follow_phys82%of 11
follow_pte---of 1
free_pgd_range42%of 82
free_pgtables65%of 14
generic_access_phys---of 7
handle_mm_fault---of 200
insert_page_into_pte_locked.constprop.0---of 16
insert_pfn---of 34
mm_trace_rss_stat---of 9
pfn_valid---of 35
print_bad_pte---of 18
print_bad_pte.cold---of 10
print_vma_addr---of 6
print_vma_addr.cold---of 4
ptlock_alloc100%of 3
ptlock_free100%of 1
remap_pfn_range63%of 62
sync_mm_rss24%of 13
unmap_mapping_page---of 10
unmap_mapping_pages---of 8
unmap_mapping_range---of 1
unmap_page_range35%of 146
unmap_single_vma82%of 11
unmap_vmas73%of 11
validate_page_before_insert---of 8
vm_insert_page---of 16
vm_insert_pages---of 34
vm_iomap_memory---of 6
vm_map_pages---of 1
vm_map_pages_zero---of 1
vm_normal_page54%of 13
vmf_insert_mixed---of 1
vmf_insert_mixed_mkwrite---of 1
vmf_insert_mixed_prot---of 1
vmf_insert_pfn---of 1
vmf_insert_pfn_prot---of 15
walk_to_pmd---of 18
wp_page_copy---of 103
zap_page_range---of 13
zap_page_range_single---of 10
zap_vma_ptes---of 4

__traceiter_x86_fpu_after_restore---of 4
__traceiter_x86_fpu_after_save---of 4
__traceiter_x86_fpu_before_restore---of 4
__traceiter_x86_fpu_before_save---of 4
__traceiter_x86_fpu_copy_dst---of 4
__traceiter_x86_fpu_copy_src---of 4
__traceiter_x86_fpu_dropped---of 4
__traceiter_x86_fpu_init_state---of 4
__traceiter_x86_fpu_regs_activated---of 4
__traceiter_x86_fpu_regs_deactivated---of 4
__traceiter_x86_fpu_xstate_check_failed---of 4
copy_fpregs_to_fpstate---of 10
copy_init_fpstate_to_fpregs---of 6
copy_kernel_to_fpregs63%of 8
fpregs_assert_state_consistent75%of 4
fpregs_mark_activate---of 10
fpstate_init---of 4
fpu__clear---of 12
fpu__clear_all---of 1
fpu__clear_user_states---of 1
fpu__copy---of 26
fpu__drop---of 20
fpu__exception_code---of 9
fpu__prepare_read---of 2
fpu__prepare_write---of 3
fpu__save---of 25
irq_fpu_usable---of 7
kernel_fpu_begin_mask---of 12
kernel_fpu_end---of 3
perf_trace_x86_fpu---of 8
switch_fpu_return54%of 15
trace_event_raw_event_x86_fpu---of 12
trace_raw_output_x86_fpu---of 4

__fsnotify_inode_delete---of 1
__fsnotify_parent21%of 44
__fsnotify_update_child_dentry_flags---of 2
__fsnotify_update_child_dentry_flags.part.0---of 14
__fsnotify_vfsmount_delete---of 1
fsnotify---of 92
fsnotify_first_mark---of 17
fsnotify_handle_inode_event.isra.0---of 12
fsnotify_sb_delete---of 18

__bitmap_and---of 5
__bitmap_andnot---of 5
__bitmap_clear75%of 4
__bitmap_complement---of 2
__bitmap_equal---of 6
__bitmap_intersects---of 6
__bitmap_or---of 2
__bitmap_or_equal---of 6
__bitmap_replace---of 2
__bitmap_set75%of 4
__bitmap_shift_left---of 8
__bitmap_shift_right---of 12
__bitmap_subset---of 6
__bitmap_weight---of 5
__bitmap_xor---of 2
bitmap_alloc---of 1
bitmap_allocate_region---of 5
bitmap_bitremap---of 9
bitmap_cut---of 11
bitmap_find_free_region---of 9
bitmap_find_next_zero_area_off---of 5
bitmap_fold---of 3
bitmap_free---of 1
bitmap_from_arr32---of 6
bitmap_onto---of 5
bitmap_ord_to_pos---of 4
bitmap_parse---of 21
bitmap_parse_user---of 5
bitmap_parselist---of 59
bitmap_parselist_user---of 5
bitmap_print_to_pagebuf---of 4
bitmap_release_region---of 3
bitmap_remap---of 11
bitmap_to_arr32---of 6
bitmap_zalloc---of 1
devm_bitmap_alloc---of 4
devm_bitmap_free---of 1
devm_bitmap_zalloc---of 1
get_order---of 1

___perf_sw_event---of 31
__do_sys_perf_event_open---of 216
__ia32_sys_perf_event_open---of 1
__perf_addr_filters_adjust---of 18
__perf_event__output_id_sample---of 12
__perf_event_disable---of 9
__perf_event_enable---of 21
__perf_event_exit_context---of 11
__perf_event_header__init_id---of 18
__perf_event_init_context---of 1
__perf_event_output_stop---of 11
__perf_event_overflow---of 17
__perf_event_period---of 11
__perf_event_read---of 20
__perf_event_read_size---of 9
__perf_event_read_value---of 3
__perf_event_stop---of 5
__perf_event_task_sched_in---of 26
__perf_event_task_sched_out---of 111
__perf_install_in_context---of 15
__perf_pmu_output_stop---of 15
__perf_pmu_sched_task---of 11
__perf_read_group_add---of 25
__perf_remove_from_context---of 10
__perf_sw_event---of 3
__refcount_add.constprop.0---of 5
__x64_sys_perf_event_open---of 1
_free_event---of 48
_perf_event_disable---of 3
_perf_event_enable---of 7
_perf_event_period---of 9
_perf_event_reset---of 1
_perf_ioctl---of 134
calc_timer_values---of 12
cpu_clock_event_add---of 4
cpu_clock_event_del---of 3
cpu_clock_event_init---of 7
cpu_clock_event_read---of 1
cpu_clock_event_start---of 2
cpu_clock_event_stop---of 3
ctx_resched---of 15
ctx_sched_in.constprop.0---of 18
ctx_sched_out---of 36
event_function---of 18
event_function_call---of 15
event_function_local.constprop.0---of 25
event_sched_in---of 34
event_sched_out---of 30
exclusive_event_destroy---of 4
exclusive_event_installable---of 12
find_get_context---of 47
free_ctx---of 4
free_event---of 3
free_event_rcu---of 3
get_event_type---of 8
get_order---of 1
group_sched_out.part.0---of 7
inherit_event.constprop.0---of 24
inherit_task_group.isra.0---of 27
ktime_get_boottime_ns---of 1
ktime_get_clocktai_ns---of 1
ktime_get_real_ns---of 1
list_add_event---of 15
list_del_event---of 19
local_clock---of 1
merge_sched_in---of 51
nr_addr_filters_show---of 1
perf_addr_filters_splice---of 11
perf_adjust_period---of 31
perf_bp_event---of 4
perf_callchain---of 5
perf_compat_ioctl---of 6
perf_copy_attr---of 45
perf_cpu_time_max_percent_handler---of 8
perf_cpu_time_max_percent_handler.cold---of 1
perf_duration_warn---of 1
perf_duration_warn.cold---of 1
perf_event__header_size---of 18
perf_event__id_header_size---of 12
perf_event__output_id_sample---of 2
perf_event_account_interrupt---of 8
perf_event_addr_filters_apply---of 24
perf_event_addr_filters_exec---of 11
perf_event_addr_filters_sync---of 7
perf_event_alloc.part.0---of 151
perf_event_attrs---of 4
perf_event_aux_event---of 8
perf_event_bpf_event---of 9
perf_event_bpf_output---of 9
perf_event_comm---of 8
perf_event_comm.cold---of 1
perf_event_comm_output---of 20
perf_event_create_kernel_counter---of 18
perf_event_ctx_lock_nested---of 33
perf_event_delayed_put---of 4
perf_event_disable---of 4
perf_event_disable_inatomic---of 1
perf_event_disable_local---of 1
perf_event_enable---of 1
perf_event_exec---of 35
perf_event_exit_cpu---of 1
perf_event_exit_cpu_context---of 3
perf_event_exit_task---of 44
perf_event_for_each_child---of 5
perf_event_fork---of 2
perf_event_free_task---of 21
perf_event_get---of 5
perf_event_groups_insert---of 9
perf_event_header__init_id---of 2
perf_event_idx_default---of 1
perf_event_init_cpu---of 13
perf_event_init_task---of 28
perf_event_itrace_started---of 1
perf_event_ksymbol---of 12
perf_event_ksymbol.cold---of 1
perf_event_ksymbol_output---of 13
perf_event_mmap6%of 58
perf_event_mmap_output---of 27
perf_event_mux_interval_ms_show---of 1
perf_event_mux_interval_ms_store---of 9
perf_event_namespaces---of 2
perf_event_namespaces.part.0---of 1
perf_event_namespaces_output---of 16
perf_event_nop_int---of 1
perf_event_output---of 15
perf_event_output_backward---of 15
perf_event_output_forward---of 15
perf_event_overflow---of 1
perf_event_pause---of 8
perf_event_period---of 1
perf_event_read---of 23
perf_event_read_event---of 14
perf_event_read_local---of 19
perf_event_read_value---of 1
perf_event_refresh---of 4
perf_event_release_kernel---of 50
perf_event_sched_in.isra.0---of 3
perf_event_set_output---of 22
perf_event_set_state.part.0---of 11
perf_event_stop.isra.0---of 4
perf_event_switch_output---of 21
perf_event_sysfs_show---of 3
perf_event_task---of 7
perf_event_task_disable---of 13
perf_event_task_enable---of 7
perf_event_task_output---of 31
perf_event_task_tick---of 36
perf_event_text_poke---of 4
perf_event_text_poke_output---of 22
perf_event_update_time---of 7
perf_event_update_userpage---of 25
perf_event_wakeup---of 3
perf_exclude_event---of 8
perf_fasync---of 1
perf_fill_ns_link_info---of 3
perf_get_aux_event---of 14
perf_get_event---of 4
perf_group_attach---of 10
perf_group_detach---of 40
perf_install_in_context---of 26
perf_ioctl---of 3
perf_iterate_ctx---of 14
perf_iterate_sb---of 37
perf_kprobe_event_init---of 8
perf_lock_task_context---of 63
perf_log_itrace_start---of 17
perf_log_lost_samples---of 8
perf_log_throttle---of 12
perf_mmap---of 83
perf_mmap_close---of 67
perf_mmap_fault---of 30
perf_mmap_open---of 4
perf_mux_hrtimer_handler---of 55
perf_mux_hrtimer_restart_ipi---of 5
perf_output_read---of 57
perf_output_sample---of 96
perf_output_sample_regs---of 3
perf_pending_event---of 12
perf_pmu_cancel_txn---of 3
perf_pmu_commit_txn---of 5
perf_pmu_disable---of 2
perf_pmu_enable---of 2
perf_pmu_migrate_context---of 24
perf_pmu_name---of 1
perf_pmu_nop_int---of 1
perf_pmu_register---of 52
perf_pmu_resched---of 4
perf_pmu_snapshot_aux---of 5
perf_pmu_start_txn---of 3
perf_pmu_unregister---of 9
perf_poll---of 10
perf_prepare_sample---of 93
perf_proc_update_handler---of 9
perf_read---of 35
perf_reboot---of 3
perf_register_guest_info_callbacks---of 5
perf_release---of 1
perf_remove_from_context---of 8
perf_sample_event_took---of 6
perf_sched_cb_dec---of 2
perf_sched_cb_inc---of 3
perf_sched_delayed---of 3
perf_swevent_add---of 14
perf_swevent_del---of 3
perf_swevent_event---of 18
perf_swevent_get_recursion_context---of 3
perf_swevent_hrtimer---of 8
perf_swevent_init---of 23
perf_swevent_put_recursion_context---of 1
perf_swevent_set_period---of 4
perf_swevent_start---of 1
perf_swevent_start_hrtimer.part.0---of 5
perf_swevent_stop---of 1
perf_tp_event---of 45
perf_tp_event_init---of 5
perf_trace_run_bpf_submit---of 4
perf_try_init_event---of 21
perf_unregister_guest_info_callbacks---of 5
perf_uprobe_event_init---of 8
pmu_dev_alloc---of 8
pmu_dev_is_visible---of 5
pmu_dev_release---of 1
put_ctx---of 13
rb_free_rcu---of 1
ref_ctr_offset_show---of 1
remote_function---of 4
retprobe_show---of 1
ring_buffer_attach---of 15
ring_buffer_get---of 30
ring_buffer_put---of 7
ring_buffer_wakeup---of 22
sw_perf_event_destroy---of 5
swevent_hlist_put_cpu---of 9
task_clock_event_add---of 4
task_clock_event_del---of 3
task_clock_event_init---of 7
task_clock_event_read---of 1
task_clock_event_start---of 2
task_clock_event_stop---of 3
task_function_call---of 6
tp_perf_event_destroy---of 1
type_show---of 1
unclone_ctx---of 6
visit_groups_merge.constprop.0.isra.0---of 52

hugetlb_get_unmapped_area---of 35
pmd_huge100%of 3
pud_huge---of 1

__cgroup_account_cputime100%of 1
__cgroup_account_cputime_field---of 4
cgroup_base_stat_cputime_show---of 4
cgroup_rstat_exit---of 7
cgroup_rstat_flush---of 1
cgroup_rstat_flush_hold---of 1
cgroup_rstat_flush_irqsafe---of 1
cgroup_rstat_flush_locked---of 39
cgroup_rstat_flush_release---of 1
cgroup_rstat_init---of 5
cgroup_rstat_updated34%of 6
root_cgroup_cputime---of 3

___p4d_free_tlb---of 4
___pmd_free_tlb---of 6
___pte_free_tlb67%of 3
___pud_free_tlb---of 4
__native_set_fixmap---of 3
__native_set_fixmap.part.0---of 1
native_set_fixmap---of 9
p4d_clear_huge---of 1
p4d_free_pud_page---of 1
p4d_set_huge---of 1
pgd_alloc---of 16
pgd_free---of 4
pgd_page_get_mm---of 1
pmd_clear_huge---of 5
pmd_free_pte_page---of 2
pmd_set_huge---of 13
pmd_set_huge.cold---of 1
preallocate_pmds.constprop.0---of 1
pte_alloc_one60%of 10
ptep_clear_flush_young---of 4
ptep_set_access_flags---of 4
ptep_test_and_clear_young---of 4
pud_clear_huge---of 4
pud_free_pmd_page---of 18
pud_set_huge---of 12

__do_sys_uselib---of 24
__get_task_comm---of 1
__ia32_compat_sys_execve---of 1
__ia32_compat_sys_execveat---of 1
__ia32_sys_execve---of 1
__ia32_sys_execveat---of 1
__ia32_sys_uselib---of 1
__register_binfmt---of 8
__set_task_comm---of 10
__x64_sys_execve---of 1
__x64_sys_execveat---of 1
__x64_sys_uselib---of 1
acct_arg_size---of 3
alloc_bprm---of 23
begin_new_exec---of 98
bprm_change_interp---of 4
bprm_execve---of 109
copy_string_kernel---of 16
copy_string_kernel.cold---of 1
copy_strings.isra.0---of 30
copy_strings_kernel---of 6
count.constprop.0---of 9
count_strings_kernel.part.0---of 6
do_execveat_common---of 21
do_execveat_common.cold---of 1
do_open_execat---of 23
finalize_exec---of 1
free_bprm---of 11
get_arg_page---of 6
get_user_arg_ptr.isra.0---of 6
kernel_execve---of 19
open_exec---of 4
path_noexec100%of 3
remove_arg_zero---of 16
set_binfmt---of 4
set_dumpable---of 4
setup_arg_pages---of 21
setup_arg_pages.cold---of 1
setup_new_exec---of 5
shift_arg_pages---of 14
unregister_binfmt---of 1
would_dump---of 10

__xa_alloc---of 15
__xa_alloc_cyclic---of 10
__xa_clear_mark---of 3
__xa_cmpxchg---of 14
__xa_erase---of 5
__xa_insert---of 13
__xa_set_mark---of 3
__xa_store54%of 13
__xas_next---of 27
__xas_nomem19%of 16
__xas_prev---of 27
xa_clear_mark---of 3
xa_delete_node---of 6
xa_destroy---of 17
xa_erase---of 1
xa_extract---of 74
xa_find---of 21
xa_find_after---of 23
xa_get_mark---of 35
xa_load57%of 16
xa_set_mark---of 3
xa_store100%of 1
xas_alloc73%of 11
xas_clear_mark---of 13
xas_create29%of 87
xas_create_range---of 20
xas_find---of 40
xas_find_conflict---of 47
xas_find_marked---of 70
xas_free_nodes---of 22
xas_get_mark---of 5
xas_init_marks---of 6
xas_load25%of 12
xas_nomem---of 8
xas_pause---of 17
xas_set_mark---of 12
xas_start30%of 30
xas_store18%of 112

__attach_mnt---of 3
__cleanup_mnt---of 1
__detach_mounts---of 13
__do_loopback---of 14
__do_sys_fsmount---of 42
__do_sys_fsmount.cold---of 2
__do_sys_pivot_root---of 52
__ia32_sys_fsmount---of 1
__ia32_sys_mount---of 15
__ia32_sys_move_mount---of 18
__ia32_sys_oldumount---of 3
__ia32_sys_open_tree---of 21
__ia32_sys_pivot_root---of 1
__ia32_sys_umount---of 6
__is_local_mountpoint---of 5
__legitimize_mnt40%of 10
__lookup_mnt---of 5
__mnt_drop_write---of 1
__mnt_drop_write_file---of 1
__mnt_is_readonly---of 3
__mnt_want_write---of 7
__mnt_want_write_file---of 8
__put_mountpoint.part.0---of 5
__x64_sys_fsmount---of 1
__x64_sys_mount---of 15
__x64_sys_move_mount---of 18
__x64_sys_oldumount---of 3
__x64_sys_open_tree---of 21
__x64_sys_pivot_root---of 1
__x64_sys_umount---of 6
alloc_mnt_ns---of 12
alloc_vfsmnt---of 10
attach_mnt---of 5
attach_recursive_mnt---of 41
can_change_locked_flags.isra.0---of 13
cleanup_group_ids---of 11
cleanup_mnt---of 19
clone_mnt---of 28
clone_private_mount---of 11
collect_mounts---of 6
commit_tree---of 9
copy_mnt_ns---of 45
copy_mount_options---of 8
copy_tree---of 35
count_mounts---of 12
current_chrooted---of 6
delayed_free_vfsmnt---of 1
delayed_mntput---of 2
dissolve_on_fput---of 5
do_add_mount---of 8
do_mount---of 4
do_move_mount---of 50
drop_collected_mounts---of 1
fc_mount---of 5
finish_automount---of 46
free_mnt_ns---of 6
from_mnt_ns---of 1
get_mountpoint---of 17
graft_tree---of 7
invent_group_ids---of 13
is_path_reachable---of 5
iterate_mounts---of 5
kern_mount---of 4
kern_unmount---of 5
kern_unmount_array---of 11
legitimize_mnt---of 23
lock_mnt_tree---of 16
lock_mount---of 10
lookup_mnt---of 21
lookup_mountpoint---of 5
m_next---of 5
m_show---of 1
m_start---of 10
m_stop---of 4
mark_mounts_for_expiry---of 13
may_umount---of 1
may_umount_tree---of 11
mnt_change_mountpoint---of 13
mnt_clone_internal---of 3
mnt_clone_write---of 5
mnt_cursor_del---of 1
mnt_drop_write---of 9
mnt_drop_write_file---of 9
mnt_get_count---of 3
mnt_may_suid---of 4
mnt_release_group_id---of 1
mnt_set_expiry---of 1
mnt_set_mountpoint---of 3
mnt_want_write---of 19
mnt_want_write_file---of 23
mnt_warn_timestamp_expiry---of 7
mnt_warn_timestamp_expiry.cold---of 2
mntget100%of 3
mntns_get---of 3
mntns_install---of 9
mntns_owner---of 1
mntns_put---of 1
mntput75%of 4
mntput_no_expire18%of 57
mount_subtree---of 13
mount_too_revealing---of 23
namespace_unlock---of 18
open_detached_copy---of 19
our_mnt---of 1
path_is_mountpoint---of 23
path_is_under---of 5
path_mount---of 139
path_mount.cold---of 1
path_umount---of 58
put_mnt_ns---of 2
sb_prepare_remount_readonly---of 16
umount_tree---of 35
unhash_mnt---of 9
vfs_create_mount---of 9
vfs_kern_mount---of 3
vfs_kern_mount.part.0---of 9
vfs_submount---of 4