5 


    5 











































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
// SPDX-License-Identifier: GPL-2.0-or-later
/* Common capabilities, needed by capability.o.
 */

#include <linux/capability.h>
#include <linux/audit.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/lsm_hooks.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/ptrace.h>
#include <linux/xattr.h>
#include <linux/hugetlb.h>
#include <linux/mount.h>
#include <linux/sched.h>
#include <linux/prctl.h>
#include <linux/securebits.h>
#include <linux/user_namespace.h>
#include <linux/binfmts.h>
#include <linux/personality.h>

/*
 * If a non-root user executes a setuid-root binary in
 * !secure(SECURE_NOROOT) mode, then we raise capabilities.
 * However if fE is also set, then the intent is for only
 * the file capabilities to be applied, and the setuid-root
 * bit is left on either to change the uid (plausible) or
 * to get full privilege on a kernel without file capabilities
 * support.  So in that case we do not raise capabilities.
 *
 * Warn if that happens, once per boot.
 */
static void warn_setuid_and_fcaps_mixed(const char *fname)
{
        static int warned;
        if (!warned) {
                printk(KERN_INFO "warning: `%s' has both setuid-root and"
                        " effective capabilities. Therefore not raising all"
                        " capabilities.\n", fname);
                warned = 1;
        }
}

/**
 * cap_capable - Determine whether a task has a particular effective capability
 * @cred: The credentials to use
 * @ns:  The user namespace in which we need the capability
 * @cap: The capability to check for
 * @opts: Bitmask of options defined in include/linux/security.h
 *
 * Determine whether the nominated task has the specified capability amongst
 * its effective set, returning 0 if it does, -ve if it does not.
 *
 * NOTE WELL: cap_has_capability() cannot be used like the kernel's capable()
 * and has_capability() functions.  That is, it has the reverse semantics:
 * cap_has_capability() returns 0 when a task has a capability, but the
 * kernel's capable() and has_capability() returns 1 for this case.
 */
int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
                int cap, unsigned int opts)
{
        struct user_namespace *ns = targ_ns;

        /* See if cred has the capability in the target user namespace
         * by examining the target user namespace and all of the target
         * user namespace's parents.
         */
        for (;;) {
                /* Do we have the necessary capabilities? */
                if (ns == cred->user_ns)
                        return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;

                /*
                 * If we're already at a lower level than we're looking for,
                 * we're done searching.
                 */
                if (ns->level <= cred->user_ns->level)
                        return -EPERM;

                /* 
                 * The owner of the user namespace in the parent of the
                 * user namespace has all caps.
                 */
                if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid))
                        return 0;

                /*
                 * If you have a capability in a parent user ns, then you have
                 * it over all children user namespaces as well.
                 */
                ns = ns->parent;
        }

        /* We never get here */
}

/**
 * cap_settime - Determine whether the current process may set the system clock
 * @ts: The time to set
 * @tz: The timezone to set
 *
 * Determine whether the current process may set the system clock and timezone
 * information, returning 0 if permission granted, -ve if denied.
 */
int cap_settime(const struct timespec64 *ts, const struct timezone *tz)
{
        if (!capable(CAP_SYS_TIME))
                return -EPERM;
        return 0;
}

/**
 * cap_ptrace_access_check - Determine whether the current process may access
 *                           another
 * @child: The process to be accessed
 * @mode: The mode of attachment.
 *
 * If we are in the same or an ancestor user_ns and have all the target
 * task's capabilities, then ptrace access is allowed.
 * If we have the ptrace capability to the target user_ns, then ptrace
 * access is allowed.
 * Else denied.
 *
 * Determine whether a process may access another, returning 0 if permission
 * granted, -ve if denied.
 */
int cap_ptrace_access_check(struct task_struct *child, unsigned int mode)
{
        int ret = 0;
        const struct cred *cred, *child_cred;
        const kernel_cap_t *caller_caps;

        rcu_read_lock();
        cred = current_cred();
        child_cred = __task_cred(child);
        if (mode & PTRACE_MODE_FSCREDS)
                caller_caps = &cred->cap_effective;
        else
                caller_caps = &cred->cap_permitted;
        if (cred->user_ns == child_cred->user_ns &&
            cap_issubset(child_cred->cap_permitted, *caller_caps))
                goto out;
        if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE))
                goto out;
        ret = -EPERM;
out:
        rcu_read_unlock();
        return ret;
}

/**
 * cap_ptrace_traceme - Determine whether another process may trace the current
 * @parent: The task proposed to be the tracer
 *
 * If parent is in the same or an ancestor user_ns and has all current's
 * capabilities, then ptrace access is allowed.
 * If parent has the ptrace capability to current's user_ns, then ptrace
 * access is allowed.
 * Else denied.
 *
 * Determine whether the nominated task is permitted to trace the current
 * process, returning 0 if permission is granted, -ve if denied.
 */
int cap_ptrace_traceme(struct task_struct *parent)
{
        int ret = 0;
        const struct cred *cred, *child_cred;

        rcu_read_lock();
        cred = __task_cred(parent);
        child_cred = current_cred();
        if (cred->user_ns == child_cred->user_ns &&
            cap_issubset(child_cred->cap_permitted, cred->cap_permitted))
                goto out;
        if (has_ns_capability(parent, child_cred->user_ns, CAP_SYS_PTRACE))
                goto out;
        ret = -EPERM;
out:
        rcu_read_unlock();
        return ret;
}

/**
 * cap_capget - Retrieve a task's capability sets
 * @target: The task from which to retrieve the capability sets
 * @effective: The place to record the effective set
 * @inheritable: The place to record the inheritable set
 * @permitted: The place to record the permitted set
 *
 * This function retrieves the capabilities of the nominated task and returns
 * them to the caller.
 */
int cap_capget(struct task_struct *target, kernel_cap_t *effective,
               kernel_cap_t *inheritable, kernel_cap_t *permitted)
{
        const struct cred *cred;

        /* Derived from kernel/capability.c:sys_capget. */
        rcu_read_lock();
        cred = __task_cred(target);
        *effective   = cred->cap_effective;
        *inheritable = cred->cap_inheritable;
        *permitted   = cred->cap_permitted;
        rcu_read_unlock();
        return 0;
}

/*
 * Determine whether the inheritable capabilities are limited to the old
 * permitted set.  Returns 1 if they are limited, 0 if they are not.
 */
static inline int cap_inh_is_capped(void)
{
        /* they are so limited unless the current task has the CAP_SETPCAP
         * capability
         */
        if (cap_capable(current_cred(), current_cred()->user_ns,
                        CAP_SETPCAP, CAP_OPT_NONE) == 0)
                return 0;
        return 1;
}

/**
 * cap_capset - Validate and apply proposed changes to current's capabilities
 * @new: The proposed new credentials; alterations should be made here
 * @old: The current task's current credentials
 * @effective: A pointer to the proposed new effective capabilities set
 * @inheritable: A pointer to the proposed new inheritable capabilities set
 * @permitted: A pointer to the proposed new permitted capabilities set
 *
 * This function validates and applies a proposed mass change to the current
 * process's capability sets.  The changes are made to the proposed new
 * credentials, and assuming no error, will be committed by the caller of LSM.
 */
int cap_capset(struct cred *new,
               const struct cred *old,
               const kernel_cap_t *effective,
               const kernel_cap_t *inheritable,
               const kernel_cap_t *permitted)
{
        if (cap_inh_is_capped() &&
            !cap_issubset(*inheritable,
                          cap_combine(old->cap_inheritable,
                                      old->cap_permitted)))
                /* incapable of using this inheritable set */
                return -EPERM;

        if (!cap_issubset(*inheritable,
                          cap_combine(old->cap_inheritable,
                                      old->cap_bset)))
                /* no new pI capabilities outside bounding set */
                return -EPERM;

        /* verify restrictions on target's new Permitted set */
        if (!cap_issubset(*permitted, old->cap_permitted))
                return -EPERM;

        /* verify the _new_Effective_ is a subset of the _new_Permitted_ */
        if (!cap_issubset(*effective, *permitted))
                return -EPERM;

        new->cap_effective   = *effective;
        new->cap_inheritable = *inheritable;
        new->cap_permitted   = *permitted;

        /*
         * Mask off ambient bits that are no longer both permitted and
         * inheritable.
         */
        new->cap_ambient = cap_intersect(new->cap_ambient,
                                         cap_intersect(*permitted,
                                                       *inheritable));
        if (WARN_ON(!cap_ambient_invariant_ok(new)))
                return -EINVAL;
        return 0;
}

/**
 * cap_inode_need_killpriv - Determine if inode change affects privileges
 * @dentry: The inode/dentry in being changed with change marked ATTR_KILL_PRIV
 *
 * Determine if an inode having a change applied that's marked ATTR_KILL_PRIV
 * affects the security markings on that inode, and if it is, should
 * inode_killpriv() be invoked or the change rejected.
 *
 * Returns 1 if security.capability has a value, meaning inode_killpriv()
 * is required, 0 otherwise, meaning inode_killpriv() is not required.
 */
int cap_inode_need_killpriv(struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);
        int error;

        error = __vfs_getxattr(dentry, inode, XATTR_NAME_CAPS, NULL, 0);
        return error > 0;
}

/**
 * cap_inode_killpriv - Erase the security markings on an inode
 * @dentry: The inode/dentry to alter
 *
 * Erase the privilege-enhancing security markings on an inode.
 *
 * Returns 0 if successful, -ve on error.
 */
int cap_inode_killpriv(struct dentry *dentry)
{
        int error;

        error = __vfs_removexattr(dentry, XATTR_NAME_CAPS);
        if (error == -EOPNOTSUPP)
                error = 0;
        return error;
}

static bool rootid_owns_currentns(kuid_t kroot)
{
        struct user_namespace *ns;

        if (!uid_valid(kroot))
                return false;

        for (ns = current_user_ns(); ; ns = ns->parent) {
                if (from_kuid(ns, kroot) == 0)
                        return true;
                if (ns == &init_user_ns)
                        break;
        }

        return false;
}

static __u32 sansflags(__u32 m)
{
        return m & ~VFS_CAP_FLAGS_EFFECTIVE;
}

static bool is_v2header(size_t size, const struct vfs_cap_data *cap)
{
        if (size != XATTR_CAPS_SZ_2)
                return false;
        return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_2;
}

static bool is_v3header(size_t size, const struct vfs_cap_data *cap)
{
        if (size != XATTR_CAPS_SZ_3)
                return false;
        return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3;
}

/*
 * getsecurity: We are called for security.* before any attempt to read the
 * xattr from the inode itself.
 *
 * This gives us a chance to read the on-disk value and convert it.  If we
 * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler.
 *
 * Note we are not called by vfs_getxattr_alloc(), but that is only called
 * by the integrity subsystem, which really wants the unconverted values -
 * so that's good.
 */
int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
                          bool alloc)
{
        int size, ret;
        kuid_t kroot;
        u32 nsmagic, magic;
        uid_t root, mappedroot;
        char *tmpbuf = NULL;
        struct vfs_cap_data *cap;
        struct vfs_ns_cap_data *nscap = NULL;
        struct dentry *dentry;
        struct user_namespace *fs_ns;

        if (strcmp(name, "capability") != 0)
                return -EOPNOTSUPP;

        dentry = d_find_any_alias(inode);
        if (!dentry)
                return -EINVAL;

        size = sizeof(struct vfs_ns_cap_data);
        ret = (int) vfs_getxattr_alloc(dentry, XATTR_NAME_CAPS,
                                 &tmpbuf, size, GFP_NOFS);
        dput(dentry);

        if (ret < 0 || !tmpbuf) {
                size = ret;
                goto out_free;
        }

        fs_ns = inode->i_sb->s_user_ns;
        cap = (struct vfs_cap_data *) tmpbuf;
        if (is_v2header((size_t) ret, cap)) {
                root = 0;
        } else if (is_v3header((size_t) ret, cap)) {
                nscap = (struct vfs_ns_cap_data *) tmpbuf;
                root = le32_to_cpu(nscap->rootid);
        } else {
                size = -EINVAL;
                goto out_free;
        }

        kroot = make_kuid(fs_ns, root);

        /* If the root kuid maps to a valid uid in current ns, then return
         * this as a nscap. */
        mappedroot = from_kuid(current_user_ns(), kroot);
        if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) {
                size = sizeof(struct vfs_ns_cap_data);
                if (alloc) {
                        if (!nscap) {
                                /* v2 -> v3 conversion */
                                nscap = kzalloc(size, GFP_ATOMIC);
                                if (!nscap) {
                                        size = -ENOMEM;
                                        goto out_free;
                                }
                                nsmagic = VFS_CAP_REVISION_3;
                                magic = le32_to_cpu(cap->magic_etc);
                                if (magic & VFS_CAP_FLAGS_EFFECTIVE)
                                        nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
                                memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
                                nscap->magic_etc = cpu_to_le32(nsmagic);
                        } else {
                                /* use allocated v3 buffer */
                                tmpbuf = NULL;
                        }
                        nscap->rootid = cpu_to_le32(mappedroot);
                        *buffer = nscap;
                }
                goto out_free;
        }

        if (!rootid_owns_currentns(kroot)) {
                size = -EOVERFLOW;
                goto out_free;
        }

        /* This comes from a parent namespace.  Return as a v2 capability */
        size = sizeof(struct vfs_cap_data);
        if (alloc) {
                if (nscap) {
                        /* v3 -> v2 conversion */
                        cap = kzalloc(size, GFP_ATOMIC);
                        if (!cap) {
                                size = -ENOMEM;
                                goto out_free;
                        }
                        magic = VFS_CAP_REVISION_2;
                        nsmagic = le32_to_cpu(nscap->magic_etc);
                        if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
                                magic |= VFS_CAP_FLAGS_EFFECTIVE;
                        memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
                        cap->magic_etc = cpu_to_le32(magic);
                } else {
                        /* use unconverted v2 */
                        tmpbuf = NULL;
                }
                *buffer = cap;
        }
out_free:
        kfree(tmpbuf);
        return size;
}

static kuid_t rootid_from_xattr(const void *value, size_t size,
                                struct user_namespace *task_ns)
{
        const struct vfs_ns_cap_data *nscap = value;
        uid_t rootid = 0;

        if (size == XATTR_CAPS_SZ_3)
                rootid = le32_to_cpu(nscap->rootid);

        return make_kuid(task_ns, rootid);
}

static bool validheader(size_t size, const struct vfs_cap_data *cap)
{
        return is_v2header(size, cap) || is_v3header(size, cap);
}

/*
 * User requested a write of security.capability.  If needed, update the
 * xattr to change from v2 to v3, or to fixup the v3 rootid.
 *
 * If all is ok, we return the new size, on error return < 0.
 */
int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size)
{
        struct vfs_ns_cap_data *nscap;
        uid_t nsrootid;
        const struct vfs_cap_data *cap = *ivalue;
        __u32 magic, nsmagic;
        struct inode *inode = d_backing_inode(dentry);
        struct user_namespace *task_ns = current_user_ns(),
                *fs_ns = inode->i_sb->s_user_ns;
        kuid_t rootid;
        size_t newsize;

        if (!*ivalue)
                return -EINVAL;
        if (!validheader(size, cap))
                return -EINVAL;
        if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
                return -EPERM;
        if (size == XATTR_CAPS_SZ_2)
                if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP))
                        /* user is privileged, just write the v2 */
                        return size;

        rootid = rootid_from_xattr(*ivalue, size, task_ns);
        if (!uid_valid(rootid))
                return -EINVAL;

        nsrootid = from_kuid(fs_ns, rootid);
        if (nsrootid == -1)
                return -EINVAL;

        newsize = sizeof(struct vfs_ns_cap_data);
        nscap = kmalloc(newsize, GFP_ATOMIC);
        if (!nscap)
                return -ENOMEM;
        nscap->rootid = cpu_to_le32(nsrootid);
        nsmagic = VFS_CAP_REVISION_3;
        magic = le32_to_cpu(cap->magic_etc);
        if (magic & VFS_CAP_FLAGS_EFFECTIVE)
                nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
        nscap->magic_etc = cpu_to_le32(nsmagic);
        memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);

        kvfree(*ivalue);
        *ivalue = nscap;
        return newsize;
}

/*
 * Calculate the new process capability sets from the capability sets attached
 * to a file.
 */
static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
                                          struct linux_binprm *bprm,
                                          bool *effective,
                                          bool *has_fcap)
{
        struct cred *new = bprm->cred;
        unsigned i;
        int ret = 0;

        if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE)
                *effective = true;

        if (caps->magic_etc & VFS_CAP_REVISION_MASK)
                *has_fcap = true;

        CAP_FOR_EACH_U32(i) {
                __u32 permitted = caps->permitted.cap[i];
                __u32 inheritable = caps->inheritable.cap[i];

                /*
                 * pP' = (X & fP) | (pI & fI)
                 * The addition of pA' is handled later.
                 */
                new->cap_permitted.cap[i] =
                        (new->cap_bset.cap[i] & permitted) |
                        (new->cap_inheritable.cap[i] & inheritable);

                if (permitted & ~new->cap_permitted.cap[i])
                        /* insufficient to execute correctly */
                        ret = -EPERM;
        }

        /*
         * For legacy apps, with no internal support for recognizing they
         * do not have enough capabilities, we return an error if they are
         * missing some "forced" (aka file-permitted) capabilities.
         */
        return *effective ? ret : 0;
}

/*
 * Extract the on-exec-apply capability sets for an executable file.
 */
int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps)
{
        struct inode *inode = d_backing_inode(dentry);
        __u32 magic_etc;
        unsigned tocopy, i;
        int size;
        struct vfs_ns_cap_data data, *nscaps = &data;
        struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
        kuid_t rootkuid;
        struct user_namespace *fs_ns;

        memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));

        if (!inode)
                return -ENODATA;

        fs_ns = inode->i_sb->s_user_ns;
        size = __vfs_getxattr((struct dentry *)dentry, inode,
                              XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
        if (size == -ENODATA || size == -EOPNOTSUPP)
                /* no data, that's ok */
                return -ENODATA;

        if (size < 0)
                return size;

        if (size < sizeof(magic_etc))
                return -EINVAL;

        cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);

        rootkuid = make_kuid(fs_ns, 0);
        switch (magic_etc & VFS_CAP_REVISION_MASK) {
        case VFS_CAP_REVISION_1:
                if (size != XATTR_CAPS_SZ_1)
                        return -EINVAL;
                tocopy = VFS_CAP_U32_1;
                break;
        case VFS_CAP_REVISION_2:
                if (size != XATTR_CAPS_SZ_2)
                        return -EINVAL;
                tocopy = VFS_CAP_U32_2;
                break;
        case VFS_CAP_REVISION_3:
                if (size != XATTR_CAPS_SZ_3)
                        return -EINVAL;
                tocopy = VFS_CAP_U32_3;
                rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
                break;

        default:
                return -EINVAL;
        }
        /* Limit the caps to the mounter of the filesystem
         * or the more limited uid specified in the xattr.
         */
        if (!rootid_owns_currentns(rootkuid))
                return -ENODATA;

        CAP_FOR_EACH_U32(i) {
                if (i >= tocopy)
                        break;
                cpu_caps->permitted.cap[i] = le32_to_cpu(caps->data[i].permitted);
                cpu_caps->inheritable.cap[i] = le32_to_cpu(caps->data[i].inheritable);
        }

        cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
        cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;

        cpu_caps->rootid = rootkuid;

        return 0;
}

/*
 * Attempt to get the on-exec apply capability sets for an executable file from
 * its xattrs and, if present, apply them to the proposed credentials being
 * constructed by execve().
 */
static int get_file_caps(struct linux_binprm *bprm, struct file *file,
                         bool *effective, bool *has_fcap)
{
        int rc = 0;
        struct cpu_vfs_cap_data vcaps;

        cap_clear(bprm->cred->cap_permitted);

        if (!file_caps_enabled)
                return 0;

        if (!mnt_may_suid(file->f_path.mnt))
                return 0;

        /*
         * This check is redundant with mnt_may_suid() but is kept to make
         * explicit that capability bits are limited to s_user_ns and its
         * descendants.
         */
        if (!current_in_userns(file->f_path.mnt->mnt_sb->s_user_ns))
                return 0;

        rc = get_vfs_caps_from_disk(file->f_path.dentry, &vcaps);
        if (rc < 0) {
                if (rc == -EINVAL)
                        printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
                                        bprm->filename);
                else if (rc == -ENODATA)
                        rc = 0;
                goto out;
        }

        rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective, has_fcap);

out:
        if (rc)
                cap_clear(bprm->cred->cap_permitted);

        return rc;
}

static inline bool root_privileged(void) { return !issecure(SECURE_NOROOT); }

static inline bool __is_real(kuid_t uid, struct cred *cred)
{ return uid_eq(cred->uid, uid); }

static inline bool __is_eff(kuid_t uid, struct cred *cred)
{ return uid_eq(cred->euid, uid); }

static inline bool __is_suid(kuid_t uid, struct cred *cred)
{ return !__is_real(uid, cred) && __is_eff(uid, cred); }

/*
 * handle_privileged_root - Handle case of privileged root
 * @bprm: The execution parameters, including the proposed creds
 * @has_fcap: Are any file capabilities set?
 * @effective: Do we have effective root privilege?
 * @root_uid: This namespace' root UID WRT initial USER namespace
 *
 * Handle the case where root is privileged and hasn't been neutered by
 * SECURE_NOROOT.  If file capabilities are set, they won't be combined with
 * set UID root and nothing is changed.  If we are root, cap_permitted is
 * updated.  If we have become set UID root, the effective bit is set.
 */
static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap,
                                   bool *effective, kuid_t root_uid)
{
        const struct cred *old = current_cred();
        struct cred *new = bprm->cred;

        if (!root_privileged())
                return;
        /*
         * If the legacy file capability is set, then don't set privs
         * for a setuid root binary run by a non-root user.  Do set it
         * for a root user just to cause least surprise to an admin.
         */
        if (has_fcap && __is_suid(root_uid, new)) {
                warn_setuid_and_fcaps_mixed(bprm->filename);
                return;
        }
        /*
         * To support inheritance of root-permissions and suid-root
         * executables under compatibility mode, we override the
         * capability sets for the file.
         */
        if (__is_eff(root_uid, new) || __is_real(root_uid, new)) {
                /* pP' = (cap_bset & ~0) | (pI & ~0) */
                new->cap_permitted = cap_combine(old->cap_bset,
                                                 old->cap_inheritable);
        }
        /*
         * If only the real uid is 0, we do not set the effective bit.
         */
        if (__is_eff(root_uid, new))
                *effective = true;
}

#define __cap_gained(field, target, source) \
        !cap_issubset(target->cap_##field, source->cap_##field)
#define __cap_grew(target, source, cred) \
        !cap_issubset(cred->cap_##target, cred->cap_##source)
#define __cap_full(field, cred) \
        cap_issubset(CAP_FULL_SET, cred->cap_##field)

static inline bool __is_setuid(struct cred *new, const struct cred *old)
{ return !uid_eq(new->euid, old->uid); }

static inline bool __is_setgid(struct cred *new, const struct cred *old)
{ return !gid_eq(new->egid, old->gid); }

/*
 * 1) Audit candidate if current->cap_effective is set
 *
 * We do not bother to audit if 3 things are true:
 *   1) cap_effective has all caps
 *   2) we became root *OR* are were already root
 *   3) root is supposed to have all caps (SECURE_NOROOT)
 * Since this is just a normal root execing a process.
 *
 * Number 1 above might fail if you don't have a full bset, but I think
 * that is interesting information to audit.
 *
 * A number of other conditions require logging:
 * 2) something prevented setuid root getting all caps
 * 3) non-setuid root gets fcaps
 * 4) non-setuid root gets ambient
 */
static inline bool nonroot_raised_pE(struct cred *new, const struct cred *old,
                                     kuid_t root, bool has_fcap)
{
        bool ret = false;

        if ((__cap_grew(effective, ambient, new) &&
             !(__cap_full(effective, new) &&
               (__is_eff(root, new) || __is_real(root, new)) &&
               root_privileged())) ||
            (root_privileged() &&
             __is_suid(root, new) &&
             !__cap_full(effective, new)) ||
            (!__is_setuid(new, old) &&
             ((has_fcap &&
               __cap_gained(permitted, new, old)) ||
              __cap_gained(ambient, new, old))))

                ret = true;

        return ret;
}

/**
 * cap_bprm_creds_from_file - Set up the proposed credentials for execve().
 * @bprm: The execution parameters, including the proposed creds
 * @file: The file to pull the credentials from
 *
 * Set up the proposed credentials for a new execution context being
 * constructed by execve().  The proposed creds in @bprm->cred is altered,
 * which won't take effect immediately.  Returns 0 if successful, -ve on error.
 */
int cap_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file)
{
        /* Process setpcap binaries and capabilities for uid 0 */
        const struct cred *old = current_cred();
        struct cred *new = bprm->cred;
        bool effective = false, has_fcap = false, is_setid;
        int ret;
        kuid_t root_uid;

        if (WARN_ON(!cap_ambient_invariant_ok(old)))
                return -EPERM;

        ret = get_file_caps(bprm, file, &effective, &has_fcap);
        if (ret < 0)
                return ret;

        root_uid = make_kuid(new->user_ns, 0);

        handle_privileged_root(bprm, has_fcap, &effective, root_uid);

        /* if we have fs caps, clear dangerous personality flags */
        if (__cap_gained(permitted, new, old))
                bprm->per_clear |= PER_CLEAR_ON_SETID;

        /* Don't let someone trace a set[ug]id/setpcap binary with the revised
         * credentials unless they have the appropriate permit.
         *
         * In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
         */
        is_setid = __is_setuid(new, old) || __is_setgid(new, old);

        if ((is_setid || __cap_gained(permitted, new, old)) &&
            ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) ||
             !ptracer_capable(current, new->user_ns))) {
                /* downgrade; they get no more than they had, and maybe less */
                if (!ns_capable(new->user_ns, CAP_SETUID) ||
                    (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) {
                        new->euid = new->uid;
                        new->egid = new->gid;
                }
                new->cap_permitted = cap_intersect(new->cap_permitted,
                                                   old->cap_permitted);
        }

        new->suid = new->fsuid = new->euid;
        new->sgid = new->fsgid = new->egid;

        /* File caps or setid cancels ambient. */
        if (has_fcap || is_setid)
                cap_clear(new->cap_ambient);

        /*
         * Now that we've computed pA', update pP' to give:
         *   pP' = (X & fP) | (pI & fI) | pA'
         */
        new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient);

        /*
         * Set pE' = (fE ? pP' : pA').  Because pA' is zero if fE is set,
         * this is the same as pE' = (fE ? pP' : 0) | pA'.
         */
        if (effective)
                new->cap_effective = new->cap_permitted;
        else
                new->cap_effective = new->cap_ambient;

        if (WARN_ON(!cap_ambient_invariant_ok(new)))
                return -EPERM;

        if (nonroot_raised_pE(new, old, root_uid, has_fcap)) {
                ret = audit_log_bprm_fcaps(bprm, new, old);
                if (ret < 0)
                        return ret;
        }

        new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);

        if (WARN_ON(!cap_ambient_invariant_ok(new)))
                return -EPERM;

        /* Check for privilege-elevated exec. */
        if (is_setid ||
            (!__is_real(root_uid, new) &&
             (effective ||
              __cap_grew(permitted, ambient, new))))
                bprm->secureexec = 1;

        return 0;
}

/**
 * cap_inode_setxattr - Determine whether an xattr may be altered
 * @dentry: The inode/dentry being altered
 * @name: The name of the xattr to be changed
 * @value: The value that the xattr will be changed to
 * @size: The size of value
 * @flags: The replacement flag
 *
 * Determine whether an xattr may be altered or set on an inode, returning 0 if
 * permission is granted, -ve if denied.
 *
 * This is used to make sure security xattrs don't get updated or set by those
 * who aren't privileged to do so.
 */
int cap_inode_setxattr(struct dentry *dentry, const char *name,
                       const void *value, size_t size, int flags)
{
        struct user_namespace *user_ns = dentry->d_sb->s_user_ns;

        /* Ignore non-security xattrs */
        if (strncmp(name, XATTR_SECURITY_PREFIX,
                        XATTR_SECURITY_PREFIX_LEN) != 0)
                return 0;

        /*
         * For XATTR_NAME_CAPS the check will be done in
         * cap_convert_nscap(), called by setxattr()
         */
        if (strcmp(name, XATTR_NAME_CAPS) == 0)
                return 0;

        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        return 0;
}

/**
 * cap_inode_removexattr - Determine whether an xattr may be removed
 * @dentry: The inode/dentry being altered
 * @name: The name of the xattr to be changed
 *
 * Determine whether an xattr may be removed from an inode, returning 0 if
 * permission is granted, -ve if denied.
 *
 * This is used to make sure security xattrs don't get removed by those who
 * aren't privileged to remove them.
 */
int cap_inode_removexattr(struct dentry *dentry, const char *name)
{
        struct user_namespace *user_ns = dentry->d_sb->s_user_ns;

        /* Ignore non-security xattrs */
        if (strncmp(name, XATTR_SECURITY_PREFIX,
                        XATTR_SECURITY_PREFIX_LEN) != 0)
                return 0;

        if (strcmp(name, XATTR_NAME_CAPS) == 0) {
                /* security.capability gets namespaced */
                struct inode *inode = d_backing_inode(dentry);
                if (!inode)
                        return -EINVAL;
                if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
                        return -EPERM;
                return 0;
        }

        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        return 0;
}

/*
 * cap_emulate_setxuid() fixes the effective / permitted capabilities of
 * a process after a call to setuid, setreuid, or setresuid.
 *
 *  1) When set*uiding _from_ one of {r,e,s}uid == 0 _to_ all of
 *  {r,e,s}uid != 0, the permitted and effective capabilities are
 *  cleared.
 *
 *  2) When set*uiding _from_ euid == 0 _to_ euid != 0, the effective
 *  capabilities of the process are cleared.
 *
 *  3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective
 *  capabilities are set to the permitted capabilities.
 *
 *  fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should
 *  never happen.
 *
 *  -astor
 *
 * cevans - New behaviour, Oct '99
 * A process may, via prctl(), elect to keep its capabilities when it
 * calls setuid() and switches away from uid==0. Both permitted and
 * effective sets will be retained.
 * Without this change, it was impossible for a daemon to drop only some
 * of its privilege. The call to setuid(!=0) would drop all privileges!
 * Keeping uid 0 is not an option because uid 0 owns too many vital
 * files..
 * Thanks to Olaf Kirch and Peter Benie for spotting this.
 */
static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old)
{
        kuid_t root_uid = make_kuid(old->user_ns, 0);

        if ((uid_eq(old->uid, root_uid) ||
             uid_eq(old->euid, root_uid) ||
             uid_eq(old->suid, root_uid)) &&
            (!uid_eq(new->uid, root_uid) &&
             !uid_eq(new->euid, root_uid) &&
             !uid_eq(new->suid, root_uid))) {
                if (!issecure(SECURE_KEEP_CAPS)) {
                        cap_clear(new->cap_permitted);
                        cap_clear(new->cap_effective);
                }

                /*
                 * Pre-ambient programs expect setresuid to nonroot followed
                 * by exec to drop capabilities.  We should make sure that
                 * this remains the case.
                 */
                cap_clear(new->cap_ambient);
        }
        if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid))
                cap_clear(new->cap_effective);
        if (!uid_eq(old->euid, root_uid) && uid_eq(new->euid, root_uid))
                new->cap_effective = new->cap_permitted;
}

/**
 * cap_task_fix_setuid - Fix up the results of setuid() call
 * @new: The proposed credentials
 * @old: The current task's current credentials
 * @flags: Indications of what has changed
 *
 * Fix up the results of setuid() call before the credential changes are
 * actually applied, returning 0 to grant the changes, -ve to deny them.
 */
int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags)
{
        switch (flags) {
        case LSM_SETID_RE:
        case LSM_SETID_ID:
        case LSM_SETID_RES:
                /* juggle the capabilities to follow [RES]UID changes unless
                 * otherwise suppressed */
                if (!issecure(SECURE_NO_SETUID_FIXUP))
                        cap_emulate_setxuid(new, old);
                break;

        case LSM_SETID_FS:
                /* juggle the capabilties to follow FSUID changes, unless
                 * otherwise suppressed
                 *
                 * FIXME - is fsuser used for all CAP_FS_MASK capabilities?
                 *          if not, we might be a bit too harsh here.
                 */
                if (!issecure(SECURE_NO_SETUID_FIXUP)) {
                        kuid_t root_uid = make_kuid(old->user_ns, 0);
                        if (uid_eq(old->fsuid, root_uid) && !uid_eq(new->fsuid, root_uid))
                                new->cap_effective =
                                        cap_drop_fs_set(new->cap_effective);

                        if (!uid_eq(old->fsuid, root_uid) && uid_eq(new->fsuid, root_uid))
                                new->cap_effective =
                                        cap_raise_fs_set(new->cap_effective,
                                                         new->cap_permitted);
                }
                break;

        default:
                return -EINVAL;
        }

        return 0;
}

/*
 * Rationale: code calling task_setscheduler, task_setioprio, and
 * task_setnice, assumes that
 *   . if capable(cap_sys_nice), then those actions should be allowed
 *   . if not capable(cap_sys_nice), but acting on your own processes,
 *           then those actions should be allowed
 * This is insufficient now since you can call code without suid, but
 * yet with increased caps.
 * So we check for increased caps on the target process.
 */
static int cap_safe_nice(struct task_struct *p)
{
        int is_subset, ret = 0;

        rcu_read_lock();
        is_subset = cap_issubset(__task_cred(p)->cap_permitted,
                                 current_cred()->cap_permitted);
        if (!is_subset && !ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
                ret = -EPERM;
        rcu_read_unlock();

        return ret;
}

/**
 * cap_task_setscheduler - Detemine if scheduler policy change is permitted
 * @p: The task to affect
 *
 * Detemine if the requested scheduler policy change is permitted for the
 * specified task, returning 0 if permission is granted, -ve if denied.
 */
int cap_task_setscheduler(struct task_struct *p)
{
        return cap_safe_nice(p);
}

/**
 * cap_task_ioprio - Detemine if I/O priority change is permitted
 * @p: The task to affect
 * @ioprio: The I/O priority to set
 *
 * Detemine if the requested I/O priority change is permitted for the specified
 * task, returning 0 if permission is granted, -ve if denied.
 */
int cap_task_setioprio(struct task_struct *p, int ioprio)
{
        return cap_safe_nice(p);
}

/**
 * cap_task_ioprio - Detemine if task priority change is permitted
 * @p: The task to affect
 * @nice: The nice value to set
 *
 * Detemine if the requested task priority change is permitted for the
 * specified task, returning 0 if permission is granted, -ve if denied.
 */
int cap_task_setnice(struct task_struct *p, int nice)
{
        return cap_safe_nice(p);
}

/*
 * Implement PR_CAPBSET_DROP.  Attempt to remove the specified capability from
 * the current task's bounding set.  Returns 0 on success, -ve on error.
 */
static int cap_prctl_drop(unsigned long cap)
{
        struct cred *new;

        if (!ns_capable(current_user_ns(), CAP_SETPCAP))
                return -EPERM;
        if (!cap_valid(cap))
                return -EINVAL;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;
        cap_lower(new->cap_bset, cap);
        return commit_creds(new);
}

/**
 * cap_task_prctl - Implement process control functions for this security module
 * @option: The process control function requested
 * @arg2, @arg3, @arg4, @arg5: The argument data for this function
 *
 * Allow process control functions (sys_prctl()) to alter capabilities; may
 * also deny access to other functions not otherwise implemented here.
 *
 * Returns 0 or +ve on success, -ENOSYS if this function is not implemented
 * here, other -ve on error.  If -ENOSYS is returned, sys_prctl() and other LSM
 * modules will consider performing the function.
 */
int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
                   unsigned long arg4, unsigned long arg5)
{
        const struct cred *old = current_cred();
        struct cred *new;

        switch (option) {
        case PR_CAPBSET_READ:
                if (!cap_valid(arg2))
                        return -EINVAL;
                return !!cap_raised(old->cap_bset, arg2);

        case PR_CAPBSET_DROP:
                return cap_prctl_drop(arg2);

        /*
         * The next four prctl's remain to assist with transitioning a
         * system from legacy UID=0 based privilege (when filesystem
         * capabilities are not in use) to a system using filesystem
         * capabilities only - as the POSIX.1e draft intended.
         *
         * Note:
         *
         *  PR_SET_SECUREBITS =
         *      issecure_mask(SECURE_KEEP_CAPS_LOCKED)
         *    | issecure_mask(SECURE_NOROOT)
         *    | issecure_mask(SECURE_NOROOT_LOCKED)
         *    | issecure_mask(SECURE_NO_SETUID_FIXUP)
         *    | issecure_mask(SECURE_NO_SETUID_FIXUP_LOCKED)
         *
         * will ensure that the current process and all of its
         * children will be locked into a pure
         * capability-based-privilege environment.
         */
        case PR_SET_SECUREBITS:
                if ((((old->securebits & SECURE_ALL_LOCKS) >> 1)
                     & (old->securebits ^ arg2))                        /*[1]*/
                    || ((old->securebits & SECURE_ALL_LOCKS & ~arg2))        /*[2]*/
                    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))        /*[3]*/
                    || (cap_capable(current_cred(),
                                    current_cred()->user_ns,
                                    CAP_SETPCAP,
                                    CAP_OPT_NONE) != 0)                        /*[4]*/
                        /*
                         * [1] no changing of bits that are locked
                         * [2] no unlocking of locks
                         * [3] no setting of unsupported bits
                         * [4] doing anything requires privilege (go read about
                         *     the "sendmail capabilities bug")
                         */
                    )
                        /* cannot change a locked bit */
                        return -EPERM;

                new = prepare_creds();
                if (!new)
                        return -ENOMEM;
                new->securebits = arg2;
                return commit_creds(new);

        case PR_GET_SECUREBITS:
                return old->securebits;

        case PR_GET_KEEPCAPS:
                return !!issecure(SECURE_KEEP_CAPS);

        case PR_SET_KEEPCAPS:
                if (arg2 > 1) /* Note, we rely on arg2 being unsigned here */
                        return -EINVAL;
                if (issecure(SECURE_KEEP_CAPS_LOCKED))
                        return -EPERM;

                new = prepare_creds();
                if (!new)
                        return -ENOMEM;
                if (arg2)
                        new->securebits |= issecure_mask(SECURE_KEEP_CAPS);
                else
                        new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
                return commit_creds(new);

        case PR_CAP_AMBIENT:
                if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) {
                        if (arg3 | arg4 | arg5)
                                return -EINVAL;

                        new = prepare_creds();
                        if (!new)
                                return -ENOMEM;
                        cap_clear(new->cap_ambient);
                        return commit_creds(new);
                }

                if (((!cap_valid(arg3)) | arg4 | arg5))
                        return -EINVAL;

                if (arg2 == PR_CAP_AMBIENT_IS_SET) {
                        return !!cap_raised(current_cred()->cap_ambient, arg3);
                } else if (arg2 != PR_CAP_AMBIENT_RAISE &&
                           arg2 != PR_CAP_AMBIENT_LOWER) {
                        return -EINVAL;
                } else {
                        if (arg2 == PR_CAP_AMBIENT_RAISE &&
                            (!cap_raised(current_cred()->cap_permitted, arg3) ||
                             !cap_raised(current_cred()->cap_inheritable,
                                         arg3) ||
                             issecure(SECURE_NO_CAP_AMBIENT_RAISE)))
                                return -EPERM;

                        new = prepare_creds();
                        if (!new)
                                return -ENOMEM;
                        if (arg2 == PR_CAP_AMBIENT_RAISE)
                                cap_raise(new->cap_ambient, arg3);
                        else
                                cap_lower(new->cap_ambient, arg3);
                        return commit_creds(new);
                }

        default:
                /* No functionality available - continue with default */
                return -ENOSYS;
        }
}

/**
 * cap_vm_enough_memory - Determine whether a new virtual mapping is permitted
 * @mm: The VM space in which the new mapping is to be made
 * @pages: The size of the mapping
 *
 * Determine whether the allocation of a new virtual mapping by the current
 * task is permitted, returning 1 if permission is granted, 0 if not.
 */
int cap_vm_enough_memory(struct mm_struct *mm, long pages)
{
        int cap_sys_admin = 0;

        if (cap_capable(current_cred(), &init_user_ns,
                                CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) == 0)
                cap_sys_admin = 1;

        return cap_sys_admin;
}

/*
 * cap_mmap_addr - check if able to map given addr
 * @addr: address attempting to be mapped
 *
 * If the process is attempting to map memory below dac_mmap_min_addr they need
 * CAP_SYS_RAWIO.  The other parameters to this function are unused by the
 * capability security module.  Returns 0 if this mapping should be allowed
 * -EPERM if not.
 */
int cap_mmap_addr(unsigned long addr)
{
        int ret = 0;

        if (addr < dac_mmap_min_addr) {
                ret = cap_capable(current_cred(), &init_user_ns, CAP_SYS_RAWIO,
                                  CAP_OPT_NONE);
                /* set PF_SUPERPRIV if it turns out we allow the low mmap */
                if (ret == 0)
                        current->flags |= PF_SUPERPRIV;
        }
        return ret;
}

int cap_mmap_file(struct file *file, unsigned long reqprot,
                  unsigned long prot, unsigned long flags)
{
        return 0;
}

#ifdef CONFIG_SECURITY

static struct security_hook_list capability_hooks[] __lsm_ro_after_init = {
        LSM_HOOK_INIT(capable, cap_capable),
        LSM_HOOK_INIT(settime, cap_settime),
        LSM_HOOK_INIT(ptrace_access_check, cap_ptrace_access_check),
        LSM_HOOK_INIT(ptrace_traceme, cap_ptrace_traceme),
        LSM_HOOK_INIT(capget, cap_capget),
        LSM_HOOK_INIT(capset, cap_capset),
        LSM_HOOK_INIT(bprm_creds_from_file, cap_bprm_creds_from_file),
        LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
        LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
        LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity),
        LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
        LSM_HOOK_INIT(mmap_file, cap_mmap_file),
        LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid),
        LSM_HOOK_INIT(task_prctl, cap_task_prctl),
        LSM_HOOK_INIT(task_setscheduler, cap_task_setscheduler),
        LSM_HOOK_INIT(task_setioprio, cap_task_setioprio),
        LSM_HOOK_INIT(task_setnice, cap_task_setnice),
        LSM_HOOK_INIT(vm_enough_memory, cap_vm_enough_memory),
};

static int __init capability_init(void)
{
        security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks),
                                "capability");
        return 0;
}

DEFINE_LSM(capability) = {
        .name = "capability",
        .order = LSM_ORDER_FIRST,
        .init = capability_init,
};

#endif /* CONFIG_SECURITY */











































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Copyright (C) 1994 Linus Torvalds
 *
 *  Pentium III FXSR, SSE support
 *  General FPU state handling cleanups
 *        Gareth Hughes <gareth@valinux.com>, May 2000
 */
#include <asm/fpu/internal.h>
#include <asm/fpu/regset.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/types.h>
#include <asm/traps.h>
#include <asm/irq_regs.h>

#include <linux/hardirq.h>
#include <linux/pkeys.h>

#define CREATE_TRACE_POINTS
#include <asm/trace/fpu.h>

/*
 * Represents the initial FPU state. It's mostly (but not completely) zeroes,
 * depending on the FPU hardware format:
 */
union fpregs_state init_fpstate __read_mostly;

/* Track in-kernel FPU usage */
static DEFINE_PER_CPU(bool, in_kernel_fpu);

/*
 * Track which context is using the FPU on the CPU:
 */
DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);

/*
 * Can we use the FPU in kernel mode with the
 * whole "kernel_fpu_begin/end()" sequence?
 */
bool irq_fpu_usable(void)
{
        if (WARN_ON_ONCE(in_nmi()))
                return false;

        /* In kernel FPU usage already active? */
        if (this_cpu_read(in_kernel_fpu))
                return false;

        /*
         * When not in NMI or hard interrupt context, FPU can be used in:
         *
         * - Task context except from within fpregs_lock()'ed critical
         *   regions.
         *
         * - Soft interrupt processing context which cannot happen
         *   while in a fpregs_lock()'ed critical region.
         */
        if (!in_irq())
                return true;

        /*
         * In hard interrupt context it's safe when soft interrupts
         * are enabled, which means the interrupt did not hit in
         * a fpregs_lock()'ed critical region.
         */
        return !softirq_count();
}
EXPORT_SYMBOL(irq_fpu_usable);

/*
 * These must be called with preempt disabled. Returns
 * 'true' if the FPU state is still intact and we can
 * keep registers active.
 *
 * The legacy FNSAVE instruction cleared all FPU state
 * unconditionally, so registers are essentially destroyed.
 * Modern FPU state can be kept in registers, if there are
 * no pending FP exceptions.
 */
int copy_fpregs_to_fpstate(struct fpu *fpu)
{
        if (likely(use_xsave())) {
                copy_xregs_to_kernel(&fpu->state.xsave);

                /*
                 * AVX512 state is tracked here because its use is
                 * known to slow the max clock speed of the core.
                 */
                if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512)
                        fpu->avx512_timestamp = jiffies;
                return 1;
        }

        if (likely(use_fxsr())) {
                copy_fxregs_to_kernel(fpu);
                return 1;
        }

        /*
         * Legacy FPU register saving, FNSAVE always clears FPU registers,
         * so we have to mark them inactive:
         */
        asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));

        return 0;
}
EXPORT_SYMBOL(copy_fpregs_to_fpstate);

void kernel_fpu_begin_mask(unsigned int kfpu_mask)
{
        preempt_disable();

        WARN_ON_FPU(!irq_fpu_usable());
        WARN_ON_FPU(this_cpu_read(in_kernel_fpu));

        this_cpu_write(in_kernel_fpu, true);

        if (!(current->flags & PF_KTHREAD) &&
            !test_thread_flag(TIF_NEED_FPU_LOAD)) {
                set_thread_flag(TIF_NEED_FPU_LOAD);
                /*
                 * Ignore return value -- we don't care if reg state
                 * is clobbered.
                 */
                copy_fpregs_to_fpstate(&current->thread.fpu);
        }
        __cpu_invalidate_fpregs_state();

        /* Put sane initial values into the control registers. */
        if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM))
                ldmxcsr(MXCSR_DEFAULT);

        if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU))
                asm volatile ("fninit");
}
EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask);

void kernel_fpu_end(void)
{
        WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));

        this_cpu_write(in_kernel_fpu, false);
        preempt_enable();
}
EXPORT_SYMBOL_GPL(kernel_fpu_end);

/*
 * Save the FPU state (mark it for reload if necessary):
 *
 * This only ever gets called for the current task.
 */
void fpu__save(struct fpu *fpu)
{
        WARN_ON_FPU(fpu != &current->thread.fpu);

        fpregs_lock();
        trace_x86_fpu_before_save(fpu);

        if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
                if (!copy_fpregs_to_fpstate(fpu)) {
                        copy_kernel_to_fpregs(&fpu->state);
                }
        }

        trace_x86_fpu_after_save(fpu);
        fpregs_unlock();
}

/*
 * Legacy x87 fpstate state init:
 */
static inline void fpstate_init_fstate(struct fregs_state *fp)
{
        fp->cwd = 0xffff037fu;
        fp->swd = 0xffff0000u;
        fp->twd = 0xffffffffu;
        fp->fos = 0xffff0000u;
}

void fpstate_init(union fpregs_state *state)
{
        if (!static_cpu_has(X86_FEATURE_FPU)) {
                fpstate_init_soft(&state->soft);
                return;
        }

        memset(state, 0, fpu_kernel_xstate_size);

        if (static_cpu_has(X86_FEATURE_XSAVES))
                fpstate_init_xstate(&state->xsave);
        if (static_cpu_has(X86_FEATURE_FXSR))
                fpstate_init_fxstate(&state->fxsave);
        else
                fpstate_init_fstate(&state->fsave);
}
EXPORT_SYMBOL_GPL(fpstate_init);

int fpu__copy(struct task_struct *dst, struct task_struct *src)
{
        struct fpu *dst_fpu = &dst->thread.fpu;
        struct fpu *src_fpu = &src->thread.fpu;

        dst_fpu->last_cpu = -1;

        if (!static_cpu_has(X86_FEATURE_FPU))
                return 0;

        WARN_ON_FPU(src_fpu != &current->thread.fpu);

        /*
         * Don't let 'init optimized' areas of the XSAVE area
         * leak into the child task:
         */
        memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size);

        /*
         * If the FPU registers are not current just memcpy() the state.
         * Otherwise save current FPU registers directly into the child's FPU
         * context, without any memory-to-memory copying.
         *
         * ( The function 'fails' in the FNSAVE case, which destroys
         *   register contents so we have to load them back. )
         */
        fpregs_lock();
        if (test_thread_flag(TIF_NEED_FPU_LOAD))
                memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size);

        else if (!copy_fpregs_to_fpstate(dst_fpu))
                copy_kernel_to_fpregs(&dst_fpu->state);

        fpregs_unlock();

        set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD);

        trace_x86_fpu_copy_src(src_fpu);
        trace_x86_fpu_copy_dst(dst_fpu);

        return 0;
}

/*
 * Activate the current task's in-memory FPU context,
 * if it has not been used before:
 */
static void fpu__initialize(struct fpu *fpu)
{
        WARN_ON_FPU(fpu != &current->thread.fpu);

        set_thread_flag(TIF_NEED_FPU_LOAD);
        fpstate_init(&fpu->state);
        trace_x86_fpu_init_state(fpu);
}

/*
 * This function must be called before we read a task's fpstate.
 *
 * There's two cases where this gets called:
 *
 * - for the current task (when coredumping), in which case we have
 *   to save the latest FPU registers into the fpstate,
 *
 * - or it's called for stopped tasks (ptrace), in which case the
 *   registers were already saved by the context-switch code when
 *   the task scheduled out.
 *
 * If the task has used the FPU before then save it.
 */
void fpu__prepare_read(struct fpu *fpu)
{
        if (fpu == &current->thread.fpu)
                fpu__save(fpu);
}

/*
 * This function must be called before we write a task's fpstate.
 *
 * Invalidate any cached FPU registers.
 *
 * After this function call, after registers in the fpstate are
 * modified and the child task has woken up, the child task will
 * restore the modified FPU state from the modified context. If we
 * didn't clear its cached status here then the cached in-registers
 * state pending on its former CPU could be restored, corrupting
 * the modifications.
 */
void fpu__prepare_write(struct fpu *fpu)
{
        /*
         * Only stopped child tasks can be used to modify the FPU
         * state in the fpstate buffer:
         */
        WARN_ON_FPU(fpu == &current->thread.fpu);

        /* Invalidate any cached state: */
        __fpu_invalidate_fpregs_state(fpu);
}

/*
 * Drops current FPU state: deactivates the fpregs and
 * the fpstate. NOTE: it still leaves previous contents
 * in the fpregs in the eager-FPU case.
 *
 * This function can be used in cases where we know that
 * a state-restore is coming: either an explicit one,
 * or a reschedule.
 */
void fpu__drop(struct fpu *fpu)
{
        preempt_disable();

        if (fpu == &current->thread.fpu) {
                /* Ignore delayed exceptions from user space */
                asm volatile("1: fwait\n"
                             "2:\n"
                             _ASM_EXTABLE(1b, 2b));
                fpregs_deactivate(fpu);
        }

        trace_x86_fpu_dropped(fpu);

        preempt_enable();
}

/*
 * Clear FPU registers by setting them up from the init fpstate.
 * Caller must do fpregs_[un]lock() around it.
 */
static inline void copy_init_fpstate_to_fpregs(u64 features_mask)
{
        if (use_xsave())
                copy_kernel_to_xregs(&init_fpstate.xsave, features_mask);
        else if (static_cpu_has(X86_FEATURE_FXSR))
                copy_kernel_to_fxregs(&init_fpstate.fxsave);
        else
                copy_kernel_to_fregs(&init_fpstate.fsave);

        if (boot_cpu_has(X86_FEATURE_OSPKE))
                copy_init_pkru_to_fpregs();
}

/*
 * Clear the FPU state back to init state.
 *
 * Called by sys_execve(), by the signal handler code and by various
 * error paths.
 */
static void fpu__clear(struct fpu *fpu, bool user_only)
{
        WARN_ON_FPU(fpu != &current->thread.fpu);

        if (!static_cpu_has(X86_FEATURE_FPU)) {
                fpu__drop(fpu);
                fpu__initialize(fpu);
                return;
        }

        fpregs_lock();

        if (user_only) {
                if (!fpregs_state_valid(fpu, smp_processor_id()) &&
                    xfeatures_mask_supervisor())
                        copy_kernel_to_xregs(&fpu->state.xsave,
                                             xfeatures_mask_supervisor());
                copy_init_fpstate_to_fpregs(xfeatures_mask_user());
        } else {
                copy_init_fpstate_to_fpregs(xfeatures_mask_all);
        }

        fpregs_mark_activate();
        fpregs_unlock();
}

void fpu__clear_user_states(struct fpu *fpu)
{
        fpu__clear(fpu, true);
}

void fpu__clear_all(struct fpu *fpu)
{
        fpu__clear(fpu, false);
}

/*
 * Load FPU context before returning to userspace.
 */
void switch_fpu_return(void)
{
        if (!static_cpu_has(X86_FEATURE_FPU))
                return;

        __fpregs_load_activate();
}
EXPORT_SYMBOL_GPL(switch_fpu_return);

#ifdef CONFIG_X86_DEBUG_FPU
/*
 * If current FPU state according to its tracking (loaded FPU context on this
 * CPU) is not valid then we must have TIF_NEED_FPU_LOAD set so the context is
 * loaded on return to userland.
 */
void fpregs_assert_state_consistent(void)
{
        struct fpu *fpu = &current->thread.fpu;

        if (test_thread_flag(TIF_NEED_FPU_LOAD))
                return;

        WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id()));
}
EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent);
#endif

void fpregs_mark_activate(void)
{
        struct fpu *fpu = &current->thread.fpu;

        fpregs_activate(fpu);
        fpu->last_cpu = smp_processor_id();
        clear_thread_flag(TIF_NEED_FPU_LOAD);
}
EXPORT_SYMBOL_GPL(fpregs_mark_activate);

/*
 * x87 math exception handling:
 */

int fpu__exception_code(struct fpu *fpu, int trap_nr)
{
        int err;

        if (trap_nr == X86_TRAP_MF) {
                unsigned short cwd, swd;
                /*
                 * (~cwd & swd) will mask out exceptions that are not set to unmasked
                 * status.  0x3f is the exception bits in these regs, 0x200 is the
                 * C1 reg you need in case of a stack fault, 0x040 is the stack
                 * fault bit.  We should only be taking one exception at a time,
                 * so if this combination doesn't produce any single exception,
                 * then we have a bad program that isn't synchronizing its FPU usage
                 * and it will suffer the consequences since we won't be able to
                 * fully reproduce the context of the exception.
                 */
                if (boot_cpu_has(X86_FEATURE_FXSR)) {
                        cwd = fpu->state.fxsave.cwd;
                        swd = fpu->state.fxsave.swd;
                } else {
                        cwd = (unsigned short)fpu->state.fsave.cwd;
                        swd = (unsigned short)fpu->state.fsave.swd;
                }

                err = swd & ~cwd;
        } else {
                /*
                 * The SIMD FPU exceptions are handled a little differently, as there
                 * is only a single status/control register.  Thus, to determine which
                 * unmasked exception was caught we must mask the exception mask bits
                 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
                 */
                unsigned short mxcsr = MXCSR_DEFAULT;

                if (boot_cpu_has(X86_FEATURE_XMM))
                        mxcsr = fpu->state.fxsave.mxcsr;

                err = ~(mxcsr >> 7) & mxcsr;
        }

        if (err & 0x001) {        /* Invalid op */
                /*
                 * swd & 0x240 == 0x040: Stack Underflow
                 * swd & 0x240 == 0x240: Stack Overflow
                 * User must clear the SF bit (0x40) if set
                 */
                return FPE_FLTINV;
        } else if (err & 0x004) { /* Divide by Zero */
                return FPE_FLTDIV;
        } else if (err & 0x008) { /* Overflow */
                return FPE_FLTOVF;
        } else if (err & 0x012) { /* Denormal, Underflow */
                return FPE_FLTUND;
        } else if (err & 0x020) { /* Precision */
                return FPE_FLTRES;
        }

        /*
         * If we're using IRQ 13, or supposedly even some trap
         * X86_TRAP_MF implementations, it's possible
         * we get a spurious trap, which is not an error.
         */
        return 0;
}


























    5 




























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_PREEMPT_H
#define __ASM_PREEMPT_H

#include <asm/rmwcc.h>
#include <asm/percpu.h>
#include <linux/thread_info.h>

DECLARE_PER_CPU(int, __preempt_count);

/* We use the MSB mostly because its available */
#define PREEMPT_NEED_RESCHED        0x80000000

/*
 * We use the PREEMPT_NEED_RESCHED bit as an inverted NEED_RESCHED such
 * that a decrement hitting 0 means we can and should reschedule.
 */
#define PREEMPT_ENABLED        (0 + PREEMPT_NEED_RESCHED)

/*
 * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
 * that think a non-zero value indicates we cannot preempt.
 */
static __always_inline int preempt_count(void)
{
        return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
}

static __always_inline void preempt_count_set(int pc)
{
        int old, new;

        do {
                old = raw_cpu_read_4(__preempt_count);
                new = (old & PREEMPT_NEED_RESCHED) |
                        (pc & ~PREEMPT_NEED_RESCHED);
        } while (raw_cpu_cmpxchg_4(__preempt_count, old, new) != old);
}

/*
 * must be macros to avoid header recursion hell
 */
#define init_task_preempt_count(p) do { } while (0)

#define init_idle_preempt_count(p, cpu) do { \
        per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \
} while (0)

/*
 * We fold the NEED_RESCHED bit into the preempt count such that
 * preempt_enable() can decrement and test for needing to reschedule with a
 * single instruction.
 *
 * We invert the actual bit, so that when the decrement hits 0 we know we both
 * need to resched (the bit is cleared) and can resched (no preempt count).
 */

static __always_inline void set_preempt_need_resched(void)
{
        raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED);
}

static __always_inline void clear_preempt_need_resched(void)
{
        raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED);
}

static __always_inline bool test_preempt_need_resched(void)
{
        return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED);
}

/*
 * The various preempt_count add/sub methods
 */

static __always_inline void __preempt_count_add(int val)
{
        raw_cpu_add_4(__preempt_count, val);
}

static __always_inline void __preempt_count_sub(int val)
{
        raw_cpu_add_4(__preempt_count, -val);
}

/*
 * Because we keep PREEMPT_NEED_RESCHED set when we do _not_ need to reschedule
 * a decrement which hits zero means we have no preempt_count and should
 * reschedule.
 */
static __always_inline bool __preempt_count_dec_and_test(void)
{
        return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var]));
}

/*
 * Returns true when we need to resched and can (barring IRQ state).
 */
static __always_inline bool should_resched(int preempt_offset)
{
        return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
}

#ifdef CONFIG_PREEMPTION
  extern asmlinkage void preempt_schedule_thunk(void);
# define __preempt_schedule() \
        asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT)

  extern asmlinkage void preempt_schedule(void);
  extern asmlinkage void preempt_schedule_notrace_thunk(void);
# define __preempt_schedule_notrace() \
        asm volatile ("call preempt_schedule_notrace_thunk" : ASM_CALL_CONSTRAINT)

  extern asmlinkage void preempt_schedule_notrace(void);
#endif

#endif /* __ASM_PREEMPT_H */




































    5 



    5 






































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#ifdef CONFIG_PREEMPTIRQ_TRACEPOINTS

#undef TRACE_SYSTEM
#define TRACE_SYSTEM preemptirq

#if !defined(_TRACE_PREEMPTIRQ_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PREEMPTIRQ_H

#include <linux/ktime.h>
#include <linux/tracepoint.h>
#include <linux/string.h>
#include <asm/sections.h>

DECLARE_EVENT_CLASS(preemptirq_template,

        TP_PROTO(unsigned long ip, unsigned long parent_ip),

        TP_ARGS(ip, parent_ip),

        TP_STRUCT__entry(
                __field(s32, caller_offs)
                __field(s32, parent_offs)
        ),

        TP_fast_assign(
                __entry->caller_offs = (s32)(ip - (unsigned long)_stext);
                __entry->parent_offs = (s32)(parent_ip - (unsigned long)_stext);
        ),

        TP_printk("caller=%pS parent=%pS",
                  (void *)((unsigned long)(_stext) + __entry->caller_offs),
                  (void *)((unsigned long)(_stext) + __entry->parent_offs))
);

#ifdef CONFIG_TRACE_IRQFLAGS
DEFINE_EVENT(preemptirq_template, irq_disable,
             TP_PROTO(unsigned long ip, unsigned long parent_ip),
             TP_ARGS(ip, parent_ip));

DEFINE_EVENT(preemptirq_template, irq_enable,
             TP_PROTO(unsigned long ip, unsigned long parent_ip),
             TP_ARGS(ip, parent_ip));
#else
#define trace_irq_enable(...)
#define trace_irq_disable(...)
#define trace_irq_enable_rcuidle(...)
#define trace_irq_disable_rcuidle(...)
#endif

#ifdef CONFIG_TRACE_PREEMPT_TOGGLE
DEFINE_EVENT(preemptirq_template, preempt_disable,
             TP_PROTO(unsigned long ip, unsigned long parent_ip),
             TP_ARGS(ip, parent_ip));

DEFINE_EVENT(preemptirq_template, preempt_enable,
             TP_PROTO(unsigned long ip, unsigned long parent_ip),
             TP_ARGS(ip, parent_ip));
#else
#define trace_preempt_enable(...)
#define trace_preempt_disable(...)
#define trace_preempt_enable_rcuidle(...)
#define trace_preempt_disable_rcuidle(...)
#endif

#endif /* _TRACE_PREEMPTIRQ_H */

#include <trace/define_trace.h>

#else /* !CONFIG_PREEMPTIRQ_TRACEPOINTS */
#define trace_irq_enable(...)
#define trace_irq_disable(...)
#define trace_irq_enable_rcuidle(...)
#define trace_irq_disable_rcuidle(...)
#define trace_preempt_enable(...)
#define trace_preempt_disable(...)
#define trace_preempt_enable_rcuidle(...)
#define trace_preempt_disable_rcuidle(...)
#endif


























































































































































































































































































































































































































































































    5 




















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (C) 1994 Linus Torvalds
 *
 * Pentium III FXSR, SSE support
 * General FPU state handling cleanups
 *        Gareth Hughes <gareth@valinux.com>, May 2000
 * x86-64 work by Andi Kleen 2002
 */

#ifndef _ASM_X86_FPU_INTERNAL_H
#define _ASM_X86_FPU_INTERNAL_H

#include <linux/compat.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/mm.h>

#include <asm/user.h>
#include <asm/fpu/api.h>
#include <asm/fpu/xstate.h>
#include <asm/fpu/xcr.h>
#include <asm/cpufeature.h>
#include <asm/trace/fpu.h>

/*
 * High level FPU state handling functions:
 */
extern void fpu__prepare_read(struct fpu *fpu);
extern void fpu__prepare_write(struct fpu *fpu);
extern void fpu__save(struct fpu *fpu);
extern int  fpu__restore_sig(void __user *buf, int ia32_frame);
extern void fpu__drop(struct fpu *fpu);
extern int  fpu__copy(struct task_struct *dst, struct task_struct *src);
extern void fpu__clear_user_states(struct fpu *fpu);
extern void fpu__clear_all(struct fpu *fpu);
extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);

/*
 * Boot time FPU initialization functions:
 */
extern void fpu__init_cpu(void);
extern void fpu__init_system_xstate(void);
extern void fpu__init_cpu_xstate(void);
extern void fpu__init_system(void);
extern void fpu__init_check_bugs(void);
extern void fpu__resume_cpu(void);
extern u64 fpu__get_supported_xfeatures_mask(void);

/*
 * Debugging facility:
 */
#ifdef CONFIG_X86_DEBUG_FPU
# define WARN_ON_FPU(x) WARN_ON_ONCE(x)
#else
# define WARN_ON_FPU(x) ({ (void)(x); 0; })
#endif

/*
 * FPU related CPU feature flag helper routines:
 */
static __always_inline __pure bool use_xsaveopt(void)
{
        return static_cpu_has(X86_FEATURE_XSAVEOPT);
}

static __always_inline __pure bool use_xsave(void)
{
        return static_cpu_has(X86_FEATURE_XSAVE);
}

static __always_inline __pure bool use_fxsr(void)
{
        return static_cpu_has(X86_FEATURE_FXSR);
}

/*
 * fpstate handling functions:
 */

extern union fpregs_state init_fpstate;

extern void fpstate_init(union fpregs_state *state);
#ifdef CONFIG_MATH_EMULATION
extern void fpstate_init_soft(struct swregs_state *soft);
#else
static inline void fpstate_init_soft(struct swregs_state *soft) {}
#endif

static inline void fpstate_init_xstate(struct xregs_state *xsave)
{
        /*
         * XRSTORS requires these bits set in xcomp_bv, or it will
         * trigger #GP:
         */
        xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all;
}

static inline void fpstate_init_fxstate(struct fxregs_state *fx)
{
        fx->cwd = 0x37f;
        fx->mxcsr = MXCSR_DEFAULT;
}
extern void fpstate_sanitize_xstate(struct fpu *fpu);

/* Returns 0 or the negated trap number, which results in -EFAULT for #PF */
#define user_insn(insn, output, input...)                                \
({                                                                        \
        int err;                                                        \
                                                                        \
        might_fault();                                                        \
                                                                        \
        asm volatile(ASM_STAC "\n"                                        \
                     "1: " #insn "\n"                                        \
                     "2: " ASM_CLAC "\n"                                \
                     ".section .fixup,\"ax\"\n"                                \
                     "3:  negl %%eax\n"                                        \
                     "    jmp  2b\n"                                        \
                     ".previous\n"                                        \
                     _ASM_EXTABLE_FAULT(1b, 3b)                                \
                     : [err] "=a" (err), output                                \
                     : "0"(0), input);                                        \
        err;                                                                \
})

#define kernel_insn_err(insn, output, input...)                                \
({                                                                        \
        int err;                                                        \
        asm volatile("1:" #insn "\n\t"                                        \
                     "2:\n"                                                \
                     ".section .fixup,\"ax\"\n"                                \
                     "3:  movl $-1,%[err]\n"                                \
                     "    jmp  2b\n"                                        \
                     ".previous\n"                                        \
                     _ASM_EXTABLE(1b, 3b)                                \
                     : [err] "=r" (err), output                                \
                     : "0"(0), input);                                        \
        err;                                                                \
})

#define kernel_insn(insn, output, input...)                                \
        asm volatile("1:" #insn "\n\t"                                        \
                     "2:\n"                                                \
                     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_fprestore)        \
                     : output : input)

static inline int copy_fregs_to_user(struct fregs_state __user *fx)
{
        return user_insn(fnsave %[fx]; fwait,  [fx] "=m" (*fx), "m" (*fx));
}

static inline int copy_fxregs_to_user(struct fxregs_state __user *fx)
{
        if (IS_ENABLED(CONFIG_X86_32))
                return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
        else
                return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));

}

static inline void copy_kernel_to_fxregs(struct fxregs_state *fx)
{
        if (IS_ENABLED(CONFIG_X86_32))
                kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
        else
                kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
}

static inline int copy_kernel_to_fxregs_err(struct fxregs_state *fx)
{
        if (IS_ENABLED(CONFIG_X86_32))
                return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
        else
                return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
}

static inline int copy_user_to_fxregs(struct fxregs_state __user *fx)
{
        if (IS_ENABLED(CONFIG_X86_32))
                return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
        else
                return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
}

static inline void copy_kernel_to_fregs(struct fregs_state *fx)
{
        kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
}

static inline int copy_kernel_to_fregs_err(struct fregs_state *fx)
{
        return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
}

static inline int copy_user_to_fregs(struct fregs_state __user *fx)
{
        return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
}

static inline void copy_fxregs_to_kernel(struct fpu *fpu)
{
        if (IS_ENABLED(CONFIG_X86_32))
                asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave));
        else
                asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave));
}

static inline void fxsave(struct fxregs_state *fx)
{
        if (IS_ENABLED(CONFIG_X86_32))
                asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx));
        else
                asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx));
}

/* These macros all use (%edi)/(%rdi) as the single memory argument. */
#define XSAVE                ".byte " REX_PREFIX "0x0f,0xae,0x27"
#define XSAVEOPT        ".byte " REX_PREFIX "0x0f,0xae,0x37"
#define XSAVES                ".byte " REX_PREFIX "0x0f,0xc7,0x2f"
#define XRSTOR                ".byte " REX_PREFIX "0x0f,0xae,0x2f"
#define XRSTORS                ".byte " REX_PREFIX "0x0f,0xc7,0x1f"

/*
 * After this @err contains 0 on success or the negated trap number when
 * the operation raises an exception. For faults this results in -EFAULT.
 */
#define XSTATE_OP(op, st, lmask, hmask, err)                                \
        asm volatile("1:" op "\n\t"                                        \
                     "xor %[err], %[err]\n"                                \
                     "2:\n\t"                                                \
                     ".pushsection .fixup,\"ax\"\n\t"                        \
                     "3: negl %%eax\n\t"                                \
                     "jmp 2b\n\t"                                        \
                     ".popsection\n\t"                                        \
                     _ASM_EXTABLE_FAULT(1b, 3b)                                \
                     : [err] "=a" (err)                                        \
                     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)        \
                     : "memory")

/*
 * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact
 * format and supervisor states in addition to modified optimization in
 * XSAVEOPT.
 *
 * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
 * supports modified optimization which is not supported by XSAVE.
 *
 * We use XSAVE as a fallback.
 *
 * The 661 label is defined in the ALTERNATIVE* macros as the address of the
 * original instruction which gets replaced. We need to use it here as the
 * address of the instruction where we might get an exception at.
 */
#define XSTATE_XSAVE(st, lmask, hmask, err)                                \
        asm volatile(ALTERNATIVE_2(XSAVE,                                \
                                   XSAVEOPT, X86_FEATURE_XSAVEOPT,        \
                                   XSAVES,   X86_FEATURE_XSAVES)        \
                     "\n"                                                \
                     "xor %[err], %[err]\n"                                \
                     "3:\n"                                                \
                     ".pushsection .fixup,\"ax\"\n"                        \
                     "4: movl $-2, %[err]\n"                                \
                     "jmp 3b\n"                                                \
                     ".popsection\n"                                        \
                     _ASM_EXTABLE(661b, 4b)                                \
                     : [err] "=r" (err)                                        \
                     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)        \
                     : "memory")

/*
 * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact
 * XSAVE area format.
 */
#define XSTATE_XRESTORE(st, lmask, hmask)                                \
        asm volatile(ALTERNATIVE(XRSTOR,                                \
                                 XRSTORS, X86_FEATURE_XSAVES)                \
                     "\n"                                                \
                     "3:\n"                                                \
                     _ASM_EXTABLE_HANDLE(661b, 3b, ex_handler_fprestore)\
                     :                                                        \
                     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)        \
                     : "memory")

/*
 * This function is called only during boot time when x86 caps are not set
 * up and alternative can not be used yet.
 */
static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
{
        u64 mask = -1;
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        WARN_ON(system_state != SYSTEM_BOOTING);

        if (boot_cpu_has(X86_FEATURE_XSAVES))
                XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
        else
                XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);

        /*
         * We should never fault when copying from a kernel buffer, and the FPU
         * state we set at boot time should be valid.
         */
        WARN_ON_FPU(err);
}

/*
 * Save processor xstate to xsave area.
 */
static inline void copy_xregs_to_kernel(struct xregs_state *xstate)
{
        u64 mask = xfeatures_mask_all;
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        WARN_ON_FPU(!alternatives_patched);

        XSTATE_XSAVE(xstate, lmask, hmask, err);

        /* We should never fault when copying to a kernel buffer: */
        WARN_ON_FPU(err);
}

/*
 * Restore processor xstate from xsave area.
 */
static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask)
{
        u32 lmask = mask;
        u32 hmask = mask >> 32;

        XSTATE_XRESTORE(xstate, lmask, hmask);
}

/*
 * Save xstate to user space xsave area.
 *
 * We don't use modified optimization because xrstor/xrstors might track
 * a different application.
 *
 * We don't use compacted format xsave area for
 * backward compatibility for old applications which don't understand
 * compacted format of xsave area.
 */
static inline int copy_xregs_to_user(struct xregs_state __user *buf)
{
        u64 mask = xfeatures_mask_user();
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        /*
         * Clear the xsave header first, so that reserved fields are
         * initialized to zero.
         */
        err = __clear_user(&buf->header, sizeof(buf->header));
        if (unlikely(err))
                return -EFAULT;

        stac();
        XSTATE_OP(XSAVE, buf, lmask, hmask, err);
        clac();

        return err;
}

/*
 * Restore xstate from user space xsave area.
 */
static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask)
{
        struct xregs_state *xstate = ((__force struct xregs_state *)buf);
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        stac();
        XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
        clac();

        return err;
}

/*
 * Restore xstate from kernel space xsave area, return an error code instead of
 * an exception.
 */
static inline int copy_kernel_to_xregs_err(struct xregs_state *xstate, u64 mask)
{
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        if (static_cpu_has(X86_FEATURE_XSAVES))
                XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
        else
                XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);

        return err;
}

extern int copy_fpregs_to_fpstate(struct fpu *fpu);

static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask)
{
        if (use_xsave()) {
                copy_kernel_to_xregs(&fpstate->xsave, mask);
        } else {
                if (use_fxsr())
                        copy_kernel_to_fxregs(&fpstate->fxsave);
                else
                        copy_kernel_to_fregs(&fpstate->fsave);
        }
}

static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate)
{
        /*
         * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
         * pending. Clear the x87 state here by setting it to fixed values.
         * "m" is a random variable that should be in L1.
         */
        if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) {
                asm volatile(
                        "fnclex\n\t"
                        "emms\n\t"
                        "fildl %P[addr]"        /* set F?P to defined value */
                        : : [addr] "m" (fpstate));
        }

        __copy_kernel_to_fpregs(fpstate, -1);
}

extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);

/*
 * FPU context switch related helper methods:
 */

DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);

/*
 * The in-register FPU state for an FPU context on a CPU is assumed to be
 * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx
 * matches the FPU.
 *
 * If the FPU register state is valid, the kernel can skip restoring the
 * FPU state from memory.
 *
 * Any code that clobbers the FPU registers or updates the in-memory
 * FPU state for a task MUST let the rest of the kernel know that the
 * FPU registers are no longer valid for this task.
 *
 * Either one of these invalidation functions is enough. Invalidate
 * a resource you control: CPU if using the CPU for something else
 * (with preemption disabled), FPU for the current task, or a task that
 * is prevented from running by the current task.
 */
static inline void __cpu_invalidate_fpregs_state(void)
{
        __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
}

static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
{
        fpu->last_cpu = -1;
}

static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
{
        return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
}

/*
 * These generally need preemption protection to work,
 * do try to avoid using these on their own:
 */
static inline void fpregs_deactivate(struct fpu *fpu)
{
        this_cpu_write(fpu_fpregs_owner_ctx, NULL);
        trace_x86_fpu_regs_deactivated(fpu);
}

static inline void fpregs_activate(struct fpu *fpu)
{
        this_cpu_write(fpu_fpregs_owner_ctx, fpu);
        trace_x86_fpu_regs_activated(fpu);
}

/*
 * Internal helper, do not use directly. Use switch_fpu_return() instead.
 */
static inline void __fpregs_load_activate(void)
{
        struct fpu *fpu = &current->thread.fpu;
        int cpu = smp_processor_id();

        if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
                return;

        if (!fpregs_state_valid(fpu, cpu)) {
                copy_kernel_to_fpregs(&fpu->state);
                fpregs_activate(fpu);
                fpu->last_cpu = cpu;
        }
        clear_thread_flag(TIF_NEED_FPU_LOAD);
}

/*
 * FPU state switching for scheduling.
 *
 * This is a two-stage process:
 *
 *  - switch_fpu_prepare() saves the old state.
 *    This is done within the context of the old process.
 *
 *  - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state
 *    will get loaded on return to userspace, or when the kernel needs it.
 *
 * If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers
 * are saved in the current thread's FPU register state.
 *
 * If TIF_NEED_FPU_LOAD is set then CPU's FPU registers may not
 * hold current()'s FPU registers. It is required to load the
 * registers before returning to userland or using the content
 * otherwise.
 *
 * The FPU context is only stored/restored for a user task and
 * PF_KTHREAD is used to distinguish between kernel and user threads.
 */
static inline void switch_fpu_prepare(struct task_struct *prev, int cpu)
{
        struct fpu *old_fpu = &prev->thread.fpu;

        if (static_cpu_has(X86_FEATURE_FPU) && !(prev->flags & PF_KTHREAD)) {
                if (!copy_fpregs_to_fpstate(old_fpu))
                        old_fpu->last_cpu = -1;
                else
                        old_fpu->last_cpu = cpu;

                /* But leave fpu_fpregs_owner_ctx! */
                trace_x86_fpu_regs_deactivated(old_fpu);
        }
}

/*
 * Misc helper functions:
 */

/*
 * Load PKRU from the FPU context if available. Delay loading of the
 * complete FPU state until the return to userland.
 */
static inline void switch_fpu_finish(struct task_struct *next)
{
        u32 pkru_val = init_pkru_value;
        struct pkru_state *pk;
        struct fpu *next_fpu = &next->thread.fpu;

        if (!static_cpu_has(X86_FEATURE_FPU))
                return;

        set_thread_flag(TIF_NEED_FPU_LOAD);

        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return;

        /*
         * PKRU state is switched eagerly because it needs to be valid before we
         * return to userland e.g. for a copy_to_user() operation.
         */
        if (!(next->flags & PF_KTHREAD)) {
                /*
                 * If the PKRU bit in xsave.header.xfeatures is not set,
                 * then the PKRU component was in init state, which means
                 * XRSTOR will set PKRU to 0. If the bit is not set then
                 * get_xsave_addr() will return NULL because the PKRU value
                 * in memory is not valid. This means pkru_val has to be
                 * set to 0 and not to init_pkru_value.
                 */
                pk = get_xsave_addr(&next_fpu->state.xsave, XFEATURE_PKRU);
                pkru_val = pk ? pk->pkru : 0;
        }
        __write_pkru(pkru_val);
}

#endif /* _ASM_X86_FPU_INTERNAL_H */






















































































    1 





















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
/* SPDX-License-Identifier: GPL-2.0 */

/*
 * This header provides generic wrappers for memory access instrumentation that
 * the compiler cannot emit for: KASAN, KCSAN.
 */
#ifndef _LINUX_INSTRUMENTED_H
#define _LINUX_INSTRUMENTED_H

#include <linux/compiler.h>
#include <linux/kasan-checks.h>
#include <linux/kcsan-checks.h>
#include <linux/types.h>

/**
 * instrument_read - instrument regular read access
 *
 * Instrument a regular read access. The instrumentation should be inserted
 * before the actual read happens.
 *
 * @ptr address of access
 * @size size of access
 */
static __always_inline void instrument_read(const volatile void *v, size_t size)
{
        kasan_check_read(v, size);
        kcsan_check_read(v, size);
}

/**
 * instrument_write - instrument regular write access
 *
 * Instrument a regular write access. The instrumentation should be inserted
 * before the actual write happens.
 *
 * @ptr address of access
 * @size size of access
 */
static __always_inline void instrument_write(const volatile void *v, size_t size)
{
        kasan_check_write(v, size);
        kcsan_check_write(v, size);
}

/**
 * instrument_read_write - instrument regular read-write access
 *
 * Instrument a regular write access. The instrumentation should be inserted
 * before the actual write happens.
 *
 * @ptr address of access
 * @size size of access
 */
static __always_inline void instrument_read_write(const volatile void *v, size_t size)
{
        kasan_check_write(v, size);
        kcsan_check_read_write(v, size);
}

/**
 * instrument_atomic_read - instrument atomic read access
 *
 * Instrument an atomic read access. The instrumentation should be inserted
 * before the actual read happens.
 *
 * @ptr address of access
 * @size size of access
 */
static __always_inline void instrument_atomic_read(const volatile void *v, size_t size)
{
        kasan_check_read(v, size);
        kcsan_check_atomic_read(v, size);
}

/**
 * instrument_atomic_write - instrument atomic write access
 *
 * Instrument an atomic write access. The instrumentation should be inserted
 * before the actual write happens.
 *
 * @ptr address of access
 * @size size of access
 */
static __always_inline void instrument_atomic_write(const volatile void *v, size_t size)
{
        kasan_check_write(v, size);
        kcsan_check_atomic_write(v, size);
}

/**
 * instrument_atomic_read_write - instrument atomic read-write access
 *
 * Instrument an atomic read-write access. The instrumentation should be
 * inserted before the actual write happens.
 *
 * @ptr address of access
 * @size size of access
 */
static __always_inline void instrument_atomic_read_write(const volatile void *v, size_t size)
{
        kasan_check_write(v, size);
        kcsan_check_atomic_read_write(v, size);
}

/**
 * instrument_copy_to_user - instrument reads of copy_to_user
 *
 * Instrument reads from kernel memory, that are due to copy_to_user (and
 * variants). The instrumentation must be inserted before the accesses.
 *
 * @to destination address
 * @from source address
 * @n number of bytes to copy
 */
static __always_inline void
instrument_copy_to_user(void __user *to, const void *from, unsigned long n)
{
        kasan_check_read(from, n);
        kcsan_check_read(from, n);
}

/**
 * instrument_copy_from_user - instrument writes of copy_from_user
 *
 * Instrument writes to kernel memory, that are due to copy_from_user (and
 * variants). The instrumentation should be inserted before the accesses.
 *
 * @to destination address
 * @from source address
 * @n number of bytes to copy
 */
static __always_inline void
instrument_copy_from_user(const void *to, const void __user *from, unsigned long n)
{
        kasan_check_write(to, n);
        kcsan_check_write(to, n);
}

#endif /* _LINUX_INSTRUMENTED_H */

























    5 





































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_JUMP_LABEL_H
#define _ASM_X86_JUMP_LABEL_H

#define HAVE_JUMP_LABEL_BATCH

#define JUMP_LABEL_NOP_SIZE 5

#ifdef CONFIG_X86_64
# define STATIC_KEY_INIT_NOP P6_NOP5_ATOMIC
#else
# define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC
#endif

#include <asm/asm.h>
#include <asm/nops.h>

#ifndef __ASSEMBLY__

#include <linux/stringify.h>
#include <linux/types.h>

static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
{
        asm_volatile_goto("1:"
                ".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t"
                ".pushsection __jump_table,  \"aw\" \n\t"
                _ASM_ALIGN "\n\t"
                ".long 1b - ., %l[l_yes] - . \n\t"
                _ASM_PTR "%c0 + %c1 - .\n\t"
                ".popsection \n\t"
                : :  "i" (key), "i" (branch) : : l_yes);

        return false;
l_yes:
        return true;
}

static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
{
        asm_volatile_goto("1:"
                ".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t"
                "2:\n\t"
                ".pushsection __jump_table,  \"aw\" \n\t"
                _ASM_ALIGN "\n\t"
                ".long 1b - ., %l[l_yes] - . \n\t"
                _ASM_PTR "%c0 + %c1 - .\n\t"
                ".popsection \n\t"
                : :  "i" (key), "i" (branch) : : l_yes);

        return false;
l_yes:
        return true;
}

#else        /* __ASSEMBLY__ */

.macro STATIC_JUMP_IF_TRUE target, key, def
.Lstatic_jump_\@:
        .if \def
        /* Equivalent to "jmp.d32 \target" */
        .byte                0xe9
        .long                \target - .Lstatic_jump_after_\@
.Lstatic_jump_after_\@:
        .else
        .byte                STATIC_KEY_INIT_NOP
        .endif
        .pushsection __jump_table, "aw"
        _ASM_ALIGN
        .long                .Lstatic_jump_\@ - ., \target - .
        _ASM_PTR        \key - .
        .popsection
.endm

.macro STATIC_JUMP_IF_FALSE target, key, def
.Lstatic_jump_\@:
        .if \def
        .byte                STATIC_KEY_INIT_NOP
        .else
        /* Equivalent to "jmp.d32 \target" */
        .byte                0xe9
        .long                \target - .Lstatic_jump_after_\@
.Lstatic_jump_after_\@:
        .endif
        .pushsection __jump_table, "aw"
        _ASM_ALIGN
        .long                .Lstatic_jump_\@ - ., \target - .
        _ASM_PTR        \key + 1 - .
        .popsection
.endm

#endif        /* __ASSEMBLY__ */

#endif






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 

































































    5 




































    2 

























































    1 





    1 








    1 

    1 


    1 


    1 






































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/kernel/sys.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/export.h>
#include <linux/mm.h>
#include <linux/utsname.h>
#include <linux/mman.h>
#include <linux/reboot.h>
#include <linux/prctl.h>
#include <linux/highuid.h>
#include <linux/fs.h>
#include <linux/kmod.h>
#include <linux/perf_event.h>
#include <linux/resource.h>
#include <linux/kernel.h>
#include <linux/workqueue.h>
#include <linux/capability.h>
#include <linux/device.h>
#include <linux/key.h>
#include <linux/times.h>
#include <linux/posix-timers.h>
#include <linux/security.h>
#include <linux/dcookies.h>
#include <linux/suspend.h>
#include <linux/tty.h>
#include <linux/signal.h>
#include <linux/cn_proc.h>
#include <linux/getcpu.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/seccomp.h>
#include <linux/cpu.h>
#include <linux/personality.h>
#include <linux/ptrace.h>
#include <linux/fs_struct.h>
#include <linux/file.h>
#include <linux/mount.h>
#include <linux/gfp.h>
#include <linux/syscore_ops.h>
#include <linux/version.h>
#include <linux/ctype.h>

#include <linux/compat.h>
#include <linux/syscalls.h>
#include <linux/kprobes.h>
#include <linux/user_namespace.h>
#include <linux/time_namespace.h>
#include <linux/binfmts.h>

#include <linux/sched.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/stat.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/task.h>
#include <linux/sched/cputime.h>
#include <linux/rcupdate.h>
#include <linux/uidgid.h>
#include <linux/cred.h>

#include <linux/nospec.h>

#include <linux/kmsg_dump.h>
/* Move somewhere else to avoid recompiling? */
#include <generated/utsrelease.h>

#include <linux/uaccess.h>
#include <asm/io.h>
#include <asm/unistd.h>

#include "uid16.h"

#ifndef SET_UNALIGN_CTL
# define SET_UNALIGN_CTL(a, b)        (-EINVAL)
#endif
#ifndef GET_UNALIGN_CTL
# define GET_UNALIGN_CTL(a, b)        (-EINVAL)
#endif
#ifndef SET_FPEMU_CTL
# define SET_FPEMU_CTL(a, b)        (-EINVAL)
#endif
#ifndef GET_FPEMU_CTL
# define GET_FPEMU_CTL(a, b)        (-EINVAL)
#endif
#ifndef SET_FPEXC_CTL
# define SET_FPEXC_CTL(a, b)        (-EINVAL)
#endif
#ifndef GET_FPEXC_CTL
# define GET_FPEXC_CTL(a, b)        (-EINVAL)
#endif
#ifndef GET_ENDIAN
# define GET_ENDIAN(a, b)        (-EINVAL)
#endif
#ifndef SET_ENDIAN
# define SET_ENDIAN(a, b)        (-EINVAL)
#endif
#ifndef GET_TSC_CTL
# define GET_TSC_CTL(a)                (-EINVAL)
#endif
#ifndef SET_TSC_CTL
# define SET_TSC_CTL(a)                (-EINVAL)
#endif
#ifndef GET_FP_MODE
# define GET_FP_MODE(a)                (-EINVAL)
#endif
#ifndef SET_FP_MODE
# define SET_FP_MODE(a,b)        (-EINVAL)
#endif
#ifndef SVE_SET_VL
# define SVE_SET_VL(a)                (-EINVAL)
#endif
#ifndef SVE_GET_VL
# define SVE_GET_VL()                (-EINVAL)
#endif
#ifndef PAC_RESET_KEYS
# define PAC_RESET_KEYS(a, b)        (-EINVAL)
#endif
#ifndef SET_TAGGED_ADDR_CTRL
# define SET_TAGGED_ADDR_CTRL(a)        (-EINVAL)
#endif
#ifndef GET_TAGGED_ADDR_CTRL
# define GET_TAGGED_ADDR_CTRL()                (-EINVAL)
#endif

/*
 * this is where the system-wide overflow UID and GID are defined, for
 * architectures that now have 32-bit UID/GID but didn't in the past
 */

int overflowuid = DEFAULT_OVERFLOWUID;
int overflowgid = DEFAULT_OVERFLOWGID;

EXPORT_SYMBOL(overflowuid);
EXPORT_SYMBOL(overflowgid);

/*
 * the same as above, but for filesystems which can only store a 16-bit
 * UID and GID. as such, this is needed on all architectures
 */

int fs_overflowuid = DEFAULT_FS_OVERFLOWUID;
int fs_overflowgid = DEFAULT_FS_OVERFLOWGID;

EXPORT_SYMBOL(fs_overflowuid);
EXPORT_SYMBOL(fs_overflowgid);

/*
 * Returns true if current's euid is same as p's uid or euid,
 * or has CAP_SYS_NICE to p's user_ns.
 *
 * Called with rcu_read_lock, creds are safe
 */
static bool set_one_prio_perm(struct task_struct *p)
{
        const struct cred *cred = current_cred(), *pcred = __task_cred(p);

        if (uid_eq(pcred->uid,  cred->euid) ||
            uid_eq(pcred->euid, cred->euid))
                return true;
        if (ns_capable(pcred->user_ns, CAP_SYS_NICE))
                return true;
        return false;
}

/*
 * set the priority of a task
 * - the caller must hold the RCU read lock
 */
static int set_one_prio(struct task_struct *p, int niceval, int error)
{
        int no_nice;

        if (!set_one_prio_perm(p)) {
                error = -EPERM;
                goto out;
        }
        if (niceval < task_nice(p) && !can_nice(p, niceval)) {
                error = -EACCES;
                goto out;
        }
        no_nice = security_task_setnice(p, niceval);
        if (no_nice) {
                error = no_nice;
                goto out;
        }
        if (error == -ESRCH)
                error = 0;
        set_user_nice(p, niceval);
out:
        return error;
}

SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
{
        struct task_struct *g, *p;
        struct user_struct *user;
        const struct cred *cred = current_cred();
        int error = -EINVAL;
        struct pid *pgrp;
        kuid_t uid;

        if (which > PRIO_USER || which < PRIO_PROCESS)
                goto out;

        /* normalize: avoid signed division (rounding problems) */
        error = -ESRCH;
        if (niceval < MIN_NICE)
                niceval = MIN_NICE;
        if (niceval > MAX_NICE)
                niceval = MAX_NICE;

        rcu_read_lock();
        read_lock(&tasklist_lock);
        switch (which) {
        case PRIO_PROCESS:
                if (who)
                        p = find_task_by_vpid(who);
                else
                        p = current;
                if (p)
                        error = set_one_prio(p, niceval, error);
                break;
        case PRIO_PGRP:
                if (who)
                        pgrp = find_vpid(who);
                else
                        pgrp = task_pgrp(current);
                do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
                        error = set_one_prio(p, niceval, error);
                } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
                break;
        case PRIO_USER:
                uid = make_kuid(cred->user_ns, who);
                user = cred->user;
                if (!who)
                        uid = cred->uid;
                else if (!uid_eq(uid, cred->uid)) {
                        user = find_user(uid);
                        if (!user)
                                goto out_unlock;        /* No processes for this user */
                }
                do_each_thread(g, p) {
                        if (uid_eq(task_uid(p), uid) && task_pid_vnr(p))
                                error = set_one_prio(p, niceval, error);
                } while_each_thread(g, p);
                if (!uid_eq(uid, cred->uid))
                        free_uid(user);                /* For find_user() */
                break;
        }
out_unlock:
        read_unlock(&tasklist_lock);
        rcu_read_unlock();
out:
        return error;
}

/*
 * Ugh. To avoid negative return values, "getpriority()" will
 * not return the normal nice-value, but a negated value that
 * has been offset by 20 (ie it returns 40..1 instead of -20..19)
 * to stay compatible.
 */
SYSCALL_DEFINE2(getpriority, int, which, int, who)
{
        struct task_struct *g, *p;
        struct user_struct *user;
        const struct cred *cred = current_cred();
        long niceval, retval = -ESRCH;
        struct pid *pgrp;
        kuid_t uid;

        if (which > PRIO_USER || which < PRIO_PROCESS)
                return -EINVAL;

        rcu_read_lock();
        read_lock(&tasklist_lock);
        switch (which) {
        case PRIO_PROCESS:
                if (who)
                        p = find_task_by_vpid(who);
                else
                        p = current;
                if (p) {
                        niceval = nice_to_rlimit(task_nice(p));
                        if (niceval > retval)
                                retval = niceval;
                }
                break;
        case PRIO_PGRP:
                if (who)
                        pgrp = find_vpid(who);
                else
                        pgrp = task_pgrp(current);
                do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
                        niceval = nice_to_rlimit(task_nice(p));
                        if (niceval > retval)
                                retval = niceval;
                } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
                break;
        case PRIO_USER:
                uid = make_kuid(cred->user_ns, who);
                user = cred->user;
                if (!who)
                        uid = cred->uid;
                else if (!uid_eq(uid, cred->uid)) {
                        user = find_user(uid);
                        if (!user)
                                goto out_unlock;        /* No processes for this user */
                }
                do_each_thread(g, p) {
                        if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) {
                                niceval = nice_to_rlimit(task_nice(p));
                                if (niceval > retval)
                                        retval = niceval;
                        }
                } while_each_thread(g, p);
                if (!uid_eq(uid, cred->uid))
                        free_uid(user);                /* for find_user() */
                break;
        }
out_unlock:
        read_unlock(&tasklist_lock);
        rcu_read_unlock();

        return retval;
}

/*
 * Unprivileged users may change the real gid to the effective gid
 * or vice versa.  (BSD-style)
 *
 * If you set the real gid at all, or set the effective gid to a value not
 * equal to the real gid, then the saved gid is set to the new effective gid.
 *
 * This makes it possible for a setgid program to completely drop its
 * privileges, which is often a useful assertion to make when you are doing
 * a security audit over a program.
 *
 * The general idea is that a program which uses just setregid() will be
 * 100% compatible with BSD.  A program which uses just setgid() will be
 * 100% compatible with POSIX with saved IDs.
 *
 * SMP: There are not races, the GIDs are checked only by filesystem
 *      operations (as far as semantic preservation is concerned).
 */
#ifdef CONFIG_MULTIUSER
long __sys_setregid(gid_t rgid, gid_t egid)
{
        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
        kgid_t krgid, kegid;

        krgid = make_kgid(ns, rgid);
        kegid = make_kgid(ns, egid);

        if ((rgid != (gid_t) -1) && !gid_valid(krgid))
                return -EINVAL;
        if ((egid != (gid_t) -1) && !gid_valid(kegid))
                return -EINVAL;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;
        old = current_cred();

        retval = -EPERM;
        if (rgid != (gid_t) -1) {
                if (gid_eq(old->gid, krgid) ||
                    gid_eq(old->egid, krgid) ||
                    ns_capable_setid(old->user_ns, CAP_SETGID))
                        new->gid = krgid;
                else
                        goto error;
        }
        if (egid != (gid_t) -1) {
                if (gid_eq(old->gid, kegid) ||
                    gid_eq(old->egid, kegid) ||
                    gid_eq(old->sgid, kegid) ||
                    ns_capable_setid(old->user_ns, CAP_SETGID))
                        new->egid = kegid;
                else
                        goto error;
        }

        if (rgid != (gid_t) -1 ||
            (egid != (gid_t) -1 && !gid_eq(kegid, old->gid)))
                new->sgid = new->egid;
        new->fsgid = new->egid;

        retval = security_task_fix_setgid(new, old, LSM_SETID_RE);
        if (retval < 0)
                goto error;

        return commit_creds(new);

error:
        abort_creds(new);
        return retval;
}

SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
{
        return __sys_setregid(rgid, egid);
}

/*
 * setgid() is implemented like SysV w/ SAVED_IDS
 *
 * SMP: Same implicit races as above.
 */
long __sys_setgid(gid_t gid)
{
        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
        kgid_t kgid;

        kgid = make_kgid(ns, gid);
        if (!gid_valid(kgid))
                return -EINVAL;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;
        old = current_cred();

        retval = -EPERM;
        if (ns_capable_setid(old->user_ns, CAP_SETGID))
                new->gid = new->egid = new->sgid = new->fsgid = kgid;
        else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid))
                new->egid = new->fsgid = kgid;
        else
                goto error;

        retval = security_task_fix_setgid(new, old, LSM_SETID_ID);
        if (retval < 0)
                goto error;

        return commit_creds(new);

error:
        abort_creds(new);
        return retval;
}

SYSCALL_DEFINE1(setgid, gid_t, gid)
{
        return __sys_setgid(gid);
}

/*
 * change the user struct in a credentials set to match the new UID
 */
static int set_user(struct cred *new)
{
        struct user_struct *new_user;

        new_user = alloc_uid(new->uid);
        if (!new_user)
                return -EAGAIN;

        /*
         * We don't fail in case of NPROC limit excess here because too many
         * poorly written programs don't check set*uid() return code, assuming
         * it never fails if called by root.  We may still enforce NPROC limit
         * for programs doing set*uid()+execve() by harmlessly deferring the
         * failure to the execve() stage.
         */
        if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
                        new_user != INIT_USER)
                current->flags |= PF_NPROC_EXCEEDED;
        else
                current->flags &= ~PF_NPROC_EXCEEDED;

        free_uid(new->user);
        new->user = new_user;
        return 0;
}

/*
 * Unprivileged users may change the real uid to the effective uid
 * or vice versa.  (BSD-style)
 *
 * If you set the real uid at all, or set the effective uid to a value not
 * equal to the real uid, then the saved uid is set to the new effective uid.
 *
 * This makes it possible for a setuid program to completely drop its
 * privileges, which is often a useful assertion to make when you are doing
 * a security audit over a program.
 *
 * The general idea is that a program which uses just setreuid() will be
 * 100% compatible with BSD.  A program which uses just setuid() will be
 * 100% compatible with POSIX with saved IDs.
 */
long __sys_setreuid(uid_t ruid, uid_t euid)
{
        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
        kuid_t kruid, keuid;

        kruid = make_kuid(ns, ruid);
        keuid = make_kuid(ns, euid);

        if ((ruid != (uid_t) -1) && !uid_valid(kruid))
                return -EINVAL;
        if ((euid != (uid_t) -1) && !uid_valid(keuid))
                return -EINVAL;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;
        old = current_cred();

        retval = -EPERM;
        if (ruid != (uid_t) -1) {
                new->uid = kruid;
                if (!uid_eq(old->uid, kruid) &&
                    !uid_eq(old->euid, kruid) &&
                    !ns_capable_setid(old->user_ns, CAP_SETUID))
                        goto error;
        }

        if (euid != (uid_t) -1) {
                new->euid = keuid;
                if (!uid_eq(old->uid, keuid) &&
                    !uid_eq(old->euid, keuid) &&
                    !uid_eq(old->suid, keuid) &&
                    !ns_capable_setid(old->user_ns, CAP_SETUID))
                        goto error;
        }

        if (!uid_eq(new->uid, old->uid)) {
                retval = set_user(new);
                if (retval < 0)
                        goto error;
        }
        if (ruid != (uid_t) -1 ||
            (euid != (uid_t) -1 && !uid_eq(keuid, old->uid)))
                new->suid = new->euid;
        new->fsuid = new->euid;

        retval = security_task_fix_setuid(new, old, LSM_SETID_RE);
        if (retval < 0)
                goto error;

        return commit_creds(new);

error:
        abort_creds(new);
        return retval;
}

SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
{
        return __sys_setreuid(ruid, euid);
}

/*
 * setuid() is implemented like SysV with SAVED_IDS
 *
 * Note that SAVED_ID's is deficient in that a setuid root program
 * like sendmail, for example, cannot set its uid to be a normal
 * user and then switch back, because if you're root, setuid() sets
 * the saved uid too.  If you don't like this, blame the bright people
 * in the POSIX committee and/or USG.  Note that the BSD-style setreuid()
 * will allow a root program to temporarily drop privileges and be able to
 * regain them by swapping the real and effective uid.
 */
long __sys_setuid(uid_t uid)
{
        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
        kuid_t kuid;

        kuid = make_kuid(ns, uid);
        if (!uid_valid(kuid))
                return -EINVAL;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;
        old = current_cred();

        retval = -EPERM;
        if (ns_capable_setid(old->user_ns, CAP_SETUID)) {
                new->suid = new->uid = kuid;
                if (!uid_eq(kuid, old->uid)) {
                        retval = set_user(new);
                        if (retval < 0)
                                goto error;
                }
        } else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) {
                goto error;
        }

        new->fsuid = new->euid = kuid;

        retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
        if (retval < 0)
                goto error;

        return commit_creds(new);

error:
        abort_creds(new);
        return retval;
}

SYSCALL_DEFINE1(setuid, uid_t, uid)
{
        return __sys_setuid(uid);
}


/*
 * This function implements a generic ability to update ruid, euid,
 * and suid.  This allows you to implement the 4.4 compatible seteuid().
 */
long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
{
        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
        kuid_t kruid, keuid, ksuid;
        bool ruid_new, euid_new, suid_new;

        kruid = make_kuid(ns, ruid);
        keuid = make_kuid(ns, euid);
        ksuid = make_kuid(ns, suid);

        if ((ruid != (uid_t) -1) && !uid_valid(kruid))
                return -EINVAL;

        if ((euid != (uid_t) -1) && !uid_valid(keuid))
                return -EINVAL;

        if ((suid != (uid_t) -1) && !uid_valid(ksuid))
                return -EINVAL;

        old = current_cred();

        /* check for no-op */
        if ((ruid == (uid_t) -1 || uid_eq(kruid, old->uid)) &&
            (euid == (uid_t) -1 || (uid_eq(keuid, old->euid) &&
                                    uid_eq(keuid, old->fsuid))) &&
            (suid == (uid_t) -1 || uid_eq(ksuid, old->suid)))
                return 0;

        ruid_new = ruid != (uid_t) -1        && !uid_eq(kruid, old->uid) &&
                   !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid);
        euid_new = euid != (uid_t) -1        && !uid_eq(keuid, old->uid) &&
                   !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid);
        suid_new = suid != (uid_t) -1        && !uid_eq(ksuid, old->uid) &&
                   !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid);
        if ((ruid_new || euid_new || suid_new) &&
            !ns_capable_setid(old->user_ns, CAP_SETUID))
                return -EPERM;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        if (ruid != (uid_t) -1) {
                new->uid = kruid;
                if (!uid_eq(kruid, old->uid)) {
                        retval = set_user(new);
                        if (retval < 0)
                                goto error;
                }
        }
        if (euid != (uid_t) -1)
                new->euid = keuid;
        if (suid != (uid_t) -1)
                new->suid = ksuid;
        new->fsuid = new->euid;

        retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
        if (retval < 0)
                goto error;

        return commit_creds(new);

error:
        abort_creds(new);
        return retval;
}

SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
{
        return __sys_setresuid(ruid, euid, suid);
}

SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp)
{
        const struct cred *cred = current_cred();
        int retval;
        uid_t ruid, euid, suid;

        ruid = from_kuid_munged(cred->user_ns, cred->uid);
        euid = from_kuid_munged(cred->user_ns, cred->euid);
        suid = from_kuid_munged(cred->user_ns, cred->suid);

        retval = put_user(ruid, ruidp);
        if (!retval) {
                retval = put_user(euid, euidp);
                if (!retval)
                        return put_user(suid, suidp);
        }
        return retval;
}

/*
 * Same as above, but for rgid, egid, sgid.
 */
long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
{
        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
        kgid_t krgid, kegid, ksgid;
        bool rgid_new, egid_new, sgid_new;

        krgid = make_kgid(ns, rgid);
        kegid = make_kgid(ns, egid);
        ksgid = make_kgid(ns, sgid);

        if ((rgid != (gid_t) -1) && !gid_valid(krgid))
                return -EINVAL;
        if ((egid != (gid_t) -1) && !gid_valid(kegid))
                return -EINVAL;
        if ((sgid != (gid_t) -1) && !gid_valid(ksgid))
                return -EINVAL;

        old = current_cred();

        /* check for no-op */
        if ((rgid == (gid_t) -1 || gid_eq(krgid, old->gid)) &&
            (egid == (gid_t) -1 || (gid_eq(kegid, old->egid) &&
                                    gid_eq(kegid, old->fsgid))) &&
            (sgid == (gid_t) -1 || gid_eq(ksgid, old->sgid)))
                return 0;

        rgid_new = rgid != (gid_t) -1        && !gid_eq(krgid, old->gid) &&
                   !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid);
        egid_new = egid != (gid_t) -1        && !gid_eq(kegid, old->gid) &&
                   !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid);
        sgid_new = sgid != (gid_t) -1        && !gid_eq(ksgid, old->gid) &&
                   !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid);
        if ((rgid_new || egid_new || sgid_new) &&
            !ns_capable_setid(old->user_ns, CAP_SETGID))
                return -EPERM;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        if (rgid != (gid_t) -1)
                new->gid = krgid;
        if (egid != (gid_t) -1)
                new->egid = kegid;
        if (sgid != (gid_t) -1)
                new->sgid = ksgid;
        new->fsgid = new->egid;

        retval = security_task_fix_setgid(new, old, LSM_SETID_RES);
        if (retval < 0)
                goto error;

        return commit_creds(new);

error:
        abort_creds(new);
        return retval;
}

SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
{
        return __sys_setresgid(rgid, egid, sgid);
}

SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp)
{
        const struct cred *cred = current_cred();
        int retval;
        gid_t rgid, egid, sgid;

        rgid = from_kgid_munged(cred->user_ns, cred->gid);
        egid = from_kgid_munged(cred->user_ns, cred->egid);
        sgid = from_kgid_munged(cred->user_ns, cred->sgid);

        retval = put_user(rgid, rgidp);
        if (!retval) {
                retval = put_user(egid, egidp);
                if (!retval)
                        retval = put_user(sgid, sgidp);
        }

        return retval;
}


/*
 * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This
 * is used for "access()" and for the NFS daemon (letting nfsd stay at
 * whatever uid it wants to). It normally shadows "euid", except when
 * explicitly set by setfsuid() or for access..
 */
long __sys_setfsuid(uid_t uid)
{
        const struct cred *old;
        struct cred *new;
        uid_t old_fsuid;
        kuid_t kuid;

        old = current_cred();
        old_fsuid = from_kuid_munged(old->user_ns, old->fsuid);

        kuid = make_kuid(old->user_ns, uid);
        if (!uid_valid(kuid))
                return old_fsuid;

        new = prepare_creds();
        if (!new)
                return old_fsuid;

        if (uid_eq(kuid, old->uid)  || uid_eq(kuid, old->euid)  ||
            uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) ||
            ns_capable_setid(old->user_ns, CAP_SETUID)) {
                if (!uid_eq(kuid, old->fsuid)) {
                        new->fsuid = kuid;
                        if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
                                goto change_okay;
                }
        }

        abort_creds(new);
        return old_fsuid;

change_okay:
        commit_creds(new);
        return old_fsuid;
}

SYSCALL_DEFINE1(setfsuid, uid_t, uid)
{
        return __sys_setfsuid(uid);
}

/*
 * Samma på svenska..
 */
long __sys_setfsgid(gid_t gid)
{
        const struct cred *old;
        struct cred *new;
        gid_t old_fsgid;
        kgid_t kgid;

        old = current_cred();
        old_fsgid = from_kgid_munged(old->user_ns, old->fsgid);

        kgid = make_kgid(old->user_ns, gid);
        if (!gid_valid(kgid))
                return old_fsgid;

        new = prepare_creds();
        if (!new)
                return old_fsgid;

        if (gid_eq(kgid, old->gid)  || gid_eq(kgid, old->egid)  ||
            gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) ||
            ns_capable_setid(old->user_ns, CAP_SETGID)) {
                if (!gid_eq(kgid, old->fsgid)) {
                        new->fsgid = kgid;
                        if (security_task_fix_setgid(new,old,LSM_SETID_FS) == 0)
                                goto change_okay;
                }
        }

        abort_creds(new);
        return old_fsgid;

change_okay:
        commit_creds(new);
        return old_fsgid;
}

SYSCALL_DEFINE1(setfsgid, gid_t, gid)
{
        return __sys_setfsgid(gid);
}
#endif /* CONFIG_MULTIUSER */

/**
 * sys_getpid - return the thread group id of the current process
 *
 * Note, despite the name, this returns the tgid not the pid.  The tgid and
 * the pid are identical unless CLONE_THREAD was specified on clone() in
 * which case the tgid is the same in all threads of the same group.
 *
 * This is SMP safe as current->tgid does not change.
 */
SYSCALL_DEFINE0(getpid)
{
        return task_tgid_vnr(current);
}

/* Thread ID - the internal kernel "pid" */
SYSCALL_DEFINE0(gettid)
{
        return task_pid_vnr(current);
}

/*
 * Accessing ->real_parent is not SMP-safe, it could
 * change from under us. However, we can use a stale
 * value of ->real_parent under rcu_read_lock(), see
 * release_task()->call_rcu(delayed_put_task_struct).
 */
SYSCALL_DEFINE0(getppid)
{
        int pid;

        rcu_read_lock();
        pid = task_tgid_vnr(rcu_dereference(current->real_parent));
        rcu_read_unlock();

        return pid;
}

SYSCALL_DEFINE0(getuid)
{
        /* Only we change this so SMP safe */
        return from_kuid_munged(current_user_ns(), current_uid());
}

SYSCALL_DEFINE0(geteuid)
{
        /* Only we change this so SMP safe */
        return from_kuid_munged(current_user_ns(), current_euid());
}

SYSCALL_DEFINE0(getgid)
{
        /* Only we change this so SMP safe */
        return from_kgid_munged(current_user_ns(), current_gid());
}

SYSCALL_DEFINE0(getegid)
{
        /* Only we change this so SMP safe */
        return from_kgid_munged(current_user_ns(), current_egid());
}

static void do_sys_times(struct tms *tms)
{
        u64 tgutime, tgstime, cutime, cstime;

        thread_group_cputime_adjusted(current, &tgutime, &tgstime);
        cutime = current->signal->cutime;
        cstime = current->signal->cstime;
        tms->tms_utime = nsec_to_clock_t(tgutime);
        tms->tms_stime = nsec_to_clock_t(tgstime);
        tms->tms_cutime = nsec_to_clock_t(cutime);
        tms->tms_cstime = nsec_to_clock_t(cstime);
}

SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
{
        if (tbuf) {
                struct tms tmp;

                do_sys_times(&tmp);
                if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
                        return -EFAULT;
        }
        force_successful_syscall_return();
        return (long) jiffies_64_to_clock_t(get_jiffies_64());
}

#ifdef CONFIG_COMPAT
static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
{
        return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
}

COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf)
{
        if (tbuf) {
                struct tms tms;
                struct compat_tms tmp;

                do_sys_times(&tms);
                /* Convert our struct tms to the compat version. */
                tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
                tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
                tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
                tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
                if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
                        return -EFAULT;
        }
        force_successful_syscall_return();
        return compat_jiffies_to_clock_t(jiffies);
}
#endif

/*
 * This needs some heavy checking ...
 * I just haven't the stomach for it. I also don't fully
 * understand sessions/pgrp etc. Let somebody who does explain it.
 *
 * OK, I think I have the protection semantics right.... this is really
 * only important on a multi-user system anyway, to make sure one user
 * can't send a signal to a process owned by another.  -TYT, 12/12/91
 *
 * !PF_FORKNOEXEC check to conform completely to POSIX.
 */
SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
{
        struct task_struct *p;
        struct task_struct *group_leader = current->group_leader;
        struct pid *pgrp;
        int err;

        if (!pid)
                pid = task_pid_vnr(group_leader);
        if (!pgid)
                pgid = pid;
        if (pgid < 0)
                return -EINVAL;
        rcu_read_lock();

        /* From this point forward we keep holding onto the tasklist lock
         * so that our parent does not change from under us. -DaveM
         */
        write_lock_irq(&tasklist_lock);

        err = -ESRCH;
        p = find_task_by_vpid(pid);
        if (!p)
                goto out;

        err = -EINVAL;
        if (!thread_group_leader(p))
                goto out;

        if (same_thread_group(p->real_parent, group_leader)) {
                err = -EPERM;
                if (task_session(p) != task_session(group_leader))
                        goto out;
                err = -EACCES;
                if (!(p->flags & PF_FORKNOEXEC))
                        goto out;
        } else {
                err = -ESRCH;
                if (p != group_leader)
                        goto out;
        }

        err = -EPERM;
        if (p->signal->leader)
                goto out;

        pgrp = task_pid(p);
        if (pgid != pid) {
                struct task_struct *g;

                pgrp = find_vpid(pgid);
                g = pid_task(pgrp, PIDTYPE_PGID);
                if (!g || task_session(g) != task_session(group_leader))
                        goto out;
        }

        err = security_task_setpgid(p, pgid);
        if (err)
                goto out;

        if (task_pgrp(p) != pgrp)
                change_pid(p, PIDTYPE_PGID, pgrp);

        err = 0;
out:
        /* All paths lead to here, thus we are safe. -DaveM */
        write_unlock_irq(&tasklist_lock);
        rcu_read_unlock();
        return err;
}

static int do_getpgid(pid_t pid)
{
        struct task_struct *p;
        struct pid *grp;
        int retval;

        rcu_read_lock();
        if (!pid)
                grp = task_pgrp(current);
        else {
                retval = -ESRCH;
                p = find_task_by_vpid(pid);
                if (!p)
                        goto out;
                grp = task_pgrp(p);
                if (!grp)
                        goto out;

                retval = security_task_getpgid(p);
                if (retval)
                        goto out;
        }
        retval = pid_vnr(grp);
out:
        rcu_read_unlock();
        return retval;
}

SYSCALL_DEFINE1(getpgid, pid_t, pid)
{
        return do_getpgid(pid);
}

#ifdef __ARCH_WANT_SYS_GETPGRP

SYSCALL_DEFINE0(getpgrp)
{
        return do_getpgid(0);
}

#endif

SYSCALL_DEFINE1(getsid, pid_t, pid)
{
        struct task_struct *p;
        struct pid *sid;
        int retval;

        rcu_read_lock();
        if (!pid)
                sid = task_session(current);
        else {
                retval = -ESRCH;
                p = find_task_by_vpid(pid);
                if (!p)
                        goto out;
                sid = task_session(p);
                if (!sid)
                        goto out;

                retval = security_task_getsid(p);
                if (retval)
                        goto out;
        }
        retval = pid_vnr(sid);
out:
        rcu_read_unlock();
        return retval;
}

static void set_special_pids(struct pid *pid)
{
        struct task_struct *curr = current->group_leader;

        if (task_session(curr) != pid)
                change_pid(curr, PIDTYPE_SID, pid);

        if (task_pgrp(curr) != pid)
                change_pid(curr, PIDTYPE_PGID, pid);
}

int ksys_setsid(void)
{
        struct task_struct *group_leader = current->group_leader;
        struct pid *sid = task_pid(group_leader);
        pid_t session = pid_vnr(sid);
        int err = -EPERM;

        write_lock_irq(&tasklist_lock);
        /* Fail if I am already a session leader */
        if (group_leader->signal->leader)
                goto out;

        /* Fail if a process group id already exists that equals the
         * proposed session id.
         */
        if (pid_task(sid, PIDTYPE_PGID))
                goto out;

        group_leader->signal->leader = 1;
        set_special_pids(sid);

        proc_clear_tty(group_leader);

        err = session;
out:
        write_unlock_irq(&tasklist_lock);
        if (err > 0) {
                proc_sid_connector(group_leader);
                sched_autogroup_create_attach(group_leader);
        }
        return err;
}

SYSCALL_DEFINE0(setsid)
{
        return ksys_setsid();
}

DECLARE_RWSEM(uts_sem);

#ifdef COMPAT_UTS_MACHINE
#define override_architecture(name) \
        (personality(current->personality) == PER_LINUX32 && \
         copy_to_user(name->machine, COMPAT_UTS_MACHINE, \
                      sizeof(COMPAT_UTS_MACHINE)))
#else
#define override_architecture(name)        0
#endif

/*
 * Work around broken programs that cannot handle "Linux 3.0".
 * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40
 * And we map 4.x and later versions to 2.6.60+x, so 4.0/5.0/6.0/... would be
 * 2.6.60.
 */
static int override_release(char __user *release, size_t len)
{
        int ret = 0;

        if (current->personality & UNAME26) {
                const char *rest = UTS_RELEASE;
                char buf[65] = { 0 };
                int ndots = 0;
                unsigned v;
                size_t copy;

                while (*rest) {
                        if (*rest == '.' && ++ndots >= 3)
                                break;
                        if (!isdigit(*rest) && *rest != '.')
                                break;
                        rest++;
                }
                v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 60;
                copy = clamp_t(size_t, len, 1, sizeof(buf));
                copy = scnprintf(buf, copy, "2.6.%u%s", v, rest);
                ret = copy_to_user(release, buf, copy + 1);
        }
        return ret;
}

SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
{
        struct new_utsname tmp;

        down_read(&uts_sem);
        memcpy(&tmp, utsname(), sizeof(tmp));
        up_read(&uts_sem);
        if (copy_to_user(name, &tmp, sizeof(tmp)))
                return -EFAULT;

        if (override_release(name->release, sizeof(name->release)))
                return -EFAULT;
        if (override_architecture(name))
                return -EFAULT;
        return 0;
}

#ifdef __ARCH_WANT_SYS_OLD_UNAME
/*
 * Old cruft
 */
SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
{
        struct old_utsname tmp;

        if (!name)
                return -EFAULT;

        down_read(&uts_sem);
        memcpy(&tmp, utsname(), sizeof(tmp));
        up_read(&uts_sem);
        if (copy_to_user(name, &tmp, sizeof(tmp)))
                return -EFAULT;

        if (override_release(name->release, sizeof(name->release)))
                return -EFAULT;
        if (override_architecture(name))
                return -EFAULT;
        return 0;
}

SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
{
        struct oldold_utsname tmp;

        if (!name)
                return -EFAULT;

        memset(&tmp, 0, sizeof(tmp));

        down_read(&uts_sem);
        memcpy(&tmp.sysname, &utsname()->sysname, __OLD_UTS_LEN);
        memcpy(&tmp.nodename, &utsname()->nodename, __OLD_UTS_LEN);
        memcpy(&tmp.release, &utsname()->release, __OLD_UTS_LEN);
        memcpy(&tmp.version, &utsname()->version, __OLD_UTS_LEN);
        memcpy(&tmp.machine, &utsname()->machine, __OLD_UTS_LEN);
        up_read(&uts_sem);
        if (copy_to_user(name, &tmp, sizeof(tmp)))
                return -EFAULT;

        if (override_architecture(name))
                return -EFAULT;
        if (override_release(name->release, sizeof(name->release)))
                return -EFAULT;
        return 0;
}
#endif

SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
{
        int errno;
        char tmp[__NEW_UTS_LEN];

        if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        if (len < 0 || len > __NEW_UTS_LEN)
                return -EINVAL;
        errno = -EFAULT;
        if (!copy_from_user(tmp, name, len)) {
                struct new_utsname *u;

                down_write(&uts_sem);
                u = utsname();
                memcpy(u->nodename, tmp, len);
                memset(u->nodename + len, 0, sizeof(u->nodename) - len);
                errno = 0;
                uts_proc_notify(UTS_PROC_HOSTNAME);
                up_write(&uts_sem);
        }
        return errno;
}

#ifdef __ARCH_WANT_SYS_GETHOSTNAME

SYSCALL_DEFINE2(gethostname, char __user *, name, int, len)
{
        int i;
        struct new_utsname *u;
        char tmp[__NEW_UTS_LEN + 1];

        if (len < 0)
                return -EINVAL;
        down_read(&uts_sem);
        u = utsname();
        i = 1 + strlen(u->nodename);
        if (i > len)
                i = len;
        memcpy(tmp, u->nodename, i);
        up_read(&uts_sem);
        if (copy_to_user(name, tmp, i))
                return -EFAULT;
        return 0;
}

#endif

/*
 * Only setdomainname; getdomainname can be implemented by calling
 * uname()
 */
SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
{
        int errno;
        char tmp[__NEW_UTS_LEN];

        if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        if (len < 0 || len > __NEW_UTS_LEN)
                return -EINVAL;

        errno = -EFAULT;
        if (!copy_from_user(tmp, name, len)) {
                struct new_utsname *u;

                down_write(&uts_sem);
                u = utsname();
                memcpy(u->domainname, tmp, len);
                memset(u->domainname + len, 0, sizeof(u->domainname) - len);
                errno = 0;
                uts_proc_notify(UTS_PROC_DOMAINNAME);
                up_write(&uts_sem);
        }
        return errno;
}

SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
{
        struct rlimit value;
        int ret;

        ret = do_prlimit(current, resource, NULL, &value);
        if (!ret)
                ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;

        return ret;
}

#ifdef CONFIG_COMPAT

COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
                       struct compat_rlimit __user *, rlim)
{
        struct rlimit r;
        struct compat_rlimit r32;

        if (copy_from_user(&r32, rlim, sizeof(struct compat_rlimit)))
                return -EFAULT;

        if (r32.rlim_cur == COMPAT_RLIM_INFINITY)
                r.rlim_cur = RLIM_INFINITY;
        else
                r.rlim_cur = r32.rlim_cur;
        if (r32.rlim_max == COMPAT_RLIM_INFINITY)
                r.rlim_max = RLIM_INFINITY;
        else
                r.rlim_max = r32.rlim_max;
        return do_prlimit(current, resource, &r, NULL);
}

COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
                       struct compat_rlimit __user *, rlim)
{
        struct rlimit r;
        int ret;

        ret = do_prlimit(current, resource, NULL, &r);
        if (!ret) {
                struct compat_rlimit r32;
                if (r.rlim_cur > COMPAT_RLIM_INFINITY)
                        r32.rlim_cur = COMPAT_RLIM_INFINITY;
                else
                        r32.rlim_cur = r.rlim_cur;
                if (r.rlim_max > COMPAT_RLIM_INFINITY)
                        r32.rlim_max = COMPAT_RLIM_INFINITY;
                else
                        r32.rlim_max = r.rlim_max;

                if (copy_to_user(rlim, &r32, sizeof(struct compat_rlimit)))
                        return -EFAULT;
        }
        return ret;
}

#endif

#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT

/*
 *        Back compatibility for getrlimit. Needed for some apps.
 */
SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
                struct rlimit __user *, rlim)
{
        struct rlimit x;
        if (resource >= RLIM_NLIMITS)
                return -EINVAL;

        resource = array_index_nospec(resource, RLIM_NLIMITS);
        task_lock(current->group_leader);
        x = current->signal->rlim[resource];
        task_unlock(current->group_leader);
        if (x.rlim_cur > 0x7FFFFFFF)
                x.rlim_cur = 0x7FFFFFFF;
        if (x.rlim_max > 0x7FFFFFFF)
                x.rlim_max = 0x7FFFFFFF;
        return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0;
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
                       struct compat_rlimit __user *, rlim)
{
        struct rlimit r;

        if (resource >= RLIM_NLIMITS)
                return -EINVAL;

        resource = array_index_nospec(resource, RLIM_NLIMITS);
        task_lock(current->group_leader);
        r = current->signal->rlim[resource];
        task_unlock(current->group_leader);
        if (r.rlim_cur > 0x7FFFFFFF)
                r.rlim_cur = 0x7FFFFFFF;
        if (r.rlim_max > 0x7FFFFFFF)
                r.rlim_max = 0x7FFFFFFF;

        if (put_user(r.rlim_cur, &rlim->rlim_cur) ||
            put_user(r.rlim_max, &rlim->rlim_max))
                return -EFAULT;
        return 0;
}
#endif

#endif

static inline bool rlim64_is_infinity(__u64 rlim64)
{
#if BITS_PER_LONG < 64
        return rlim64 >= ULONG_MAX;
#else
        return rlim64 == RLIM64_INFINITY;
#endif
}

static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64)
{
        if (rlim->rlim_cur == RLIM_INFINITY)
                rlim64->rlim_cur = RLIM64_INFINITY;
        else
                rlim64->rlim_cur = rlim->rlim_cur;
        if (rlim->rlim_max == RLIM_INFINITY)
                rlim64->rlim_max = RLIM64_INFINITY;
        else
                rlim64->rlim_max = rlim->rlim_max;
}

static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim)
{
        if (rlim64_is_infinity(rlim64->rlim_cur))
                rlim->rlim_cur = RLIM_INFINITY;
        else
                rlim->rlim_cur = (unsigned long)rlim64->rlim_cur;
        if (rlim64_is_infinity(rlim64->rlim_max))
                rlim->rlim_max = RLIM_INFINITY;
        else
                rlim->rlim_max = (unsigned long)rlim64->rlim_max;
}

/* make sure you are allowed to change @tsk limits before calling this */
int do_prlimit(struct task_struct *tsk, unsigned int resource,
                struct rlimit *new_rlim, struct rlimit *old_rlim)
{
        struct rlimit *rlim;
        int retval = 0;

        if (resource >= RLIM_NLIMITS)
                return -EINVAL;
        resource = array_index_nospec(resource, RLIM_NLIMITS);

        if (new_rlim) {
                if (new_rlim->rlim_cur > new_rlim->rlim_max)
                        return -EINVAL;
                if (resource == RLIMIT_NOFILE &&
                                new_rlim->rlim_max > sysctl_nr_open)
                        return -EPERM;
        }

        /* protect tsk->signal and tsk->sighand from disappearing */
        read_lock(&tasklist_lock);
        if (!tsk->sighand) {
                retval = -ESRCH;
                goto out;
        }

        rlim = tsk->signal->rlim + resource;
        task_lock(tsk->group_leader);
        if (new_rlim) {
                /* Keep the capable check against init_user_ns until
                   cgroups can contain all limits */
                if (new_rlim->rlim_max > rlim->rlim_max &&
                                !capable(CAP_SYS_RESOURCE))
                        retval = -EPERM;
                if (!retval)
                        retval = security_task_setrlimit(tsk, resource, new_rlim);
        }
        if (!retval) {
                if (old_rlim)
                        *old_rlim = *rlim;
                if (new_rlim)
                        *rlim = *new_rlim;
        }
        task_unlock(tsk->group_leader);

        /*
         * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not
         * infite. In case of RLIM_INFINITY the posix CPU timer code
         * ignores the rlimit.
         */
         if (!retval && new_rlim && resource == RLIMIT_CPU &&
             new_rlim->rlim_cur != RLIM_INFINITY &&
             IS_ENABLED(CONFIG_POSIX_TIMERS))
                update_rlimit_cpu(tsk, new_rlim->rlim_cur);
out:
        read_unlock(&tasklist_lock);
        return retval;
}

/* rcu lock must be held */
static int check_prlimit_permission(struct task_struct *task,
                                    unsigned int flags)
{
        const struct cred *cred = current_cred(), *tcred;
        bool id_match;

        if (current == task)
                return 0;

        tcred = __task_cred(task);
        id_match = (uid_eq(cred->uid, tcred->euid) &&
                    uid_eq(cred->uid, tcred->suid) &&
                    uid_eq(cred->uid, tcred->uid)  &&
                    gid_eq(cred->gid, tcred->egid) &&
                    gid_eq(cred->gid, tcred->sgid) &&
                    gid_eq(cred->gid, tcred->gid));
        if (!id_match && !ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
                return -EPERM;

        return security_task_prlimit(cred, tcred, flags);
}

SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
                const struct rlimit64 __user *, new_rlim,
                struct rlimit64 __user *, old_rlim)
{
        struct rlimit64 old64, new64;
        struct rlimit old, new;
        struct task_struct *tsk;
        unsigned int checkflags = 0;
        int ret;

        if (old_rlim)
                checkflags |= LSM_PRLIMIT_READ;

        if (new_rlim) {
                if (copy_from_user(&new64, new_rlim, sizeof(new64)))
                        return -EFAULT;
                rlim64_to_rlim(&new64, &new);
                checkflags |= LSM_PRLIMIT_WRITE;
        }

        rcu_read_lock();
        tsk = pid ? find_task_by_vpid(pid) : current;
        if (!tsk) {
                rcu_read_unlock();
                return -ESRCH;
        }
        ret = check_prlimit_permission(tsk, checkflags);
        if (ret) {
                rcu_read_unlock();
                return ret;
        }
        get_task_struct(tsk);
        rcu_read_unlock();

        ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,
                        old_rlim ? &old : NULL);

        if (!ret && old_rlim) {
                rlim_to_rlim64(&old, &old64);
                if (copy_to_user(old_rlim, &old64, sizeof(old64)))
                        ret = -EFAULT;
        }

        put_task_struct(tsk);
        return ret;
}

SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
{
        struct rlimit new_rlim;

        if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
                return -EFAULT;
        return do_prlimit(current, resource, &new_rlim, NULL);
}

/*
 * It would make sense to put struct rusage in the task_struct,
 * except that would make the task_struct be *really big*.  After
 * task_struct gets moved into malloc'ed memory, it would
 * make sense to do this.  It will make moving the rest of the information
 * a lot simpler!  (Which we're not doing right now because we're not
 * measuring them yet).
 *
 * When sampling multiple threads for RUSAGE_SELF, under SMP we might have
 * races with threads incrementing their own counters.  But since word
 * reads are atomic, we either get new values or old values and we don't
 * care which for the sums.  We always take the siglock to protect reading
 * the c* fields from p->signal from races with exit.c updating those
 * fields when reaping, so a sample either gets all the additions of a
 * given child after it's reaped, or none so this sample is before reaping.
 *
 * Locking:
 * We need to take the siglock for CHILDEREN, SELF and BOTH
 * for  the cases current multithreaded, non-current single threaded
 * non-current multithreaded.  Thread traversal is now safe with
 * the siglock held.
 * Strictly speaking, we donot need to take the siglock if we are current and
 * single threaded,  as no one else can take our signal_struct away, no one
 * else can  reap the  children to update signal->c* counters, and no one else
 * can race with the signal-> fields. If we do not take any lock, the
 * signal-> fields could be read out of order while another thread was just
 * exiting. So we should  place a read memory barrier when we avoid the lock.
 * On the writer side,  write memory barrier is implied in  __exit_signal
 * as __exit_signal releases  the siglock spinlock after updating the signal->
 * fields. But we don't do this yet to keep things simple.
 *
 */

static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
{
        r->ru_nvcsw += t->nvcsw;
        r->ru_nivcsw += t->nivcsw;
        r->ru_minflt += t->min_flt;
        r->ru_majflt += t->maj_flt;
        r->ru_inblock += task_io_get_inblock(t);
        r->ru_oublock += task_io_get_oublock(t);
}

void getrusage(struct task_struct *p, int who, struct rusage *r)
{
        struct task_struct *t;
        unsigned long flags;
        u64 tgutime, tgstime, utime, stime;
        unsigned long maxrss;
        struct mm_struct *mm;
        struct signal_struct *sig = p->signal;
        unsigned int seq = 0;

retry:
        memset(r, 0, sizeof(*r));
        utime = stime = 0;
        maxrss = 0;

        if (who == RUSAGE_THREAD) {
                task_cputime_adjusted(current, &utime, &stime);
                accumulate_thread_rusage(p, r);
                maxrss = sig->maxrss;
                goto out_thread;
        }

        flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);

        switch (who) {
        case RUSAGE_BOTH:
        case RUSAGE_CHILDREN:
                utime = sig->cutime;
                stime = sig->cstime;
                r->ru_nvcsw = sig->cnvcsw;
                r->ru_nivcsw = sig->cnivcsw;
                r->ru_minflt = sig->cmin_flt;
                r->ru_majflt = sig->cmaj_flt;
                r->ru_inblock = sig->cinblock;
                r->ru_oublock = sig->coublock;
                maxrss = sig->cmaxrss;

                if (who == RUSAGE_CHILDREN)
                        break;
                fallthrough;

        case RUSAGE_SELF:
                r->ru_nvcsw += sig->nvcsw;
                r->ru_nivcsw += sig->nivcsw;
                r->ru_minflt += sig->min_flt;
                r->ru_majflt += sig->maj_flt;
                r->ru_inblock += sig->inblock;
                r->ru_oublock += sig->oublock;
                if (maxrss < sig->maxrss)
                        maxrss = sig->maxrss;

                rcu_read_lock();
                __for_each_thread(sig, t)
                        accumulate_thread_rusage(t, r);
                rcu_read_unlock();

                break;

        default:
                BUG();
        }

        if (need_seqretry(&sig->stats_lock, seq)) {
                seq = 1;
                goto retry;
        }
        done_seqretry_irqrestore(&sig->stats_lock, seq, flags);

        if (who == RUSAGE_CHILDREN)
                goto out_children;

        thread_group_cputime_adjusted(p, &tgutime, &tgstime);
        utime += tgutime;
        stime += tgstime;

out_thread:
        mm = get_task_mm(p);
        if (mm) {
                setmax_mm_hiwater_rss(&maxrss, mm);
                mmput(mm);
        }

out_children:
        r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
        r->ru_utime = ns_to_kernel_old_timeval(utime);
        r->ru_stime = ns_to_kernel_old_timeval(stime);
}

SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
{
        struct rusage r;

        if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
            who != RUSAGE_THREAD)
                return -EINVAL;

        getrusage(current, who, &r);
        return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru)
{
        struct rusage r;

        if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
            who != RUSAGE_THREAD)
                return -EINVAL;

        getrusage(current, who, &r);
        return put_compat_rusage(&r, ru);
}
#endif

SYSCALL_DEFINE1(umask, int, mask)
{
        mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
        return mask;
}

static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
{
        struct fd exe;
        struct file *old_exe, *exe_file;
        struct inode *inode;
        int err;

        exe = fdget(fd);
        if (!exe.file)
                return -EBADF;

        inode = file_inode(exe.file);

        /*
         * Because the original mm->exe_file points to executable file, make
         * sure that this one is executable as well, to avoid breaking an
         * overall picture.
         */
        err = -EACCES;
        if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
                goto exit;

        err = file_permission(exe.file, MAY_EXEC);
        if (err)
                goto exit;

        /*
         * Forbid mm->exe_file change if old file still mapped.
         */
        exe_file = get_mm_exe_file(mm);
        err = -EBUSY;
        if (exe_file) {
                struct vm_area_struct *vma;

                mmap_read_lock(mm);
                for (vma = mm->mmap; vma; vma = vma->vm_next) {
                        if (!vma->vm_file)
                                continue;
                        if (path_equal(&vma->vm_file->f_path,
                                       &exe_file->f_path))
                                goto exit_err;
                }

                mmap_read_unlock(mm);
                fput(exe_file);
        }

        err = 0;
        /* set the new file, lockless */
        get_file(exe.file);
        old_exe = xchg(&mm->exe_file, exe.file);
        if (old_exe)
                fput(old_exe);
exit:
        fdput(exe);
        return err;
exit_err:
        mmap_read_unlock(mm);
        fput(exe_file);
        goto exit;
}

/*
 * Check arithmetic relations of passed addresses.
 *
 * WARNING: we don't require any capability here so be very careful
 * in what is allowed for modification from userspace.
 */
static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map)
{
        unsigned long mmap_max_addr = TASK_SIZE;
        int error = -EINVAL, i;

        static const unsigned char offsets[] = {
                offsetof(struct prctl_mm_map, start_code),
                offsetof(struct prctl_mm_map, end_code),
                offsetof(struct prctl_mm_map, start_data),
                offsetof(struct prctl_mm_map, end_data),
                offsetof(struct prctl_mm_map, start_brk),
                offsetof(struct prctl_mm_map, brk),
                offsetof(struct prctl_mm_map, start_stack),
                offsetof(struct prctl_mm_map, arg_start),
                offsetof(struct prctl_mm_map, arg_end),
                offsetof(struct prctl_mm_map, env_start),
                offsetof(struct prctl_mm_map, env_end),
        };

        /*
         * Make sure the members are not somewhere outside
         * of allowed address space.
         */
        for (i = 0; i < ARRAY_SIZE(offsets); i++) {
                u64 val = *(u64 *)((char *)prctl_map + offsets[i]);

                if ((unsigned long)val >= mmap_max_addr ||
                    (unsigned long)val < mmap_min_addr)
                        goto out;
        }

        /*
         * Make sure the pairs are ordered.
         */
#define __prctl_check_order(__m1, __op, __m2)                                \
        ((unsigned long)prctl_map->__m1 __op                                \
         (unsigned long)prctl_map->__m2) ? 0 : -EINVAL
        error  = __prctl_check_order(start_code, <, end_code);
        error |= __prctl_check_order(start_data,<=, end_data);
        error |= __prctl_check_order(start_brk, <=, brk);
        error |= __prctl_check_order(arg_start, <=, arg_end);
        error |= __prctl_check_order(env_start, <=, env_end);
        if (error)
                goto out;
#undef __prctl_check_order

        error = -EINVAL;

        /*
         * Neither we should allow to override limits if they set.
         */
        if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk,
                              prctl_map->start_brk, prctl_map->end_data,
                              prctl_map->start_data))
                        goto out;

        error = 0;
out:
        return error;
}

#ifdef CONFIG_CHECKPOINT_RESTORE
static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
{
        struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
        unsigned long user_auxv[AT_VECTOR_SIZE];
        struct mm_struct *mm = current->mm;
        int error;

        BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
        BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256);

        if (opt == PR_SET_MM_MAP_SIZE)
                return put_user((unsigned int)sizeof(prctl_map),
                                (unsigned int __user *)addr);

        if (data_size != sizeof(prctl_map))
                return -EINVAL;

        if (copy_from_user(&prctl_map, addr, sizeof(prctl_map)))
                return -EFAULT;

        error = validate_prctl_map_addr(&prctl_map);
        if (error)
                return error;

        if (prctl_map.auxv_size) {
                /*
                 * Someone is trying to cheat the auxv vector.
                 */
                if (!prctl_map.auxv ||
                                prctl_map.auxv_size > sizeof(mm->saved_auxv))
                        return -EINVAL;

                memset(user_auxv, 0, sizeof(user_auxv));
                if (copy_from_user(user_auxv,
                                   (const void __user *)prctl_map.auxv,
                                   prctl_map.auxv_size))
                        return -EFAULT;

                /* Last entry must be AT_NULL as specification requires */
                user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL;
                user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
        }

        if (prctl_map.exe_fd != (u32)-1) {
                /*
                 * Check if the current user is checkpoint/restore capable.
                 * At the time of this writing, it checks for CAP_SYS_ADMIN
                 * or CAP_CHECKPOINT_RESTORE.
                 * Note that a user with access to ptrace can masquerade an
                 * arbitrary program as any executable, even setuid ones.
                 * This may have implications in the tomoyo subsystem.
                 */
                if (!checkpoint_restore_ns_capable(current_user_ns()))
                        return -EPERM;

                error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
                if (error)
                        return error;
        }

        /*
         * arg_lock protects concurent updates but we still need mmap_lock for
         * read to exclude races with sys_brk.
         */
        mmap_read_lock(mm);

        /*
         * We don't validate if these members are pointing to
         * real present VMAs because application may have correspond
         * VMAs already unmapped and kernel uses these members for statistics
         * output in procfs mostly, except
         *
         *  - @start_brk/@brk which are used in do_brk_flags but kernel lookups
         *    for VMAs when updating these memvers so anything wrong written
         *    here cause kernel to swear at userspace program but won't lead
         *    to any problem in kernel itself
         */

        spin_lock(&mm->arg_lock);
        mm->start_code        = prctl_map.start_code;
        mm->end_code        = prctl_map.end_code;
        mm->start_data        = prctl_map.start_data;
        mm->end_data        = prctl_map.end_data;
        mm->start_brk        = prctl_map.start_brk;
        mm->brk                = prctl_map.brk;
        mm->start_stack        = prctl_map.start_stack;
        mm->arg_start        = prctl_map.arg_start;
        mm->arg_end        = prctl_map.arg_end;
        mm->env_start        = prctl_map.env_start;
        mm->env_end        = prctl_map.env_end;
        spin_unlock(&mm->arg_lock);

        /*
         * Note this update of @saved_auxv is lockless thus
         * if someone reads this member in procfs while we're
         * updating -- it may get partly updated results. It's
         * known and acceptable trade off: we leave it as is to
         * not introduce additional locks here making the kernel
         * more complex.
         */
        if (prctl_map.auxv_size)
                memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));

        mmap_read_unlock(mm);
        return 0;
}
#endif /* CONFIG_CHECKPOINT_RESTORE */

static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
                          unsigned long len)
{
        /*
         * This doesn't move the auxiliary vector itself since it's pinned to
         * mm_struct, but it permits filling the vector with new values.  It's
         * up to the caller to provide sane values here, otherwise userspace
         * tools which use this vector might be unhappy.
         */
        unsigned long user_auxv[AT_VECTOR_SIZE];

        if (len > sizeof(user_auxv))
                return -EINVAL;

        if (copy_from_user(user_auxv, (const void __user *)addr, len))
                return -EFAULT;

        /* Make sure the last entry is always AT_NULL */
        user_auxv[AT_VECTOR_SIZE - 2] = 0;
        user_auxv[AT_VECTOR_SIZE - 1] = 0;

        BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));

        task_lock(current);
        memcpy(mm->saved_auxv, user_auxv, len);
        task_unlock(current);

        return 0;
}

static int prctl_set_mm(int opt, unsigned long addr,
                        unsigned long arg4, unsigned long arg5)
{
        struct mm_struct *mm = current->mm;
        struct prctl_mm_map prctl_map = {
                .auxv = NULL,
                .auxv_size = 0,
                .exe_fd = -1,
        };
        struct vm_area_struct *vma;
        int error;

        if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV &&
                              opt != PR_SET_MM_MAP &&
                              opt != PR_SET_MM_MAP_SIZE)))
                return -EINVAL;

#ifdef CONFIG_CHECKPOINT_RESTORE
        if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE)
                return prctl_set_mm_map(opt, (const void __user *)addr, arg4);
#endif

        if (!capable(CAP_SYS_RESOURCE))
                return -EPERM;

        if (opt == PR_SET_MM_EXE_FILE)
                return prctl_set_mm_exe_file(mm, (unsigned int)addr);

        if (opt == PR_SET_MM_AUXV)
                return prctl_set_auxv(mm, addr, arg4);

        if (addr >= TASK_SIZE || addr < mmap_min_addr)
                return -EINVAL;

        error = -EINVAL;

        /*
         * arg_lock protects concurent updates of arg boundaries, we need
         * mmap_lock for a) concurrent sys_brk, b) finding VMA for addr
         * validation.
         */
        mmap_read_lock(mm);
        vma = find_vma(mm, addr);

        spin_lock(&mm->arg_lock);
        prctl_map.start_code        = mm->start_code;
        prctl_map.end_code        = mm->end_code;
        prctl_map.start_data        = mm->start_data;
        prctl_map.end_data        = mm->end_data;
        prctl_map.start_brk        = mm->start_brk;
        prctl_map.brk                = mm->brk;
        prctl_map.start_stack        = mm->start_stack;
        prctl_map.arg_start        = mm->arg_start;
        prctl_map.arg_end        = mm->arg_end;
        prctl_map.env_start        = mm->env_start;
        prctl_map.env_end        = mm->env_end;

        switch (opt) {
        case PR_SET_MM_START_CODE:
                prctl_map.start_code = addr;
                break;
        case PR_SET_MM_END_CODE:
                prctl_map.end_code = addr;
                break;
        case PR_SET_MM_START_DATA:
                prctl_map.start_data = addr;
                break;
        case PR_SET_MM_END_DATA:
                prctl_map.end_data = addr;
                break;
        case PR_SET_MM_START_STACK:
                prctl_map.start_stack = addr;
                break;
        case PR_SET_MM_START_BRK:
                prctl_map.start_brk = addr;
                break;
        case PR_SET_MM_BRK:
                prctl_map.brk = addr;
                break;
        case PR_SET_MM_ARG_START:
                prctl_map.arg_start = addr;
                break;
        case PR_SET_MM_ARG_END:
                prctl_map.arg_end = addr;
                break;
        case PR_SET_MM_ENV_START:
                prctl_map.env_start = addr;
                break;
        case PR_SET_MM_ENV_END:
                prctl_map.env_end = addr;
                break;
        default:
                goto out;
        }

        error = validate_prctl_map_addr(&prctl_map);
        if (error)
                goto out;

        switch (opt) {
        /*
         * If command line arguments and environment
         * are placed somewhere else on stack, we can
         * set them up here, ARG_START/END to setup
         * command line argumets and ENV_START/END
         * for environment.
         */
        case PR_SET_MM_START_STACK:
        case PR_SET_MM_ARG_START:
        case PR_SET_MM_ARG_END:
        case PR_SET_MM_ENV_START:
        case PR_SET_MM_ENV_END:
                if (!vma) {
                        error = -EFAULT;
                        goto out;
                }
        }

        mm->start_code        = prctl_map.start_code;
        mm->end_code        = prctl_map.end_code;
        mm->start_data        = prctl_map.start_data;
        mm->end_data        = prctl_map.end_data;
        mm->start_brk        = prctl_map.start_brk;
        mm->brk                = prctl_map.brk;
        mm->start_stack        = prctl_map.start_stack;
        mm->arg_start        = prctl_map.arg_start;
        mm->arg_end        = prctl_map.arg_end;
        mm->env_start        = prctl_map.env_start;
        mm->env_end        = prctl_map.env_end;

        error = 0;
out:
        spin_unlock(&mm->arg_lock);
        mmap_read_unlock(mm);
        return error;
}

#ifdef CONFIG_CHECKPOINT_RESTORE
static int prctl_get_tid_address(struct task_struct *me, int __user * __user *tid_addr)
{
        return put_user(me->clear_child_tid, tid_addr);
}
#else
static int prctl_get_tid_address(struct task_struct *me, int __user * __user *tid_addr)
{
        return -EINVAL;
}
#endif

static int propagate_has_child_subreaper(struct task_struct *p, void *data)
{
        /*
         * If task has has_child_subreaper - all its decendants
         * already have these flag too and new decendants will
         * inherit it on fork, skip them.
         *
         * If we've found child_reaper - skip descendants in
         * it's subtree as they will never get out pidns.
         */
        if (p->signal->has_child_subreaper ||
            is_child_reaper(task_pid(p)))
                return 0;

        p->signal->has_child_subreaper = 1;
        return 1;
}

int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which)
{
        return -EINVAL;
}

int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
                                    unsigned long ctrl)
{
        return -EINVAL;
}

#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE)

SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                unsigned long, arg4, unsigned long, arg5)
{
        struct task_struct *me = current;
        unsigned char comm[sizeof(me->comm)];
        long error;

        error = security_task_prctl(option, arg2, arg3, arg4, arg5);
        if (error != -ENOSYS)
                return error;

        error = 0;
        switch (option) {
        case PR_SET_PDEATHSIG:
                if (!valid_signal(arg2)) {
                        error = -EINVAL;
                        break;
                }
                me->pdeath_signal = arg2;
                break;
        case PR_GET_PDEATHSIG:
                error = put_user(me->pdeath_signal, (int __user *)arg2);
                break;
        case PR_GET_DUMPABLE:
                error = get_dumpable(me->mm);
                break;
        case PR_SET_DUMPABLE:
                if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) {
                        error = -EINVAL;
                        break;
                }
                set_dumpable(me->mm, arg2);
                break;

        case PR_SET_UNALIGN:
                error = SET_UNALIGN_CTL(me, arg2);
                break;
        case PR_GET_UNALIGN:
                error = GET_UNALIGN_CTL(me, arg2);
                break;
        case PR_SET_FPEMU:
                error = SET_FPEMU_CTL(me, arg2);
                break;
        case PR_GET_FPEMU:
                error = GET_FPEMU_CTL(me, arg2);
                break;
        case PR_SET_FPEXC:
                error = SET_FPEXC_CTL(me, arg2);
                break;
        case PR_GET_FPEXC:
                error = GET_FPEXC_CTL(me, arg2);
                break;
        case PR_GET_TIMING:
                error = PR_TIMING_STATISTICAL;
                break;
        case PR_SET_TIMING:
                if (arg2 != PR_TIMING_STATISTICAL)
                        error = -EINVAL;
                break;
        case PR_SET_NAME:
                comm[sizeof(me->comm) - 1] = 0;
                if (strncpy_from_user(comm, (char __user *)arg2,
                                      sizeof(me->comm) - 1) < 0)
                        return -EFAULT;
                set_task_comm(me, comm);
                proc_comm_connector(me);
                break;
        case PR_GET_NAME:
                get_task_comm(comm, me);
                if (copy_to_user((char __user *)arg2, comm, sizeof(comm)))
                        return -EFAULT;
                break;
        case PR_GET_ENDIAN:
                error = GET_ENDIAN(me, arg2);
                break;
        case PR_SET_ENDIAN:
                error = SET_ENDIAN(me, arg2);
                break;
        case PR_GET_SECCOMP:
                error = prctl_get_seccomp();
                break;
        case PR_SET_SECCOMP:
                error = prctl_set_seccomp(arg2, (char __user *)arg3);
                break;
        case PR_GET_TSC:
                error = GET_TSC_CTL(arg2);
                break;
        case PR_SET_TSC:
                error = SET_TSC_CTL(arg2);
                break;
        case PR_TASK_PERF_EVENTS_DISABLE:
                error = perf_event_task_disable();
                break;
        case PR_TASK_PERF_EVENTS_ENABLE:
                error = perf_event_task_enable();
                break;
        case PR_GET_TIMERSLACK:
                if (current->timer_slack_ns > ULONG_MAX)
                        error = ULONG_MAX;
                else
                        error = current->timer_slack_ns;
                break;
        case PR_SET_TIMERSLACK:
                if (arg2 <= 0)
                        current->timer_slack_ns =
                                        current->default_timer_slack_ns;
                else
                        current->timer_slack_ns = arg2;
                break;
        case PR_MCE_KILL:
                if (arg4 | arg5)
                        return -EINVAL;
                switch (arg2) {
                case PR_MCE_KILL_CLEAR:
                        if (arg3 != 0)
                                return -EINVAL;
                        current->flags &= ~PF_MCE_PROCESS;
                        break;
                case PR_MCE_KILL_SET:
                        current->flags |= PF_MCE_PROCESS;
                        if (arg3 == PR_MCE_KILL_EARLY)
                                current->flags |= PF_MCE_EARLY;
                        else if (arg3 == PR_MCE_KILL_LATE)
                                current->flags &= ~PF_MCE_EARLY;
                        else if (arg3 == PR_MCE_KILL_DEFAULT)
                                current->flags &=
                                                ~(PF_MCE_EARLY|PF_MCE_PROCESS);
                        else
                                return -EINVAL;
                        break;
                default:
                        return -EINVAL;
                }
                break;
        case PR_MCE_KILL_GET:
                if (arg2 | arg3 | arg4 | arg5)
                        return -EINVAL;
                if (current->flags & PF_MCE_PROCESS)
                        error = (current->flags & PF_MCE_EARLY) ?
                                PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
                else
                        error = PR_MCE_KILL_DEFAULT;
                break;
        case PR_SET_MM:
                error = prctl_set_mm(arg2, arg3, arg4, arg5);
                break;
        case PR_GET_TID_ADDRESS:
                error = prctl_get_tid_address(me, (int __user * __user *)arg2);
                break;
        case PR_SET_CHILD_SUBREAPER:
                me->signal->is_child_subreaper = !!arg2;
                if (!arg2)
                        break;

                walk_process_tree(me, propagate_has_child_subreaper, NULL);
                break;
        case PR_GET_CHILD_SUBREAPER:
                error = put_user(me->signal->is_child_subreaper,
                                 (int __user *)arg2);
                break;
        case PR_SET_NO_NEW_PRIVS:
                if (arg2 != 1 || arg3 || arg4 || arg5)
                        return -EINVAL;

                task_set_no_new_privs(current);
                break;
        case PR_GET_NO_NEW_PRIVS:
                if (arg2 || arg3 || arg4 || arg5)
                        return -EINVAL;
                return task_no_new_privs(current) ? 1 : 0;
        case PR_GET_THP_DISABLE:
                if (arg2 || arg3 || arg4 || arg5)
                        return -EINVAL;
                error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags);
                break;
        case PR_SET_THP_DISABLE:
                if (arg3 || arg4 || arg5)
                        return -EINVAL;
                if (mmap_write_lock_killable(me->mm))
                        return -EINTR;
                if (arg2)
                        set_bit(MMF_DISABLE_THP, &me->mm->flags);
                else
                        clear_bit(MMF_DISABLE_THP, &me->mm->flags);
                mmap_write_unlock(me->mm);
                break;
        case PR_MPX_ENABLE_MANAGEMENT:
        case PR_MPX_DISABLE_MANAGEMENT:
                /* No longer implemented: */
                return -EINVAL;
        case PR_SET_FP_MODE:
                error = SET_FP_MODE(me, arg2);
                break;
        case PR_GET_FP_MODE:
                error = GET_FP_MODE(me);
                break;
        case PR_SVE_SET_VL:
                error = SVE_SET_VL(arg2);
                break;
        case PR_SVE_GET_VL:
                error = SVE_GET_VL();
                break;
        case PR_GET_SPECULATION_CTRL:
                if (arg3 || arg4 || arg5)
                        return -EINVAL;
                error = arch_prctl_spec_ctrl_get(me, arg2);
                break;
        case PR_SET_SPECULATION_CTRL:
                if (arg4 || arg5)
                        return -EINVAL;
                error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
                break;
        case PR_PAC_RESET_KEYS:
                if (arg3 || arg4 || arg5)
                        return -EINVAL;
                error = PAC_RESET_KEYS(me, arg2);
                break;
        case PR_SET_TAGGED_ADDR_CTRL:
                if (arg3 || arg4 || arg5)
                        return -EINVAL;
                error = SET_TAGGED_ADDR_CTRL(arg2);
                break;
        case PR_GET_TAGGED_ADDR_CTRL:
                if (arg2 || arg3 || arg4 || arg5)
                        return -EINVAL;
                error = GET_TAGGED_ADDR_CTRL();
                break;
        case PR_SET_IO_FLUSHER:
                if (!capable(CAP_SYS_RESOURCE))
                        return -EPERM;

                if (arg3 || arg4 || arg5)
                        return -EINVAL;

                if (arg2 == 1)
                        current->flags |= PR_IO_FLUSHER;
                else if (!arg2)
                        current->flags &= ~PR_IO_FLUSHER;
                else
                        return -EINVAL;
                break;
        case PR_GET_IO_FLUSHER:
                if (!capable(CAP_SYS_RESOURCE))
                        return -EPERM;

                if (arg2 || arg3 || arg4 || arg5)
                        return -EINVAL;

                error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
                break;
        default:
                error = -EINVAL;
                break;
        }
        return error;
}

SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
                struct getcpu_cache __user *, unused)
{
        int err = 0;
        int cpu = raw_smp_processor_id();

        if (cpup)
                err |= put_user(cpu, cpup);
        if (nodep)
                err |= put_user(cpu_to_node(cpu), nodep);
        return err ? -EFAULT : 0;
}

/**
 * do_sysinfo - fill in sysinfo struct
 * @info: pointer to buffer to fill
 */
static int do_sysinfo(struct sysinfo *info)
{
        unsigned long mem_total, sav_total;
        unsigned int mem_unit, bitcount;
        struct timespec64 tp;

        memset(info, 0, sizeof(struct sysinfo));

        ktime_get_boottime_ts64(&tp);
        timens_add_boottime(&tp);
        info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);

        get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);

        info->procs = nr_threads;

        si_meminfo(info);
        si_swapinfo(info);

        /*
         * If the sum of all the available memory (i.e. ram + swap)
         * is less than can be stored in a 32 bit unsigned long then
         * we can be binary compatible with 2.2.x kernels.  If not,
         * well, in that case 2.2.x was broken anyways...
         *
         *  -Erik Andersen <andersee@debian.org>
         */

        mem_total = info->totalram + info->totalswap;
        if (mem_total < info->totalram || mem_total < info->totalswap)
                goto out;
        bitcount = 0;
        mem_unit = info->mem_unit;
        while (mem_unit > 1) {
                bitcount++;
                mem_unit >>= 1;
                sav_total = mem_total;
                mem_total <<= 1;
                if (mem_total < sav_total)
                        goto out;
        }

        /*
         * If mem_total did not overflow, multiply all memory values by
         * info->mem_unit and set it to 1.  This leaves things compatible
         * with 2.2.x, and also retains compatibility with earlier 2.4.x
         * kernels...
         */

        info->mem_unit = 1;
        info->totalram <<= bitcount;
        info->freeram <<= bitcount;
        info->sharedram <<= bitcount;
        info->bufferram <<= bitcount;
        info->totalswap <<= bitcount;
        info->freeswap <<= bitcount;
        info->totalhigh <<= bitcount;
        info->freehigh <<= bitcount;

out:
        return 0;
}

SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info)
{
        struct sysinfo val;

        do_sysinfo(&val);

        if (copy_to_user(info, &val, sizeof(struct sysinfo)))
                return -EFAULT;

        return 0;
}

#ifdef CONFIG_COMPAT
struct compat_sysinfo {
        s32 uptime;
        u32 loads[3];
        u32 totalram;
        u32 freeram;
        u32 sharedram;
        u32 bufferram;
        u32 totalswap;
        u32 freeswap;
        u16 procs;
        u16 pad;
        u32 totalhigh;
        u32 freehigh;
        u32 mem_unit;
        char _f[20-2*sizeof(u32)-sizeof(int)];
};

COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
{
        struct sysinfo s;
        struct compat_sysinfo s_32;

        do_sysinfo(&s);

        /* Check to see if any memory value is too large for 32-bit and scale
         *  down if needed
         */
        if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) {
                int bitcount = 0;

                while (s.mem_unit < PAGE_SIZE) {
                        s.mem_unit <<= 1;
                        bitcount++;
                }

                s.totalram >>= bitcount;
                s.freeram >>= bitcount;
                s.sharedram >>= bitcount;
                s.bufferram >>= bitcount;
                s.totalswap >>= bitcount;
                s.freeswap >>= bitcount;
                s.totalhigh >>= bitcount;
                s.freehigh >>= bitcount;
        }

        memset(&s_32, 0, sizeof(s_32));
        s_32.uptime = s.uptime;
        s_32.loads[0] = s.loads[0];
        s_32.loads[1] = s.loads[1];
        s_32.loads[2] = s.loads[2];
        s_32.totalram = s.totalram;
        s_32.freeram = s.freeram;
        s_32.sharedram = s.sharedram;
        s_32.bufferram = s.bufferram;
        s_32.totalswap = s.totalswap;
        s_32.freeswap = s.freeswap;
        s_32.procs = s.procs;
        s_32.totalhigh = s.totalhigh;
        s_32.freehigh = s.freehigh;
        s_32.mem_unit = s.mem_unit;
        if (copy_to_user(info, &s_32, sizeof(s_32)))
                return -EFAULT;
        return 0;
}
#endif /* CONFIG_COMPAT */















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_H
#define _LINUX_SCHED_H

/*
 * Define 'struct task_struct' and provide the main scheduler
 * APIs (schedule(), wakeup variants, etc.)
 */

#include <uapi/linux/sched.h>

#include <asm/current.h>

#include <linux/pid.h>
#include <linux/sem.h>
#include <linux/shm.h>
#include <linux/mutex.h>
#include <linux/plist.h>
#include <linux/hrtimer.h>
#include <linux/irqflags.h>
#include <linux/seccomp.h>
#include <linux/nodemask.h>
#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <linux/resource.h>
#include <linux/latencytop.h>
#include <linux/sched/prio.h>
#include <linux/sched/types.h>
#include <linux/signal_types.h>
#include <linux/mm_types_task.h>
#include <linux/task_io_accounting.h>
#include <linux/posix-timers.h>
#include <linux/rseq.h>
#include <linux/seqlock.h>
#include <linux/kcsan.h>

/* task_struct member predeclarations (sorted alphabetically): */
struct audit_context;
struct backing_dev_info;
struct bio_list;
struct blk_plug;
struct capture_control;
struct cfs_rq;
struct fs_struct;
struct futex_pi_state;
struct io_context;
struct mempolicy;
struct nameidata;
struct nsproxy;
struct perf_event_context;
struct pid_namespace;
struct pipe_inode_info;
struct rcu_node;
struct reclaim_state;
struct robust_list_head;
struct root_domain;
struct rq;
struct sched_attr;
struct sched_param;
struct seq_file;
struct sighand_struct;
struct signal_struct;
struct task_delay_info;
struct task_group;
struct io_uring_task;

/*
 * Task state bitmask. NOTE! These bits are also
 * encoded in fs/proc/array.c: get_task_state().
 *
 * We have two separate sets of flags: task->state
 * is about runnability, while task->exit_state are
 * about the task exiting. Confusing, but this way
 * modifying one set can't modify the other one by
 * mistake.
 */

/* Used in tsk->state: */
#define TASK_RUNNING                        0x0000
#define TASK_INTERRUPTIBLE                0x0001
#define TASK_UNINTERRUPTIBLE                0x0002
#define __TASK_STOPPED                        0x0004
#define __TASK_TRACED                        0x0008
/* Used in tsk->exit_state: */
#define EXIT_DEAD                        0x0010
#define EXIT_ZOMBIE                        0x0020
#define EXIT_TRACE                        (EXIT_ZOMBIE | EXIT_DEAD)
/* Used in tsk->state again: */
#define TASK_PARKED                        0x0040
#define TASK_DEAD                        0x0080
#define TASK_WAKEKILL                        0x0100
#define TASK_WAKING                        0x0200
#define TASK_NOLOAD                        0x0400
#define TASK_NEW                        0x0800
#define TASK_STATE_MAX                        0x1000

/* Convenience macros for the sake of set_current_state: */
#define TASK_KILLABLE                        (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
#define TASK_STOPPED                        (TASK_WAKEKILL | __TASK_STOPPED)
#define TASK_TRACED                        (TASK_WAKEKILL | __TASK_TRACED)

#define TASK_IDLE                        (TASK_UNINTERRUPTIBLE | TASK_NOLOAD)

/* Convenience macros for the sake of wake_up(): */
#define TASK_NORMAL                        (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)

/* get_task_state(): */
#define TASK_REPORT                        (TASK_RUNNING | TASK_INTERRUPTIBLE | \
                                         TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
                                         __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
                                         TASK_PARKED)

#define task_is_traced(task)                ((task->state & __TASK_TRACED) != 0)

#define task_is_stopped(task)                ((task->state & __TASK_STOPPED) != 0)

#define task_is_stopped_or_traced(task)        ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)

#ifdef CONFIG_DEBUG_ATOMIC_SLEEP

/*
 * Special states are those that do not use the normal wait-loop pattern. See
 * the comment with set_special_state().
 */
#define is_special_task_state(state)                                \
        ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD))

#define __set_current_state(state_value)                        \
        do {                                                        \
                WARN_ON_ONCE(is_special_task_state(state_value));\
                current->task_state_change = _THIS_IP_;                \
                current->state = (state_value);                        \
        } while (0)

#define set_current_state(state_value)                                \
        do {                                                        \
                WARN_ON_ONCE(is_special_task_state(state_value));\
                current->task_state_change = _THIS_IP_;                \
                smp_store_mb(current->state, (state_value));        \
        } while (0)

#define set_special_state(state_value)                                        \
        do {                                                                \
                unsigned long flags; /* may shadow */                        \
                WARN_ON_ONCE(!is_special_task_state(state_value));        \
                raw_spin_lock_irqsave(&current->pi_lock, flags);        \
                current->task_state_change = _THIS_IP_;                        \
                current->state = (state_value);                                \
                raw_spin_unlock_irqrestore(&current->pi_lock, flags);        \
        } while (0)
#else
/*
 * set_current_state() includes a barrier so that the write of current->state
 * is correctly serialised wrt the caller's subsequent test of whether to
 * actually sleep:
 *
 *   for (;;) {
 *        set_current_state(TASK_UNINTERRUPTIBLE);
 *        if (CONDITION)
 *           break;
 *
 *        schedule();
 *   }
 *   __set_current_state(TASK_RUNNING);
 *
 * If the caller does not need such serialisation (because, for instance, the
 * CONDITION test and condition change and wakeup are under the same lock) then
 * use __set_current_state().
 *
 * The above is typically ordered against the wakeup, which does:
 *
 *   CONDITION = 1;
 *   wake_up_state(p, TASK_UNINTERRUPTIBLE);
 *
 * where wake_up_state()/try_to_wake_up() executes a full memory barrier before
 * accessing p->state.
 *
 * Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is,
 * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a
 * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING).
 *
 * However, with slightly different timing the wakeup TASK_RUNNING store can
 * also collide with the TASK_UNINTERRUPTIBLE store. Losing that store is not
 * a problem either because that will result in one extra go around the loop
 * and our @cond test will save the day.
 *
 * Also see the comments of try_to_wake_up().
 */
#define __set_current_state(state_value)                                \
        current->state = (state_value)

#define set_current_state(state_value)                                        \
        smp_store_mb(current->state, (state_value))

/*
 * set_special_state() should be used for those states when the blocking task
 * can not use the regular condition based wait-loop. In that case we must
 * serialize against wakeups such that any possible in-flight TASK_RUNNING stores
 * will not collide with our state change.
 */
#define set_special_state(state_value)                                        \
        do {                                                                \
                unsigned long flags; /* may shadow */                        \
                raw_spin_lock_irqsave(&current->pi_lock, flags);        \
                current->state = (state_value);                                \
                raw_spin_unlock_irqrestore(&current->pi_lock, flags);        \
        } while (0)

#endif

/* Task command name length: */
#define TASK_COMM_LEN                        16

extern void scheduler_tick(void);

#define        MAX_SCHEDULE_TIMEOUT                LONG_MAX

extern long schedule_timeout(long timeout);
extern long schedule_timeout_interruptible(long timeout);
extern long schedule_timeout_killable(long timeout);
extern long schedule_timeout_uninterruptible(long timeout);
extern long schedule_timeout_idle(long timeout);
asmlinkage void schedule(void);
extern void schedule_preempt_disabled(void);
asmlinkage void preempt_schedule_irq(void);

extern int __must_check io_schedule_prepare(void);
extern void io_schedule_finish(int token);
extern long io_schedule_timeout(long timeout);
extern void io_schedule(void);

/**
 * struct prev_cputime - snapshot of system and user cputime
 * @utime: time spent in user mode
 * @stime: time spent in system mode
 * @lock: protects the above two fields
 *
 * Stores previous user/system time values such that we can guarantee
 * monotonicity.
 */
struct prev_cputime {
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
        u64                                utime;
        u64                                stime;
        raw_spinlock_t                        lock;
#endif
};

enum vtime_state {
        /* Task is sleeping or running in a CPU with VTIME inactive: */
        VTIME_INACTIVE = 0,
        /* Task is idle */
        VTIME_IDLE,
        /* Task runs in kernelspace in a CPU with VTIME active: */
        VTIME_SYS,
        /* Task runs in userspace in a CPU with VTIME active: */
        VTIME_USER,
        /* Task runs as guests in a CPU with VTIME active: */
        VTIME_GUEST,
};

struct vtime {
        seqcount_t                seqcount;
        unsigned long long        starttime;
        enum vtime_state        state;
        unsigned int                cpu;
        u64                        utime;
        u64                        stime;
        u64                        gtime;
};

/*
 * Utilization clamp constraints.
 * @UCLAMP_MIN:        Minimum utilization
 * @UCLAMP_MAX:        Maximum utilization
 * @UCLAMP_CNT:        Utilization clamp constraints count
 */
enum uclamp_id {
        UCLAMP_MIN = 0,
        UCLAMP_MAX,
        UCLAMP_CNT
};

#ifdef CONFIG_SMP
extern struct root_domain def_root_domain;
extern struct mutex sched_domains_mutex;
#endif

struct sched_info {
#ifdef CONFIG_SCHED_INFO
        /* Cumulative counters: */

        /* # of times we have run on this CPU: */
        unsigned long                        pcount;

        /* Time spent waiting on a runqueue: */
        unsigned long long                run_delay;

        /* Timestamps: */

        /* When did we last run on a CPU? */
        unsigned long long                last_arrival;

        /* When were we last queued to run? */
        unsigned long long                last_queued;

#endif /* CONFIG_SCHED_INFO */
};

/*
 * Integer metrics need fixed point arithmetic, e.g., sched/fair
 * has a few: load, load_avg, util_avg, freq, and capacity.
 *
 * We define a basic fixed point arithmetic range, and then formalize
 * all these metrics based on that basic range.
 */
# define SCHED_FIXEDPOINT_SHIFT                10
# define SCHED_FIXEDPOINT_SCALE                (1L << SCHED_FIXEDPOINT_SHIFT)

/* Increase resolution of cpu_capacity calculations */
# define SCHED_CAPACITY_SHIFT                SCHED_FIXEDPOINT_SHIFT
# define SCHED_CAPACITY_SCALE                (1L << SCHED_CAPACITY_SHIFT)

struct load_weight {
        unsigned long                        weight;
        u32                                inv_weight;
};

/**
 * struct util_est - Estimation utilization of FAIR tasks
 * @enqueued: instantaneous estimated utilization of a task/cpu
 * @ewma:     the Exponential Weighted Moving Average (EWMA)
 *            utilization of a task
 *
 * Support data structure to track an Exponential Weighted Moving Average
 * (EWMA) of a FAIR task's utilization. New samples are added to the moving
 * average each time a task completes an activation. Sample's weight is chosen
 * so that the EWMA will be relatively insensitive to transient changes to the
 * task's workload.
 *
 * The enqueued attribute has a slightly different meaning for tasks and cpus:
 * - task:   the task's util_avg at last task dequeue time
 * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
 * Thus, the util_est.enqueued of a task represents the contribution on the
 * estimated utilization of the CPU where that task is currently enqueued.
 *
 * Only for tasks we track a moving average of the past instantaneous
 * estimated utilization. This allows to absorb sporadic drops in utilization
 * of an otherwise almost periodic task.
 *
 * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
 * updates. When a task is dequeued, its util_est should not be updated if its
 * util_avg has not been updated in the meantime.
 * This information is mapped into the MSB bit of util_est.enqueued at dequeue
 * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
 * for a task) it is safe to use MSB.
 */
struct util_est {
        unsigned int                        enqueued;
        unsigned int                        ewma;
#define UTIL_EST_WEIGHT_SHIFT                2
#define UTIL_AVG_UNCHANGED                0x80000000
} __attribute__((__aligned__(sizeof(u64))));

/*
 * The load/runnable/util_avg accumulates an infinite geometric series
 * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
 *
 * [load_avg definition]
 *
 *   load_avg = runnable% * scale_load_down(load)
 *
 * [runnable_avg definition]
 *
 *   runnable_avg = runnable% * SCHED_CAPACITY_SCALE
 *
 * [util_avg definition]
 *
 *   util_avg = running% * SCHED_CAPACITY_SCALE
 *
 * where runnable% is the time ratio that a sched_entity is runnable and
 * running% the time ratio that a sched_entity is running.
 *
 * For cfs_rq, they are the aggregated values of all runnable and blocked
 * sched_entities.
 *
 * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU
 * capacity scaling. The scaling is done through the rq_clock_pelt that is used
 * for computing those signals (see update_rq_clock_pelt())
 *
 * N.B., the above ratios (runnable% and running%) themselves are in the
 * range of [0, 1]. To do fixed point arithmetics, we therefore scale them
 * to as large a range as necessary. This is for example reflected by
 * util_avg's SCHED_CAPACITY_SCALE.
 *
 * [Overflow issue]
 *
 * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities
 * with the highest load (=88761), always runnable on a single cfs_rq,
 * and should not overflow as the number already hits PID_MAX_LIMIT.
 *
 * For all other cases (including 32-bit kernels), struct load_weight's
 * weight will overflow first before we do, because:
 *
 *    Max(load_avg) <= Max(load.weight)
 *
 * Then it is the load_weight's responsibility to consider overflow
 * issues.
 */
struct sched_avg {
        u64                                last_update_time;
        u64                                load_sum;
        u64                                runnable_sum;
        u32                                util_sum;
        u32                                period_contrib;
        unsigned long                        load_avg;
        unsigned long                        runnable_avg;
        unsigned long                        util_avg;
        struct util_est                        util_est;
} ____cacheline_aligned;

struct sched_statistics {
#ifdef CONFIG_SCHEDSTATS
        u64                                wait_start;
        u64                                wait_max;
        u64                                wait_count;
        u64                                wait_sum;
        u64                                iowait_count;
        u64                                iowait_sum;

        u64                                sleep_start;
        u64                                sleep_max;
        s64                                sum_sleep_runtime;

        u64                                block_start;
        u64                                block_max;
        u64                                exec_max;
        u64                                slice_max;

        u64                                nr_migrations_cold;
        u64                                nr_failed_migrations_affine;
        u64                                nr_failed_migrations_running;
        u64                                nr_failed_migrations_hot;
        u64                                nr_forced_migrations;

        u64                                nr_wakeups;
        u64                                nr_wakeups_sync;
        u64                                nr_wakeups_migrate;
        u64                                nr_wakeups_local;
        u64                                nr_wakeups_remote;
        u64                                nr_wakeups_affine;
        u64                                nr_wakeups_affine_attempts;
        u64                                nr_wakeups_passive;
        u64                                nr_wakeups_idle;
#endif
};

struct sched_entity {
        /* For load-balancing: */
        struct load_weight                load;
        struct rb_node                        run_node;
        struct list_head                group_node;
        unsigned int                        on_rq;

        u64                                exec_start;
        u64                                sum_exec_runtime;
        u64                                vruntime;
        u64                                prev_sum_exec_runtime;

        u64                                nr_migrations;

        struct sched_statistics                statistics;

#ifdef CONFIG_FAIR_GROUP_SCHED
        int                                depth;
        struct sched_entity                *parent;
        /* rq on which this entity is (to be) queued: */
        struct cfs_rq                        *cfs_rq;
        /* rq "owned" by this entity/group: */
        struct cfs_rq                        *my_q;
        /* cached value of my_q->h_nr_running */
        unsigned long                        runnable_weight;
#endif

#ifdef CONFIG_SMP
        /*
         * Per entity load average tracking.
         *
         * Put into separate cache line so it does not
         * collide with read-mostly values above.
         */
        struct sched_avg                avg;
#endif
};

struct sched_rt_entity {
        struct list_head                run_list;
        unsigned long                        timeout;
        unsigned long                        watchdog_stamp;
        unsigned int                        time_slice;
        unsigned short                        on_rq;
        unsigned short                        on_list;

        struct sched_rt_entity                *back;
#ifdef CONFIG_RT_GROUP_SCHED
        struct sched_rt_entity                *parent;
        /* rq on which this entity is (to be) queued: */
        struct rt_rq                        *rt_rq;
        /* rq "owned" by this entity/group: */
        struct rt_rq                        *my_q;
#endif
} __randomize_layout;

struct sched_dl_entity {
        struct rb_node                        rb_node;

        /*
         * Original scheduling parameters. Copied here from sched_attr
         * during sched_setattr(), they will remain the same until
         * the next sched_setattr().
         */
        u64                                dl_runtime;        /* Maximum runtime for each instance        */
        u64                                dl_deadline;        /* Relative deadline of each instance        */
        u64                                dl_period;        /* Separation of two instances (period) */
        u64                                dl_bw;                /* dl_runtime / dl_period                */
        u64                                dl_density;        /* dl_runtime / dl_deadline                */

        /*
         * Actual scheduling parameters. Initialized with the values above,
         * they are continuously updated during task execution. Note that
         * the remaining runtime could be < 0 in case we are in overrun.
         */
        s64                                runtime;        /* Remaining runtime for this instance        */
        u64                                deadline;        /* Absolute deadline for this instance        */
        unsigned int                        flags;                /* Specifying the scheduler behaviour        */

        /*
         * Some bool flags:
         *
         * @dl_throttled tells if we exhausted the runtime. If so, the
         * task has to wait for a replenishment to be performed at the
         * next firing of dl_timer.
         *
         * @dl_yielded tells if task gave up the CPU before consuming
         * all its available runtime during the last job.
         *
         * @dl_non_contending tells if the task is inactive while still
         * contributing to the active utilization. In other words, it
         * indicates if the inactive timer has been armed and its handler
         * has not been executed yet. This flag is useful to avoid race
         * conditions between the inactive timer handler and the wakeup
         * code.
         *
         * @dl_overrun tells if the task asked to be informed about runtime
         * overruns.
         */
        unsigned int                        dl_throttled      : 1;
        unsigned int                        dl_yielded        : 1;
        unsigned int                        dl_non_contending : 1;
        unsigned int                        dl_overrun          : 1;

        /*
         * Bandwidth enforcement timer. Each -deadline task has its
         * own bandwidth to be enforced, thus we need one timer per task.
         */
        struct hrtimer                        dl_timer;

        /*
         * Inactive timer, responsible for decreasing the active utilization
         * at the "0-lag time". When a -deadline task blocks, it contributes
         * to GRUB's active utilization until the "0-lag time", hence a
         * timer is needed to decrease the active utilization at the correct
         * time.
         */
        struct hrtimer inactive_timer;

#ifdef CONFIG_RT_MUTEXES
        /*
         * Priority Inheritance. When a DEADLINE scheduling entity is boosted
         * pi_se points to the donor, otherwise points to the dl_se it belongs
         * to (the original one/itself).
         */
        struct sched_dl_entity *pi_se;
#endif
};

#ifdef CONFIG_UCLAMP_TASK
/* Number of utilization clamp buckets (shorter alias) */
#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT

/*
 * Utilization clamp for a scheduling entity
 * @value:                clamp value "assigned" to a se
 * @bucket_id:                bucket index corresponding to the "assigned" value
 * @active:                the se is currently refcounted in a rq's bucket
 * @user_defined:        the requested clamp value comes from user-space
 *
 * The bucket_id is the index of the clamp bucket matching the clamp value
 * which is pre-computed and stored to avoid expensive integer divisions from
 * the fast path.
 *
 * The active bit is set whenever a task has got an "effective" value assigned,
 * which can be different from the clamp value "requested" from user-space.
 * This allows to know a task is refcounted in the rq's bucket corresponding
 * to the "effective" bucket_id.
 *
 * The user_defined bit is set whenever a task has got a task-specific clamp
 * value requested from userspace, i.e. the system defaults apply to this task
 * just as a restriction. This allows to relax default clamps when a less
 * restrictive task-specific value has been requested, thus allowing to
 * implement a "nice" semantic. For example, a task running with a 20%
 * default boost can still drop its own boosting to 0%.
 */
struct uclamp_se {
        unsigned int value                : bits_per(SCHED_CAPACITY_SCALE);
        unsigned int bucket_id                : bits_per(UCLAMP_BUCKETS);
        unsigned int active                : 1;
        unsigned int user_defined        : 1;
};
#endif /* CONFIG_UCLAMP_TASK */

union rcu_special {
        struct {
                u8                        blocked;
                u8                        need_qs;
                u8                        exp_hint; /* Hint for performance. */
                u8                        need_mb; /* Readers need smp_mb(). */
        } b; /* Bits. */
        u32 s; /* Set of bits. */
};

enum perf_event_task_context {
        perf_invalid_context = -1,
        perf_hw_context = 0,
        perf_sw_context,
        perf_nr_task_contexts,
};

struct wake_q_node {
        struct wake_q_node *next;
};

struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
        /*
         * For reasons of header soup (see current_thread_info()), this
         * must be the first element of task_struct.
         */
        struct thread_info                thread_info;
#endif
        /* -1 unrunnable, 0 runnable, >0 stopped: */
        volatile long                        state;

        /*
         * This begins the randomizable portion of task_struct. Only
         * scheduling-critical items should be added above here.
         */
        randomized_struct_fields_start

        void                                *stack;
        refcount_t                        usage;
        /* Per task flags (PF_*), defined further below: */
        unsigned int                        flags;
        unsigned int                        ptrace;

#ifdef CONFIG_SMP
        int                                on_cpu;
        struct __call_single_node        wake_entry;
#ifdef CONFIG_THREAD_INFO_IN_TASK
        /* Current CPU: */
        unsigned int                        cpu;
#endif
        unsigned int                        wakee_flips;
        unsigned long                        wakee_flip_decay_ts;
        struct task_struct                *last_wakee;

        /*
         * recent_used_cpu is initially set as the last CPU used by a task
         * that wakes affine another task. Waker/wakee relationships can
         * push tasks around a CPU where each wakeup moves to the next one.
         * Tracking a recently used CPU allows a quick search for a recently
         * used CPU that may be idle.
         */
        int                                recent_used_cpu;
        int                                wake_cpu;
#endif
        int                                on_rq;

        int                                prio;
        int                                static_prio;
        int                                normal_prio;
        unsigned int                        rt_priority;

        const struct sched_class        *sched_class;
        struct sched_entity                se;
        struct sched_rt_entity                rt;
#ifdef CONFIG_CGROUP_SCHED
        struct task_group                *sched_task_group;
#endif
        struct sched_dl_entity                dl;

#ifdef CONFIG_UCLAMP_TASK
        /*
         * Clamp values requested for a scheduling entity.
         * Must be updated with task_rq_lock() held.
         */
        struct uclamp_se                uclamp_req[UCLAMP_CNT];
        /*
         * Effective clamp values used for a scheduling entity.
         * Must be updated with task_rq_lock() held.
         */
        struct uclamp_se                uclamp[UCLAMP_CNT];
#endif

#ifdef CONFIG_PREEMPT_NOTIFIERS
        /* List of struct preempt_notifier: */
        struct hlist_head                preempt_notifiers;
#endif

#ifdef CONFIG_BLK_DEV_IO_TRACE
        unsigned int                        btrace_seq;
#endif

        unsigned int                        policy;
        int                                nr_cpus_allowed;
        const cpumask_t                        *cpus_ptr;
        cpumask_t                        cpus_mask;

#ifdef CONFIG_PREEMPT_RCU
        int                                rcu_read_lock_nesting;
        union rcu_special                rcu_read_unlock_special;
        struct list_head                rcu_node_entry;
        struct rcu_node                        *rcu_blocked_node;
#endif /* #ifdef CONFIG_PREEMPT_RCU */

#ifdef CONFIG_TASKS_RCU
        unsigned long                        rcu_tasks_nvcsw;
        u8                                rcu_tasks_holdout;
        u8                                rcu_tasks_idx;
        int                                rcu_tasks_idle_cpu;
        struct list_head                rcu_tasks_holdout_list;
#endif /* #ifdef CONFIG_TASKS_RCU */

#ifdef CONFIG_TASKS_TRACE_RCU
        int                                trc_reader_nesting;
        int                                trc_ipi_to_cpu;
        union rcu_special                trc_reader_special;
        bool                                trc_reader_checked;
        struct list_head                trc_holdout_list;
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */

        struct sched_info                sched_info;

        struct list_head                tasks;
#ifdef CONFIG_SMP
        struct plist_node                pushable_tasks;
        struct rb_node                        pushable_dl_tasks;
#endif

        struct mm_struct                *mm;
        struct mm_struct                *active_mm;

        /* Per-thread vma caching: */
        struct vmacache                        vmacache;

#ifdef SPLIT_RSS_COUNTING
        struct task_rss_stat                rss_stat;
#endif
        int                                exit_state;
        int                                exit_code;
        int                                exit_signal;
        /* The signal sent when the parent dies: */
        int                                pdeath_signal;
        /* JOBCTL_*, siglock protected: */
        unsigned long                        jobctl;

        /* Used for emulating ABI behavior of previous Linux versions: */
        unsigned int                        personality;

        /* Scheduler bits, serialized by scheduler locks: */
        unsigned                        sched_reset_on_fork:1;
        unsigned                        sched_contributes_to_load:1;
        unsigned                        sched_migrated:1;
#ifdef CONFIG_PSI
        unsigned                        sched_psi_wake_requeue:1;
#endif

        /* Force alignment to the next boundary: */
        unsigned                        :0;

        /* Unserialized, strictly 'current' */

        /*
         * This field must not be in the scheduler word above due to wakelist
         * queueing no longer being serialized by p->on_cpu. However:
         *
         * p->XXX = X;                        ttwu()
         * schedule()                          if (p->on_rq && ..) // false
         *   smp_mb__after_spinlock();          if (smp_load_acquire(&p->on_cpu) && //true
         *   deactivate_task()                      ttwu_queue_wakelist())
         *     p->on_rq = 0;                        p->sched_remote_wakeup = Y;
         *
         * guarantees all stores of 'current' are visible before
         * ->sched_remote_wakeup gets used, so it can be in this word.
         */
        unsigned                        sched_remote_wakeup:1;

        /* Bit to tell LSMs we're in execve(): */
        unsigned                        in_execve:1;
        unsigned                        in_iowait:1;
#ifndef TIF_RESTORE_SIGMASK
        unsigned                        restore_sigmask:1;
#endif
#ifdef CONFIG_MEMCG
        unsigned                        in_user_fault:1;
#endif
#ifdef CONFIG_COMPAT_BRK
        unsigned                        brk_randomized:1;
#endif
#ifdef CONFIG_CGROUPS
        /* disallow userland-initiated cgroup migration */
        unsigned                        no_cgroup_migration:1;
        /* task is frozen/stopped (used by the cgroup freezer) */
        unsigned                        frozen:1;
#endif
#ifdef CONFIG_BLK_CGROUP
        unsigned                        use_memdelay:1;
#endif
#ifdef CONFIG_PSI
        /* Stalled due to lack of memory */
        unsigned                        in_memstall:1;
#endif

        unsigned long                        atomic_flags; /* Flags requiring atomic access. */

        struct restart_block                restart_block;

        pid_t                                pid;
        pid_t                                tgid;

#ifdef CONFIG_STACKPROTECTOR
        /* Canary value for the -fstack-protector GCC feature: */
        unsigned long                        stack_canary;
#endif
        /*
         * Pointers to the (original) parent process, youngest child, younger sibling,
         * older sibling, respectively.  (p->father can be replaced with
         * p->real_parent->pid)
         */

        /* Real parent process: */
        struct task_struct __rcu        *real_parent;

        /* Recipient of SIGCHLD, wait4() reports: */
        struct task_struct __rcu        *parent;

        /*
         * Children/sibling form the list of natural children:
         */
        struct list_head                children;
        struct list_head                sibling;
        struct task_struct                *group_leader;

        /*
         * 'ptraced' is the list of tasks this task is using ptrace() on.
         *
         * This includes both natural children and PTRACE_ATTACH targets.
         * 'ptrace_entry' is this task's link on the p->parent->ptraced list.
         */
        struct list_head                ptraced;
        struct list_head                ptrace_entry;

        /* PID/PID hash table linkage. */
        struct pid                        *thread_pid;
        struct hlist_node                pid_links[PIDTYPE_MAX];
        struct list_head                thread_group;
        struct list_head                thread_node;

        struct completion                *vfork_done;

        /* CLONE_CHILD_SETTID: */
        int __user                        *set_child_tid;

        /* CLONE_CHILD_CLEARTID: */
        int __user                        *clear_child_tid;

        /* PF_IO_WORKER */
        void                                *pf_io_worker;

        u64                                utime;
        u64                                stime;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
        u64                                utimescaled;
        u64                                stimescaled;
#endif
        u64                                gtime;
        struct prev_cputime                prev_cputime;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
        struct vtime                        vtime;
#endif

#ifdef CONFIG_NO_HZ_FULL
        atomic_t                        tick_dep_mask;
#endif
        /* Context switch counts: */
        unsigned long                        nvcsw;
        unsigned long                        nivcsw;

        /* Monotonic time in nsecs: */
        u64                                start_time;

        /* Boot based time in nsecs: */
        u64                                start_boottime;

        /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */
        unsigned long                        min_flt;
        unsigned long                        maj_flt;

        /* Empty if CONFIG_POSIX_CPUTIMERS=n */
        struct posix_cputimers                posix_cputimers;

#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
        struct posix_cputimers_work        posix_cputimers_work;
#endif

        /* Process credentials: */

        /* Tracer's credentials at attach: */
        const struct cred __rcu                *ptracer_cred;

        /* Objective and real subjective task credentials (COW): */
        const struct cred __rcu                *real_cred;

        /* Effective (overridable) subjective task credentials (COW): */
        const struct cred __rcu                *cred;

#ifdef CONFIG_KEYS
        /* Cached requested key. */
        struct key                        *cached_requested_key;
#endif

        /*
         * executable name, excluding path.
         *
         * - normally initialized setup_new_exec()
         * - access it with [gs]et_task_comm()
         * - lock it with task_lock()
         */
        char                                comm[TASK_COMM_LEN];

        struct nameidata                *nameidata;

#ifdef CONFIG_SYSVIPC
        struct sysv_sem                        sysvsem;
        struct sysv_shm                        sysvshm;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
        unsigned long                        last_switch_count;
        unsigned long                        last_switch_time;
#endif
        /* Filesystem information: */
        struct fs_struct                *fs;

        /* Open file information: */
        struct files_struct                *files;

#ifdef CONFIG_IO_URING
        struct io_uring_task                *io_uring;
#endif

        /* Namespaces: */
        struct nsproxy                        *nsproxy;

        /* Signal handlers: */
        struct signal_struct                *signal;
        struct sighand_struct __rcu                *sighand;
        sigset_t                        blocked;
        sigset_t                        real_blocked;
        /* Restored if set_restore_sigmask() was used: */
        sigset_t                        saved_sigmask;
        struct sigpending                pending;
        unsigned long                        sas_ss_sp;
        size_t                                sas_ss_size;
        unsigned int                        sas_ss_flags;

        struct callback_head                *task_works;

#ifdef CONFIG_AUDIT
#ifdef CONFIG_AUDITSYSCALL
        struct audit_context                *audit_context;
#endif
        kuid_t                                loginuid;
        unsigned int                        sessionid;
#endif
        struct seccomp                        seccomp;

        /* Thread group tracking: */
        u64                                parent_exec_id;
        u64                                self_exec_id;

        /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
        spinlock_t                        alloc_lock;

        /* Protection of the PI data structures: */
        raw_spinlock_t                        pi_lock;

        struct wake_q_node                wake_q;

#ifdef CONFIG_RT_MUTEXES
        /* PI waiters blocked on a rt_mutex held by this task: */
        struct rb_root_cached                pi_waiters;
        /* Updated under owner's pi_lock and rq lock */
        struct task_struct                *pi_top_task;
        /* Deadlock detection and priority inheritance handling: */
        struct rt_mutex_waiter                *pi_blocked_on;
#endif

#ifdef CONFIG_DEBUG_MUTEXES
        /* Mutex deadlock detection: */
        struct mutex_waiter                *blocked_on;
#endif

#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
        int                                non_block_count;
#endif

#ifdef CONFIG_TRACE_IRQFLAGS
        struct irqtrace_events                irqtrace;
        unsigned int                        hardirq_threaded;
        u64                                hardirq_chain_key;
        int                                softirqs_enabled;
        int                                softirq_context;
        int                                irq_config;
#endif

#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH                        48UL
        u64                                curr_chain_key;
        int                                lockdep_depth;
        unsigned int                        lockdep_recursion;
        struct held_lock                held_locks[MAX_LOCK_DEPTH];
#endif

#if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP)
        unsigned int                        in_ubsan;
#endif

        /* Journalling filesystem info: */
        void                                *journal_info;

        /* Stacked block device info: */
        struct bio_list                        *bio_list;

#ifdef CONFIG_BLOCK
        /* Stack plugging: */
        struct blk_plug                        *plug;
#endif

        /* VM state: */
        struct reclaim_state                *reclaim_state;

        struct backing_dev_info                *backing_dev_info;

        struct io_context                *io_context;

#ifdef CONFIG_COMPACTION
        struct capture_control                *capture_control;
#endif
        /* Ptrace state: */
        unsigned long                        ptrace_message;
        kernel_siginfo_t                *last_siginfo;

        struct task_io_accounting        ioac;
#ifdef CONFIG_PSI
        /* Pressure stall state */
        unsigned int                        psi_flags;
#endif
#ifdef CONFIG_TASK_XACCT
        /* Accumulated RSS usage: */
        u64                                acct_rss_mem1;
        /* Accumulated virtual memory usage: */
        u64                                acct_vm_mem1;
        /* stime + utime since last update: */
        u64                                acct_timexpd;
#endif
#ifdef CONFIG_CPUSETS
        /* Protected by ->alloc_lock: */
        nodemask_t                        mems_allowed;
        /* Seqence number to catch updates: */
        seqcount_spinlock_t                mems_allowed_seq;
        int                                cpuset_mem_spread_rotor;
        int                                cpuset_slab_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
        /* Control Group info protected by css_set_lock: */
        struct css_set __rcu                *cgroups;
        /* cg_list protected by css_set_lock and tsk->alloc_lock: */
        struct list_head                cg_list;
#endif
#ifdef CONFIG_X86_CPU_RESCTRL
        u32                                closid;
        u32                                rmid;
#endif
#ifdef CONFIG_FUTEX
        struct robust_list_head __user        *robust_list;
#ifdef CONFIG_COMPAT
        struct compat_robust_list_head __user *compat_robust_list;
#endif
        struct list_head                pi_state_list;
        struct futex_pi_state                *pi_state_cache;
        struct mutex                        futex_exit_mutex;
        unsigned int                        futex_state;
#endif
#ifdef CONFIG_PERF_EVENTS
        struct perf_event_context        *perf_event_ctxp[perf_nr_task_contexts];
        struct mutex                        perf_event_mutex;
        struct list_head                perf_event_list;
#endif
#ifdef CONFIG_DEBUG_PREEMPT
        unsigned long                        preempt_disable_ip;
#endif
#ifdef CONFIG_NUMA
        /* Protected by alloc_lock: */
        struct mempolicy                *mempolicy;
        short                                il_prev;
        short                                pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
        int                                numa_scan_seq;
        unsigned int                        numa_scan_period;
        unsigned int                        numa_scan_period_max;
        int                                numa_preferred_nid;
        unsigned long                        numa_migrate_retry;
        /* Migration stamp: */
        u64                                node_stamp;
        u64                                last_task_numa_placement;
        u64                                last_sum_exec_runtime;
        struct callback_head                numa_work;

        /*
         * This pointer is only modified for current in syscall and
         * pagefault context (and for tasks being destroyed), so it can be read
         * from any of the following contexts:
         *  - RCU read-side critical section
         *  - current->numa_group from everywhere
         *  - task's runqueue locked, task not running
         */
        struct numa_group __rcu                *numa_group;

        /*
         * numa_faults is an array split into four regions:
         * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
         * in this precise order.
         *
         * faults_memory: Exponential decaying average of faults on a per-node
         * basis. Scheduling placement decisions are made based on these
         * counts. The values remain static for the duration of a PTE scan.
         * faults_cpu: Track the nodes the process was running on when a NUMA
         * hinting fault was incurred.
         * faults_memory_buffer and faults_cpu_buffer: Record faults per node
         * during the current scan window. When the scan completes, the counts
         * in faults_memory and faults_cpu decay and these values are copied.
         */
        unsigned long                        *numa_faults;
        unsigned long                        total_numa_faults;

        /*
         * numa_faults_locality tracks if faults recorded during the last
         * scan window were remote/local or failed to migrate. The task scan
         * period is adapted based on the locality of the faults with different
         * weights depending on whether they were shared or private faults
         */
        unsigned long                        numa_faults_locality[3];

        unsigned long                        numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */

#ifdef CONFIG_RSEQ
        struct rseq __user *rseq;
        u32 rseq_sig;
        /*
         * RmW on rseq_event_mask must be performed atomically
         * with respect to preemption.
         */
        unsigned long rseq_event_mask;
#endif

        struct tlbflush_unmap_batch        tlb_ubc;

        union {
                refcount_t                rcu_users;
                struct rcu_head                rcu;
        };

        /* Cache last used pipe for splice(): */
        struct pipe_inode_info                *splice_pipe;

        struct page_frag                task_frag;

#ifdef CONFIG_TASK_DELAY_ACCT
        struct task_delay_info                *delays;
#endif

#ifdef CONFIG_FAULT_INJECTION
        int                                make_it_fail;
        unsigned int                        fail_nth;
#endif
        /*
         * When (nr_dirtied >= nr_dirtied_pause), it's time to call
         * balance_dirty_pages() for a dirty throttling pause:
         */
        int                                nr_dirtied;
        int                                nr_dirtied_pause;
        /* Start of a write-and-pause period: */
        unsigned long                        dirty_paused_when;

#ifdef CONFIG_LATENCYTOP
        int                                latency_record_count;
        struct latency_record                latency_record[LT_SAVECOUNT];
#endif
        /*
         * Time slack values; these are used to round up poll() and
         * select() etc timeout values. These are in nanoseconds.
         */
        u64                                timer_slack_ns;
        u64                                default_timer_slack_ns;

#ifdef CONFIG_KASAN
        unsigned int                        kasan_depth;
#endif

#ifdef CONFIG_KCSAN
        struct kcsan_ctx                kcsan_ctx;
#ifdef CONFIG_TRACE_IRQFLAGS
        struct irqtrace_events                kcsan_save_irqtrace;
#endif
#endif

#if IS_ENABLED(CONFIG_KUNIT)
        struct kunit                        *kunit_test;
#endif

#ifdef CONFIG_FUNCTION_GRAPH_TRACER
        /* Index of current stored address in ret_stack: */
        int                                curr_ret_stack;
        int                                curr_ret_depth;

        /* Stack of return addresses for return function tracing: */
        struct ftrace_ret_stack                *ret_stack;

        /* Timestamp for last schedule: */
        unsigned long long                ftrace_timestamp;

        /*
         * Number of functions that haven't been traced
         * because of depth overrun:
         */
        atomic_t                        trace_overrun;

        /* Pause tracing: */
        atomic_t                        tracing_graph_pause;
#endif

#ifdef CONFIG_TRACING
        /* State flags for use by tracers: */
        unsigned long                        trace;

        /* Bitmask and counter of trace recursion: */
        unsigned long                        trace_recursion;
#endif /* CONFIG_TRACING */

#ifdef CONFIG_KCOV
        /* See kernel/kcov.c for more details. */

        /* Coverage collection mode enabled for this task (0 if disabled): */
        unsigned int                        kcov_mode;

        /* Size of the kcov_area: */
        unsigned int                        kcov_size;

        /* Buffer for coverage collection: */
        void                                *kcov_area;

        /* KCOV descriptor wired with this task or NULL: */
        struct kcov                        *kcov;

        /* KCOV common handle for remote coverage collection: */
        u64                                kcov_handle;

        /* KCOV sequence number: */
        int                                kcov_sequence;

        /* Collect coverage from softirq context: */
        unsigned int                        kcov_softirq;
#endif

#ifdef CONFIG_MEMCG
        struct mem_cgroup                *memcg_in_oom;
        gfp_t                                memcg_oom_gfp_mask;
        int                                memcg_oom_order;

        /* Number of pages to reclaim on returning to userland: */
        unsigned int                        memcg_nr_pages_over_high;

        /* Used by memcontrol for targeted memcg charge: */
        struct mem_cgroup                *active_memcg;
#endif

#ifdef CONFIG_BLK_CGROUP
        struct request_queue                *throttle_queue;
#endif

#ifdef CONFIG_UPROBES
        struct uprobe_task                *utask;
#endif
#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
        unsigned int                        sequential_io;
        unsigned int                        sequential_io_avg;
#endif
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
        unsigned long                        task_state_change;
#endif
        int                                pagefault_disabled;
#ifdef CONFIG_MMU
        struct task_struct                *oom_reaper_list;
        struct timer_list                oom_reaper_timer;
#endif
#ifdef CONFIG_VMAP_STACK
        struct vm_struct                *stack_vm_area;
#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
        /* A live task holds one reference: */
        refcount_t                        stack_refcount;
#endif
#ifdef CONFIG_LIVEPATCH
        int patch_state;
#endif
#ifdef CONFIG_SECURITY
        /* Used by LSM modules for access restriction: */
        void                                *security;
#endif

#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
        unsigned long                        lowest_stack;
        unsigned long                        prev_lowest_stack;
#endif

#ifdef CONFIG_X86_MCE
        void __user                        *mce_vaddr;
        __u64                                mce_kflags;
        u64                                mce_addr;
        __u64                                mce_ripv : 1,
                                        mce_whole_page : 1,
                                        __mce_reserved : 62;
        struct callback_head                mce_kill_me;
        int                                mce_count;
#endif

        /*
         * New fields for task_struct should be added above here, so that
         * they are included in the randomized portion of task_struct.
         */
        randomized_struct_fields_end

        /* CPU-specific state of this task: */
        struct thread_struct                thread;

        /*
         * WARNING: on x86, 'thread_struct' contains a variable-sized
         * structure.  It *MUST* be at the end of 'task_struct'.
         *
         * Do not put anything below here!
         */
};

static inline struct pid *task_pid(struct task_struct *task)
{
        return task->thread_pid;
}

/*
 * the helpers to get the task's different pids as they are seen
 * from various namespaces
 *
 * task_xid_nr()     : global id, i.e. the id seen from the init namespace;
 * task_xid_vnr()    : virtual id, i.e. the id seen from the pid namespace of
 *                     current.
 * task_xid_nr_ns()  : id seen from the ns specified;
 *
 * see also pid_nr() etc in include/linux/pid.h
 */
pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns);

static inline pid_t task_pid_nr(struct task_struct *tsk)
{
        return tsk->pid;
}

static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
}

static inline pid_t task_pid_vnr(struct task_struct *tsk)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
}


static inline pid_t task_tgid_nr(struct task_struct *tsk)
{
        return tsk->tgid;
}

/**
 * pid_alive - check that a task structure is not stale
 * @p: Task structure to be checked.
 *
 * Test if a process is not yet dead (at most zombie state)
 * If pid_alive fails, then pointers within the task structure
 * can be stale and must not be dereferenced.
 *
 * Return: 1 if the process is alive. 0 otherwise.
 */
static inline int pid_alive(const struct task_struct *p)
{
        return p->thread_pid != NULL;
}

static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
}

static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
}


static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
}

static inline pid_t task_session_vnr(struct task_struct *tsk)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
}

static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns);
}

static inline pid_t task_tgid_vnr(struct task_struct *tsk)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL);
}

static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
{
        pid_t pid = 0;

        rcu_read_lock();
        if (pid_alive(tsk))
                pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
        rcu_read_unlock();

        return pid;
}

static inline pid_t task_ppid_nr(const struct task_struct *tsk)
{
        return task_ppid_nr_ns(tsk, &init_pid_ns);
}

/* Obsolete, do not use: */
static inline pid_t task_pgrp_nr(struct task_struct *tsk)
{
        return task_pgrp_nr_ns(tsk, &init_pid_ns);
}

#define TASK_REPORT_IDLE        (TASK_REPORT + 1)
#define TASK_REPORT_MAX                (TASK_REPORT_IDLE << 1)

static inline unsigned int task_state_index(struct task_struct *tsk)
{
        unsigned int tsk_state = READ_ONCE(tsk->state);
        unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT;

        BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX);

        if (tsk_state == TASK_IDLE)
                state = TASK_REPORT_IDLE;

        return fls(state);
}

static inline char task_index_to_char(unsigned int state)
{
        static const char state_char[] = "RSDTtXZPI";

        BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1);

        return state_char[state];
}

static inline char task_state_to_char(struct task_struct *tsk)
{
        return task_index_to_char(task_state_index(tsk));
}

/**
 * is_global_init - check if a task structure is init. Since init
 * is free to have sub-threads we need to check tgid.
 * @tsk: Task structure to be checked.
 *
 * Check if a task structure is the first user space task the kernel created.
 *
 * Return: 1 if the task structure is init. 0 otherwise.
 */
static inline int is_global_init(struct task_struct *tsk)
{
        return task_tgid_nr(tsk) == 1;
}

extern struct pid *cad_pid;

/*
 * Per process flags
 */
#define PF_VCPU                        0x00000001        /* I'm a virtual CPU */
#define PF_IDLE                        0x00000002        /* I am an IDLE thread */
#define PF_EXITING                0x00000004        /* Getting shut down */
#define PF_IO_WORKER                0x00000010        /* Task is an IO worker */
#define PF_WQ_WORKER                0x00000020        /* I'm a workqueue worker */
#define PF_FORKNOEXEC                0x00000040        /* Forked but didn't exec */
#define PF_MCE_PROCESS                0x00000080      /* Process policy on mce errors */
#define PF_SUPERPRIV                0x00000100        /* Used super-user privileges */
#define PF_DUMPCORE                0x00000200        /* Dumped core */
#define PF_SIGNALED                0x00000400        /* Killed by a signal */
#define PF_MEMALLOC                0x00000800        /* Allocating memory */
#define PF_NPROC_EXCEEDED        0x00001000        /* set_user() noticed that RLIMIT_NPROC was exceeded */
#define PF_USED_MATH                0x00002000        /* If unset the fpu must be initialized before use */
#define PF_NOFREEZE                0x00008000        /* This thread should not be frozen */
#define PF_FROZEN                0x00010000        /* Frozen for system suspend */
#define PF_KSWAPD                0x00020000        /* I am kswapd */
#define PF_MEMALLOC_NOFS        0x00040000        /* All allocation requests will inherit GFP_NOFS */
#define PF_MEMALLOC_NOIO        0x00080000        /* All allocation requests will inherit GFP_NOIO */
#define PF_LOCAL_THROTTLE        0x00100000        /* Throttle writes only against the bdi I write to,
                                                 * I am cleaning dirty pages from some other bdi. */
#define PF_KTHREAD                0x00200000        /* I am a kernel thread */
#define PF_RANDOMIZE                0x00400000        /* Randomize virtual address space */
#define PF_SWAPWRITE                0x00800000        /* Allowed to write to swap */
#define PF_NO_SETAFFINITY        0x04000000        /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY                0x08000000      /* Early kill for mce process policy */
#define PF_MEMALLOC_NOCMA        0x10000000        /* All allocation request will have _GFP_MOVABLE cleared */
#define PF_FREEZER_SKIP                0x40000000        /* Freezer should not count it as freezable */
#define PF_SUSPEND_TASK                0x80000000      /* This thread called freeze_processes() and should not be frozen */

/*
 * Only the _current_ task can read/write to tsk->flags, but other
 * tasks can access tsk->flags in readonly mode for example
 * with tsk_used_math (like during threaded core dumping).
 * There is however an exception to this rule during ptrace
 * or during fork: the ptracer task is allowed to write to the
 * child->flags of its traced child (same goes for fork, the parent
 * can write to the child->flags), because we're guaranteed the
 * child is not running and in turn not changing child->flags
 * at the same time the parent does it.
 */
#define clear_stopped_child_used_math(child)        do { (child)->flags &= ~PF_USED_MATH; } while (0)
#define set_stopped_child_used_math(child)        do { (child)->flags |= PF_USED_MATH; } while (0)
#define clear_used_math()                        clear_stopped_child_used_math(current)
#define set_used_math()                                set_stopped_child_used_math(current)

#define conditional_stopped_child_used_math(condition, child) \
        do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)

#define conditional_used_math(condition)        conditional_stopped_child_used_math(condition, current)

#define copy_to_stopped_child_used_math(child) \
        do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)

/* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
#define tsk_used_math(p)                        ((p)->flags & PF_USED_MATH)
#define used_math()                                tsk_used_math(current)

static __always_inline bool is_percpu_thread(void)
{
#ifdef CONFIG_SMP
        return (current->flags & PF_NO_SETAFFINITY) &&
                (current->nr_cpus_allowed  == 1);
#else
        return true;
#endif
}

/* Per-process atomic flags. */
#define PFA_NO_NEW_PRIVS                0        /* May not gain new privileges. */
#define PFA_SPREAD_PAGE                        1        /* Spread page cache over cpuset */
#define PFA_SPREAD_SLAB                        2        /* Spread some slab caches over cpuset */
#define PFA_SPEC_SSB_DISABLE                3        /* Speculative Store Bypass disabled */
#define PFA_SPEC_SSB_FORCE_DISABLE        4        /* Speculative Store Bypass force disabled*/
#define PFA_SPEC_IB_DISABLE                5        /* Indirect branch speculation restricted */
#define PFA_SPEC_IB_FORCE_DISABLE        6        /* Indirect branch speculation permanently restricted */
#define PFA_SPEC_SSB_NOEXEC                7        /* Speculative Store Bypass clear on execve() */

#define TASK_PFA_TEST(name, func)                                        \
        static inline bool task_##func(struct task_struct *p)                \
        { return test_bit(PFA_##name, &p->atomic_flags); }

#define TASK_PFA_SET(name, func)                                        \
        static inline void task_set_##func(struct task_struct *p)        \
        { set_bit(PFA_##name, &p->atomic_flags); }

#define TASK_PFA_CLEAR(name, func)                                        \
        static inline void task_clear_##func(struct task_struct *p)        \
        { clear_bit(PFA_##name, &p->atomic_flags); }

TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs)
TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs)

TASK_PFA_TEST(SPREAD_PAGE, spread_page)
TASK_PFA_SET(SPREAD_PAGE, spread_page)
TASK_PFA_CLEAR(SPREAD_PAGE, spread_page)

TASK_PFA_TEST(SPREAD_SLAB, spread_slab)
TASK_PFA_SET(SPREAD_SLAB, spread_slab)
TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)

TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable)
TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable)
TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable)

TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec)
TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec)
TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec)

TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)

TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable)
TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable)
TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable)

TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)
TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)

static inline void
current_restore_flags(unsigned long orig_flags, unsigned long flags)
{
        current->flags &= ~flags;
        current->flags |= orig_flags & flags;
}

extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
extern int task_can_attach(struct task_struct *p);
extern int dl_bw_alloc(int cpu, u64 dl_bw);
extern void dl_bw_free(int cpu, u64 dl_bw);
#ifdef CONFIG_SMP
extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
#else
static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
}
static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
        if (!cpumask_test_cpu(0, new_mask))
                return -EINVAL;
        return 0;
}
#endif

extern int yield_to(struct task_struct *p, bool preempt);
extern void set_user_nice(struct task_struct *p, long nice);
extern int task_prio(const struct task_struct *p);

/**
 * task_nice - return the nice value of a given task.
 * @p: the task in question.
 *
 * Return: The nice value [ -20 ... 0 ... 19 ].
 */
static inline int task_nice(const struct task_struct *p)
{
        return PRIO_TO_NICE((p)->static_prio);
}

extern int can_nice(const struct task_struct *p, const int nice);
extern int task_curr(const struct task_struct *p);
extern int idle_cpu(int cpu);
extern int available_idle_cpu(int cpu);
extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
extern void sched_set_fifo(struct task_struct *p);
extern void sched_set_fifo_low(struct task_struct *p);
extern void sched_set_normal(struct task_struct *p, int nice);
extern int sched_setattr(struct task_struct *, const struct sched_attr *);
extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *);
extern struct task_struct *idle_task(int cpu);

/**
 * is_idle_task - is the specified task an idle task?
 * @p: the task in question.
 *
 * Return: 1 if @p is an idle task. 0 otherwise.
 */
static __always_inline bool is_idle_task(const struct task_struct *p)
{
        return !!(p->flags & PF_IDLE);
}

extern struct task_struct *curr_task(int cpu);
extern void ia64_set_curr_task(int cpu, struct task_struct *p);

void yield(void);

union thread_union {
#ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK
        struct task_struct task;
#endif
#ifndef CONFIG_THREAD_INFO_IN_TASK
        struct thread_info thread_info;
#endif
        unsigned long stack[THREAD_SIZE/sizeof(long)];
};

#ifndef CONFIG_THREAD_INFO_IN_TASK
extern struct thread_info init_thread_info;
#endif

extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)];

#ifdef CONFIG_THREAD_INFO_IN_TASK
static inline struct thread_info *task_thread_info(struct task_struct *task)
{
        return &task->thread_info;
}
#elif !defined(__HAVE_THREAD_FUNCTIONS)
# define task_thread_info(task)        ((struct thread_info *)(task)->stack)
#endif

/*
 * find a task by one of its numerical ids
 *
 * find_task_by_pid_ns():
 *      finds a task by its pid in the specified namespace
 * find_task_by_vpid():
 *      finds a task by its virtual pid
 *
 * see also find_vpid() etc in include/linux/pid.h
 */

extern struct task_struct *find_task_by_vpid(pid_t nr);
extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns);

/*
 * find a task by its virtual pid and get the task struct
 */
extern struct task_struct *find_get_task_by_vpid(pid_t nr);

extern int wake_up_state(struct task_struct *tsk, unsigned int state);
extern int wake_up_process(struct task_struct *tsk);
extern void wake_up_new_task(struct task_struct *tsk);

#ifdef CONFIG_SMP
extern void kick_process(struct task_struct *tsk);
#else
static inline void kick_process(struct task_struct *tsk) { }
#endif

extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec);

static inline void set_task_comm(struct task_struct *tsk, const char *from)
{
        __set_task_comm(tsk, from, false);
}

extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk);
#define get_task_comm(buf, tsk) ({                        \
        BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN);        \
        __get_task_comm(buf, sizeof(buf), tsk);                \
})

#ifdef CONFIG_SMP
static __always_inline void scheduler_ipi(void)
{
        /*
         * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
         * TIF_NEED_RESCHED remotely (for the first time) will also send
         * this IPI.
         */
        preempt_fold_need_resched();
}
extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
#else
static inline void scheduler_ipi(void) { }
static inline unsigned long wait_task_inactive(struct task_struct *p, long match_state)
{
        return 1;
}
#endif

/*
 * Set thread flags in other task's structures.
 * See asm/thread_info.h for TIF_xxxx flags available:
 */
static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        set_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        clear_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag,
                                          bool value)
{
        update_ti_thread_flag(task_thread_info(tsk), flag, value);
}

static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        return test_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline void set_tsk_need_resched(struct task_struct *tsk)
{
        set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
}

static inline void clear_tsk_need_resched(struct task_struct *tsk)
{
        clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
}

static inline int test_tsk_need_resched(struct task_struct *tsk)
{
        return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
}

/*
 * cond_resched() and cond_resched_lock(): latency reduction via
 * explicit rescheduling in places that are safe. The return
 * value indicates whether a reschedule was done in fact.
 * cond_resched_lock() will drop the spinlock before scheduling,
 */
#ifndef CONFIG_PREEMPTION
extern int _cond_resched(void);
#else
static inline int _cond_resched(void) { return 0; }
#endif

#define cond_resched() ({                        \
        ___might_sleep(__FILE__, __LINE__, 0);        \
        _cond_resched();                        \
})

extern int __cond_resched_lock(spinlock_t *lock);

#define cond_resched_lock(lock) ({                                \
        ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\
        __cond_resched_lock(lock);                                \
})

static inline void cond_resched_rcu(void)
{
#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
        rcu_read_unlock();
        cond_resched();
        rcu_read_lock();
#endif
}

/*
 * Does a critical section need to be broken due to another
 * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
 * but a general need for low latency)
 */
static inline int spin_needbreak(spinlock_t *lock)
{
#ifdef CONFIG_PREEMPTION
        return spin_is_contended(lock);
#else
        return 0;
#endif
}

static __always_inline bool need_resched(void)
{
        return unlikely(tif_need_resched());
}

/*
 * Wrappers for p->thread_info->cpu access. No-op on UP.
 */
#ifdef CONFIG_SMP

static inline unsigned int task_cpu(const struct task_struct *p)
{
#ifdef CONFIG_THREAD_INFO_IN_TASK
        return READ_ONCE(p->cpu);
#else
        return READ_ONCE(task_thread_info(p)->cpu);
#endif
}

extern void set_task_cpu(struct task_struct *p, unsigned int cpu);

#else

static inline unsigned int task_cpu(const struct task_struct *p)
{
        return 0;
}

static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
{
}

#endif /* CONFIG_SMP */

/*
 * In order to reduce various lock holder preemption latencies provide an
 * interface to see if a vCPU is currently running or not.
 *
 * This allows us to terminate optimistic spin loops and block, analogous to
 * the native optimistic spin heuristic of testing if the lock owner task is
 * running or not.
 */
#ifndef vcpu_is_preempted
static inline bool vcpu_is_preempted(int cpu)
{
        return false;
}
#endif

extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
extern long sched_getaffinity(pid_t pid, struct cpumask *mask);

#ifndef TASK_SIZE_OF
#define TASK_SIZE_OF(tsk)        TASK_SIZE
#endif

#ifdef CONFIG_RSEQ

/*
 * Map the event mask on the user-space ABI enum rseq_cs_flags
 * for direct mask checks.
 */
enum rseq_event_mask_bits {
        RSEQ_EVENT_PREEMPT_BIT        = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
        RSEQ_EVENT_SIGNAL_BIT        = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
        RSEQ_EVENT_MIGRATE_BIT        = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
};

enum rseq_event_mask {
        RSEQ_EVENT_PREEMPT        = (1U << RSEQ_EVENT_PREEMPT_BIT),
        RSEQ_EVENT_SIGNAL        = (1U << RSEQ_EVENT_SIGNAL_BIT),
        RSEQ_EVENT_MIGRATE        = (1U << RSEQ_EVENT_MIGRATE_BIT),
};

static inline void rseq_set_notify_resume(struct task_struct *t)
{
        if (t->rseq)
                set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
}

void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);

static inline void rseq_handle_notify_resume(struct ksignal *ksig,
                                             struct pt_regs *regs)
{
        if (current->rseq)
                __rseq_handle_notify_resume(ksig, regs);
}

static inline void rseq_signal_deliver(struct ksignal *ksig,
                                       struct pt_regs *regs)
{
        preempt_disable();
        __set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
        preempt_enable();
        rseq_handle_notify_resume(ksig, regs);
}

/* rseq_preempt() requires preemption to be disabled. */
static inline void rseq_preempt(struct task_struct *t)
{
        __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
        rseq_set_notify_resume(t);
}

/* rseq_migrate() requires preemption to be disabled. */
static inline void rseq_migrate(struct task_struct *t)
{
        __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
        rseq_set_notify_resume(t);
}

/*
 * If parent process has a registered restartable sequences area, the
 * child inherits. Unregister rseq for a clone with CLONE_VM set.
 */
static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
{
        if (clone_flags & CLONE_VM) {
                t->rseq = NULL;
                t->rseq_sig = 0;
                t->rseq_event_mask = 0;
        } else {
                t->rseq = current->rseq;
                t->rseq_sig = current->rseq_sig;
                t->rseq_event_mask = current->rseq_event_mask;
        }
}

static inline void rseq_execve(struct task_struct *t)
{
        t->rseq = NULL;
        t->rseq_sig = 0;
        t->rseq_event_mask = 0;
}

#else

static inline void rseq_set_notify_resume(struct task_struct *t)
{
}
static inline void rseq_handle_notify_resume(struct ksignal *ksig,
                                             struct pt_regs *regs)
{
}
static inline void rseq_signal_deliver(struct ksignal *ksig,
                                       struct pt_regs *regs)
{
}
static inline void rseq_preempt(struct task_struct *t)
{
}
static inline void rseq_migrate(struct task_struct *t)
{
}
static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
{
}
static inline void rseq_execve(struct task_struct *t)
{
}

#endif

#ifdef CONFIG_DEBUG_RSEQ

void rseq_syscall(struct pt_regs *regs);

#else

static inline void rseq_syscall(struct pt_regs *regs)
{
}

#endif

const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq);
char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len);
int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq);

const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq);
const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq);
const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq);

int sched_trace_rq_cpu(struct rq *rq);
int sched_trace_rq_cpu_capacity(struct rq *rq);
int sched_trace_rq_nr_running(struct rq *rq);

const struct cpumask *sched_trace_rd_span(struct root_domain *rd);

#endif






























    5 

    5 
    3 








    5 

    5 
    4 



    5 













    4 
    5 


    5 








    5 


    3 


    5 



















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
// SPDX-License-Identifier: GPL-2.0
/*
 * preemptoff and irqoff tracepoints
 *
 * Copyright (C) Joel Fernandes (Google) <joel@joelfernandes.org>
 */

#include <linux/kallsyms.h>
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ftrace.h>
#include <linux/kprobes.h>
#include "trace.h"

#define CREATE_TRACE_POINTS
#include <trace/events/preemptirq.h>

#ifdef CONFIG_TRACE_IRQFLAGS
/* Per-cpu variable to prevent redundant calls when IRQs already off */
static DEFINE_PER_CPU(int, tracing_irq_cpu);

/*
 * Like trace_hardirqs_on() but without the lockdep invocation. This is
 * used in the low level entry code where the ordering vs. RCU is important
 * and lockdep uses a staged approach which splits the lockdep hardirq
 * tracking into a RCU on and a RCU off section.
 */
void trace_hardirqs_on_prepare(void)
{
        if (this_cpu_read(tracing_irq_cpu)) {
                if (!in_nmi())
                        trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1);
                tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
                this_cpu_write(tracing_irq_cpu, 0);
        }
}
EXPORT_SYMBOL(trace_hardirqs_on_prepare);
NOKPROBE_SYMBOL(trace_hardirqs_on_prepare);

void trace_hardirqs_on(void)
{
        if (this_cpu_read(tracing_irq_cpu)) {
                if (!in_nmi())
                        trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
                tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
                this_cpu_write(tracing_irq_cpu, 0);
        }

        lockdep_hardirqs_on_prepare(CALLER_ADDR0);
        lockdep_hardirqs_on(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_on);
NOKPROBE_SYMBOL(trace_hardirqs_on);

/*
 * Like trace_hardirqs_off() but without the lockdep invocation. This is
 * used in the low level entry code where the ordering vs. RCU is important
 * and lockdep uses a staged approach which splits the lockdep hardirq
 * tracking into a RCU on and a RCU off section.
 */
void trace_hardirqs_off_finish(void)
{
        if (!this_cpu_read(tracing_irq_cpu)) {
                this_cpu_write(tracing_irq_cpu, 1);
                tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
                if (!in_nmi())
                        trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1);
        }

}
EXPORT_SYMBOL(trace_hardirqs_off_finish);
NOKPROBE_SYMBOL(trace_hardirqs_off_finish);

void trace_hardirqs_off(void)
{
        lockdep_hardirqs_off(CALLER_ADDR0);

        if (!this_cpu_read(tracing_irq_cpu)) {
                this_cpu_write(tracing_irq_cpu, 1);
                tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
                if (!in_nmi())
                        trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
        }
}
EXPORT_SYMBOL(trace_hardirqs_off);
NOKPROBE_SYMBOL(trace_hardirqs_off);

__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
{
        if (this_cpu_read(tracing_irq_cpu)) {
                if (!in_nmi())
                        trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr);
                tracer_hardirqs_on(CALLER_ADDR0, caller_addr);
                this_cpu_write(tracing_irq_cpu, 0);
        }

        lockdep_hardirqs_on_prepare(caller_addr);
        lockdep_hardirqs_on(caller_addr);
}
EXPORT_SYMBOL(trace_hardirqs_on_caller);
NOKPROBE_SYMBOL(trace_hardirqs_on_caller);

__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
{
        lockdep_hardirqs_off(caller_addr);

        if (!this_cpu_read(tracing_irq_cpu)) {
                this_cpu_write(tracing_irq_cpu, 1);
                tracer_hardirqs_off(CALLER_ADDR0, caller_addr);
                if (!in_nmi())
                        trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr);
        }
}
EXPORT_SYMBOL(trace_hardirqs_off_caller);
NOKPROBE_SYMBOL(trace_hardirqs_off_caller);
#endif /* CONFIG_TRACE_IRQFLAGS */

#ifdef CONFIG_TRACE_PREEMPT_TOGGLE

void trace_preempt_on(unsigned long a0, unsigned long a1)
{
        if (!in_nmi())
                trace_preempt_enable_rcuidle(a0, a1);
        tracer_preempt_on(a0, a1);
}

void trace_preempt_off(unsigned long a0, unsigned long a1)
{
        if (!in_nmi())
                trace_preempt_disable_rcuidle(a0, a1);
        tracer_preempt_off(a0, a1);
}
#endif















    5 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CURRENT_H
#define _ASM_X86_CURRENT_H

#include <linux/compiler.h>
#include <asm/percpu.h>

#ifndef __ASSEMBLY__
struct task_struct;

DECLARE_PER_CPU(struct task_struct *, current_task);

static __always_inline struct task_struct *get_current(void)
{
        return this_cpu_read_stable(current_task);
}

#define current get_current()

#endif /* __ASSEMBLY__ */

#endif /* _ASM_X86_CURRENT_H */
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    5 
    5 






    5 









































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Security plug functions
 *
 * Copyright (C) 2001 WireX Communications, Inc <chris@wirex.com>
 * Copyright (C) 2001-2002 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (C) 2001 Networks Associates Technology, Inc <ssmalley@nai.com>
 * Copyright (C) 2016 Mellanox Technologies
 */

#define pr_fmt(fmt) "LSM: " fmt

#include <linux/bpf.h>
#include <linux/capability.h>
#include <linux/dcache.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kernel_read_file.h>
#include <linux/lsm_hooks.h>
#include <linux/integrity.h>
#include <linux/ima.h>
#include <linux/evm.h>
#include <linux/fsnotify.h>
#include <linux/mman.h>
#include <linux/mount.h>
#include <linux/personality.h>
#include <linux/backing-dev.h>
#include <linux/string.h>
#include <linux/msg.h>
#include <net/flow.h>

#define MAX_LSM_EVM_XATTR        2

/* How many LSMs were built into the kernel? */
#define LSM_COUNT (__end_lsm_info - __start_lsm_info)

/*
 * These are descriptions of the reasons that can be passed to the
 * security_locked_down() LSM hook. Placing this array here allows
 * all security modules to use the same descriptions for auditing
 * purposes.
 */
const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = {
        [LOCKDOWN_NONE] = "none",
        [LOCKDOWN_MODULE_SIGNATURE] = "unsigned module loading",
        [LOCKDOWN_DEV_MEM] = "/dev/mem,kmem,port",
        [LOCKDOWN_EFI_TEST] = "/dev/efi_test access",
        [LOCKDOWN_KEXEC] = "kexec of unsigned images",
        [LOCKDOWN_HIBERNATION] = "hibernation",
        [LOCKDOWN_PCI_ACCESS] = "direct PCI access",
        [LOCKDOWN_IOPORT] = "raw io port access",
        [LOCKDOWN_MSR] = "raw MSR access",
        [LOCKDOWN_ACPI_TABLES] = "modifying ACPI tables",
        [LOCKDOWN_PCMCIA_CIS] = "direct PCMCIA CIS storage",
        [LOCKDOWN_TIOCSSERIAL] = "reconfiguration of serial port IO",
        [LOCKDOWN_MODULE_PARAMETERS] = "unsafe module parameters",
        [LOCKDOWN_MMIOTRACE] = "unsafe mmio",
        [LOCKDOWN_DEBUGFS] = "debugfs access",
        [LOCKDOWN_XMON_WR] = "xmon write access",
        [LOCKDOWN_BPF_WRITE_USER] = "use of bpf to write user RAM",
        [LOCKDOWN_DBG_WRITE_KERNEL] = "use of kgdb/kdb to write kernel RAM",
        [LOCKDOWN_INTEGRITY_MAX] = "integrity",
        [LOCKDOWN_KCORE] = "/proc/kcore access",
        [LOCKDOWN_KPROBES] = "use of kprobes",
        [LOCKDOWN_BPF_READ] = "use of bpf to read kernel RAM",
        [LOCKDOWN_DBG_READ_KERNEL] = "use of kgdb/kdb to read kernel RAM",
        [LOCKDOWN_PERF] = "unsafe use of perf",
        [LOCKDOWN_TRACEFS] = "use of tracefs",
        [LOCKDOWN_XMON_RW] = "xmon read and write access",
        [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality",
};

struct security_hook_heads security_hook_heads __lsm_ro_after_init;
static BLOCKING_NOTIFIER_HEAD(blocking_lsm_notifier_chain);

static struct kmem_cache *lsm_file_cache;
static struct kmem_cache *lsm_inode_cache;

char *lsm_names;
static struct lsm_blob_sizes blob_sizes __lsm_ro_after_init;

/* Boot-time LSM user choice */
static __initdata const char *chosen_lsm_order;
static __initdata const char *chosen_major_lsm;

static __initconst const char * const builtin_lsm_order = CONFIG_LSM;

/* Ordered list of LSMs to initialize. */
static __initdata struct lsm_info **ordered_lsms;
static __initdata struct lsm_info *exclusive;

static __initdata bool debug;
#define init_debug(...)                                                \
        do {                                                        \
                if (debug)                                        \
                        pr_info(__VA_ARGS__);                        \
        } while (0)

static bool __init is_enabled(struct lsm_info *lsm)
{
        if (!lsm->enabled)
                return false;

        return *lsm->enabled;
}

/* Mark an LSM's enabled flag. */
static int lsm_enabled_true __initdata = 1;
static int lsm_enabled_false __initdata = 0;
static void __init set_enabled(struct lsm_info *lsm, bool enabled)
{
        /*
         * When an LSM hasn't configured an enable variable, we can use
         * a hard-coded location for storing the default enabled state.
         */
        if (!lsm->enabled) {
                if (enabled)
                        lsm->enabled = &lsm_enabled_true;
                else
                        lsm->enabled = &lsm_enabled_false;
        } else if (lsm->enabled == &lsm_enabled_true) {
                if (!enabled)
                        lsm->enabled = &lsm_enabled_false;
        } else if (lsm->enabled == &lsm_enabled_false) {
                if (enabled)
                        lsm->enabled = &lsm_enabled_true;
        } else {
                *lsm->enabled = enabled;
        }
}

/* Is an LSM already listed in the ordered LSMs list? */
static bool __init exists_ordered_lsm(struct lsm_info *lsm)
{
        struct lsm_info **check;

        for (check = ordered_lsms; *check; check++)
                if (*check == lsm)
                        return true;

        return false;
}

/* Append an LSM to the list of ordered LSMs to initialize. */
static int last_lsm __initdata;
static void __init append_ordered_lsm(struct lsm_info *lsm, const char *from)
{
        /* Ignore duplicate selections. */
        if (exists_ordered_lsm(lsm))
                return;

        if (WARN(last_lsm == LSM_COUNT, "%s: out of LSM slots!?\n", from))
                return;

        /* Enable this LSM, if it is not already set. */
        if (!lsm->enabled)
                lsm->enabled = &lsm_enabled_true;
        ordered_lsms[last_lsm++] = lsm;

        init_debug("%s ordering: %s (%sabled)\n", from, lsm->name,
                   is_enabled(lsm) ? "en" : "dis");
}

/* Is an LSM allowed to be initialized? */
static bool __init lsm_allowed(struct lsm_info *lsm)
{
        /* Skip if the LSM is disabled. */
        if (!is_enabled(lsm))
                return false;

        /* Not allowed if another exclusive LSM already initialized. */
        if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && exclusive) {
                init_debug("exclusive disabled: %s\n", lsm->name);
                return false;
        }

        return true;
}

static void __init lsm_set_blob_size(int *need, int *lbs)
{
        int offset;

        if (*need > 0) {
                offset = *lbs;
                *lbs += *need;
                *need = offset;
        }
}

static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed)
{
        if (!needed)
                return;

        lsm_set_blob_size(&needed->lbs_cred, &blob_sizes.lbs_cred);
        lsm_set_blob_size(&needed->lbs_file, &blob_sizes.lbs_file);
        /*
         * The inode blob gets an rcu_head in addition to
         * what the modules might need.
         */
        if (needed->lbs_inode && blob_sizes.lbs_inode == 0)
                blob_sizes.lbs_inode = sizeof(struct rcu_head);
        lsm_set_blob_size(&needed->lbs_inode, &blob_sizes.lbs_inode);
        lsm_set_blob_size(&needed->lbs_ipc, &blob_sizes.lbs_ipc);
        lsm_set_blob_size(&needed->lbs_msg_msg, &blob_sizes.lbs_msg_msg);
        lsm_set_blob_size(&needed->lbs_task, &blob_sizes.lbs_task);
}

/* Prepare LSM for initialization. */
static void __init prepare_lsm(struct lsm_info *lsm)
{
        int enabled = lsm_allowed(lsm);

        /* Record enablement (to handle any following exclusive LSMs). */
        set_enabled(lsm, enabled);

        /* If enabled, do pre-initialization work. */
        if (enabled) {
                if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && !exclusive) {
                        exclusive = lsm;
                        init_debug("exclusive chosen: %s\n", lsm->name);
                }

                lsm_set_blob_sizes(lsm->blobs);
        }
}

/* Initialize a given LSM, if it is enabled. */
static void __init initialize_lsm(struct lsm_info *lsm)
{
        if (is_enabled(lsm)) {
                int ret;

                init_debug("initializing %s\n", lsm->name);
                ret = lsm->init();
                WARN(ret, "%s failed to initialize: %d\n", lsm->name, ret);
        }
}

/* Populate ordered LSMs list from comma-separated LSM name list. */
static void __init ordered_lsm_parse(const char *order, const char *origin)
{
        struct lsm_info *lsm;
        char *sep, *name, *next;

        /* LSM_ORDER_FIRST is always first. */
        for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                if (lsm->order == LSM_ORDER_FIRST)
                        append_ordered_lsm(lsm, "first");
        }

        /* Process "security=", if given. */
        if (chosen_major_lsm) {
                struct lsm_info *major;

                /*
                 * To match the original "security=" behavior, this
                 * explicitly does NOT fallback to another Legacy Major
                 * if the selected one was separately disabled: disable
                 * all non-matching Legacy Major LSMs.
                 */
                for (major = __start_lsm_info; major < __end_lsm_info;
                     major++) {
                        if ((major->flags & LSM_FLAG_LEGACY_MAJOR) &&
                            strcmp(major->name, chosen_major_lsm) != 0) {
                                set_enabled(major, false);
                                init_debug("security=%s disabled: %s\n",
                                           chosen_major_lsm, major->name);
                        }
                }
        }

        sep = kstrdup(order, GFP_KERNEL);
        next = sep;
        /* Walk the list, looking for matching LSMs. */
        while ((name = strsep(&next, ",")) != NULL) {
                bool found = false;

                for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                        if (lsm->order == LSM_ORDER_MUTABLE &&
                            strcmp(lsm->name, name) == 0) {
                                append_ordered_lsm(lsm, origin);
                                found = true;
                        }
                }

                if (!found)
                        init_debug("%s ignored: %s\n", origin, name);
        }

        /* Process "security=", if given. */
        if (chosen_major_lsm) {
                for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                        if (exists_ordered_lsm(lsm))
                                continue;
                        if (strcmp(lsm->name, chosen_major_lsm) == 0)
                                append_ordered_lsm(lsm, "security=");
                }
        }

        /* Disable all LSMs not in the ordered list. */
        for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                if (exists_ordered_lsm(lsm))
                        continue;
                set_enabled(lsm, false);
                init_debug("%s disabled: %s\n", origin, lsm->name);
        }

        kfree(sep);
}

static void __init lsm_early_cred(struct cred *cred);
static void __init lsm_early_task(struct task_struct *task);

static int lsm_append(const char *new, char **result);

static void __init ordered_lsm_init(void)
{
        struct lsm_info **lsm;

        ordered_lsms = kcalloc(LSM_COUNT + 1, sizeof(*ordered_lsms),
                                GFP_KERNEL);

        if (chosen_lsm_order) {
                if (chosen_major_lsm) {
                        pr_info("security= is ignored because it is superseded by lsm=\n");
                        chosen_major_lsm = NULL;
                }
                ordered_lsm_parse(chosen_lsm_order, "cmdline");
        } else
                ordered_lsm_parse(builtin_lsm_order, "builtin");

        for (lsm = ordered_lsms; *lsm; lsm++)
                prepare_lsm(*lsm);

        init_debug("cred blob size     = %d\n", blob_sizes.lbs_cred);
        init_debug("file blob size     = %d\n", blob_sizes.lbs_file);
        init_debug("inode blob size    = %d\n", blob_sizes.lbs_inode);
        init_debug("ipc blob size      = %d\n", blob_sizes.lbs_ipc);
        init_debug("msg_msg blob size  = %d\n", blob_sizes.lbs_msg_msg);
        init_debug("task blob size     = %d\n", blob_sizes.lbs_task);

        /*
         * Create any kmem_caches needed for blobs
         */
        if (blob_sizes.lbs_file)
                lsm_file_cache = kmem_cache_create("lsm_file_cache",
                                                   blob_sizes.lbs_file, 0,
                                                   SLAB_PANIC, NULL);
        if (blob_sizes.lbs_inode)
                lsm_inode_cache = kmem_cache_create("lsm_inode_cache",
                                                    blob_sizes.lbs_inode, 0,
                                                    SLAB_PANIC, NULL);

        lsm_early_cred((struct cred *) current->cred);
        lsm_early_task(current);
        for (lsm = ordered_lsms; *lsm; lsm++)
                initialize_lsm(*lsm);

        kfree(ordered_lsms);
}

int __init early_security_init(void)
{
        int i;
        struct hlist_head *list = (struct hlist_head *) &security_hook_heads;
        struct lsm_info *lsm;

        for (i = 0; i < sizeof(security_hook_heads) / sizeof(struct hlist_head);
             i++)
                INIT_HLIST_HEAD(&list[i]);

        for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) {
                if (!lsm->enabled)
                        lsm->enabled = &lsm_enabled_true;
                prepare_lsm(lsm);
                initialize_lsm(lsm);
        }

        return 0;
}

/**
 * security_init - initializes the security framework
 *
 * This should be called early in the kernel initialization sequence.
 */
int __init security_init(void)
{
        struct lsm_info *lsm;

        pr_info("Security Framework initializing\n");

        /*
         * Append the names of the early LSM modules now that kmalloc() is
         * available
         */
        for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) {
                if (lsm->enabled)
                        lsm_append(lsm->name, &lsm_names);
        }

        /* Load LSMs in specified order. */
        ordered_lsm_init();

        return 0;
}

/* Save user chosen LSM */
static int __init choose_major_lsm(char *str)
{
        chosen_major_lsm = str;
        return 1;
}
__setup("security=", choose_major_lsm);

/* Explicitly choose LSM initialization order. */
static int __init choose_lsm_order(char *str)
{
        chosen_lsm_order = str;
        return 1;
}
__setup("lsm=", choose_lsm_order);

/* Enable LSM order debugging. */
static int __init enable_debug(char *str)
{
        debug = true;
        return 1;
}
__setup("lsm.debug", enable_debug);

static bool match_last_lsm(const char *list, const char *lsm)
{
        const char *last;

        if (WARN_ON(!list || !lsm))
                return false;
        last = strrchr(list, ',');
        if (last)
                /* Pass the comma, strcmp() will check for '\0' */
                last++;
        else
                last = list;
        return !strcmp(last, lsm);
}

static int lsm_append(const char *new, char **result)
{
        char *cp;

        if (*result == NULL) {
                *result = kstrdup(new, GFP_KERNEL);
                if (*result == NULL)
                        return -ENOMEM;
        } else {
                /* Check if it is the last registered name */
                if (match_last_lsm(*result, new))
                        return 0;
                cp = kasprintf(GFP_KERNEL, "%s,%s", *result, new);
                if (cp == NULL)
                        return -ENOMEM;
                kfree(*result);
                *result = cp;
        }
        return 0;
}

/**
 * security_add_hooks - Add a modules hooks to the hook lists.
 * @hooks: the hooks to add
 * @count: the number of hooks to add
 * @lsm: the name of the security module
 *
 * Each LSM has to register its hooks with the infrastructure.
 */
void __init security_add_hooks(struct security_hook_list *hooks, int count,
                                char *lsm)
{
        int i;

        for (i = 0; i < count; i++) {
                hooks[i].lsm = lsm;
                hlist_add_tail_rcu(&hooks[i].list, hooks[i].head);
        }

        /*
         * Don't try to append during early_security_init(), we'll come back
         * and fix this up afterwards.
         */
        if (slab_is_available()) {
                if (lsm_append(lsm, &lsm_names) < 0)
                        panic("%s - Cannot get early memory.\n", __func__);
        }
}

int call_blocking_lsm_notifier(enum lsm_event event, void *data)
{
        return blocking_notifier_call_chain(&blocking_lsm_notifier_chain,
                                            event, data);
}
EXPORT_SYMBOL(call_blocking_lsm_notifier);

int register_blocking_lsm_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&blocking_lsm_notifier_chain,
                                                nb);
}
EXPORT_SYMBOL(register_blocking_lsm_notifier);

int unregister_blocking_lsm_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&blocking_lsm_notifier_chain,
                                                  nb);
}
EXPORT_SYMBOL(unregister_blocking_lsm_notifier);

/**
 * lsm_cred_alloc - allocate a composite cred blob
 * @cred: the cred that needs a blob
 * @gfp: allocation type
 *
 * Allocate the cred blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_cred_alloc(struct cred *cred, gfp_t gfp)
{
        if (blob_sizes.lbs_cred == 0) {
                cred->security = NULL;
                return 0;
        }

        cred->security = kzalloc(blob_sizes.lbs_cred, gfp);
        if (cred->security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_early_cred - during initialization allocate a composite cred blob
 * @cred: the cred that needs a blob
 *
 * Allocate the cred blob for all the modules
 */
static void __init lsm_early_cred(struct cred *cred)
{
        int rc = lsm_cred_alloc(cred, GFP_KERNEL);

        if (rc)
                panic("%s: Early cred alloc failed.\n", __func__);
}

/**
 * lsm_file_alloc - allocate a composite file blob
 * @file: the file that needs a blob
 *
 * Allocate the file blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_file_alloc(struct file *file)
{
        if (!lsm_file_cache) {
                file->f_security = NULL;
                return 0;
        }

        file->f_security = kmem_cache_zalloc(lsm_file_cache, GFP_KERNEL);
        if (file->f_security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_inode_alloc - allocate a composite inode blob
 * @inode: the inode that needs a blob
 *
 * Allocate the inode blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
int lsm_inode_alloc(struct inode *inode)
{
        if (!lsm_inode_cache) {
                inode->i_security = NULL;
                return 0;
        }

        inode->i_security = kmem_cache_zalloc(lsm_inode_cache, GFP_NOFS);
        if (inode->i_security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_task_alloc - allocate a composite task blob
 * @task: the task that needs a blob
 *
 * Allocate the task blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_task_alloc(struct task_struct *task)
{
        if (blob_sizes.lbs_task == 0) {
                task->security = NULL;
                return 0;
        }

        task->security = kzalloc(blob_sizes.lbs_task, GFP_KERNEL);
        if (task->security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_ipc_alloc - allocate a composite ipc blob
 * @kip: the ipc that needs a blob
 *
 * Allocate the ipc blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_ipc_alloc(struct kern_ipc_perm *kip)
{
        if (blob_sizes.lbs_ipc == 0) {
                kip->security = NULL;
                return 0;
        }

        kip->security = kzalloc(blob_sizes.lbs_ipc, GFP_KERNEL);
        if (kip->security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_msg_msg_alloc - allocate a composite msg_msg blob
 * @mp: the msg_msg that needs a blob
 *
 * Allocate the ipc blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_msg_msg_alloc(struct msg_msg *mp)
{
        if (blob_sizes.lbs_msg_msg == 0) {
                mp->security = NULL;
                return 0;
        }

        mp->security = kzalloc(blob_sizes.lbs_msg_msg, GFP_KERNEL);
        if (mp->security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_early_task - during initialization allocate a composite task blob
 * @task: the task that needs a blob
 *
 * Allocate the task blob for all the modules
 */
static void __init lsm_early_task(struct task_struct *task)
{
        int rc = lsm_task_alloc(task);

        if (rc)
                panic("%s: Early task alloc failed.\n", __func__);
}

/*
 * The default value of the LSM hook is defined in linux/lsm_hook_defs.h and
 * can be accessed with:
 *
 *        LSM_RET_DEFAULT(<hook_name>)
 *
 * The macros below define static constants for the default value of each
 * LSM hook.
 */
#define LSM_RET_DEFAULT(NAME) (NAME##_default)
#define DECLARE_LSM_RET_DEFAULT_void(DEFAULT, NAME)
#define DECLARE_LSM_RET_DEFAULT_int(DEFAULT, NAME) \
        static const int LSM_RET_DEFAULT(NAME) = (DEFAULT);
#define LSM_HOOK(RET, DEFAULT, NAME, ...) \
        DECLARE_LSM_RET_DEFAULT_##RET(DEFAULT, NAME)

#include <linux/lsm_hook_defs.h>
#undef LSM_HOOK

/*
 * Hook list operation macros.
 *
 * call_void_hook:
 *        This is a hook that does not return a value.
 *
 * call_int_hook:
 *        This is a hook that returns a value.
 */

#define call_void_hook(FUNC, ...)                                \
        do {                                                        \
                struct security_hook_list *P;                        \
                                                                \
                hlist_for_each_entry(P, &security_hook_heads.FUNC, list) \
                        P->hook.FUNC(__VA_ARGS__);                \
        } while (0)

#define call_int_hook(FUNC, IRC, ...) ({                        \
        int RC = IRC;                                                \
        do {                                                        \
                struct security_hook_list *P;                        \
                                                                \
                hlist_for_each_entry(P, &security_hook_heads.FUNC, list) { \
                        RC = P->hook.FUNC(__VA_ARGS__);                \
                        if (RC != 0)                                \
                                break;                                \
                }                                                \
        } while (0);                                                \
        RC;                                                        \
})

/* Security operations */

int security_binder_set_context_mgr(const struct cred *mgr)
{
        return call_int_hook(binder_set_context_mgr, 0, mgr);
}

int security_binder_transaction(const struct cred *from,
                                const struct cred *to)
{
        return call_int_hook(binder_transaction, 0, from, to);
}

int security_binder_transfer_binder(const struct cred *from,
                                    const struct cred *to)
{
        return call_int_hook(binder_transfer_binder, 0, from, to);
}

int security_binder_transfer_file(const struct cred *from,
                                  const struct cred *to, struct file *file)
{
        return call_int_hook(binder_transfer_file, 0, from, to, file);
}

int security_ptrace_access_check(struct task_struct *child, unsigned int mode)
{
        return call_int_hook(ptrace_access_check, 0, child, mode);
}

int security_ptrace_traceme(struct task_struct *parent)
{
        return call_int_hook(ptrace_traceme, 0, parent);
}

int security_capget(struct task_struct *target,
                     kernel_cap_t *effective,
                     kernel_cap_t *inheritable,
                     kernel_cap_t *permitted)
{
        return call_int_hook(capget, 0, target,
                                effective, inheritable, permitted);
}

int security_capset(struct cred *new, const struct cred *old,
                    const kernel_cap_t *effective,
                    const kernel_cap_t *inheritable,
                    const kernel_cap_t *permitted)
{
        return call_int_hook(capset, 0, new, old,
                                effective, inheritable, permitted);
}

int security_capable(const struct cred *cred,
                     struct user_namespace *ns,
                     int cap,
                     unsigned int opts)
{
        return call_int_hook(capable, 0, cred, ns, cap, opts);
}

int security_quotactl(int cmds, int type, int id, struct super_block *sb)
{
        return call_int_hook(quotactl, 0, cmds, type, id, sb);
}

int security_quota_on(struct dentry *dentry)
{
        return call_int_hook(quota_on, 0, dentry);
}

int security_syslog(int type)
{
        return call_int_hook(syslog, 0, type);
}

int security_settime64(const struct timespec64 *ts, const struct timezone *tz)
{
        return call_int_hook(settime, 0, ts, tz);
}

int security_vm_enough_memory_mm(struct mm_struct *mm, long pages)
{
        struct security_hook_list *hp;
        int cap_sys_admin = 1;
        int rc;

        /*
         * The module will respond with a positive value if
         * it thinks the __vm_enough_memory() call should be
         * made with the cap_sys_admin set. If all of the modules
         * agree that it should be set it will. If any module
         * thinks it should not be set it won't.
         */
        hlist_for_each_entry(hp, &security_hook_heads.vm_enough_memory, list) {
                rc = hp->hook.vm_enough_memory(mm, pages);
                if (rc <= 0) {
                        cap_sys_admin = 0;
                        break;
                }
        }
        return __vm_enough_memory(mm, pages, cap_sys_admin);
}

int security_bprm_creds_for_exec(struct linux_binprm *bprm)
{
        return call_int_hook(bprm_creds_for_exec, 0, bprm);
}

int security_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file)
{
        return call_int_hook(bprm_creds_from_file, 0, bprm, file);
}

int security_bprm_check(struct linux_binprm *bprm)
{
        int ret;

        ret = call_int_hook(bprm_check_security, 0, bprm);
        if (ret)
                return ret;
        return ima_bprm_check(bprm);
}

void security_bprm_committing_creds(struct linux_binprm *bprm)
{
        call_void_hook(bprm_committing_creds, bprm);
}

void security_bprm_committed_creds(struct linux_binprm *bprm)
{
        call_void_hook(bprm_committed_creds, bprm);
}

int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc)
{
        return call_int_hook(fs_context_dup, 0, fc, src_fc);
}

int security_fs_context_parse_param(struct fs_context *fc,
                                    struct fs_parameter *param)
{
        struct security_hook_list *hp;
        int trc;
        int rc = -ENOPARAM;

        hlist_for_each_entry(hp, &security_hook_heads.fs_context_parse_param,
                             list) {
                trc = hp->hook.fs_context_parse_param(fc, param);
                if (trc == 0)
                        rc = 0;
                else if (trc != -ENOPARAM)
                        return trc;
        }
        return rc;
}

int security_sb_alloc(struct super_block *sb)
{
        return call_int_hook(sb_alloc_security, 0, sb);
}

void security_sb_free(struct super_block *sb)
{
        call_void_hook(sb_free_security, sb);
}

void security_free_mnt_opts(void **mnt_opts)
{
        if (!*mnt_opts)
                return;
        call_void_hook(sb_free_mnt_opts, *mnt_opts);
        *mnt_opts = NULL;
}
EXPORT_SYMBOL(security_free_mnt_opts);

int security_sb_eat_lsm_opts(char *options, void **mnt_opts)
{
        return call_int_hook(sb_eat_lsm_opts, 0, options, mnt_opts);
}
EXPORT_SYMBOL(security_sb_eat_lsm_opts);

int security_sb_remount(struct super_block *sb,
                        void *mnt_opts)
{
        return call_int_hook(sb_remount, 0, sb, mnt_opts);
}
EXPORT_SYMBOL(security_sb_remount);

int security_sb_kern_mount(struct super_block *sb)
{
        return call_int_hook(sb_kern_mount, 0, sb);
}

int security_sb_show_options(struct seq_file *m, struct super_block *sb)
{
        return call_int_hook(sb_show_options, 0, m, sb);
}

int security_sb_statfs(struct dentry *dentry)
{
        return call_int_hook(sb_statfs, 0, dentry);
}

int security_sb_mount(const char *dev_name, const struct path *path,
                       const char *type, unsigned long flags, void *data)
{
        return call_int_hook(sb_mount, 0, dev_name, path, type, flags, data);
}

int security_sb_umount(struct vfsmount *mnt, int flags)
{
        return call_int_hook(sb_umount, 0, mnt, flags);
}

int security_sb_pivotroot(const struct path *old_path, const struct path *new_path)
{
        return call_int_hook(sb_pivotroot, 0, old_path, new_path);
}

int security_sb_set_mnt_opts(struct super_block *sb,
                                void *mnt_opts,
                                unsigned long kern_flags,
                                unsigned long *set_kern_flags)
{
        return call_int_hook(sb_set_mnt_opts,
                                mnt_opts ? -EOPNOTSUPP : 0, sb,
                                mnt_opts, kern_flags, set_kern_flags);
}
EXPORT_SYMBOL(security_sb_set_mnt_opts);

int security_sb_clone_mnt_opts(const struct super_block *oldsb,
                                struct super_block *newsb,
                                unsigned long kern_flags,
                                unsigned long *set_kern_flags)
{
        return call_int_hook(sb_clone_mnt_opts, 0, oldsb, newsb,
                                kern_flags, set_kern_flags);
}
EXPORT_SYMBOL(security_sb_clone_mnt_opts);

int security_add_mnt_opt(const char *option, const char *val, int len,
                         void **mnt_opts)
{
        return call_int_hook(sb_add_mnt_opt, -EINVAL,
                                        option, val, len, mnt_opts);
}
EXPORT_SYMBOL(security_add_mnt_opt);

int security_move_mount(const struct path *from_path, const struct path *to_path)
{
        return call_int_hook(move_mount, 0, from_path, to_path);
}

int security_path_notify(const struct path *path, u64 mask,
                                unsigned int obj_type)
{
        return call_int_hook(path_notify, 0, path, mask, obj_type);
}

int security_inode_alloc(struct inode *inode)
{
        int rc = lsm_inode_alloc(inode);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(inode_alloc_security, 0, inode);
        if (unlikely(rc))
                security_inode_free(inode);
        return rc;
}

static void inode_free_by_rcu(struct rcu_head *head)
{
        /*
         * The rcu head is at the start of the inode blob
         */
        kmem_cache_free(lsm_inode_cache, head);
}

void security_inode_free(struct inode *inode)
{
        integrity_inode_free(inode);
        call_void_hook(inode_free_security, inode);
        /*
         * The inode may still be referenced in a path walk and
         * a call to security_inode_permission() can be made
         * after inode_free_security() is called. Ideally, the VFS
         * wouldn't do this, but fixing that is a much harder
         * job. For now, simply free the i_security via RCU, and
         * leave the current inode->i_security pointer intact.
         * The inode will be freed after the RCU grace period too.
         */
        if (inode->i_security)
                call_rcu((struct rcu_head *)inode->i_security,
                                inode_free_by_rcu);
}

int security_dentry_init_security(struct dentry *dentry, int mode,
                                        const struct qstr *name, void **ctx,
                                        u32 *ctxlen)
{
        return call_int_hook(dentry_init_security, -EOPNOTSUPP, dentry, mode,
                                name, ctx, ctxlen);
}
EXPORT_SYMBOL(security_dentry_init_security);

int security_dentry_create_files_as(struct dentry *dentry, int mode,
                                    struct qstr *name,
                                    const struct cred *old, struct cred *new)
{
        return call_int_hook(dentry_create_files_as, 0, dentry, mode,
                                name, old, new);
}
EXPORT_SYMBOL(security_dentry_create_files_as);

int security_inode_init_security(struct inode *inode, struct inode *dir,
                                 const struct qstr *qstr,
                                 const initxattrs initxattrs, void *fs_data)
{
        struct xattr new_xattrs[MAX_LSM_EVM_XATTR + 1];
        struct xattr *lsm_xattr, *evm_xattr, *xattr;
        int ret;

        if (unlikely(IS_PRIVATE(inode)))
                return 0;

        if (!initxattrs)
                return call_int_hook(inode_init_security, -EOPNOTSUPP, inode,
                                     dir, qstr, NULL, NULL, NULL);
        memset(new_xattrs, 0, sizeof(new_xattrs));
        lsm_xattr = new_xattrs;
        ret = call_int_hook(inode_init_security, -EOPNOTSUPP, inode, dir, qstr,
                                                &lsm_xattr->name,
                                                &lsm_xattr->value,
                                                &lsm_xattr->value_len);
        if (ret)
                goto out;

        evm_xattr = lsm_xattr + 1;
        ret = evm_inode_init_security(inode, lsm_xattr, evm_xattr);
        if (ret)
                goto out;
        ret = initxattrs(inode, new_xattrs, fs_data);
out:
        for (xattr = new_xattrs; xattr->value != NULL; xattr++)
                kfree(xattr->value);
        return (ret == -EOPNOTSUPP) ? 0 : ret;
}
EXPORT_SYMBOL(security_inode_init_security);

int security_old_inode_init_security(struct inode *inode, struct inode *dir,
                                     const struct qstr *qstr, const char **name,
                                     void **value, size_t *len)
{
        if (unlikely(IS_PRIVATE(inode)))
                return -EOPNOTSUPP;
        return call_int_hook(inode_init_security, -EOPNOTSUPP, inode, dir,
                             qstr, name, value, len);
}
EXPORT_SYMBOL(security_old_inode_init_security);

#ifdef CONFIG_SECURITY_PATH
int security_path_mknod(const struct path *dir, struct dentry *dentry, umode_t mode,
                        unsigned int dev)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_mknod, 0, dir, dentry, mode, dev);
}
EXPORT_SYMBOL(security_path_mknod);

int security_path_mkdir(const struct path *dir, struct dentry *dentry, umode_t mode)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_mkdir, 0, dir, dentry, mode);
}
EXPORT_SYMBOL(security_path_mkdir);

int security_path_rmdir(const struct path *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_rmdir, 0, dir, dentry);
}

int security_path_unlink(const struct path *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_unlink, 0, dir, dentry);
}
EXPORT_SYMBOL(security_path_unlink);

int security_path_symlink(const struct path *dir, struct dentry *dentry,
                          const char *old_name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_symlink, 0, dir, dentry, old_name);
}

int security_path_link(struct dentry *old_dentry, const struct path *new_dir,
                       struct dentry *new_dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry))))
                return 0;
        return call_int_hook(path_link, 0, old_dentry, new_dir, new_dentry);
}

int security_path_rename(const struct path *old_dir, struct dentry *old_dentry,
                         const struct path *new_dir, struct dentry *new_dentry,
                         unsigned int flags)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) ||
                     (d_is_positive(new_dentry) && IS_PRIVATE(d_backing_inode(new_dentry)))))
                return 0;

        if (flags & RENAME_EXCHANGE) {
                int err = call_int_hook(path_rename, 0, new_dir, new_dentry,
                                        old_dir, old_dentry);
                if (err)
                        return err;
        }

        return call_int_hook(path_rename, 0, old_dir, old_dentry, new_dir,
                                new_dentry);
}
EXPORT_SYMBOL(security_path_rename);

int security_path_truncate(const struct path *path)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(path_truncate, 0, path);
}

int security_path_chmod(const struct path *path, umode_t mode)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(path_chmod, 0, path, mode);
}

int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(path_chown, 0, path, uid, gid);
}

int security_path_chroot(const struct path *path)
{
        return call_int_hook(path_chroot, 0, path);
}
#endif

int security_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_create, 0, dir, dentry, mode);
}
EXPORT_SYMBOL_GPL(security_inode_create);

int security_inode_link(struct dentry *old_dentry, struct inode *dir,
                         struct dentry *new_dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry))))
                return 0;
        return call_int_hook(inode_link, 0, old_dentry, dir, new_dentry);
}

int security_inode_unlink(struct inode *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_unlink, 0, dir, dentry);
}

int security_inode_symlink(struct inode *dir, struct dentry *dentry,
                            const char *old_name)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_symlink, 0, dir, dentry, old_name);
}

int security_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_mkdir, 0, dir, dentry, mode);
}
EXPORT_SYMBOL_GPL(security_inode_mkdir);

int security_inode_rmdir(struct inode *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_rmdir, 0, dir, dentry);
}

int security_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_mknod, 0, dir, dentry, mode, dev);
}

int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry,
                           struct inode *new_dir, struct dentry *new_dentry,
                           unsigned int flags)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) ||
            (d_is_positive(new_dentry) && IS_PRIVATE(d_backing_inode(new_dentry)))))
                return 0;

        if (flags & RENAME_EXCHANGE) {
                int err = call_int_hook(inode_rename, 0, new_dir, new_dentry,
                                                     old_dir, old_dentry);
                if (err)
                        return err;
        }

        return call_int_hook(inode_rename, 0, old_dir, old_dentry,
                                           new_dir, new_dentry);
}

int security_inode_readlink(struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_readlink, 0, dentry);
}

int security_inode_follow_link(struct dentry *dentry, struct inode *inode,
                               bool rcu)
{
        if (unlikely(IS_PRIVATE(inode)))
                return 0;
        return call_int_hook(inode_follow_link, 0, dentry, inode, rcu);
}

int security_inode_permission(struct inode *inode, int mask)
{
        if (unlikely(IS_PRIVATE(inode)))
                return 0;
        return call_int_hook(inode_permission, 0, inode, mask);
}

int security_inode_setattr(struct dentry *dentry, struct iattr *attr)
{
        int ret;

        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        ret = call_int_hook(inode_setattr, 0, dentry, attr);
        if (ret)
                return ret;
        return evm_inode_setattr(dentry, attr);
}
EXPORT_SYMBOL_GPL(security_inode_setattr);

int security_inode_getattr(const struct path *path)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(inode_getattr, 0, path);
}

int security_inode_setxattr(struct dentry *dentry, const char *name,
                            const void *value, size_t size, int flags)
{
        int ret;

        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        /*
         * SELinux and Smack integrate the cap call,
         * so assume that all LSMs supplying this call do so.
         */
        ret = call_int_hook(inode_setxattr, 1, dentry, name, value, size,
                                flags);

        if (ret == 1)
                ret = cap_inode_setxattr(dentry, name, value, size, flags);
        if (ret)
                return ret;
        ret = ima_inode_setxattr(dentry, name, value, size);
        if (ret)
                return ret;
        return evm_inode_setxattr(dentry, name, value, size);
}

void security_inode_post_setxattr(struct dentry *dentry, const char *name,
                                  const void *value, size_t size, int flags)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_setxattr, dentry, name, value, size, flags);
        evm_inode_post_setxattr(dentry, name, value, size);
}

int security_inode_getxattr(struct dentry *dentry, const char *name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_getxattr, 0, dentry, name);
}

int security_inode_listxattr(struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_listxattr, 0, dentry);
}

int security_inode_removexattr(struct dentry *dentry, const char *name)
{
        int ret;

        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        /*
         * SELinux and Smack integrate the cap call,
         * so assume that all LSMs supplying this call do so.
         */
        ret = call_int_hook(inode_removexattr, 1, dentry, name);
        if (ret == 1)
                ret = cap_inode_removexattr(dentry, name);
        if (ret)
                return ret;
        ret = ima_inode_removexattr(dentry, name);
        if (ret)
                return ret;
        return evm_inode_removexattr(dentry, name);
}

int security_inode_need_killpriv(struct dentry *dentry)
{
        return call_int_hook(inode_need_killpriv, 0, dentry);
}

int security_inode_killpriv(struct dentry *dentry)
{
        return call_int_hook(inode_killpriv, 0, dentry);
}

int security_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc)
{
        struct security_hook_list *hp;
        int rc;

        if (unlikely(IS_PRIVATE(inode)))
                return LSM_RET_DEFAULT(inode_getsecurity);
        /*
         * Only one module will provide an attribute with a given name.
         */
        hlist_for_each_entry(hp, &security_hook_heads.inode_getsecurity, list) {
                rc = hp->hook.inode_getsecurity(inode, name, buffer, alloc);
                if (rc != LSM_RET_DEFAULT(inode_getsecurity))
                        return rc;
        }
        return LSM_RET_DEFAULT(inode_getsecurity);
}

int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags)
{
        struct security_hook_list *hp;
        int rc;

        if (unlikely(IS_PRIVATE(inode)))
                return LSM_RET_DEFAULT(inode_setsecurity);
        /*
         * Only one module will provide an attribute with a given name.
         */
        hlist_for_each_entry(hp, &security_hook_heads.inode_setsecurity, list) {
                rc = hp->hook.inode_setsecurity(inode, name, value, size,
                                                                flags);
                if (rc != LSM_RET_DEFAULT(inode_setsecurity))
                        return rc;
        }
        return LSM_RET_DEFAULT(inode_setsecurity);
}

int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size)
{
        if (unlikely(IS_PRIVATE(inode)))
                return 0;
        return call_int_hook(inode_listsecurity, 0, inode, buffer, buffer_size);
}
EXPORT_SYMBOL(security_inode_listsecurity);

void security_inode_getsecid(struct inode *inode, u32 *secid)
{
        call_void_hook(inode_getsecid, inode, secid);
}

int security_inode_copy_up(struct dentry *src, struct cred **new)
{
        return call_int_hook(inode_copy_up, 0, src, new);
}
EXPORT_SYMBOL(security_inode_copy_up);

int security_inode_copy_up_xattr(const char *name)
{
        struct security_hook_list *hp;
        int rc;

        /*
         * The implementation can return 0 (accept the xattr), 1 (discard the
         * xattr), -EOPNOTSUPP if it does not know anything about the xattr or
         * any other error code incase of an error.
         */
        hlist_for_each_entry(hp,
                &security_hook_heads.inode_copy_up_xattr, list) {
                rc = hp->hook.inode_copy_up_xattr(name);
                if (rc != LSM_RET_DEFAULT(inode_copy_up_xattr))
                        return rc;
        }

        return LSM_RET_DEFAULT(inode_copy_up_xattr);
}
EXPORT_SYMBOL(security_inode_copy_up_xattr);

int security_kernfs_init_security(struct kernfs_node *kn_dir,
                                  struct kernfs_node *kn)
{
        return call_int_hook(kernfs_init_security, 0, kn_dir, kn);
}

int security_file_permission(struct file *file, int mask)
{
        int ret;

        ret = call_int_hook(file_permission, 0, file, mask);
        if (ret)
                return ret;

        return fsnotify_perm(file, mask);
}

int security_file_alloc(struct file *file)
{
        int rc = lsm_file_alloc(file);

        if (rc)
                return rc;
        rc = call_int_hook(file_alloc_security, 0, file);
        if (unlikely(rc))
                security_file_free(file);
        return rc;
}

void security_file_free(struct file *file)
{
        void *blob;

        call_void_hook(file_free_security, file);

        blob = file->f_security;
        if (blob) {
                file->f_security = NULL;
                kmem_cache_free(lsm_file_cache, blob);
        }
}

int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        return call_int_hook(file_ioctl, 0, file, cmd, arg);
}
EXPORT_SYMBOL_GPL(security_file_ioctl);

/**
 * security_file_ioctl_compat() - Check if an ioctl is allowed in compat mode
 * @file: associated file
 * @cmd: ioctl cmd
 * @arg: ioctl arguments
 *
 * Compat version of security_file_ioctl() that correctly handles 32-bit
 * processes running on 64-bit kernels.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_ioctl_compat(struct file *file, unsigned int cmd,
                               unsigned long arg)
{
        return call_int_hook(file_ioctl_compat, 0, file, cmd, arg);
}
EXPORT_SYMBOL_GPL(security_file_ioctl_compat);

static inline unsigned long mmap_prot(struct file *file, unsigned long prot)
{
        /*
         * Does we have PROT_READ and does the application expect
         * it to imply PROT_EXEC?  If not, nothing to talk about...
         */
        if ((prot & (PROT_READ | PROT_EXEC)) != PROT_READ)
                return prot;
        if (!(current->personality & READ_IMPLIES_EXEC))
                return prot;
        /*
         * if that's an anonymous mapping, let it.
         */
        if (!file)
                return prot | PROT_EXEC;
        /*
         * ditto if it's not on noexec mount, except that on !MMU we need
         * NOMMU_MAP_EXEC (== VM_MAYEXEC) in this case
         */
        if (!path_noexec(&file->f_path)) {
#ifndef CONFIG_MMU
                if (file->f_op->mmap_capabilities) {
                        unsigned caps = file->f_op->mmap_capabilities(file);
                        if (!(caps & NOMMU_MAP_EXEC))
                                return prot;
                }
#endif
                return prot | PROT_EXEC;
        }
        /* anything on noexec mount won't get PROT_EXEC */
        return prot;
}

int security_mmap_file(struct file *file, unsigned long prot,
                        unsigned long flags)
{
        unsigned long prot_adj = mmap_prot(file, prot);
        int ret;

        ret = call_int_hook(mmap_file, 0, file, prot, prot_adj, flags);
        if (ret)
                return ret;
        return ima_file_mmap(file, prot, prot_adj, flags);
}

int security_mmap_addr(unsigned long addr)
{
        return call_int_hook(mmap_addr, 0, addr);
}

int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
                            unsigned long prot)
{
        int ret;

        ret = call_int_hook(file_mprotect, 0, vma, reqprot, prot);
        if (ret)
                return ret;
        return ima_file_mprotect(vma, prot);
}

int security_file_lock(struct file *file, unsigned int cmd)
{
        return call_int_hook(file_lock, 0, file, cmd);
}

int security_file_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
{
        return call_int_hook(file_fcntl, 0, file, cmd, arg);
}

void security_file_set_fowner(struct file *file)
{
        call_void_hook(file_set_fowner, file);
}

int security_file_send_sigiotask(struct task_struct *tsk,
                                  struct fown_struct *fown, int sig)
{
        return call_int_hook(file_send_sigiotask, 0, tsk, fown, sig);
}

int security_file_receive(struct file *file)
{
        return call_int_hook(file_receive, 0, file);
}

int security_file_open(struct file *file)
{
        int ret;

        ret = call_int_hook(file_open, 0, file);
        if (ret)
                return ret;

        return fsnotify_perm(file, MAY_OPEN);
}

int security_task_alloc(struct task_struct *task, unsigned long clone_flags)
{
        int rc = lsm_task_alloc(task);

        if (rc)
                return rc;
        rc = call_int_hook(task_alloc, 0, task, clone_flags);
        if (unlikely(rc))
                security_task_free(task);
        return rc;
}

void security_task_free(struct task_struct *task)
{
        call_void_hook(task_free, task);

        kfree(task->security);
        task->security = NULL;
}

int security_cred_alloc_blank(struct cred *cred, gfp_t gfp)
{
        int rc = lsm_cred_alloc(cred, gfp);

        if (rc)
                return rc;

        rc = call_int_hook(cred_alloc_blank, 0, cred, gfp);
        if (unlikely(rc))
                security_cred_free(cred);
        return rc;
}

void security_cred_free(struct cred *cred)
{
        /*
         * There is a failure case in prepare_creds() that
         * may result in a call here with ->security being NULL.
         */
        if (unlikely(cred->security == NULL))
                return;

        call_void_hook(cred_free, cred);

        kfree(cred->security);
        cred->security = NULL;
}

int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp)
{
        int rc = lsm_cred_alloc(new, gfp);

        if (rc)
                return rc;

        rc = call_int_hook(cred_prepare, 0, new, old, gfp);
        if (unlikely(rc))
                security_cred_free(new);
        return rc;
}

void security_transfer_creds(struct cred *new, const struct cred *old)
{
        call_void_hook(cred_transfer, new, old);
}

void security_cred_getsecid(const struct cred *c, u32 *secid)
{
        *secid = 0;
        call_void_hook(cred_getsecid, c, secid);
}
EXPORT_SYMBOL(security_cred_getsecid);

int security_kernel_act_as(struct cred *new, u32 secid)
{
        return call_int_hook(kernel_act_as, 0, new, secid);
}

int security_kernel_create_files_as(struct cred *new, struct inode *inode)
{
        return call_int_hook(kernel_create_files_as, 0, new, inode);
}

int security_kernel_module_request(char *kmod_name)
{
        int ret;

        ret = call_int_hook(kernel_module_request, 0, kmod_name);
        if (ret)
                return ret;
        return integrity_kernel_module_request(kmod_name);
}

int security_kernel_read_file(struct file *file, enum kernel_read_file_id id,
                              bool contents)
{
        int ret;

        ret = call_int_hook(kernel_read_file, 0, file, id, contents);
        if (ret)
                return ret;
        return ima_read_file(file, id, contents);
}
EXPORT_SYMBOL_GPL(security_kernel_read_file);

int security_kernel_post_read_file(struct file *file, char *buf, loff_t size,
                                   enum kernel_read_file_id id)
{
        int ret;

        ret = call_int_hook(kernel_post_read_file, 0, file, buf, size, id);
        if (ret)
                return ret;
        return ima_post_read_file(file, buf, size, id);
}
EXPORT_SYMBOL_GPL(security_kernel_post_read_file);

int security_kernel_load_data(enum kernel_load_data_id id, bool contents)
{
        int ret;

        ret = call_int_hook(kernel_load_data, 0, id, contents);
        if (ret)
                return ret;
        return ima_load_data(id, contents);
}
EXPORT_SYMBOL_GPL(security_kernel_load_data);

int security_kernel_post_load_data(char *buf, loff_t size,
                                   enum kernel_load_data_id id,
                                   char *description)
{
        int ret;

        ret = call_int_hook(kernel_post_load_data, 0, buf, size, id,
                            description);
        if (ret)
                return ret;
        return ima_post_load_data(buf, size, id, description);
}
EXPORT_SYMBOL_GPL(security_kernel_post_load_data);

int security_task_fix_setuid(struct cred *new, const struct cred *old,
                             int flags)
{
        return call_int_hook(task_fix_setuid, 0, new, old, flags);
}

int security_task_fix_setgid(struct cred *new, const struct cred *old,
                                 int flags)
{
        return call_int_hook(task_fix_setgid, 0, new, old, flags);
}

int security_task_setpgid(struct task_struct *p, pid_t pgid)
{
        return call_int_hook(task_setpgid, 0, p, pgid);
}

int security_task_getpgid(struct task_struct *p)
{
        return call_int_hook(task_getpgid, 0, p);
}

int security_task_getsid(struct task_struct *p)
{
        return call_int_hook(task_getsid, 0, p);
}

void security_task_getsecid(struct task_struct *p, u32 *secid)
{
        *secid = 0;
        call_void_hook(task_getsecid, p, secid);
}
EXPORT_SYMBOL(security_task_getsecid);

int security_task_setnice(struct task_struct *p, int nice)
{
        return call_int_hook(task_setnice, 0, p, nice);
}

int security_task_setioprio(struct task_struct *p, int ioprio)
{
        return call_int_hook(task_setioprio, 0, p, ioprio);
}

int security_task_getioprio(struct task_struct *p)
{
        return call_int_hook(task_getioprio, 0, p);
}

int security_task_prlimit(const struct cred *cred, const struct cred *tcred,
                          unsigned int flags)
{
        return call_int_hook(task_prlimit, 0, cred, tcred, flags);
}

int security_task_setrlimit(struct task_struct *p, unsigned int resource,
                struct rlimit *new_rlim)
{
        return call_int_hook(task_setrlimit, 0, p, resource, new_rlim);
}

int security_task_setscheduler(struct task_struct *p)
{
        return call_int_hook(task_setscheduler, 0, p);
}

int security_task_getscheduler(struct task_struct *p)
{
        return call_int_hook(task_getscheduler, 0, p);
}

int security_task_movememory(struct task_struct *p)
{
        return call_int_hook(task_movememory, 0, p);
}

int security_task_kill(struct task_struct *p, struct kernel_siginfo *info,
                        int sig, const struct cred *cred)
{
        return call_int_hook(task_kill, 0, p, info, sig, cred);
}

int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
                         unsigned long arg4, unsigned long arg5)
{
        int thisrc;
        int rc = LSM_RET_DEFAULT(task_prctl);
        struct security_hook_list *hp;

        hlist_for_each_entry(hp, &security_hook_heads.task_prctl, list) {
                thisrc = hp->hook.task_prctl(option, arg2, arg3, arg4, arg5);
                if (thisrc != LSM_RET_DEFAULT(task_prctl)) {
                        rc = thisrc;
                        if (thisrc != 0)
                                break;
                }
        }
        return rc;
}

void security_task_to_inode(struct task_struct *p, struct inode *inode)
{
        call_void_hook(task_to_inode, p, inode);
}

int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag)
{
        return call_int_hook(ipc_permission, 0, ipcp, flag);
}

void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid)
{
        *secid = 0;
        call_void_hook(ipc_getsecid, ipcp, secid);
}

int security_msg_msg_alloc(struct msg_msg *msg)
{
        int rc = lsm_msg_msg_alloc(msg);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(msg_msg_alloc_security, 0, msg);
        if (unlikely(rc))
                security_msg_msg_free(msg);
        return rc;
}

void security_msg_msg_free(struct msg_msg *msg)
{
        call_void_hook(msg_msg_free_security, msg);
        kfree(msg->security);
        msg->security = NULL;
}

int security_msg_queue_alloc(struct kern_ipc_perm *msq)
{
        int rc = lsm_ipc_alloc(msq);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(msg_queue_alloc_security, 0, msq);
        if (unlikely(rc))
                security_msg_queue_free(msq);
        return rc;
}

void security_msg_queue_free(struct kern_ipc_perm *msq)
{
        call_void_hook(msg_queue_free_security, msq);
        kfree(msq->security);
        msq->security = NULL;
}

int security_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg)
{
        return call_int_hook(msg_queue_associate, 0, msq, msqflg);
}

int security_msg_queue_msgctl(struct kern_ipc_perm *msq, int cmd)
{
        return call_int_hook(msg_queue_msgctl, 0, msq, cmd);
}

int security_msg_queue_msgsnd(struct kern_ipc_perm *msq,
                               struct msg_msg *msg, int msqflg)
{
        return call_int_hook(msg_queue_msgsnd, 0, msq, msg, msqflg);
}

int security_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *msg,
                               struct task_struct *target, long type, int mode)
{
        return call_int_hook(msg_queue_msgrcv, 0, msq, msg, target, type, mode);
}

int security_shm_alloc(struct kern_ipc_perm *shp)
{
        int rc = lsm_ipc_alloc(shp);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(shm_alloc_security, 0, shp);
        if (unlikely(rc))
                security_shm_free(shp);
        return rc;
}

void security_shm_free(struct kern_ipc_perm *shp)
{
        call_void_hook(shm_free_security, shp);
        kfree(shp->security);
        shp->security = NULL;
}

int security_shm_associate(struct kern_ipc_perm *shp, int shmflg)
{
        return call_int_hook(shm_associate, 0, shp, shmflg);
}

int security_shm_shmctl(struct kern_ipc_perm *shp, int cmd)
{
        return call_int_hook(shm_shmctl, 0, shp, cmd);
}

int security_shm_shmat(struct kern_ipc_perm *shp, char __user *shmaddr, int shmflg)
{
        return call_int_hook(shm_shmat, 0, shp, shmaddr, shmflg);
}

int security_sem_alloc(struct kern_ipc_perm *sma)
{
        int rc = lsm_ipc_alloc(sma);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(sem_alloc_security, 0, sma);
        if (unlikely(rc))
                security_sem_free(sma);
        return rc;
}

void security_sem_free(struct kern_ipc_perm *sma)
{
        call_void_hook(sem_free_security, sma);
        kfree(sma->security);
        sma->security = NULL;
}

int security_sem_associate(struct kern_ipc_perm *sma, int semflg)
{
        return call_int_hook(sem_associate, 0, sma, semflg);
}

int security_sem_semctl(struct kern_ipc_perm *sma, int cmd)
{
        return call_int_hook(sem_semctl, 0, sma, cmd);
}

int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops,
                        unsigned nsops, int alter)
{
        return call_int_hook(sem_semop, 0, sma, sops, nsops, alter);
}

void security_d_instantiate(struct dentry *dentry, struct inode *inode)
{
        if (unlikely(inode && IS_PRIVATE(inode)))
                return;
        call_void_hook(d_instantiate, dentry, inode);
}
EXPORT_SYMBOL(security_d_instantiate);

int security_getprocattr(struct task_struct *p, const char *lsm, char *name,
                                char **value)
{
        struct security_hook_list *hp;

        hlist_for_each_entry(hp, &security_hook_heads.getprocattr, list) {
                if (lsm != NULL && strcmp(lsm, hp->lsm))
                        continue;
                return hp->hook.getprocattr(p, name, value);
        }
        return LSM_RET_DEFAULT(getprocattr);
}

int security_setprocattr(const char *lsm, const char *name, void *value,
                         size_t size)
{
        struct security_hook_list *hp;

        hlist_for_each_entry(hp, &security_hook_heads.setprocattr, list) {
                if (lsm != NULL && strcmp(lsm, hp->lsm))
                        continue;
                return hp->hook.setprocattr(name, value, size);
        }
        return LSM_RET_DEFAULT(setprocattr);
}

int security_netlink_send(struct sock *sk, struct sk_buff *skb)
{
        return call_int_hook(netlink_send, 0, sk, skb);
}

int security_ismaclabel(const char *name)
{
        return call_int_hook(ismaclabel, 0, name);
}
EXPORT_SYMBOL(security_ismaclabel);

int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
{
        struct security_hook_list *hp;
        int rc;

        /*
         * Currently, only one LSM can implement secid_to_secctx (i.e this
         * LSM hook is not "stackable").
         */
        hlist_for_each_entry(hp, &security_hook_heads.secid_to_secctx, list) {
                rc = hp->hook.secid_to_secctx(secid, secdata, seclen);
                if (rc != LSM_RET_DEFAULT(secid_to_secctx))
                        return rc;
        }

        return LSM_RET_DEFAULT(secid_to_secctx);
}
EXPORT_SYMBOL(security_secid_to_secctx);

int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
{
        *secid = 0;
        return call_int_hook(secctx_to_secid, 0, secdata, seclen, secid);
}
EXPORT_SYMBOL(security_secctx_to_secid);

void security_release_secctx(char *secdata, u32 seclen)
{
        call_void_hook(release_secctx, secdata, seclen);
}
EXPORT_SYMBOL(security_release_secctx);

void security_inode_invalidate_secctx(struct inode *inode)
{
        call_void_hook(inode_invalidate_secctx, inode);
}
EXPORT_SYMBOL(security_inode_invalidate_secctx);

int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen)
{
        return call_int_hook(inode_notifysecctx, 0, inode, ctx, ctxlen);
}
EXPORT_SYMBOL(security_inode_notifysecctx);

int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen)
{
        return call_int_hook(inode_setsecctx, 0, dentry, ctx, ctxlen);
}
EXPORT_SYMBOL(security_inode_setsecctx);

int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
{
        struct security_hook_list *hp;
        int rc;

        /*
         * Only one module will provide a security context.
         */
        hlist_for_each_entry(hp, &security_hook_heads.inode_getsecctx, list) {
                rc = hp->hook.inode_getsecctx(inode, ctx, ctxlen);
                if (rc != LSM_RET_DEFAULT(inode_getsecctx))
                        return rc;
        }

        return LSM_RET_DEFAULT(inode_getsecctx);
}
EXPORT_SYMBOL(security_inode_getsecctx);

#ifdef CONFIG_WATCH_QUEUE
int security_post_notification(const struct cred *w_cred,
                               const struct cred *cred,
                               struct watch_notification *n)
{
        return call_int_hook(post_notification, 0, w_cred, cred, n);
}
#endif /* CONFIG_WATCH_QUEUE */

#ifdef CONFIG_KEY_NOTIFICATIONS
int security_watch_key(struct key *key)
{
        return call_int_hook(watch_key, 0, key);
}
#endif

#ifdef CONFIG_SECURITY_NETWORK

int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk)
{
        return call_int_hook(unix_stream_connect, 0, sock, other, newsk);
}
EXPORT_SYMBOL(security_unix_stream_connect);

int security_unix_may_send(struct socket *sock,  struct socket *other)
{
        return call_int_hook(unix_may_send, 0, sock, other);
}
EXPORT_SYMBOL(security_unix_may_send);

int security_socket_create(int family, int type, int protocol, int kern)
{
        return call_int_hook(socket_create, 0, family, type, protocol, kern);
}

int security_socket_post_create(struct socket *sock, int family,
                                int type, int protocol, int kern)
{
        return call_int_hook(socket_post_create, 0, sock, family, type,
                                                protocol, kern);
}

int security_socket_socketpair(struct socket *socka, struct socket *sockb)
{
        return call_int_hook(socket_socketpair, 0, socka, sockb);
}
EXPORT_SYMBOL(security_socket_socketpair);

int security_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen)
{
        return call_int_hook(socket_bind, 0, sock, address, addrlen);
}

int security_socket_connect(struct socket *sock, struct sockaddr *address, int addrlen)
{
        return call_int_hook(socket_connect, 0, sock, address, addrlen);
}

int security_socket_listen(struct socket *sock, int backlog)
{
        return call_int_hook(socket_listen, 0, sock, backlog);
}

int security_socket_accept(struct socket *sock, struct socket *newsock)
{
        return call_int_hook(socket_accept, 0, sock, newsock);
}

int security_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size)
{
        return call_int_hook(socket_sendmsg, 0, sock, msg, size);
}

int security_socket_recvmsg(struct socket *sock, struct msghdr *msg,
                            int size, int flags)
{
        return call_int_hook(socket_recvmsg, 0, sock, msg, size, flags);
}

int security_socket_getsockname(struct socket *sock)
{
        return call_int_hook(socket_getsockname, 0, sock);
}

int security_socket_getpeername(struct socket *sock)
{
        return call_int_hook(socket_getpeername, 0, sock);
}

int security_socket_getsockopt(struct socket *sock, int level, int optname)
{
        return call_int_hook(socket_getsockopt, 0, sock, level, optname);
}

int security_socket_setsockopt(struct socket *sock, int level, int optname)
{
        return call_int_hook(socket_setsockopt, 0, sock, level, optname);
}

int security_socket_shutdown(struct socket *sock, int how)
{
        return call_int_hook(socket_shutdown, 0, sock, how);
}

int security_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        return call_int_hook(socket_sock_rcv_skb, 0, sk, skb);
}
EXPORT_SYMBOL(security_sock_rcv_skb);

int security_socket_getpeersec_stream(struct socket *sock, sockptr_t optval,
                                      sockptr_t optlen, unsigned int len)
{
        struct security_hook_list *hp;
        int rc;

        /*
         * Only one module will provide a security context.
         */
        hlist_for_each_entry(hp, &security_hook_heads.socket_getpeersec_stream,
                             list) {
                rc = hp->hook.socket_getpeersec_stream(sock, optval, optlen,
                                                       len);
                if (rc != LSM_RET_DEFAULT(socket_getpeersec_stream))
                        return rc;
        }
        return LSM_RET_DEFAULT(socket_getpeersec_stream);
}

int security_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid)
{
        struct security_hook_list *hp;
        int rc;

        /*
         * Only one module will provide a security context.
         */
        hlist_for_each_entry(hp, &security_hook_heads.socket_getpeersec_dgram,
                             list) {
                rc = hp->hook.socket_getpeersec_dgram(sock, skb, secid);
                if (rc != LSM_RET_DEFAULT(socket_getpeersec_dgram))
                        return rc;
        }
        return LSM_RET_DEFAULT(socket_getpeersec_dgram);
}
EXPORT_SYMBOL(security_socket_getpeersec_dgram);

int security_sk_alloc(struct sock *sk, int family, gfp_t priority)
{
        return call_int_hook(sk_alloc_security, 0, sk, family, priority);
}

void security_sk_free(struct sock *sk)
{
        call_void_hook(sk_free_security, sk);
}

void security_sk_clone(const struct sock *sk, struct sock *newsk)
{
        call_void_hook(sk_clone_security, sk, newsk);
}
EXPORT_SYMBOL(security_sk_clone);

void security_sk_classify_flow(struct sock *sk, struct flowi_common *flic)
{
        call_void_hook(sk_getsecid, sk, &flic->flowic_secid);
}
EXPORT_SYMBOL(security_sk_classify_flow);

void security_req_classify_flow(const struct request_sock *req,
                                struct flowi_common *flic)
{
        call_void_hook(req_classify_flow, req, flic);
}
EXPORT_SYMBOL(security_req_classify_flow);

void security_sock_graft(struct sock *sk, struct socket *parent)
{
        call_void_hook(sock_graft, sk, parent);
}
EXPORT_SYMBOL(security_sock_graft);

int security_inet_conn_request(struct sock *sk,
                        struct sk_buff *skb, struct request_sock *req)
{
        return call_int_hook(inet_conn_request, 0, sk, skb, req);
}
EXPORT_SYMBOL(security_inet_conn_request);

void security_inet_csk_clone(struct sock *newsk,
                        const struct request_sock *req)
{
        call_void_hook(inet_csk_clone, newsk, req);
}

void security_inet_conn_established(struct sock *sk,
                        struct sk_buff *skb)
{
        call_void_hook(inet_conn_established, sk, skb);
}
EXPORT_SYMBOL(security_inet_conn_established);

int security_secmark_relabel_packet(u32 secid)
{
        return call_int_hook(secmark_relabel_packet, 0, secid);
}
EXPORT_SYMBOL(security_secmark_relabel_packet);

void security_secmark_refcount_inc(void)
{
        call_void_hook(secmark_refcount_inc);
}
EXPORT_SYMBOL(security_secmark_refcount_inc);

void security_secmark_refcount_dec(void)
{
        call_void_hook(secmark_refcount_dec);
}
EXPORT_SYMBOL(security_secmark_refcount_dec);

int security_tun_dev_alloc_security(void **security)
{
        return call_int_hook(tun_dev_alloc_security, 0, security);
}
EXPORT_SYMBOL(security_tun_dev_alloc_security);

void security_tun_dev_free_security(void *security)
{
        call_void_hook(tun_dev_free_security, security);
}
EXPORT_SYMBOL(security_tun_dev_free_security);

int security_tun_dev_create(void)
{
        return call_int_hook(tun_dev_create, 0);
}
EXPORT_SYMBOL(security_tun_dev_create);

int security_tun_dev_attach_queue(void *security)
{
        return call_int_hook(tun_dev_attach_queue, 0, security);
}
EXPORT_SYMBOL(security_tun_dev_attach_queue);

int security_tun_dev_attach(struct sock *sk, void *security)
{
        return call_int_hook(tun_dev_attach, 0, sk, security);
}
EXPORT_SYMBOL(security_tun_dev_attach);

int security_tun_dev_open(void *security)
{
        return call_int_hook(tun_dev_open, 0, security);
}
EXPORT_SYMBOL(security_tun_dev_open);

int security_sctp_assoc_request(struct sctp_endpoint *ep, struct sk_buff *skb)
{
        return call_int_hook(sctp_assoc_request, 0, ep, skb);
}
EXPORT_SYMBOL(security_sctp_assoc_request);

int security_sctp_bind_connect(struct sock *sk, int optname,
                               struct sockaddr *address, int addrlen)
{
        return call_int_hook(sctp_bind_connect, 0, sk, optname,
                             address, addrlen);
}
EXPORT_SYMBOL(security_sctp_bind_connect);

void security_sctp_sk_clone(struct sctp_endpoint *ep, struct sock *sk,
                            struct sock *newsk)
{
        call_void_hook(sctp_sk_clone, ep, sk, newsk);
}
EXPORT_SYMBOL(security_sctp_sk_clone);

#endif        /* CONFIG_SECURITY_NETWORK */

#ifdef CONFIG_SECURITY_INFINIBAND

int security_ib_pkey_access(void *sec, u64 subnet_prefix, u16 pkey)
{
        return call_int_hook(ib_pkey_access, 0, sec, subnet_prefix, pkey);
}
EXPORT_SYMBOL(security_ib_pkey_access);

int security_ib_endport_manage_subnet(void *sec, const char *dev_name, u8 port_num)
{
        return call_int_hook(ib_endport_manage_subnet, 0, sec, dev_name, port_num);
}
EXPORT_SYMBOL(security_ib_endport_manage_subnet);

int security_ib_alloc_security(void **sec)
{
        return call_int_hook(ib_alloc_security, 0, sec);
}
EXPORT_SYMBOL(security_ib_alloc_security);

void security_ib_free_security(void *sec)
{
        call_void_hook(ib_free_security, sec);
}
EXPORT_SYMBOL(security_ib_free_security);
#endif        /* CONFIG_SECURITY_INFINIBAND */

#ifdef CONFIG_SECURITY_NETWORK_XFRM

int security_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp,
                               struct xfrm_user_sec_ctx *sec_ctx,
                               gfp_t gfp)
{
        return call_int_hook(xfrm_policy_alloc_security, 0, ctxp, sec_ctx, gfp);
}
EXPORT_SYMBOL(security_xfrm_policy_alloc);

int security_xfrm_policy_clone(struct xfrm_sec_ctx *old_ctx,
                              struct xfrm_sec_ctx **new_ctxp)
{
        return call_int_hook(xfrm_policy_clone_security, 0, old_ctx, new_ctxp);
}

void security_xfrm_policy_free(struct xfrm_sec_ctx *ctx)
{
        call_void_hook(xfrm_policy_free_security, ctx);
}
EXPORT_SYMBOL(security_xfrm_policy_free);

int security_xfrm_policy_delete(struct xfrm_sec_ctx *ctx)
{
        return call_int_hook(xfrm_policy_delete_security, 0, ctx);
}

int security_xfrm_state_alloc(struct xfrm_state *x,
                              struct xfrm_user_sec_ctx *sec_ctx)
{
        return call_int_hook(xfrm_state_alloc, 0, x, sec_ctx);
}
EXPORT_SYMBOL(security_xfrm_state_alloc);

int security_xfrm_state_alloc_acquire(struct xfrm_state *x,
                                      struct xfrm_sec_ctx *polsec, u32 secid)
{
        return call_int_hook(xfrm_state_alloc_acquire, 0, x, polsec, secid);
}

int security_xfrm_state_delete(struct xfrm_state *x)
{
        return call_int_hook(xfrm_state_delete_security, 0, x);
}
EXPORT_SYMBOL(security_xfrm_state_delete);

void security_xfrm_state_free(struct xfrm_state *x)
{
        call_void_hook(xfrm_state_free_security, x);
}

int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir)
{
        return call_int_hook(xfrm_policy_lookup, 0, ctx, fl_secid, dir);
}

int security_xfrm_state_pol_flow_match(struct xfrm_state *x,
                                       struct xfrm_policy *xp,
                                       const struct flowi_common *flic)
{
        struct security_hook_list *hp;
        int rc = LSM_RET_DEFAULT(xfrm_state_pol_flow_match);

        /*
         * Since this function is expected to return 0 or 1, the judgment
         * becomes difficult if multiple LSMs supply this call. Fortunately,
         * we can use the first LSM's judgment because currently only SELinux
         * supplies this call.
         *
         * For speed optimization, we explicitly break the loop rather than
         * using the macro
         */
        hlist_for_each_entry(hp, &security_hook_heads.xfrm_state_pol_flow_match,
                                list) {
                rc = hp->hook.xfrm_state_pol_flow_match(x, xp, flic);
                break;
        }
        return rc;
}

int security_xfrm_decode_session(struct sk_buff *skb, u32 *secid)
{
        return call_int_hook(xfrm_decode_session, 0, skb, secid, 1);
}

void security_skb_classify_flow(struct sk_buff *skb, struct flowi_common *flic)
{
        int rc = call_int_hook(xfrm_decode_session, 0, skb, &flic->flowic_secid,
                                0);

        BUG_ON(rc);
}
EXPORT_SYMBOL(security_skb_classify_flow);

#endif        /* CONFIG_SECURITY_NETWORK_XFRM */

#ifdef CONFIG_KEYS

int security_key_alloc(struct key *key, const struct cred *cred,
                       unsigned long flags)
{
        return call_int_hook(key_alloc, 0, key, cred, flags);
}

void security_key_free(struct key *key)
{
        call_void_hook(key_free, key);
}

int security_key_permission(key_ref_t key_ref, const struct cred *cred,
                            enum key_need_perm need_perm)
{
        return call_int_hook(key_permission, 0, key_ref, cred, need_perm);
}

int security_key_getsecurity(struct key *key, char **_buffer)
{
        *_buffer = NULL;
        return call_int_hook(key_getsecurity, 0, key, _buffer);
}

#endif        /* CONFIG_KEYS */

#ifdef CONFIG_AUDIT

int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule,
                             gfp_t gfp)
{
        return call_int_hook(audit_rule_init, 0, field, op, rulestr, lsmrule,
                             gfp);
}

int security_audit_rule_known(struct audit_krule *krule)
{
        return call_int_hook(audit_rule_known, 0, krule);
}

void security_audit_rule_free(void *lsmrule)
{
        call_void_hook(audit_rule_free, lsmrule);
}

int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule)
{
        return call_int_hook(audit_rule_match, 0, secid, field, op, lsmrule);
}
#endif /* CONFIG_AUDIT */

#ifdef CONFIG_BPF_SYSCALL
int security_bpf(int cmd, union bpf_attr *attr, unsigned int size)
{
        return call_int_hook(bpf, 0, cmd, attr, size);
}
int security_bpf_map(struct bpf_map *map, fmode_t fmode)
{
        return call_int_hook(bpf_map, 0, map, fmode);
}
int security_bpf_prog(struct bpf_prog *prog)
{
        return call_int_hook(bpf_prog, 0, prog);
}
int security_bpf_map_alloc(struct bpf_map *map)
{
        return call_int_hook(bpf_map_alloc_security, 0, map);
}
int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
{
        return call_int_hook(bpf_prog_alloc_security, 0, aux);
}
void security_bpf_map_free(struct bpf_map *map)
{
        call_void_hook(bpf_map_free_security, map);
}
void security_bpf_prog_free(struct bpf_prog_aux *aux)
{
        call_void_hook(bpf_prog_free_security, aux);
}
#endif /* CONFIG_BPF_SYSCALL */

int security_locked_down(enum lockdown_reason what)
{
        return call_int_hook(locked_down, 0, what);
}
EXPORT_SYMBOL(security_locked_down);

#ifdef CONFIG_PERF_EVENTS
int security_perf_event_open(struct perf_event_attr *attr, int type)
{
        return call_int_hook(perf_event_open, 0, attr, type);
}

int security_perf_event_alloc(struct perf_event *event)
{
        return call_int_hook(perf_event_alloc, 0, event);
}

void security_perf_event_free(struct perf_event *event)
{
        call_void_hook(perf_event_free, event);
}

int security_perf_event_read(struct perf_event *event)
{
        return call_int_hook(perf_event_read, 0, event);
}

int security_perf_event_write(struct perf_event *event)
{
        return call_int_hook(perf_event_write, 0, event);
}
#endif /* CONFIG_PERF_EVENTS */

prctl$PR_SET_TIMERSLACK(0x26, 0x0)

prctl$PR_SET_TIMERSLACK(0x1d, 0xfffffffffffffffa)
syz_mount_image$vfat(0x0, 0x0, 0x0, 0x0, &(0x7f0000001240), 0x0, 0x0)

prctl$PR_SET_TIMERSLACK(0x27, 0x0)

prctl$PR_SET_TIMERSLACK(0x1d, 0x0)

prctl$PR_SET_TIMERSLACK(0x29, 0x0)

cap_bprm_creds_from_file---of 93
cap_bprm_creds_from_file.cold---of 2
cap_capable---of 9
cap_capget---of 18
cap_capset---of 24
cap_convert_nscap---of 14
cap_inode_getsecurity---of 34
cap_inode_killpriv---of 2
cap_inode_need_killpriv---of 1
cap_inode_removexattr---of 7
cap_inode_setxattr---of 5
cap_mmap_addr---of 11
cap_mmap_file---of 1
cap_ptrace_access_check---of 27
cap_ptrace_traceme---of 24
cap_safe_nice---of 28
cap_settime---of 1
cap_task_fix_setuid---of 30
cap_task_prctl5%of 45
cap_task_setioprio---of 1
cap_task_setnice---of 1
cap_task_setscheduler---of 1
cap_vm_enough_memory---of 9
get_vfs_caps_from_disk---of 20

__traceiter_x86_fpu_after_restore---of 4
__traceiter_x86_fpu_after_save---of 4
__traceiter_x86_fpu_before_restore---of 4
__traceiter_x86_fpu_before_save---of 4
__traceiter_x86_fpu_copy_dst---of 4
__traceiter_x86_fpu_copy_src---of 4
__traceiter_x86_fpu_dropped---of 4
__traceiter_x86_fpu_init_state---of 4
__traceiter_x86_fpu_regs_activated---of 4
__traceiter_x86_fpu_regs_deactivated---of 4
__traceiter_x86_fpu_xstate_check_failed---of 4
copy_fpregs_to_fpstate---of 10
copy_init_fpstate_to_fpregs---of 6
copy_kernel_to_fpregs---of 8
fpregs_assert_state_consistent75%of 4
fpregs_mark_activate---of 10
fpstate_init---of 4
fpu__clear---of 12
fpu__clear_all---of 1
fpu__clear_user_states---of 1
fpu__copy---of 26
fpu__drop---of 20
fpu__exception_code---of 9
fpu__prepare_read---of 2
fpu__prepare_write---of 3
fpu__save---of 25
irq_fpu_usable---of 7
kernel_fpu_begin_mask---of 12
kernel_fpu_end---of 3
perf_trace_x86_fpu---of 8
switch_fpu_return---of 15
trace_event_raw_event_x86_fpu---of 12
trace_raw_output_x86_fpu---of 4

__do_compat_sys_getrusage---of 4
__do_compat_sys_sysinfo---of 7
__do_sys_geteuid---of 1
__do_sys_getgid---of 1
__do_sys_getpgrp---of 1
__do_sys_getppid---of 18
__do_sys_getpriority---of 63
__do_sys_getrusage---of 4
__do_sys_newuname---of 6
__do_sys_olduname---of 7
__do_sys_prctl22%of 66
__do_sys_prlimit64---of 65
__do_sys_setpgid---of 34
__do_sys_setpriority---of 65
__do_sys_sysinfo---of 2
__do_sys_uname---of 7
__ia32_compat_sys_getrlimit---of 6
__ia32_compat_sys_getrusage---of 1
__ia32_compat_sys_old_getrlimit---of 4
__ia32_compat_sys_setrlimit---of 7
__ia32_compat_sys_sysinfo---of 1
__ia32_compat_sys_times---of 6
__ia32_sys_getcpu---of 6
__ia32_sys_getegid---of 1
__ia32_sys_gethostname---of 4
__ia32_sys_gethostname.cold---of 1
__ia32_sys_getpgid---of 1
__ia32_sys_getpid---of 1
__ia32_sys_getpriority---of 1
__ia32_sys_getresgid---of 4
__ia32_sys_getresuid---of 5
__ia32_sys_getrlimit---of 4
__ia32_sys_getrusage---of 1
__ia32_sys_getsid---of 20
__ia32_sys_newuname---of 1
__ia32_sys_old_getrlimit---of 7
__ia32_sys_olduname---of 1
__ia32_sys_prctl---of 1
__ia32_sys_prlimit64---of 1
__ia32_sys_setdomainname---of 5
__ia32_sys_setfsgid---of 1
__ia32_sys_setfsuid---of 1
__ia32_sys_setgid---of 1
__ia32_sys_sethostname---of 5
__ia32_sys_setpgid---of 1
__ia32_sys_setpriority---of 1
__ia32_sys_setregid---of 1
__ia32_sys_setresgid---of 1
__ia32_sys_setresuid---of 1
__ia32_sys_setreuid---of 1
__ia32_sys_setrlimit---of 3
__ia32_sys_setuid---of 1
__ia32_sys_sysinfo---of 1
__ia32_sys_times---of 6
__ia32_sys_umask---of 1
__ia32_sys_uname---of 1
__sys_setfsgid---of 14
__sys_setfsuid---of 14
__sys_setgid---of 13
__sys_setregid---of 26
__sys_setresgid---of 39
__sys_setresuid---of 40
__sys_setreuid---of 31
__sys_setuid---of 13
__x64_sys_getcpu---of 6
__x64_sys_gethostname---of 4
__x64_sys_gethostname.cold---of 1
__x64_sys_getpgid---of 1
__x64_sys_getpriority---of 1
__x64_sys_getresgid---of 4
__x64_sys_getresuid---of 5
__x64_sys_getrlimit---of 4
__x64_sys_getrusage---of 1
__x64_sys_getsid---of 20
__x64_sys_gettid---of 1
__x64_sys_getuid---of 1
__x64_sys_newuname---of 1
__x64_sys_old_getrlimit---of 7
__x64_sys_olduname---of 1
__x64_sys_prctl100%of 1
__x64_sys_prlimit64---of 1
__x64_sys_setdomainname---of 5
__x64_sys_setfsgid---of 1
__x64_sys_setfsuid---of 1
__x64_sys_setgid---of 1
__x64_sys_sethostname---of 5
__x64_sys_setpgid---of 1
__x64_sys_setpriority---of 1
__x64_sys_setregid---of 1
__x64_sys_setresgid---of 1
__x64_sys_setresuid---of 1
__x64_sys_setreuid---of 1
__x64_sys_setrlimit---of 3
__x64_sys_setsid---of 1
__x64_sys_setuid---of 1
__x64_sys_sysinfo---of 1
__x64_sys_times---of 6
__x64_sys_umask---of 1
__x64_sys_uname---of 1
do_getpgid---of 19
do_prlimit---of 21
do_sys_times---of 1
do_sysinfo.isra.0---of 7
getrusage---of 39
ksys_setsid---of 10
override_release.part.0---of 9
prctl_set_auxv---of 4
prctl_set_mm---of 68
propagate_has_child_subreaper---of 5
set_one_prio---of 14
set_user---of 7

__traceiter_irq_disable---of 4
__traceiter_irq_enable---of 4
perf_trace_preemptirq_template---of 6
trace_event_raw_event_preemptirq_template---of 10
trace_hardirqs_off40%of 10
trace_hardirqs_off_caller---of 10
trace_hardirqs_off_finish55%of 11
trace_hardirqs_on50%of 12
trace_hardirqs_on_caller---of 12
trace_hardirqs_on_prepare59%of 12
trace_raw_output_preemptirq_template---of 4

call_blocking_lsm_notifier---of 1
fsnotify_perm.part.0---of 21
get_order---of 1
inode_free_by_rcu---of 1
lsm_append.constprop.0---of 10
lsm_inode_alloc---of 5
register_blocking_lsm_notifier---of 1
security_add_mnt_opt---of 4
security_audit_rule_free---of 2
security_audit_rule_init---of 4
security_audit_rule_known---of 4
security_audit_rule_match---of 4
security_binder_set_context_mgr---of 4
security_binder_transaction---of 4
security_binder_transfer_binder---of 4
security_binder_transfer_file---of 4
security_bprm_check---of 5
security_bprm_committed_creds---of 2
security_bprm_committing_creds---of 2
security_bprm_creds_for_exec---of 4
security_bprm_creds_from_file---of 4
security_capable---of 4
security_capget---of 4
security_capset---of 4
security_cred_alloc_blank---of 9
security_cred_free---of 5
security_cred_getsecid---of 2
security_d_instantiate---of 4
security_dentry_create_files_as---of 4
security_dentry_init_security---of 4
security_file_alloc---of 9
security_file_fcntl---of 4
security_file_free---of 4
security_file_ioctl---of 4
security_file_ioctl_compat---of 4
security_file_lock---of 4
security_file_mprotect---of 5
security_file_open---of 6
security_file_permission---of 7
security_file_receive---of 4
security_file_send_sigiotask---of 4
security_file_set_fowner---of 2
security_free_mnt_opts---of 5
security_fs_context_dup---of 4
security_fs_context_parse_param---of 5
security_getprocattr---of 6
security_inet_conn_established---of 2
security_inet_conn_request---of 4
security_inet_csk_clone---of 2
security_inode_alloc---of 9
security_inode_copy_up---of 4
security_inode_copy_up_xattr---of 5
security_inode_create---of 6
security_inode_follow_link---of 6
security_inode_free---of 4
security_inode_getattr---of 5
security_inode_getsecctx---of 4
security_inode_getsecid---of 2
security_inode_getsecurity---of 6
security_inode_getxattr---of 5
security_inode_init_security---of 15
security_inode_invalidate_secctx---of 2
security_inode_killpriv---of 4
security_inode_link---of 5
security_inode_listsecurity---of 5
security_inode_listxattr---of 5
security_inode_mkdir---of 6
security_inode_mknod---of 6
security_inode_need_killpriv---of 4
security_inode_notifysecctx---of 4
security_inode_permission---of 5
security_inode_post_setxattr---of 3
security_inode_readlink---of 5
security_inode_removexattr---of 8
security_inode_rename---of 12
security_inode_rmdir---of 5
security_inode_setattr---of 6
security_inode_setsecctx---of 4
security_inode_setsecurity---of 5
security_inode_setxattr---of 8
security_inode_symlink---of 5
security_inode_unlink---of 5
security_ipc_getsecid---of 2
security_ipc_permission---of 5
security_ismaclabel---of 4
security_kernel_act_as---of 4
security_kernel_create_files_as---of 4
security_kernel_load_data---of 5
security_kernel_module_request---of 5
security_kernel_post_load_data---of 5
security_kernel_post_read_file---of 5
security_kernel_read_file---of 5
security_kernfs_init_security---of 4
security_key_alloc---of 4
security_key_free---of 2
security_key_getsecurity---of 4
security_key_permission---of 4
security_locked_down---of 4
security_mmap_addr---of 4
security_mmap_file---of 10
security_move_mount---of 4
security_msg_msg_alloc---of 11
security_msg_msg_free---of 3
security_msg_queue_alloc---of 11
security_msg_queue_associate---of 4
security_msg_queue_free---of 3
security_msg_queue_msgctl---of 4
security_msg_queue_msgrcv---of 4
security_msg_queue_msgsnd---of 4
security_netlink_send---of 4
security_old_inode_init_security---of 5
security_path_notify---of 4
security_perf_event_alloc---of 4
security_perf_event_free---of 2
security_perf_event_open---of 4
security_perf_event_read---of 4
security_perf_event_write---of 4
security_prepare_creds---of 9
security_ptrace_access_check---of 4
security_ptrace_traceme---of 4
security_quota_on---of 4
security_quotactl---of 4
security_release_secctx---of 2
security_req_classify_flow---of 2
security_sb_alloc---of 4
security_sb_clone_mnt_opts---of 4
security_sb_eat_lsm_opts---of 4
security_sb_free---of 2
security_sb_kern_mount---of 4
security_sb_mount---of 4
security_sb_pivotroot---of 4
security_sb_remount---of 4
security_sb_set_mnt_opts---of 5
security_sb_show_options---of 4
security_sb_statfs---of 4
security_sb_umount---of 4
security_sctp_assoc_request---of 4
security_sctp_bind_connect---of 4
security_sctp_sk_clone---of 2
security_secctx_to_secid---of 4
security_secid_to_secctx---of 4
security_secmark_refcount_dec---of 2
security_secmark_refcount_inc---of 2
security_secmark_relabel_packet---of 4
security_sem_alloc---of 11
security_sem_associate---of 4
security_sem_free---of 3
security_sem_semctl---of 4
security_sem_semop---of 4
security_setprocattr---of 6
security_settime64---of 4
security_shm_alloc---of 11
security_shm_associate---of 4
security_shm_free---of 3
security_shm_shmat---of 4
security_shm_shmctl---of 4
security_sk_alloc---of 4
security_sk_classify_flow---of 3
security_sk_clone---of 2
security_sk_free---of 2
security_sock_graft---of 2
security_sock_rcv_skb---of 4
security_socket_accept---of 4
security_socket_bind---of 4
security_socket_connect---of 4
security_socket_create---of 4
security_socket_getpeername---of 4
security_socket_getpeersec_dgram---of 4
security_socket_getpeersec_stream---of 4
security_socket_getsockname---of 4
security_socket_getsockopt---of 4
security_socket_listen---of 4
security_socket_post_create---of 4
security_socket_recvmsg---of 4
security_socket_sendmsg---of 4
security_socket_setsockopt---of 4
security_socket_shutdown---of 4
security_socket_socketpair---of 4
security_syslog---of 4
security_task_alloc---of 11
security_task_fix_setgid---of 4
security_task_fix_setuid---of 4
security_task_free---of 3
security_task_getioprio---of 4
security_task_getpgid---of 4
security_task_getscheduler---of 4
security_task_getsecid---of 2
security_task_getsid---of 4
security_task_kill---of 4
security_task_movememory---of 4
security_task_prctl80%of 5
security_task_prlimit---of 4
security_task_setioprio---of 4
security_task_setnice---of 4
security_task_setpgid---of 4
security_task_setrlimit---of 4
security_task_setscheduler---of 4
security_task_to_inode---of 2
security_transfer_creds---of 2
security_tun_dev_alloc_security---of 4
security_tun_dev_attach---of 4
security_tun_dev_attach_queue---of 4
security_tun_dev_create---of 4
security_tun_dev_free_security---of 2
security_tun_dev_open---of 4
security_unix_may_send---of 4
security_unix_stream_connect---of 4
security_vm_enough_memory_mm---of 4
unregister_blocking_lsm_notifier---of 1